001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.io.dataformat.ColumnDescription;
015    import org.maltparser.core.io.dataformat.DataFormatException;
016    import org.maltparser.core.io.dataformat.DataFormatInstance;
017    import org.maltparser.core.syntaxgraph.DependencyStructure;
018    import org.maltparser.core.syntaxgraph.Element;
019    import org.maltparser.core.syntaxgraph.TokenStructure;
020    import org.maltparser.core.syntaxgraph.edge.Edge;
021    /**
022    *
023    *
024    * @author Johan Hall
025    */
026    public class TabReader implements SyntaxGraphReader {
027            private BufferedReader reader;
028            private int sentenceCount;
029            private final StringBuilder input;
030            private DataFormatInstance dataFormatInstance;
031            private static final String IGNORE_COLUMN_SIGN = "_";
032            private static final char TAB = '\t';
033            private static final char NEWLINE = '\n';
034            private static final char CARRIAGE_RETURN = '\r';
035            
036            
037            public TabReader() { 
038                    input = new StringBuilder();
039            }
040            
041            public void open(String fileName, String charsetName) throws MaltChainedException {
042                    try {
043                            open(new FileInputStream(fileName), charsetName);
044                    }catch (FileNotFoundException e) {
045                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
046                    }
047            }
048            
049            public void open(URL url, String charsetName) throws MaltChainedException {
050                    try {
051                            open(url.openStream(), charsetName);
052                    } catch (IOException e) {
053                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
054                    }
055            }
056            
057            public void open(InputStream is, String charsetName) throws MaltChainedException {
058                    try {
059                            open(new InputStreamReader(is, charsetName));
060                    } catch (UnsupportedEncodingException e) {
061                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
062                    }
063            }
064            
065            public void open(InputStreamReader isr) throws MaltChainedException {
066                    setReader(new BufferedReader(isr));
067                    setSentenceCount(0);
068            }
069            
070            public void readProlog() throws MaltChainedException {
071                    
072            }
073            
074            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
075                    if (syntaxGraph == null || dataFormatInstance == null) {
076                            return false;
077                    }
078                    
079                    Element node = null;
080                    Edge edge = null;
081                    input.setLength(0);
082                    int i = 0;
083                    int terminalCounter = 0;
084                    int nNewLines = 0;
085    
086                    syntaxGraph.clear();
087                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
088                    while (true) {
089                            int c;
090    
091                            try {
092                                    c = reader.read();
093                            } catch (IOException e) {
094                                    close();
095                                    throw new DataFormatException("Error when reading from the input file. ", e);
096                            }
097                            if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
098                                    if (input.length() != 0) {                                      
099                                            if (i == 0) {
100                                                    terminalCounter++;
101                                                    node = syntaxGraph.addTokenNode(terminalCounter);
102                                            }
103                                            ColumnDescription column = null;
104                                            if (columns.hasNext()) {
105                                                    column = columns.next();
106                                                    if (column.getCategory() == ColumnDescription.INPUT && node != null) {
107                                                            syntaxGraph.addLabel(node, column.getName(), input.toString());
108                                                    } else if (column.getCategory() == ColumnDescription.HEAD) {
109                                                            if (syntaxGraph instanceof DependencyStructure) {
110                                                                    if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
111                                                                            edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
112                                                                    }
113                                                            } 
114                                                            else {
115                                                                    close();
116                                                                    throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
117                                                            }
118                                                    } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
119                                                            syntaxGraph.addLabel(edge, column.getName(), input.toString());
120                                                    }
121                                            }
122                                            input.setLength(0);
123                                            nNewLines = 0;
124                                            i++;
125                                    }
126                                    if (c == NEWLINE) {
127                                            nNewLines++;
128                                            i = 0;
129                                            columns = dataFormatInstance.iterator();
130                                    }
131                            } else {
132                                    input.append((char)c);
133                            }
134                            
135                            if (nNewLines == 2 && c == NEWLINE) {
136                                    if (syntaxGraph.hasTokens()) {
137                                            sentenceCount++;
138                                    }
139                                    return true;
140                            } else if (c == -1) {
141                                    if (syntaxGraph.hasTokens()) {
142                                            sentenceCount++;
143                                    }
144                                    return false;                                   
145                            }
146                    }
147            }
148            
149            public void readEpilog() throws MaltChainedException {
150                    
151            }
152            
153            public BufferedReader getReader() {
154                    return reader;
155            }
156    
157            public void setReader(BufferedReader reader) throws MaltChainedException {
158                    close();
159                    this.reader = reader;
160            }
161            
162            public DataFormatInstance getDataFormatInstance() {
163                    return dataFormatInstance;
164            }
165    
166            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
167                    this.dataFormatInstance = dataFormatInstance;
168            }
169    
170            public int getSentenceCount() throws MaltChainedException {
171                    return sentenceCount;
172            }
173            
174            public void setSentenceCount(int sentenceCount) {
175                    this.sentenceCount = sentenceCount;
176            }
177            
178            public String getOptions() {
179                    return null;
180            }
181            
182            public void setOptions(String optionString) throws MaltChainedException {
183                    
184            }
185            
186            public void close() throws MaltChainedException {
187                    try {
188                            if (reader != null) {
189                                    reader.close();
190                                    reader = null;
191                            }
192                    } catch (IOException e) {
193                            throw new DataFormatException("Error when closing the input file. ", e);
194                    } 
195            }
196            
197            public void clear() throws MaltChainedException {
198                    close();
199                    input.setLength(0);
200                    dataFormatInstance = null;
201                    sentenceCount = 0;
202            }
203    }