001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.io.dataformat.ColumnDescription;
015    import org.maltparser.core.io.dataformat.DataFormatException;
016    import org.maltparser.core.io.dataformat.DataFormatInstance;
017    import org.maltparser.core.syntaxgraph.DependencyStructure;
018    import org.maltparser.core.syntaxgraph.Element;
019    import org.maltparser.core.syntaxgraph.TokenStructure;
020    import org.maltparser.core.syntaxgraph.edge.Edge;
021    /**
022    *
023    *
024    * @author Johan Hall
025    */
026    public class TabReader implements SyntaxGraphReader {
027            private BufferedReader reader;
028            private int sentenceCount;
029            private final StringBuilder input;
030            private DataFormatInstance dataFormatInstance;
031            private static final String IGNORE_COLUMN_SIGN = "_";
032            private static final char TAB = '\t';
033            private static final char NEWLINE = '\n';
034            private static final char CARRIAGE_RETURN = '\r';
035            private String fileName = null;
036            private URL url = null;
037            private String charsetName;
038            private int nIterations;
039            private int cIterations;
040            private boolean closeStream = true;
041            
042            public TabReader() { 
043                    input = new StringBuilder();
044                    nIterations = 1;
045                    cIterations = 1;
046            }
047            
048            private void reopen() throws MaltChainedException {
049                    close();
050                    if (fileName != null) {
051                            open(fileName, charsetName);
052                    } else if (url != null) {
053                            open(url, charsetName);
054                    } else {
055                            throw new DataFormatException("The input stream cannot be reopen. ");
056                    }
057            }
058            
059            public void open(String fileName, String charsetName) throws MaltChainedException {
060                    setFileName(fileName);
061                    setCharsetName(charsetName);
062                    try {
063                            open(new FileInputStream(fileName), charsetName);
064                    } catch (FileNotFoundException e) {
065                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
066                    }
067            }
068            
069            public void open(URL url, String charsetName) throws MaltChainedException {
070                    setUrl(url);
071                    setCharsetName(charsetName);
072                    if (url == null) {
073                            throw new DataFormatException("The input file cannot be found. ");
074                    }
075                    try {
076                            open(url.openStream(), charsetName);
077                    } catch (IOException e) {
078                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
079                    }
080            }
081            
082            public void open(InputStream is, String charsetName) throws MaltChainedException {
083                    try {
084                            if (is == System.in) {
085                                    closeStream = false;
086                            }
087                            open(new InputStreamReader(is, charsetName));
088                    } catch (UnsupportedEncodingException e) {
089                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
090                    }
091            }
092            
093            private void open(InputStreamReader isr) throws MaltChainedException {
094                    setReader(new BufferedReader(isr));
095                    setSentenceCount(0);
096            }
097            
098            public void readProlog() throws MaltChainedException {
099                    
100            }
101            
102            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
103                    if (syntaxGraph == null || dataFormatInstance == null) {
104                            return false;
105                    }
106                    
107                    Element node = null;
108                    Edge edge = null;
109                    input.setLength(0);
110                    int i = 0;
111                    int terminalCounter = 0;
112                    int nNewLines = 0;
113                    syntaxGraph.clear();
114                    syntaxGraph.getSymbolTables().cleanUp();
115                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
116                    while (true) {
117                            int c;
118    
119                            try {
120                                    c = reader.read();
121                            } catch (IOException e) {
122                                    close();
123                                    throw new DataFormatException("Error when reading from the input file. ", e);
124                            }
125                            if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
126                                    if (input.length() != 0) {                                      
127                                            if (i == 0) {
128                                                    terminalCounter++;
129                                                    node = syntaxGraph.addTokenNode(terminalCounter);
130                                            }
131                                            if (columns.hasNext()) {
132                                                    ColumnDescription column = columns.next();
133                                                    if (column.getCategory() == ColumnDescription.INPUT && node != null) {
134                                                            syntaxGraph.addLabel(node, column.getName(), input.toString());
135                                                    } else if (column.getCategory() == ColumnDescription.HEAD) {
136                                                            if (syntaxGraph instanceof DependencyStructure) {
137                                                                    if (column.getCategory() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) {
138    //                                                              if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
139                                                                    //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
140                                                                            edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
141                                                                    }
142                                                            } 
143                                                            else {
144                                                                    close();
145                                                                    throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
146                                                            }
147                                                    } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
148                                                            //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
149                                                                    syntaxGraph.addLabel(edge, column.getName(), input.toString());
150                                                            //} // bugfix
151                                                    }
152                                            }
153                                            input.setLength(0);
154                                            nNewLines = 0;
155                                            i++;
156                                    } else if (c == TAB) {
157                                            throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
158                                    }
159                                    if (c == NEWLINE) {
160                                            nNewLines++;
161                                            i = 0;
162                                            columns = dataFormatInstance.iterator();
163                                    }
164                            } else {
165                                    input.append((char)c);
166                            }
167                            
168                            if (nNewLines == 2 && c == NEWLINE) {
169                                    if (syntaxGraph.hasTokens()) {
170                                            sentenceCount++;
171                                    }
172                                    return true;
173                            } else if (c == -1) {
174                                    if (syntaxGraph.hasTokens()) {
175                                            sentenceCount++;
176                                    }
177                                    if (cIterations < nIterations) {
178                                            cIterations++;
179                                            reopen();
180                                            return true;
181                                    }
182                                    
183                                    return false;                                   
184                            }
185                    }
186            }
187            
188            public void readEpilog() throws MaltChainedException {
189                    
190            }
191            
192            public BufferedReader getReader() {
193                    return reader;
194            }
195    
196            public void setReader(BufferedReader reader) throws MaltChainedException {
197                    close();
198                    this.reader = reader;
199            }
200            
201            public DataFormatInstance getDataFormatInstance() {
202                    return dataFormatInstance;
203            }
204    
205            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
206                    this.dataFormatInstance = dataFormatInstance;
207            }
208    
209            public int getSentenceCount() throws MaltChainedException {
210                    return sentenceCount;
211            }
212            
213            public void setSentenceCount(int sentenceCount) {
214                    this.sentenceCount = sentenceCount;
215            }
216            
217            public String getOptions() {
218                    return null;
219            }
220            
221            public void setOptions(String optionString) throws MaltChainedException {
222                    
223            }
224            
225            public String getFileName() {
226                    return fileName;
227            }
228    
229            public void setFileName(String fileName) {
230                    this.fileName = fileName;
231            }
232    
233            public URL getUrl() {
234                    return url;
235            }
236    
237            public void setUrl(URL url) {
238                    this.url = url;
239            }
240    
241            public String getCharsetName() {
242                    return charsetName;
243            }
244    
245            public void setCharsetName(String charsetName) {
246                    this.charsetName = charsetName;
247            }
248    
249            public int getNIterations() {
250                    return nIterations;
251            }
252    
253            public void setNIterations(int iterations) {
254                    nIterations = iterations;
255            }
256    
257            public int getIterationCounter() {
258                    return cIterations;
259            }
260    
261            public void close() throws MaltChainedException {
262                    try {
263                            if (reader != null) {
264                                    if (closeStream) {
265                                            reader.close();
266                                    }
267                                    reader = null;
268                            }
269                    } catch (IOException e) {
270                            throw new DataFormatException("Error when closing the input file. ", e);
271                    } 
272            }
273            
274            public void clear() throws MaltChainedException {
275                    close();
276                    input.setLength(0);
277                    dataFormatInstance = null;
278                    sentenceCount = 0;
279            }
280    }