001package org.maltparser.core.syntaxgraph.reader;
002
003import java.io.BufferedReader;
004import java.io.FileInputStream;
005import java.io.FileNotFoundException;
006import java.io.IOException;
007import java.io.InputStream;
008import java.io.InputStreamReader;
009import java.io.UnsupportedEncodingException;
010import java.net.URL;
011import java.util.ArrayList;
012import java.util.Iterator;
013
014import org.maltparser.core.exception.MaltChainedException;
015import org.maltparser.core.io.dataformat.ColumnDescription;
016import org.maltparser.core.io.dataformat.DataFormatException;
017import org.maltparser.core.io.dataformat.DataFormatInstance;
018import org.maltparser.core.syntaxgraph.DependencyStructure;
019import org.maltparser.core.syntaxgraph.Element;
020import org.maltparser.core.syntaxgraph.TokenStructure;
021import org.maltparser.core.syntaxgraph.edge.Edge;
022/**
023*
024*
025* @author Johan Hall
026*/
027public class TabReader implements SyntaxGraphReader {
028        private BufferedReader reader;
029        private int sentenceCount;
030//      private final StringBuilder input;
031        private DataFormatInstance dataFormatInstance;
032        private static final String IGNORE_COLUMN_SIGN = "_";
033//      private static final char TAB = '\t';
034//      private static final char NEWLINE = '\n';
035//      private static final char CARRIAGE_RETURN = '\r';
036        private String fileName = null;
037        private URL url = null;
038        private String charsetName;
039        private int nIterations;
040        private int cIterations;
041        private boolean closeStream = true;
042
043        public TabReader() {
044//              input = new StringBuilder();
045                nIterations = 1;
046                cIterations = 1;
047        }
048
049//      private void reopen() throws MaltChainedException {
050//              close();
051//              if (fileName != null) {
052//                      open(fileName, charsetName);
053//              } else if (url != null) {
054//                      open(url, charsetName);
055//              } else {
056//                      throw new DataFormatException("The input stream cannot be reopen. ");
057//              }
058//      }
059
060        public void open(String fileName, String charsetName) throws MaltChainedException {
061                setFileName(fileName);
062                setCharsetName(charsetName);
063                try {
064                        open(new FileInputStream(fileName), charsetName);
065                } catch (FileNotFoundException e) {
066                        throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
067                }
068        }
069
070        public void open(URL url, String charsetName) throws MaltChainedException {
071                setUrl(url);
072                setCharsetName(charsetName);
073                if (url == null) {
074                        throw new DataFormatException("The input file cannot be found. ");
075                }
076                try {
077                        open(url.openStream(), charsetName);
078                } catch (IOException e) {
079                        throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
080                }
081        }
082
083        public void open(InputStream is, String charsetName) throws MaltChainedException {
084                try {
085                        if (is == System.in) {
086                                closeStream = false;
087                        }
088                        open(new InputStreamReader(is, charsetName));
089                } catch (UnsupportedEncodingException e) {
090                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
091                }
092        }
093
094        private void open(InputStreamReader isr) throws MaltChainedException {
095                setReader(new BufferedReader(isr));
096                setSentenceCount(0);
097        }
098
099        public void readProlog() throws MaltChainedException {
100
101        }
102
103        public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
104                if (syntaxGraph == null || dataFormatInstance == null) {
105                        return false;
106                }
107                syntaxGraph.clear();
108                syntaxGraph.getSymbolTables().cleanUp();
109                Element node = null;
110                Edge edge = null;
111
112
113        ArrayList<String> tokens = new ArrayList<String>();
114        try {
115                String line;
116                        while ((line = reader.readLine()) != null) {
117                                if (line.trim().length() == 0) {
118                                        break;
119                                } else {
120                                        tokens.add(line.trim());
121                                }
122                        }
123                } catch (IOException e) {
124                        close();
125                        throw new DataFormatException("Error when reading from the input file. ", e);
126                }
127
128        int terminalCounter = 0;
129                for (int i = 0; i < tokens.size(); i++) {
130                        String token = tokens.get(i);
131
132                        if (token.charAt(0) == '#') {
133                                syntaxGraph.addComment(token, terminalCounter+1);
134                                continue;
135                        }
136                        String[] columns = token.split("\t");
137                        if (columns[0].contains("-") || columns[0].contains(".")) {
138                                syntaxGraph.addComment(token, terminalCounter+1);
139                                continue;
140                        }
141                        terminalCounter++;
142                        node = syntaxGraph.addTokenNode(terminalCounter);
143
144                        Iterator<ColumnDescription> columnDescriptions = dataFormatInstance.iterator();
145                        for (int j = 0; j < columns.length; j++) {
146                                ColumnDescription columnDescription = columnDescriptions.next();
147
148                                if (columnDescription.getCategory() == ColumnDescription.INPUT && node != null) {
149                                        syntaxGraph.addLabel(node, columnDescription.getName(), columns[j]);
150                                } else if (columnDescription.getCategory() == ColumnDescription.HEAD) {
151                                        if (syntaxGraph instanceof DependencyStructure) {
152                                                if (columnDescription.getCategory() != ColumnDescription.IGNORE && !columns[j].equals(IGNORE_COLUMN_SIGN)) {
153                                                        edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(columns[j]), terminalCounter);
154                                                }
155                                        }
156                                        else {
157                                                close();
158                                                throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
159                                        }
160                                } else if (columnDescription.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
161                                        syntaxGraph.addLabel(edge, columnDescription.getName(), columns[j]);
162                                }
163                        }
164                }
165
166                if (!syntaxGraph.hasTokens()) {
167                        return false;
168                }
169                sentenceCount++;
170                return true;
171        }
172
173//      public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
174//              if (syntaxGraph == null || dataFormatInstance == null) {
175//                      return false;
176//              }
177//
178//              Element node = null;
179//              Edge edge = null;
180//              input.setLength(0);
181//              int i = 0;
182//              int terminalCounter = 0;
183//              int nNewLines = 0;
184//              syntaxGraph.clear();
185//              syntaxGraph.getSymbolTables().cleanUp();
186//              Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
187//              while (true) {
188//                      int c;
189//
190//                      try {
191//                              c = reader.read();
192//                      } catch (IOException e) {
193//                              close();
194//                              throw new DataFormatException("Error when reading from the input file. ", e);
195//                      }
196//                      if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
197//                              if (input.length() != 0) {
198//                                      if (i == 0) {
199//                                              terminalCounter++;
200//                                              node = syntaxGraph.addTokenNode(terminalCounter);
201//                                      }
202//                                      if (columns.hasNext()) {
203//                                              ColumnDescription column = columns.next();
204//                                              if (column.getCategory() == ColumnDescription.INPUT && node != null) {
205//                                                      syntaxGraph.addLabel(node, column.getName(), input.toString());
206//                                              } else if (column.getCategory() == ColumnDescription.HEAD) {
207//                                                      if (syntaxGraph instanceof DependencyStructure) {
208//                                                              if (column.getCategory() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) {
209////                                                            if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
210//                                                              //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
211//                                                                      edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
212//                                                              }
213//                                                      }
214//                                                      else {
215//                                                              close();
216//                                                              throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
217//                                                      }
218//                                              } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
219//                                                      //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
220//                                                              syntaxGraph.addLabel(edge, column.getName(), input.toString());
221//                                                      //} // bugfix
222//                                              }
223//                                      }
224//                                      input.setLength(0);
225//                                      nNewLines = 0;
226//                                      i++;
227//                              } else if (c == TAB) {
228//                                      throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
229//                              }
230//                              if (c == NEWLINE) {
231//                                      nNewLines++;
232//                                      i = 0;
233//                                      columns = dataFormatInstance.iterator();
234//                              }
235//                      } else {
236//                              input.append((char)c);
237//                      }
238//
239//                      if (nNewLines == 2 && c == NEWLINE) {
240//                              if (syntaxGraph.hasTokens()) {
241//                                      sentenceCount++;
242//                              }
243//                              return true;
244//                      } else if (c == -1) {
245//                              if (syntaxGraph.hasTokens()) {
246//                                      sentenceCount++;
247//                              }
248//                              if (cIterations < nIterations) {
249//                                      cIterations++;
250//                                      reopen();
251//                                      return true;
252//                              }
253//
254//                              return false;
255//                      }
256//              }
257//      }
258
259        public void readEpilog() throws MaltChainedException {
260
261        }
262
263        public BufferedReader getReader() {
264                return reader;
265        }
266
267        public void setReader(BufferedReader reader) throws MaltChainedException {
268                close();
269                this.reader = reader;
270        }
271
272        public DataFormatInstance getDataFormatInstance() {
273                return dataFormatInstance;
274        }
275
276        public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
277                this.dataFormatInstance = dataFormatInstance;
278        }
279
280        public int getSentenceCount() throws MaltChainedException {
281                return sentenceCount;
282        }
283
284        public void setSentenceCount(int sentenceCount) {
285                this.sentenceCount = sentenceCount;
286        }
287
288        public String getOptions() {
289                return null;
290        }
291
292        public void setOptions(String optionString) throws MaltChainedException {
293
294        }
295
296        public String getFileName() {
297                return fileName;
298        }
299
300        public void setFileName(String fileName) {
301                this.fileName = fileName;
302        }
303
304        public URL getUrl() {
305                return url;
306        }
307
308        public void setUrl(URL url) {
309                this.url = url;
310        }
311
312        public String getCharsetName() {
313                return charsetName;
314        }
315
316        public void setCharsetName(String charsetName) {
317                this.charsetName = charsetName;
318        }
319
320        public int getNIterations() {
321                return nIterations;
322        }
323
324        public void setNIterations(int iterations) {
325                nIterations = iterations;
326        }
327
328        public int getIterationCounter() {
329                return cIterations;
330        }
331
332        public void close() throws MaltChainedException {
333                try {
334                        if (reader != null) {
335                                if (closeStream) {
336                                        reader.close();
337                                }
338                                reader = null;
339                        }
340                } catch (IOException e) {
341                        throw new DataFormatException("Error when closing the input file. ", e);
342                }
343        }
344
345        public void clear() throws MaltChainedException {
346                close();
347//              input.setLength(0);
348                dataFormatInstance = null;
349                sentenceCount = 0;
350        }
351}