001package org.maltparser.core.syntaxgraph.reader;
002
003import java.io.BufferedReader;
004import java.io.FileInputStream;
005import java.io.FileNotFoundException;
006import java.io.IOException;
007import java.io.InputStream;
008import java.io.InputStreamReader;
009import java.io.UnsupportedEncodingException;
010import java.net.URL;
011import java.util.Iterator;
012import java.util.SortedMap;
013import java.util.TreeMap;
014import java.util.regex.PatternSyntaxException;
015
016import org.maltparser.core.exception.MaltChainedException;
017import org.maltparser.core.io.dataformat.ColumnDescription;
018import org.maltparser.core.io.dataformat.DataFormatException;
019import org.maltparser.core.io.dataformat.DataFormatInstance;
020import org.maltparser.core.symbol.SymbolTableHandler;
021import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
022import org.maltparser.core.syntaxgraph.PhraseStructure;
023import org.maltparser.core.syntaxgraph.TokenStructure;
024import org.maltparser.core.syntaxgraph.edge.Edge;
025import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
026
027/**
028*
029*
030* @author Johan Hall
031*/
032public class NegraReader implements SyntaxGraphReader {
033        private enum NegraTables {
034                ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
035        };
036        private BufferedReader reader;
037        private DataFormatInstance dataFormatInstance;
038        private int sentenceCount;
039        private String optionString;
040        private int formatVersion;
041        private NegraTables currentHeaderTable;
042        private int currentTerminalSize;
043        private int currentNonTerminalSize;
044        private SortedMap<Integer,PhraseStructureNode> nonterminals; 
045        private StringBuilder edgelabelSymbol;
046        private StringBuilder edgelabelTableName;
047        private int START_ID_OF_NONTERMINALS = 500;
048        private String fileName = null;
049        private URL url = null;
050        private String charsetName;
051        private int nIterations;
052        private int cIterations;
053        private boolean closeStream = true;
054        
055        public NegraReader() {
056                currentHeaderTable = NegraTables.UNDEF;
057                edgelabelSymbol = new StringBuilder();
058                edgelabelTableName = new StringBuilder();
059                nonterminals = new TreeMap<Integer,PhraseStructureNode>();
060                nIterations = 1;
061                cIterations = 1;
062        }
063        
064        private void reopen() throws MaltChainedException {
065                close();
066                if (fileName != null) {
067                        open(fileName, charsetName);
068                } else if (url != null) {
069                        open(url, charsetName);
070                } else {
071                        throw new DataFormatException("The input stream cannot be reopen. ");
072                }
073        }
074        
075        public void open(String fileName, String charsetName) throws MaltChainedException {
076                setFileName(fileName);
077                setCharsetName(charsetName);
078                try {
079                        open(new FileInputStream(fileName), charsetName);
080                } catch (FileNotFoundException e) {
081                        throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
082                }
083        }
084        public void open(URL url, String charsetName) throws MaltChainedException {
085                setUrl(url);
086                setCharsetName(charsetName);
087                try {
088                        open(url.openStream(), charsetName);
089                } catch (IOException e) {
090                        throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
091                }
092        }
093        
094        public void open(InputStream is, String charsetName) throws MaltChainedException {
095                try {
096                        if (is == System.in) {
097                                closeStream = false;
098                        }
099                        open(new InputStreamReader(is, charsetName));
100                } catch (UnsupportedEncodingException e) {
101                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
102                }
103        }
104        
105        private void open(InputStreamReader isr) throws MaltChainedException {
106                setReader(new BufferedReader(isr));
107                setSentenceCount(0);
108        }
109        
110        public void readProlog() throws MaltChainedException {
111                
112        }
113        
114        public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
115                if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
116                        return false;
117                }
118                syntaxGraph.clear();
119                final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
120                final SymbolTableHandler symbolTables = phraseStructure.getSymbolTables();
121                PhraseStructureNode parent = null;
122                PhraseStructureNode child = null;
123                currentHeaderTable = NegraTables.UNDEF;
124                String line = null;
125                syntaxGraph.clear();
126                nonterminals.clear();
127                try {
128                        while (true) {
129                                line = reader.readLine();
130                                if (line == null) {
131                                        if (syntaxGraph.hasTokens()) {
132                                                sentenceCount++;
133                                                if (syntaxGraph instanceof MappablePhraseStructureGraph) {
134                                                        ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
135                                                }
136                                        }
137                                        if (cIterations < nIterations) {
138                                                cIterations++;
139                                                reopen();
140                                                return true;
141                                        }
142                                        return false;
143                                } else if (line.startsWith("#EOS")) {
144                                        currentTerminalSize = 0;
145                                        currentNonTerminalSize = 0;
146                                        currentHeaderTable = NegraTables.UNDEF;
147                                        if (syntaxGraph instanceof MappablePhraseStructureGraph) {
148                                                ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
149                                        }
150                                        return true;
151                                } else if (line.startsWith("#BOS")) {
152                                        currentHeaderTable = NegraTables.SENTENCE;
153                                        int s = -1, e = -1;
154                                        for (int i = 5, n = line.length(); i < n; i++) {
155                                                if (Character.isDigit(line.charAt(i)) && s == -1) {
156                                                        s = i;
157                                                }
158                                                if (line.charAt(i) == ' ') {
159                                                        e = i;
160                                                        break;
161                                                }
162                                        }
163                                        if (s != e && s != -1 && e != -1) {
164                                                phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
165                                        }
166                                        sentenceCount++;
167                                } else if (currentHeaderTable == NegraTables.SENTENCE) {
168                                        if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
169                                                Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
170                                                ColumnDescription column = null;
171                                                currentNonTerminalSize++;
172                                                char[] lineChars = line.toCharArray();
173                                                int start = 0;
174                                                int secedgecounter = 0;
175                                                for (int i = 0, n = lineChars.length; i < n; i++) {
176                                                        if (lineChars[i] == '\t' && start == i) {
177                                                                start++;
178                                                        } else if (lineChars[i] == '\t' || i == n - 1) {
179                                                                if (columns.hasNext()) {
180                                                                        column = columns.next();
181                                                                }
182                                                                if (column.getPosition() == 0) {
183                                                                        int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
184                                                                        child = nonterminals.get(index);
185                                                                        if (child == null) {
186                                                                                if (index != 0) {
187                                                                                        child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
188                                                                                }
189                                                                                nonterminals.put(index,child);
190                                                                        }
191                                                                } else if (column.getPosition() == 2 && child != null) {
192                                                                        syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
193                                                                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 
194                                                                        edgelabelSymbol.setLength(0);
195                                                                        edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
196                                                                        edgelabelTableName.setLength(0);
197                                                                        edgelabelTableName.append(column.getName());
198                                                                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
199                                                                        int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
200                                                                        parent = nonterminals.get(index);
201                                                                        if (parent == null) {
202                                                                                if (index == 0) {
203                                                                                        parent = phraseStructure.getPhraseStructureRoot();      
204                                                                                } else {
205                                                                                        parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
206                                                                                }
207                                                                                nonterminals.put(index,parent);
208                                                                        }
209                                                                        Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
210                                                                        syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
211                                                                } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
212                                                                        if (secedgecounter % 2 == 0) {
213                                                                                edgelabelSymbol.setLength(0);
214                                                                                edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
215                                                                                secedgecounter++;
216                                                                        } else {
217                                                                                int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
218                                                                                if (index == 0) {
219                                                                                        parent = phraseStructure.getPhraseStructureRoot();
220                                                                                } else if (index < START_ID_OF_NONTERMINALS) {
221                                                                                        parent = phraseStructure.getTokenNode(index);
222                                                                                } else {
223                                                                                        parent = nonterminals.get(index);
224                                                                                        if (parent == null) {
225                                                                                                parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
226                                                                                                nonterminals.put(index,parent);
227                                                                                        }
228                                                                                }
229                                                                                Edge e = phraseStructure.addSecondaryEdge(parent, child);
230                                                                                e.addLabel(symbolTables.getSymbolTable(column.getName()), edgelabelSymbol.toString());
231                                                                                secedgecounter++;
232                                                                        }
233                                                                }
234                                                                start = i + 1;
235                                                        }
236                                                }
237                                        } else { // Terminal
238                                                Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
239                                                ColumnDescription column = null;
240                                                
241                                                currentTerminalSize++;
242                                                child = syntaxGraph.addTokenNode(currentTerminalSize);
243                                                char[] lineChars = line.toCharArray();
244                                                int start = 0;
245                                                int secedgecounter = 0;
246                                                for (int i = 0, n = lineChars.length; i < n; i++) {
247                                                        if (lineChars[i] == '\t' && start == i) {
248                                                                start++;
249                                                        } else if (lineChars[i] == '\t' || i == n - 1) {
250                                                                if (columns.hasNext()) {
251                                                                        column = columns.next();
252                                                                }
253                                                                if (column.getCategory() == ColumnDescription.INPUT && child != null) {
254                                                                        syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
255                                                                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
256                                                                        edgelabelSymbol.setLength(0);
257                                                                        edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
258                                                                        edgelabelTableName.setLength(0);
259                                                                        edgelabelTableName.append(column.getName());
260                                                                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
261                                                                        int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
262                                                                        parent = nonterminals.get(index);
263                                                                        if (parent == null) {
264                                                                                if (index == 0) {
265                                                                                        parent = phraseStructure.getPhraseStructureRoot();      
266                                                                                } else {
267                                                                                        parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
268                                                                                }
269                                                                                nonterminals.put(index,parent);
270                                                                        }
271
272                                                                        Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
273                                                                        syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
274                                                                } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
275                                                                        if (secedgecounter % 2 == 0) {
276                                                                                edgelabelSymbol.setLength(0);
277                                                                                edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
278                                                                                secedgecounter++;
279                                                                        } else {
280                                                                                int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
281                                                                                if (index == 0) {
282                                                                                        parent = phraseStructure.getPhraseStructureRoot();
283                                                                                } else if (index < START_ID_OF_NONTERMINALS) {
284                                                                                        parent = phraseStructure.getTokenNode(index);
285                                                                                } else {
286                                                                                        parent = nonterminals.get(index);
287                                                                                        if (parent == null) {
288                                                                                                parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
289                                                                                                nonterminals.put(index,parent);
290                                                                                        }
291                                                                                }
292                                                                                Edge e = phraseStructure.addSecondaryEdge(parent, child);
293                                                                                e.addLabel(symbolTables.getSymbolTable(column.getName()), edgelabelSymbol.toString());
294                                                                                secedgecounter++;
295                                                                        }
296                                                                }
297                                                                start = i + 1;
298                                                        }
299                                                }
300                                        }
301                                } else if (line.startsWith("%%")) { // comment skip
302                                
303                                } else if (line.startsWith("#FORMAT")) {
304//                              int index = line.indexOf(' ');
305//                              if (index > -1) {
306//                                      try {
307//                                              formatVersion = Integer.parseInt(line.substring(index+1));
308//                                      } catch (NumberFormatException e) {
309//                                              
310//                                      }
311//                              }
312                                } else if (line.startsWith("#BOT")) {
313//                              int index = line.indexOf(' ');
314//                              if (index > -1) {
315//                                      if (line.substring(index+1).equals("ORIGIN")) {
316//                                              currentHeaderTable = NegraTables.ORIGIN;
317//                                      } else if (line.substring(index+1).equals("EDITOR")) {
318//                                              currentHeaderTable = NegraTables.EDITOR;
319//                                      } else if (line.substring(index+1).equals("WORDTAG")) {
320//                                              currentHeaderTable = NegraTables.WORDTAG;
321//                                      } else if (line.substring(index+1).equals("MORPHTAG")) {
322//                                              currentHeaderTable = NegraTables.MORPHTAG;
323//                                      } else if (line.substring(index+1).equals("NODETAG")) {
324//                                              currentHeaderTable = NegraTables.NODETAG;
325//                                      } else if (line.substring(index+1).equals("EDGETAG")) {
326//                                              currentHeaderTable = NegraTables.EDGETAG;
327//                                      } else if (line.substring(index+1).equals("SECEDGETAG")) {
328//                                              currentHeaderTable = NegraTables.SECEDGETAG;
329//                                      } else {
330//                                              currentHeaderTable = NegraTables.UNDEF;
331//                                      }
332//                              }
333                                } else if (line.startsWith("#EOT")) {
334                                        currentHeaderTable = NegraTables.UNDEF;
335                                }
336                        }
337                }  catch (IOException e) {
338                        throw new DataFormatException("Error when reading from the input file. ", e);
339                }
340        }
341        
342        public void readEpilog() throws MaltChainedException {
343                
344        }
345        
346        public BufferedReader getReader() {
347                return reader;
348        }
349
350        public void setReader(BufferedReader reader) {
351                this.reader = reader;
352        }
353
354        public int getSentenceCount() {
355                return sentenceCount;
356        }
357
358        public void setSentenceCount(int sentenceCount) {
359                this.sentenceCount = sentenceCount;
360        }
361        
362        public int getFormatVersion() {
363                return formatVersion;
364        }
365
366        public void setFormatVersion(int formatVersion) {
367                this.formatVersion = formatVersion;
368        }
369
370        public DataFormatInstance getDataFormatInstance() {
371                return dataFormatInstance;
372        }
373        
374        public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
375                this.dataFormatInstance = inputDataFormatInstance;
376        }
377        
378        public String getOptions() {
379                return optionString;
380        }
381        
382        public void setOptions(String optionString) throws MaltChainedException {
383                this.optionString = optionString;
384
385                String[] argv;
386                try {
387                        argv = optionString.split("[_\\p{Blank}]");
388                } catch (PatternSyntaxException e) {
389                        throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
390                }
391                for (int i=0; i < argv.length-1; i++) {
392                        if(argv[i].charAt(0) != '-') {
393                                throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
394                        }
395                        if(++i>=argv.length) {
396                                throw new DataFormatException("The last argument does not have any value. ");
397                        }
398                        switch(argv[i-1].charAt(1)) {
399                        case 's': 
400                                try {
401                                        START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
402                                } catch (NumberFormatException e){
403                                        throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
404                                }
405                                break;
406                        default:
407                                throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");             
408                        }
409                }
410        }
411        
412        public String getFileName() {
413                return fileName;
414        }
415
416        public void setFileName(String fileName) {
417                this.fileName = fileName;
418        }
419
420        public URL getUrl() {
421                return url;
422        }
423
424        public void setUrl(URL url) {
425                this.url = url;
426        }
427
428        public String getCharsetName() {
429                return charsetName;
430        }
431
432        public void setCharsetName(String charsetName) {
433                this.charsetName = charsetName;
434        }
435
436        public int getNIterations() {
437                return nIterations;
438        }
439
440        public void setNIterations(int iterations) {
441                nIterations = iterations;
442        }
443
444        public int getIterationCounter() {
445                return cIterations;
446        }
447        
448        public void close() throws MaltChainedException {
449                try {
450                        if (reader != null) {
451                                if (closeStream) {
452                                        reader.close();
453                                }
454                                reader = null;
455                        }
456                }   catch (IOException e) {
457                        throw new DataFormatException("Error when closing the input file.", e);
458                } 
459        }
460}