001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    import java.util.TreeMap;
014    import java.util.regex.PatternSyntaxException;
015    
016    import org.maltparser.core.exception.MaltChainedException;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
021    import org.maltparser.core.syntaxgraph.PhraseStructure;
022    import org.maltparser.core.syntaxgraph.TokenStructure;
023    import org.maltparser.core.syntaxgraph.edge.Edge;
024    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025    
026    /**
027    *
028    *
029    * @author Johan Hall
030    */
031    public class NegraReader implements SyntaxGraphReader {
032            private enum NegraTables {
033                    ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
034            };
035            private BufferedReader reader;
036            private DataFormatInstance dataFormatInstance;
037            private int sentenceCount;
038            private String optionString;
039            private int formatVersion;
040            private NegraTables currentHeaderTable;
041            private int currentTerminalSize;
042            private int currentNonTerminalSize;
043            private SortedMap<Integer,PhraseStructureNode> nonterminals; 
044            private StringBuilder edgelabelSymbol;
045            private StringBuilder edgelabelTableName;
046            private int START_ID_OF_NONTERMINALS = 500;
047            private String fileName = null;
048            private URL url = null;
049            private String charsetName;
050            private int nIterations;
051            private int cIterations;
052            private boolean closeStream = true;
053            
054            public NegraReader() {
055                    currentHeaderTable = NegraTables.UNDEF;
056                    edgelabelSymbol = new StringBuilder();
057                    edgelabelTableName = new StringBuilder();
058                    nonterminals = new TreeMap<Integer,PhraseStructureNode>();
059                    nIterations = 1;
060                    cIterations = 1;
061            }
062            
063            private void reopen() throws MaltChainedException {
064                    close();
065                    if (fileName != null) {
066                            open(fileName, charsetName);
067                    } else if (url != null) {
068                            open(url, charsetName);
069                    } else {
070                            throw new DataFormatException("The input stream cannot be reopen. ");
071                    }
072            }
073            
074            public void open(String fileName, String charsetName) throws MaltChainedException {
075                    setFileName(fileName);
076                    setCharsetName(charsetName);
077                    try {
078                            open(new FileInputStream(fileName), charsetName);
079                    } catch (FileNotFoundException e) {
080                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081                    }
082            }
083            public void open(URL url, String charsetName) throws MaltChainedException {
084                    setUrl(url);
085                    setCharsetName(charsetName);
086                    try {
087                            open(url.openStream(), charsetName);
088                    } catch (IOException e) {
089                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090                    }
091            }
092            
093            public void open(InputStream is, String charsetName) throws MaltChainedException {
094                    try {
095                            if (is == System.in) {
096                                    closeStream = false;
097                            }
098                            open(new InputStreamReader(is, charsetName));
099                    } catch (UnsupportedEncodingException e) {
100                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101                    }
102            }
103            
104            private void open(InputStreamReader isr) throws MaltChainedException {
105                    setReader(new BufferedReader(isr));
106                    setSentenceCount(0);
107            }
108            
109            public void readProlog() throws MaltChainedException {
110                    
111            }
112            
113            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
114                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
115                            return false;
116                    }
117                    syntaxGraph.clear();
118                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
119                    PhraseStructureNode parent = null;
120                    PhraseStructureNode child = null;
121                    currentHeaderTable = NegraTables.UNDEF;
122                    String line = null;
123                    syntaxGraph.clear();
124                    syntaxGraph.getSymbolTables().cleanUp();
125                    nonterminals.clear();
126                    try {
127                            while (true) {
128                                    line = reader.readLine();
129                                    if (line == null) {
130                                            if (syntaxGraph.hasTokens()) {
131                                                    sentenceCount++;
132                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
133                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
134                                                    }
135                                            }
136                                            if (cIterations < nIterations) {
137                                                    cIterations++;
138                                                    reopen();
139                                                    return true;
140                                            }
141                                            return false;
142                                    } else if (line.startsWith("#EOS")) {
143                                            currentTerminalSize = 0;
144                                            currentNonTerminalSize = 0;
145                                            currentHeaderTable = NegraTables.UNDEF;
146                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
147                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
148                                            }
149                                            return true;
150                                    } else if (line.startsWith("#BOS")) {
151                                            currentHeaderTable = NegraTables.SENTENCE;
152                                            int s = -1, e = -1;
153                                            for (int i = 5, n = line.length(); i < n; i++) {
154                                                    if (Character.isDigit(line.charAt(i)) && s == -1) {
155                                                            s = i;
156                                                    }
157                                                    if (line.charAt(i) == ' ') {
158                                                            e = i;
159                                                            break;
160                                                    }
161                                            }
162                                            if (s != e && s != -1 && e != -1) {
163                                                    phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
164                                            }
165                                            sentenceCount++;
166                                    } else if (currentHeaderTable == NegraTables.SENTENCE) {
167                                            if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
168                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
169                                                    ColumnDescription column = null;
170                                                    currentNonTerminalSize++;
171                                                    char[] lineChars = line.toCharArray();
172                                                    int start = 0;
173                                                    int secedgecounter = 0;
174                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
175                                                            if (lineChars[i] == '\t' && start == i) {
176                                                                    start++;
177                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
178                                                                    if (columns.hasNext()) {
179                                                                            column = columns.next();
180                                                                    }
181                                                                    if (column.getPosition() == 0) {
182                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
183                                                                            child = nonterminals.get(index);
184                                                                            if (child == null) {
185                                                                                    if (index != 0) {
186                                                                                            child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
187                                                                                    }
188                                                                                    nonterminals.put(index,child);
189                                                                            }
190                                                                    } else if (column.getPosition() == 2 && child != null) {
191                                                                            syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
192                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 
193                                                                            edgelabelSymbol.setLength(0);
194                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
195                                                                            edgelabelTableName.setLength(0);
196                                                                            edgelabelTableName.append(column.getName());
197                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
198                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
199                                                                            parent = nonterminals.get(index);
200                                                                            if (parent == null) {
201                                                                                    if (index == 0) {
202                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
203                                                                                    } else {
204                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
205                                                                                    }
206                                                                                    nonterminals.put(index,parent);
207                                                                            }
208                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
209                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
210                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
211                                                                            if (secedgecounter % 2 == 0) {
212                                                                                    edgelabelSymbol.setLength(0);
213                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
214                                                                                    secedgecounter++;
215                                                                            } else {
216                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
217                                                                                    if (index == 0) {
218                                                                                            parent = phraseStructure.getPhraseStructureRoot();
219                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
220                                                                                            parent = phraseStructure.getTokenNode(index);
221                                                                                    } else {
222                                                                                            parent = nonterminals.get(index);
223                                                                                            if (parent == null) {
224                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
225                                                                                                    nonterminals.put(index,parent);
226                                                                                            }
227                                                                                    }
228                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
229                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
230                                                                                    secedgecounter++;
231                                                                            }
232                                                                    }
233                                                                    start = i + 1;
234                                                            }
235                                                    }
236                                            } else { // Terminal
237                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
238                                                    ColumnDescription column = null;
239                                                    
240                                                    currentTerminalSize++;
241                                                    child = syntaxGraph.addTokenNode(currentTerminalSize);
242                                                    char[] lineChars = line.toCharArray();
243                                                    int start = 0;
244                                                    int secedgecounter = 0;
245                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
246                                                            if (lineChars[i] == '\t' && start == i) {
247                                                                    start++;
248                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
249                                                                    if (columns.hasNext()) {
250                                                                            column = columns.next();
251                                                                    }
252                                                                    if (column.getCategory() == ColumnDescription.INPUT && child != null) {
253                                                                            syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
254                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
255                                                                            edgelabelSymbol.setLength(0);
256                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
257                                                                            edgelabelTableName.setLength(0);
258                                                                            edgelabelTableName.append(column.getName());
259                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
260                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
261                                                                            parent = nonterminals.get(index);
262                                                                            if (parent == null) {
263                                                                                    if (index == 0) {
264                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
265                                                                                    } else {
266                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
267                                                                                    }
268                                                                                    nonterminals.put(index,parent);
269                                                                            }
270    
271                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
272                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
273                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
274                                                                            if (secedgecounter % 2 == 0) {
275                                                                                    edgelabelSymbol.setLength(0);
276                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
277                                                                                    secedgecounter++;
278                                                                            } else {
279                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
280                                                                                    if (index == 0) {
281                                                                                            parent = phraseStructure.getPhraseStructureRoot();
282                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
283                                                                                            parent = phraseStructure.getTokenNode(index);
284                                                                                    } else {
285                                                                                            parent = nonterminals.get(index);
286                                                                                            if (parent == null) {
287                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
288                                                                                                    nonterminals.put(index,parent);
289                                                                                            }
290                                                                                    }
291                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
292                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
293                                                                                    secedgecounter++;
294                                                                            }
295                                                                    }
296                                                                    start = i + 1;
297                                                            }
298                                                    }
299                                            }
300                                    } else if (line.startsWith("%%")) { // comment skip
301                                    
302                                    } else if (line.startsWith("#FORMAT")) {
303    //                              int index = line.indexOf(' ');
304    //                              if (index > -1) {
305    //                                      try {
306    //                                              formatVersion = Integer.parseInt(line.substring(index+1));
307    //                                      } catch (NumberFormatException e) {
308    //                                              
309    //                                      }
310    //                              }
311                                    } else if (line.startsWith("#BOT")) {
312    //                              int index = line.indexOf(' ');
313    //                              if (index > -1) {
314    //                                      if (line.substring(index+1).equals("ORIGIN")) {
315    //                                              currentHeaderTable = NegraTables.ORIGIN;
316    //                                      } else if (line.substring(index+1).equals("EDITOR")) {
317    //                                              currentHeaderTable = NegraTables.EDITOR;
318    //                                      } else if (line.substring(index+1).equals("WORDTAG")) {
319    //                                              currentHeaderTable = NegraTables.WORDTAG;
320    //                                      } else if (line.substring(index+1).equals("MORPHTAG")) {
321    //                                              currentHeaderTable = NegraTables.MORPHTAG;
322    //                                      } else if (line.substring(index+1).equals("NODETAG")) {
323    //                                              currentHeaderTable = NegraTables.NODETAG;
324    //                                      } else if (line.substring(index+1).equals("EDGETAG")) {
325    //                                              currentHeaderTable = NegraTables.EDGETAG;
326    //                                      } else if (line.substring(index+1).equals("SECEDGETAG")) {
327    //                                              currentHeaderTable = NegraTables.SECEDGETAG;
328    //                                      } else {
329    //                                              currentHeaderTable = NegraTables.UNDEF;
330    //                                      }
331    //                              }
332                                    } else if (line.startsWith("#EOT")) {
333                                            currentHeaderTable = NegraTables.UNDEF;
334                                    }
335                            }
336                    }  catch (IOException e) {
337                            throw new DataFormatException("Error when reading from the input file. ", e);
338                    }
339            }
340            
341            public void readEpilog() throws MaltChainedException {
342                    
343            }
344            
345            public BufferedReader getReader() {
346                    return reader;
347            }
348    
349            public void setReader(BufferedReader reader) {
350                    this.reader = reader;
351            }
352    
353            public int getSentenceCount() {
354                    return sentenceCount;
355            }
356    
357            public void setSentenceCount(int sentenceCount) {
358                    this.sentenceCount = sentenceCount;
359            }
360            
361            public int getFormatVersion() {
362                    return formatVersion;
363            }
364    
365            public void setFormatVersion(int formatVersion) {
366                    this.formatVersion = formatVersion;
367            }
368    
369            public DataFormatInstance getDataFormatInstance() {
370                    return dataFormatInstance;
371            }
372            
373            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
374                    this.dataFormatInstance = inputDataFormatInstance;
375            }
376            
377            public String getOptions() {
378                    return optionString;
379            }
380            
381            public void setOptions(String optionString) throws MaltChainedException {
382                    this.optionString = optionString;
383    
384                    String[] argv;
385                    try {
386                            argv = optionString.split("[_\\p{Blank}]");
387                    } catch (PatternSyntaxException e) {
388                            throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
389                    }
390                    for (int i=0; i < argv.length-1; i++) {
391                            if(argv[i].charAt(0) != '-') {
392                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
393                            }
394                            if(++i>=argv.length) {
395                                    throw new DataFormatException("The last argument does not have any value. ");
396                            }
397                            switch(argv[i-1].charAt(1)) {
398                            case 's': 
399                                    try {
400                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
401                                    } catch (NumberFormatException e){
402                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
403                                    }
404                                    break;
405                            default:
406                                    throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");             
407                            }
408                    }
409            }
410            
411            public String getFileName() {
412                    return fileName;
413            }
414    
415            public void setFileName(String fileName) {
416                    this.fileName = fileName;
417            }
418    
419            public URL getUrl() {
420                    return url;
421            }
422    
423            public void setUrl(URL url) {
424                    this.url = url;
425            }
426    
427            public String getCharsetName() {
428                    return charsetName;
429            }
430    
431            public void setCharsetName(String charsetName) {
432                    this.charsetName = charsetName;
433            }
434    
435            public int getNIterations() {
436                    return nIterations;
437            }
438    
439            public void setNIterations(int iterations) {
440                    nIterations = iterations;
441            }
442    
443            public int getIterationCounter() {
444                    return cIterations;
445            }
446            
447            public void close() throws MaltChainedException {
448                    try {
449                            if (reader != null) {
450                                    if (closeStream) {
451                                            reader.close();
452                                    }
453                                    reader = null;
454                            }
455                    }   catch (IOException e) {
456                            throw new DataFormatException("Error when closing the input file.", e);
457                    } 
458            }
459    }