001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    import java.util.TreeMap;
014    import java.util.regex.PatternSyntaxException;
015    
016    import org.maltparser.core.exception.MaltChainedException;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
021    import org.maltparser.core.syntaxgraph.PhraseStructure;
022    import org.maltparser.core.syntaxgraph.TokenStructure;
023    import org.maltparser.core.syntaxgraph.edge.Edge;
024    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025    
026    /**
027    *
028    *
029    * @author Johan Hall
030    */
031    public class NegraReader implements SyntaxGraphReader {
032            private enum NegraTables {
033                    ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
034            };
035            private BufferedReader reader;
036            private DataFormatInstance dataFormatInstance;
037            private int sentenceCount;
038            private String optionString;
039            private int formatVersion;
040            private NegraTables currentHeaderTable;
041            private int currentTerminalSize;
042            private int currentNonTerminalSize;
043            private SortedMap<Integer,PhraseStructureNode> nonterminals; 
044            private StringBuilder edgelabelSymbol;
045            private StringBuilder edgelabelTableName;
046            private int START_ID_OF_NONTERMINALS = 500;
047            private String fileName = null;
048            private URL url = null;
049            private String charsetName;
050            private int nIterations;
051            private int cIterations;
052            
053            public NegraReader() {
054                    currentHeaderTable = NegraTables.UNDEF;
055                    edgelabelSymbol = new StringBuilder();
056                    edgelabelTableName = new StringBuilder();
057                    nonterminals = new TreeMap<Integer,PhraseStructureNode>();
058                    nIterations = 1;
059                    cIterations = 1;
060            }
061            
062            private void reopen() throws MaltChainedException {
063                    close();
064                    if (fileName != null) {
065                            open(fileName, charsetName);
066                    } else if (url != null) {
067                            open(url, charsetName);
068                    } else {
069                            throw new DataFormatException("The input stream cannot be reopen. ");
070                    }
071            }
072            
073            public void open(String fileName, String charsetName) throws MaltChainedException {
074                    setFileName(fileName);
075                    setCharsetName(charsetName);
076                    try {
077                            open(new FileInputStream(fileName), charsetName);
078                    } catch (FileNotFoundException e) {
079                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
080                    }
081            }
082            public void open(URL url, String charsetName) throws MaltChainedException {
083                    setUrl(url);
084                    setCharsetName(charsetName);
085                    try {
086                            open(url.openStream(), charsetName);
087                    } catch (IOException e) {
088                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
089                    }
090            }
091            
092            public void open(InputStream is, String charsetName) throws MaltChainedException {
093                    try {
094                            open(new InputStreamReader(is, charsetName));
095                    } catch (UnsupportedEncodingException e) {
096                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
097                    }
098            }
099            
100            public void open(InputStreamReader isr) throws MaltChainedException {
101                    setReader(new BufferedReader(isr));
102                    setSentenceCount(0);
103            }
104            
105            public void readProlog() throws MaltChainedException {
106                    
107            }
108            
109            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
110                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
111                            return false;
112                    }
113                    syntaxGraph.clear();
114                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
115                    PhraseStructureNode parent = null;
116                    PhraseStructureNode child = null;
117                    currentHeaderTable = NegraTables.UNDEF;
118                    String line = null;
119                    syntaxGraph.clear();
120                    nonterminals.clear();
121                    try {
122                            while (true) {
123                                    line = reader.readLine();
124                                    if (line == null) {
125                                            if (syntaxGraph.hasTokens()) {
126                                                    sentenceCount++;
127                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
128                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
129                                                    }
130                                            }
131                                            if (cIterations < nIterations) {
132                                                    cIterations++;
133                                                    reopen();
134                                                    return true;
135                                            }
136                                            return false;
137                                    } else if (line.startsWith("#EOS")) {
138                                            currentTerminalSize = 0;
139                                            currentNonTerminalSize = 0;
140                                            currentHeaderTable = NegraTables.UNDEF;
141                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
142                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
143                                            }
144                                            return true;
145                                    } else if (line.startsWith("#BOS")) {
146                                            currentHeaderTable = NegraTables.SENTENCE;
147                                            int s = -1, e = -1;
148                                            for (int i = 5, n = line.length(); i < n; i++) {
149                                                    if (Character.isDigit(line.charAt(i)) && s == -1) {
150                                                            s = i;
151                                                    }
152                                                    if (line.charAt(i) == ' ') {
153                                                            e = i;
154                                                            break;
155                                                    }
156                                            }
157                                            if (s != e && s != -1 && e != -1) {
158                                                    phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
159                                            }
160                                            sentenceCount++;
161                                    } else if (currentHeaderTable == NegraTables.SENTENCE) {
162                                            if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
163                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
164                                                    ColumnDescription column = null;
165                                                    currentNonTerminalSize++;
166                                                    char[] lineChars = line.toCharArray();
167                                                    int start = 0;
168                                                    int secedgecounter = 0;
169                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
170                                                            if (lineChars[i] == '\t' && start == i) {
171                                                                    start++;
172                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
173                                                                    if (columns.hasNext()) {
174                                                                            column = columns.next();
175                                                                    }
176                                                                    if (column.getPosition() == 0) {
177                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
178                                                                            child = nonterminals.get(index);
179                                                                            if (child == null) {
180                                                                                    if (index != 0) {
181                                                                                            child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
182                                                                                    }
183                                                                                    nonterminals.put(index,child);
184                                                                            }
185                                                                    } else if (column.getPosition() == 2 && child != null) {
186                                                                            syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
187                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 
188                                                                            edgelabelSymbol.setLength(0);
189                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
190                                                                            edgelabelTableName.setLength(0);
191                                                                            edgelabelTableName.append(column.getName());
192                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
193                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
194                                                                            parent = nonterminals.get(index);
195                                                                            if (parent == null) {
196                                                                                    if (index == 0) {
197                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
198                                                                                    } else {
199                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
200                                                                                    }
201                                                                                    nonterminals.put(index,parent);
202                                                                            }
203                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
204                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
205                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
206                                                                            if (secedgecounter % 2 == 0) {
207                                                                                    edgelabelSymbol.setLength(0);
208                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
209                                                                                    secedgecounter++;
210                                                                            } else {
211                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
212                                                                                    if (index == 0) {
213                                                                                            parent = phraseStructure.getPhraseStructureRoot();
214                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
215                                                                                            parent = phraseStructure.getTokenNode(index);
216                                                                                    } else {
217                                                                                            parent = nonterminals.get(index);
218                                                                                            if (parent == null) {
219                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
220                                                                                                    nonterminals.put(index,parent);
221                                                                                            }
222                                                                                    }
223                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
224                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
225                                                                                    secedgecounter++;
226                                                                            }
227                                                                    }
228                                                                    start = i + 1;
229                                                            }
230                                                    }
231                                            } else { // Terminal
232                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
233                                                    ColumnDescription column = null;
234                                                    
235                                                    currentTerminalSize++;
236                                                    child = syntaxGraph.addTokenNode(currentTerminalSize);
237                                                    char[] lineChars = line.toCharArray();
238                                                    int start = 0;
239                                                    int secedgecounter = 0;
240                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
241                                                            if (lineChars[i] == '\t' && start == i) {
242                                                                    start++;
243                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
244                                                                    if (columns.hasNext()) {
245                                                                            column = columns.next();
246                                                                    }
247                                                                    if (column.getCategory() == ColumnDescription.INPUT && child != null) {
248                                                                            syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
249                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
250                                                                            edgelabelSymbol.setLength(0);
251                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
252                                                                            edgelabelTableName.setLength(0);
253                                                                            edgelabelTableName.append(column.getName());
254                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
255                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
256                                                                            parent = nonterminals.get(index);
257                                                                            if (parent == null) {
258                                                                                    if (index == 0) {
259                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
260                                                                                    } else {
261                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
262                                                                                    }
263                                                                                    nonterminals.put(index,parent);
264                                                                            }
265    
266                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
267                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
268                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
269                                                                            if (secedgecounter % 2 == 0) {
270                                                                                    edgelabelSymbol.setLength(0);
271                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
272                                                                                    secedgecounter++;
273                                                                            } else {
274                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
275                                                                                    if (index == 0) {
276                                                                                            parent = phraseStructure.getPhraseStructureRoot();
277                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
278                                                                                            parent = phraseStructure.getTokenNode(index);
279                                                                                    } else {
280                                                                                            parent = nonterminals.get(index);
281                                                                                            if (parent == null) {
282                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
283                                                                                                    nonterminals.put(index,parent);
284                                                                                            }
285                                                                                    }
286                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
287                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
288                                                                                    secedgecounter++;
289                                                                            }
290                                                                    }
291                                                                    start = i + 1;
292                                                            }
293                                                    }
294                                            }
295                                    } else if (line.startsWith("%%")) { // comment skip
296                                    
297                                    } else if (line.startsWith("#FORMAT")) {
298    //                              int index = line.indexOf(' ');
299    //                              if (index > -1) {
300    //                                      try {
301    //                                              formatVersion = Integer.parseInt(line.substring(index+1));
302    //                                      } catch (NumberFormatException e) {
303    //                                              
304    //                                      }
305    //                              }
306                                    } else if (line.startsWith("#BOT")) {
307    //                              int index = line.indexOf(' ');
308    //                              if (index > -1) {
309    //                                      if (line.substring(index+1).equals("ORIGIN")) {
310    //                                              currentHeaderTable = NegraTables.ORIGIN;
311    //                                      } else if (line.substring(index+1).equals("EDITOR")) {
312    //                                              currentHeaderTable = NegraTables.EDITOR;
313    //                                      } else if (line.substring(index+1).equals("WORDTAG")) {
314    //                                              currentHeaderTable = NegraTables.WORDTAG;
315    //                                      } else if (line.substring(index+1).equals("MORPHTAG")) {
316    //                                              currentHeaderTable = NegraTables.MORPHTAG;
317    //                                      } else if (line.substring(index+1).equals("NODETAG")) {
318    //                                              currentHeaderTable = NegraTables.NODETAG;
319    //                                      } else if (line.substring(index+1).equals("EDGETAG")) {
320    //                                              currentHeaderTable = NegraTables.EDGETAG;
321    //                                      } else if (line.substring(index+1).equals("SECEDGETAG")) {
322    //                                              currentHeaderTable = NegraTables.SECEDGETAG;
323    //                                      } else {
324    //                                              currentHeaderTable = NegraTables.UNDEF;
325    //                                      }
326    //                              }
327                                    } else if (line.startsWith("#EOT")) {
328                                            currentHeaderTable = NegraTables.UNDEF;
329                                    }
330                            }
331                    }  catch (IOException e) {
332                            throw new DataFormatException("Error when reading from the input file. ", e);
333                    }
334            }
335            
336            public void readEpilog() throws MaltChainedException {
337                    
338            }
339            
340            public BufferedReader getReader() {
341                    return reader;
342            }
343    
344            public void setReader(BufferedReader reader) {
345                    this.reader = reader;
346            }
347    
348            public int getSentenceCount() {
349                    return sentenceCount;
350            }
351    
352            public void setSentenceCount(int sentenceCount) {
353                    this.sentenceCount = sentenceCount;
354            }
355            
356            public int getFormatVersion() {
357                    return formatVersion;
358            }
359    
360            public void setFormatVersion(int formatVersion) {
361                    this.formatVersion = formatVersion;
362            }
363    
364            public DataFormatInstance getDataFormatInstance() {
365                    return dataFormatInstance;
366            }
367            
368            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
369                    this.dataFormatInstance = inputDataFormatInstance;
370            }
371            
372            public String getOptions() {
373                    return optionString;
374            }
375            
376            public void setOptions(String optionString) throws MaltChainedException {
377                    this.optionString = optionString;
378    
379                    String[] argv;
380                    try {
381                            argv = optionString.split("[_\\p{Blank}]");
382                    } catch (PatternSyntaxException e) {
383                            throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
384                    }
385                    for (int i=0; i < argv.length-1; i++) {
386                            if(argv[i].charAt(0) != '-') {
387                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
388                            }
389                            if(++i>=argv.length) {
390                                    throw new DataFormatException("The last argument does not have any value. ");
391                            }
392                            switch(argv[i-1].charAt(1)) {
393                            case 's': 
394                                    try {
395                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
396                                    } catch (NumberFormatException e){
397                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
398                                    }
399                                    break;
400                            default:
401                                    throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");             
402                            }
403                    }
404            }
405            
406            public String getFileName() {
407                    return fileName;
408            }
409    
410            public void setFileName(String fileName) {
411                    this.fileName = fileName;
412            }
413    
414            public URL getUrl() {
415                    return url;
416            }
417    
418            public void setUrl(URL url) {
419                    this.url = url;
420            }
421    
422            public String getCharsetName() {
423                    return charsetName;
424            }
425    
426            public void setCharsetName(String charsetName) {
427                    this.charsetName = charsetName;
428            }
429    
430            public int getNIterations() {
431                    return nIterations;
432            }
433    
434            public void setNIterations(int iterations) {
435                    nIterations = iterations;
436            }
437    
438            public int getIterationCounter() {
439                    return cIterations;
440            }
441            
442            public void close() throws MaltChainedException {
443                    try {
444                            if (reader != null) {
445                                    reader.close();
446                                    reader = null;
447                            }
448                    }   catch (IOException e) {
449                            throw new DataFormatException("Error when closing the input file.", e);
450                    } 
451            }
452    }