001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    import java.util.TreeMap;
014    import java.util.regex.PatternSyntaxException;
015    
016    import org.maltparser.core.exception.MaltChainedException;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
021    import org.maltparser.core.syntaxgraph.PhraseStructure;
022    import org.maltparser.core.syntaxgraph.TokenStructure;
023    import org.maltparser.core.syntaxgraph.edge.Edge;
024    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025    
026    /**
027    *
028    *
029    * @author Johan Hall
030    */
031    public class NegraReader implements SyntaxGraphReader {
032            private enum NegraTables {
033                    ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
034            };
035            private BufferedReader reader;
036            private DataFormatInstance dataFormatInstance;
037            private int sentenceCount;
038            private String optionString;
039            private int formatVersion;
040            private NegraTables currentHeaderTable;
041            private int currentTerminalSize;
042            private int currentNonTerminalSize;
043            private SortedMap<Integer,PhraseStructureNode> nonterminals; 
044            private StringBuilder edgelabelSymbol;
045            private StringBuilder edgelabelTableName;
046            private int START_ID_OF_NONTERMINALS = 500;
047            
048            public NegraReader() {
049                    currentHeaderTable = NegraTables.UNDEF;
050                    edgelabelSymbol = new StringBuilder();
051                    edgelabelTableName = new StringBuilder();
052                    nonterminals = new TreeMap<Integer,PhraseStructureNode>();
053            }
054            
055            public void open(String fileName, String charsetName) throws MaltChainedException {
056                    try {
057                            open(new FileInputStream(fileName), charsetName);
058                    } catch (FileNotFoundException e) {
059                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
060                    }
061            }
062            public void open(URL url, String charsetName) throws MaltChainedException {
063                    try {
064                            open(url.openStream(), charsetName);
065                    } catch (IOException e) {
066                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
067                    }
068            }
069            
070            public void open(InputStream is, String charsetName) throws MaltChainedException {
071                    try {
072                            open(new InputStreamReader(is, charsetName));
073                    } catch (UnsupportedEncodingException e) {
074                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
075                    }
076            }
077            
078            public void open(InputStreamReader isr) throws MaltChainedException {
079                    setReader(new BufferedReader(isr));
080                    setSentenceCount(0);
081            }
082            
083            public void readProlog() throws MaltChainedException {
084                    
085            }
086            
087            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
088                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
089                            return false;
090                    }
091                    syntaxGraph.clear();
092                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
093                    PhraseStructureNode parent = null;
094                    PhraseStructureNode child = null;
095                    currentHeaderTable = NegraTables.UNDEF;
096                    String line = null;
097                    syntaxGraph.clear();
098                    nonterminals.clear();
099                    try {
100                            while (true) {
101                                    line = reader.readLine();
102                                    if (line == null) {
103                                            if (syntaxGraph.hasTokens()) {
104                                                    sentenceCount++;
105                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
106                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
107                                                    }
108                                            }
109                                            return false;
110                                    } else if (line.startsWith("#EOS")) {
111                                            currentTerminalSize = 0;
112                                            currentNonTerminalSize = 0;
113                                            currentHeaderTable = NegraTables.UNDEF;
114                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
115                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
116                                            }
117                                            return true;
118                                    } else if (line.startsWith("#BOS")) {
119                                            currentHeaderTable = NegraTables.SENTENCE;
120                                            int s = -1, e = -1;
121                                            for (int i = 5, n = line.length(); i < n; i++) {
122                                                    if (Character.isDigit(line.charAt(i)) && s == -1) {
123                                                            s = i;
124                                                    }
125                                                    if (line.charAt(i) == ' ') {
126                                                            e = i;
127                                                            break;
128                                                    }
129                                            }
130                                            if (s != e && s != -1 && e != -1) {
131                                                    phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
132                                            }
133                                            sentenceCount++;
134                                    } else if (currentHeaderTable == NegraTables.SENTENCE) {
135                                            if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
136                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
137                                                    ColumnDescription column = null;
138                                                    currentNonTerminalSize++;
139                                                    char[] lineChars = line.toCharArray();
140                                                    int start = 0;
141                                                    int secedgecounter = 0;
142                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
143                                                            if (lineChars[i] == '\t' && start == i) {
144                                                                    start++;
145                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
146                                                                    if (columns.hasNext()) {
147                                                                            column = columns.next();
148                                                                    }
149                                                                    if (column.getPosition() == 0) {
150                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
151                                                                            child = nonterminals.get(index);
152                                                                            if (child == null) {
153                                                                                    if (index != 0) {
154                                                                                            child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
155                                                                                    }
156                                                                                    nonterminals.put(index,child);
157                                                                            }
158                                                                    } else if (column.getPosition() == 2 && child != null) {
159                                                                            syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
160                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 
161                                                                            edgelabelSymbol.setLength(0);
162                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
163                                                                            edgelabelTableName.setLength(0);
164                                                                            edgelabelTableName.append(column.getName());
165                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
166                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
167                                                                            parent = nonterminals.get(index);
168                                                                            if (parent == null) {
169                                                                                    if (index == 0) {
170                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
171                                                                                    } else {
172                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
173                                                                                    }
174                                                                                    nonterminals.put(index,parent);
175                                                                            }
176                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
177                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
178                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
179                                                                            if (secedgecounter % 2 == 0) {
180                                                                                    edgelabelSymbol.setLength(0);
181                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
182                                                                                    secedgecounter++;
183                                                                            } else {
184                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
185                                                                                    if (index == 0) {
186                                                                                            parent = phraseStructure.getPhraseStructureRoot();
187                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
188                                                                                            parent = phraseStructure.getTokenNode(index);
189                                                                                    } else {
190                                                                                            parent = nonterminals.get(index);
191                                                                                            if (parent == null) {
192                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
193                                                                                                    nonterminals.put(index,parent);
194                                                                                            }
195                                                                                    }
196                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
197                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
198                                                                                    secedgecounter++;
199                                                                            }
200                                                                    }
201                                                                    start = i + 1;
202                                                            }
203                                                    }
204                                            } else { // Terminal
205                                                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
206                                                    ColumnDescription column = null;
207                                                    
208                                                    currentTerminalSize++;
209                                                    child = syntaxGraph.addTokenNode(currentTerminalSize);
210                                                    char[] lineChars = line.toCharArray();
211                                                    int start = 0;
212                                                    int secedgecounter = 0;
213                                                    for (int i = 0, n = lineChars.length; i < n; i++) {
214                                                            if (lineChars[i] == '\t' && start == i) {
215                                                                    start++;
216                                                            } else if (lineChars[i] == '\t' || i == n - 1) {
217                                                                    if (columns.hasNext()) {
218                                                                            column = columns.next();
219                                                                    }
220                                                                    if (column.getCategory() == ColumnDescription.INPUT && child != null) {
221                                                                            syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
222                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
223                                                                            edgelabelSymbol.setLength(0);
224                                                                            edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
225                                                                            edgelabelTableName.setLength(0);
226                                                                            edgelabelTableName.append(column.getName());
227                                                                    } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
228                                                                            int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
229                                                                            parent = nonterminals.get(index);
230                                                                            if (parent == null) {
231                                                                                    if (index == 0) {
232                                                                                            parent = phraseStructure.getPhraseStructureRoot();      
233                                                                                    } else {
234                                                                                            parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
235                                                                                    }
236                                                                                    nonterminals.put(index,parent);
237                                                                            }
238    
239                                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
240                                                                            syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
241                                                                    } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
242                                                                            if (secedgecounter % 2 == 0) {
243                                                                                    edgelabelSymbol.setLength(0);
244                                                                                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
245                                                                                    secedgecounter++;
246                                                                            } else {
247                                                                                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
248                                                                                    if (index == 0) {
249                                                                                            parent = phraseStructure.getPhraseStructureRoot();
250                                                                                    } else if (index < START_ID_OF_NONTERMINALS) {
251                                                                                            parent = phraseStructure.getTokenNode(index);
252                                                                                    } else {
253                                                                                            parent = nonterminals.get(index);
254                                                                                            if (parent == null) {
255                                                                                                    parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
256                                                                                                    nonterminals.put(index,parent);
257                                                                                            }
258                                                                                    }
259                                                                                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
260                                                                                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
261                                                                                    secedgecounter++;
262                                                                            }
263                                                                    }
264                                                                    start = i + 1;
265                                                            }
266                                                    }
267                                            }
268                                    } else if (line.startsWith("%%")) { // comment skip
269                                    
270                                    } else if (line.startsWith("#FORMAT")) {
271    //                              int index = line.indexOf(' ');
272    //                              if (index > -1) {
273    //                                      try {
274    //                                              formatVersion = Integer.parseInt(line.substring(index+1));
275    //                                      } catch (NumberFormatException e) {
276    //                                              
277    //                                      }
278    //                              }
279                                    } else if (line.startsWith("#BOT")) {
280    //                              int index = line.indexOf(' ');
281    //                              if (index > -1) {
282    //                                      if (line.substring(index+1).equals("ORIGIN")) {
283    //                                              currentHeaderTable = NegraTables.ORIGIN;
284    //                                      } else if (line.substring(index+1).equals("EDITOR")) {
285    //                                              currentHeaderTable = NegraTables.EDITOR;
286    //                                      } else if (line.substring(index+1).equals("WORDTAG")) {
287    //                                              currentHeaderTable = NegraTables.WORDTAG;
288    //                                      } else if (line.substring(index+1).equals("MORPHTAG")) {
289    //                                              currentHeaderTable = NegraTables.MORPHTAG;
290    //                                      } else if (line.substring(index+1).equals("NODETAG")) {
291    //                                              currentHeaderTable = NegraTables.NODETAG;
292    //                                      } else if (line.substring(index+1).equals("EDGETAG")) {
293    //                                              currentHeaderTable = NegraTables.EDGETAG;
294    //                                      } else if (line.substring(index+1).equals("SECEDGETAG")) {
295    //                                              currentHeaderTable = NegraTables.SECEDGETAG;
296    //                                      } else {
297    //                                              currentHeaderTable = NegraTables.UNDEF;
298    //                                      }
299    //                              }
300                                    } else if (line.startsWith("#EOT")) {
301                                            currentHeaderTable = NegraTables.UNDEF;
302                                    }
303                            }
304                    }  catch (IOException e) {
305                            throw new DataFormatException("Error when reading from the input file. ", e);
306                    }
307            }
308            
309            public void readEpilog() throws MaltChainedException {
310                    
311            }
312            
313            public BufferedReader getReader() {
314                    return reader;
315            }
316    
317            public void setReader(BufferedReader reader) {
318                    this.reader = reader;
319            }
320    
321            public int getSentenceCount() {
322                    return sentenceCount;
323            }
324    
325            public void setSentenceCount(int sentenceCount) {
326                    this.sentenceCount = sentenceCount;
327            }
328            
329            public int getFormatVersion() {
330                    return formatVersion;
331            }
332    
333            public void setFormatVersion(int formatVersion) {
334                    this.formatVersion = formatVersion;
335            }
336    
337            public DataFormatInstance getDataFormatInstance() {
338                    return dataFormatInstance;
339            }
340            
341            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
342                    this.dataFormatInstance = inputDataFormatInstance;
343            }
344            
345            public String getOptions() {
346                    return optionString;
347            }
348            
349            public void setOptions(String optionString) throws MaltChainedException {
350                    this.optionString = optionString;
351    
352                    String[] argv;
353                    try {
354                            argv = optionString.split("[_\\p{Blank}]");
355                    } catch (PatternSyntaxException e) {
356                            throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
357                    }
358                    for (int i=0; i < argv.length-1; i++) {
359                            if(argv[i].charAt(0) != '-') {
360                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
361                            }
362                            if(++i>=argv.length) {
363                                    throw new DataFormatException("The last argument does not have any value. ");
364                            }
365                            switch(argv[i-1].charAt(1)) {
366                            case 's': 
367                                    try {
368                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
369                                    } catch (NumberFormatException e){
370                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
371                                    }
372                                    break;
373                            default:
374                                    throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");             
375                            }
376                    }
377            }
378            
379            public void close() throws MaltChainedException {
380                    try {
381                            if (reader != null) {
382                                    reader.close();
383                                    reader = null;
384                            }
385                    }   catch (IOException e) {
386                            throw new DataFormatException("Error when closing the input file.", e);
387                    } 
388            }
389    }