001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.SortedMap;
012    import java.util.regex.PatternSyntaxException;
013    
014    import javax.xml.stream.XMLInputFactory;
015    import javax.xml.stream.XMLStreamConstants;
016    import javax.xml.stream.XMLStreamException;
017    import javax.xml.stream.XMLStreamReader;
018    
019    import org.maltparser.core.exception.MaltChainedException;
020    import org.maltparser.core.io.dataformat.DataFormatException;
021    import org.maltparser.core.io.dataformat.DataFormatInstance;
022    import org.maltparser.core.symbol.SymbolTable;
023    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
024    import org.maltparser.core.syntaxgraph.PhraseStructure;
025    import org.maltparser.core.syntaxgraph.SyntaxGraphException;
026    import org.maltparser.core.syntaxgraph.TokenStructure;
027    import org.maltparser.core.syntaxgraph.edge.Edge;
028    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
029    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
030    
031    /**
032    *
033    *
034    * @author Johan Hall
035    */
036    public class TigerXMLReader implements SyntaxGraphReader {
037    //      private TigerXMLHeader header;
038            private XMLStreamReader reader;
039            private int sentenceCount;
040            private DataFormatInstance dataFormatInstance;
041            private StringBuffer ntid;
042            private final StringBuilder graphRootID;
043    //      private StringBuilder elementContent; 
044    //      private StringBuilder valueName;
045    //      private StringBuilder currentFeatureName;
046    //      private Domain domain;
047    //      private boolean collectChar = false;
048            private String optionString;
049            private String fileName = null;
050            private URL url = null;
051            private String charsetName;
052            private int nIterations;
053            private int cIterations;
054            private int START_ID_OF_NONTERMINALS = 500;
055            private boolean closeStream = true;
056            
057            public TigerXMLReader() {
058                    this.ntid = new StringBuffer();
059    //              elementContent = new StringBuilder();
060    //              valueName = new StringBuilder();
061    //              currentFeatureName = new StringBuilder(); 
062                    graphRootID = new StringBuilder(); 
063                    nIterations = 1;
064                    cIterations = 1;
065            }
066            
067            private void reopen() throws MaltChainedException {
068                    close();
069                    if (fileName != null) {
070                            open(fileName, charsetName);
071                    } else if (url != null) {
072                            open(url, charsetName);
073                    } else {
074                            throw new DataFormatException("The input stream cannot be reopen. ");
075                    }
076            }
077            
078            public void open(String fileName, String charsetName) throws MaltChainedException {
079                    setFileName(fileName);
080                    setCharsetName(charsetName);
081                    try {
082                            open(new FileInputStream(fileName), charsetName);
083                    }catch (FileNotFoundException e) {
084                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
085                    }
086            }
087            public void open(URL url, String charsetName) throws MaltChainedException {
088                    setUrl(url);
089                    setCharsetName(charsetName);
090                    try {
091                            open(url.openStream(), charsetName);
092                    } catch (IOException e) {
093                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
094                    }
095            }
096            
097            public void open(InputStream is, String charsetName) throws MaltChainedException {
098                    try {
099                            if (is == System.in) {
100                                    closeStream = false;
101                            }
102                            open(new InputStreamReader(is, charsetName));
103                    } catch (UnsupportedEncodingException e) {
104                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
105                    }
106            }
107            
108            private void open(InputStreamReader isr) throws MaltChainedException {
109                    try {
110                            XMLInputFactory factory = XMLInputFactory.newInstance();
111                            setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
112                    } catch (XMLStreamException e) {
113                            throw new DataFormatException("XML input file could be opened. ", e);
114                    } 
115                    setSentenceCount(0);
116            }
117            
118            public void readProlog() throws MaltChainedException {
119                    
120            }
121            
122            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
123                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
124                            return false;
125                    }
126                    syntaxGraph.clear();
127                    syntaxGraph.getSymbolTables().cleanUp();
128                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
129                    PhraseStructureNode parent = null;
130                    PhraseStructureNode child = null;
131    //              if (header == null) {
132    //                      header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
133    //              }
134    
135                    try {
136                            while (true) {
137                                    int event = reader.next();
138                                    if (event == XMLStreamConstants.START_ELEMENT) {
139                                            if (reader.getLocalName().length() == 0) {
140                                                    continue;
141                                            }
142                                            if (reader.getLocalName().charAt(0) == 'e') {
143                                                    // e -> edge, edgelabel
144                                                    if (reader.getLocalName().length() == 4) { //edge
145                                                            int childid = -1;
146                                                            int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
147                                                            
148                                                            try {
149                                                                    if (indexSep != -1) {
150                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
151                                                                    } else {
152                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
153                                                                    }
154                                                                    if (childid == -1) {
155                                                                            throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
156                                                                    }
157                                                            } catch (NumberFormatException e) {
158                                                                    throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
159                                                            }
160    
161                                                            if (childid < START_ID_OF_NONTERMINALS) {
162                                                                    child = phraseStructure.getTokenNode(childid);
163                                                            } else {
164    
165                                                                    child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
166                                                            }
167    
168                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
169                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
170                                                            for (String name : inputTables.keySet()) {
171                                                                    e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
172                                                            }
173                                                    } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
174    //                                                      domain = Domain.EL;
175                                                    }
176                                            } else if (reader.getLocalName().charAt(0) == 'n') {
177                                                    // n -> nt, nonterminals, name
178                                                    if (reader.getLocalName().length() == 2) { // nt
179                                                            final String id = reader.getAttributeValue(null, "id");
180                                                            if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
181                                                                    parent = phraseStructure.getPhraseStructureRoot();
182                                                            } else {
183                                                                    int index = id.indexOf('_');
184                                                                    if (index != -1) {
185                                                                            parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
186                                                                    }
187                                                            }
188                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables();
189                                                            for (String name : inputTables.keySet()) {
190                                                                    parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
191                                                            }
192                                                    } else if (reader.getLocalName().equals("name")) { // name
193    //                                                      elementContent.setLength(0);
194    //                                                      collectChar = true;
195                                                    }
196                                            } else if (reader.getLocalName().charAt(0) == 't') {
197                                                    // t -> t, terminals
198                                                    if (reader.getLocalName().length() == 1) { // t
199                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables();
200                                                            child = syntaxGraph.addTokenNode();
201                                                            for (String name : inputTables.keySet()) {
202                                                                    child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
203                                                            }
204                                                    }
205                                            } else if (reader.getLocalName().charAt(0) == 's') {
206                                                    // s -> subcorpus, secedge, s, secedgelabel
207                                                    if (reader.getLocalName().length() == 1) { // s
208                                                            String id = reader.getAttributeValue(null, "id");
209                                                            boolean indexable = false;
210                                                            int index = -1;
211                                                            if (id != null && id.length() > 0) {
212                                                                    for (int i = 0, n = id.length(); i < n; i++) {
213                                                                            if (Character.isDigit(id.charAt(i))) {
214                                                                                    if (index == -1) { 
215                                                                                            index = i;
216                                                                                    }
217                                                                                    indexable = true;
218                                                                            }
219                                                                    }
220                                                            }
221                                                            if (indexable) {
222                                                                    phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
223                                                            } else {
224                                                                    phraseStructure.setSentenceID(sentenceCount+1);
225                                                            }
226                                                    }
227                                            } else if (reader.getLocalName().charAt(0) == 'v') {
228                                                    // v -> variable, value
229    //                                              if (reader.getLocalName().equals("value")) {
230    //                                                      valueName.setLength(0);
231    //                                                      valueName.append(reader.getAttributeValue(null, "name"));
232    //                                                      elementContent.setLength(0);
233    //                                                      collectChar = true;
234    //                                              }
235                                            } else {
236    //                                               a -> annotation, author
237    //                                               b -> body
238    //                                               c -> corpus
239    //                                               d -> date, description,
240    //                                               f -> feature, format
241    //                                               g -> graph
242    //                                               h -> head, history
243    //                                               m -> matches, match
244                                                    if (reader.getLocalName().equals("graph")) {
245                                                            graphRootID.setLength(0);
246                                                            graphRootID.append(reader.getAttributeValue(null, "root"));
247                                                    } else  if (reader.getLocalName().equals("corpus")) {
248    //                                                      header.setCorpusID(reader.getAttributeValue(null, "id"));
249    //                                                      header.setCorpusID(reader.getAttributeValue(null, "version"));
250                                                    } else if (reader.getLocalName().equals("feature")) {
251    //                                                      if (header != null) {
252    //                                                              currentFeatureName.setLength(0);
253    //                                                              currentFeatureName.append(reader.getAttributeValue(null, "name"));
254    //                                                              header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
255    //                                                      }
256    //                                                      domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
257                                                    } else if (reader.getLocalName().equals("secedgelabel")) {
258    //                                                      domain = Domain.SEL;
259                                                    } else if (reader.getLocalName().equals("author")) {
260    //                                                      elementContent.setLength(0);
261    //                                                      collectChar = true;
262                                                    } else if (reader.getLocalName().equals("date")) {
263    //                                                      elementContent.setLength(0);
264    //                                                      collectChar = true;
265                                                    } else if (reader.getLocalName().equals("description")) {
266    //                                                      elementContent.setLength(0);
267    //                                                      collectChar = true;
268                                                    } else if (reader.getLocalName().equals("format")) {
269    //                                                      elementContent.setLength(0);
270    //                                                      collectChar = true;
271                                                    } else if (reader.getLocalName().equals("history")) {
272    //                                                      elementContent.setLength(0);
273    //                                                      collectChar = true;
274                                                    } 
275                                            }
276                                    } else if (event == XMLStreamConstants.END_ELEMENT) {
277                                            if (reader.getLocalName().length() == 0) {
278                                                    continue;
279                                            }
280                                            if (reader.getLocalName().charAt(0) == 'e') {
281                                                    // e -> edge, edgelabel
282                                            } else if (reader.getLocalName().charAt(0) == 'n') {
283                                                    // n -> nt, nonterminals, name
284                                                    if (reader.getLocalName().equals("nt")) {
285                                                            ntid.setLength(0);
286                                                    }
287                                                    else if (reader.getLocalName().equals("nonterminals")) {
288                                                            if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
289                                                                    Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
290                                                                    SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
291                                                                    for (String name : inputTables.keySet()) {
292                                                                            e.addLabel(inputTables.get(name), "--");
293                                                                    }
294                                                            }
295                                                    }
296    //                                              else if (reader.getLocalName().equals("name")) {
297    //                                                      if (header != null) {
298    //                                                              header.setMetaName(elementContent.toString());
299    //                                                      }
300    //                                                      collectChar = false;
301    //                                              }
302                                            } else if (reader.getLocalName().charAt(0) == 't') {
303                                                    // t -> t, terminals
304                                            } else if (reader.getLocalName().charAt(0) == 's') {
305                                                    // s -> subcorpus, secedge, s, secedgelabel
306                                                    if (reader.getLocalName().equals("s")) {
307                                                            if (syntaxGraph.hasTokens()) {
308                                                                    sentenceCount++;
309                                                            }
310                                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
311                                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
312                                                            }
313                                                            return true;
314                                                    }
315                                            } else if (reader.getLocalName().charAt(0) == 'v') {
316                                                    // v -> variable, value
317    //                                              if (reader.getLocalName().equals("value")) {
318    //                                                      if (header != null) {
319    //                                                              if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
320    //                                                                      header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
321    //                                                              } else if (domain == Domain.EL) {
322    //                                                                      header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
323    //                                                              } else if (domain == Domain.SEL) {
324    //                                                                      header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
325    //                                                              }
326    //                                                      }
327    //                                                      collectChar = false;
328    //                                              }
329                                            } else {
330    //                                               a -> annotation, author
331    //                                               b -> body
332    //                                               c -> corpus
333    //                                               d -> date, description,
334    //                                               f -> feature, format
335    //                                               g -> graph
336    //                                               h -> head, history
337    //                                               m -> matches, match
338                                                    if (reader.getLocalName().equals("body")) {
339                                                            //sentence = dataStructures.getSentence();
340                                                            //phraseTree = dataStructures.getInPhraseTree();
341                                                            //sentence.clear();
342                                                            //phraseTree.clear();
343                                                            //dataStructures.setLastProcessObject(true);
344                                                    }  else if (reader.getLocalName().equals("author")) {
345    //                                                      if (header != null) {
346    //                                                              header.setMetaAuthor(elementContent.toString());
347    //                                                      }
348    //                                                      collectChar = false;
349                                                    } else if (reader.getLocalName().equals("date")) {
350    //                                                      if (header != null) {
351    //                                                              header.setMetaInDate(elementContent.toString());
352    //                                                      }
353    //                                                      collectChar = false;
354                                                    } else if (reader.getLocalName().equals("description")) {
355    //                                                      if (header != null) {
356    //                                                              header.setMetaDescription(elementContent.toString());
357    //                                                      }
358    //                                                      collectChar = false;
359                                                    } else if (reader.getLocalName().equals("format")) {
360    //                                                      if (header != null) {
361    //                                                              header.setMetaFormat(elementContent.toString());
362    //                                                      }
363    //                                                      collectChar = false;
364                                                    } else if (reader.getLocalName().equals("history")) {
365    //                                                      if (header != null) {
366    //                                                              header.setMetaHistory(elementContent.toString());
367    //                                                      }
368    //                                                      collectChar = false;
369                                                    } /* else if (reader.getLocalName().equals("annotation")) {
370                                                            if (header != null) {
371                                                                    System.out.println(header.toTigerXML());
372                                                            }
373                                                            collectChar = false;
374                                                    } */
375                                            }                               
376                                    } else if (event == XMLStreamConstants.END_DOCUMENT) {
377                                            if (syntaxGraph.hasTokens()) {
378                                                    sentenceCount++;
379                                            }
380                                            if (cIterations < nIterations) {
381                                                    cIterations++;
382                                                    reopen();
383                                                    return true;
384                                            }
385                                            return false;
386                                    } else if (event == XMLStreamConstants.CHARACTERS) {
387    //                                      if (collectChar) {
388    //                                              char[] ch = reader.getTextCharacters();
389    //                                              final int size = reader.getTextStart()+reader.getTextLength();
390    //                                              for (int i = reader.getTextStart(); i < size; i++) {
391    //                                                      elementContent.append(ch[i]);
392    //                                              }
393    //                                      }
394                                    }
395                            }
396                    } catch (XMLStreamException e) {
397                            throw new DataFormatException("", e);
398                    }
399            }
400            
401            public int getSentenceCount() {
402                    return sentenceCount;
403            }
404    
405            public void setSentenceCount(int sentenceCount) {
406                    this.sentenceCount = sentenceCount;
407            }
408            
409            public XMLStreamReader getReader() {
410                    return reader;
411            }
412    
413            public void setReader(XMLStreamReader reader) {
414                    this.reader = reader;
415            }
416            
417            public void readEpilog() throws MaltChainedException {
418                    
419            }
420            
421            public void close() throws MaltChainedException {
422                    try {
423                            if (reader != null) {
424                                    if (closeStream) {
425                                            reader.close();
426                                    }
427                                    reader = null;
428                            }
429                    } catch (XMLStreamException e) {
430                            throw new DataFormatException("The XML input file could be closed. ", e);
431                    }
432            }
433    
434            public DataFormatInstance getDataFormatInstance() {
435                    return dataFormatInstance;
436            }
437            
438            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
439                    this.dataFormatInstance = inputDataFormatInstance;
440            }
441            
442            public String getOptions() {
443                    return optionString;
444            }
445            
446            public void setOptions(String optionString) throws MaltChainedException {
447                    this.optionString = optionString;
448                    String[] argv;
449                    try {
450                            argv = optionString.split("[_\\p{Blank}]");
451                    } catch (PatternSyntaxException e) {
452                            throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
453                    }
454                    for (int i=0; i < argv.length-1; i++) {
455                            if(argv[i].charAt(0) != '-') {
456                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
457                            }
458                            if(++i>=argv.length) {
459                                    throw new DataFormatException("The last argument does not have any value. ");
460                            }
461                            switch(argv[i-1].charAt(1)) {
462                            case 's': 
463                                    try {
464                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
465                                    } catch (NumberFormatException e){
466                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
467                                    }
468                                    break;
469                            default:
470                                    throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");          
471                            }
472                    }
473            }
474            
475            public String getFileName() {
476                    return fileName;
477            }
478    
479            public void setFileName(String fileName) {
480                    this.fileName = fileName;
481            }
482    
483            public URL getUrl() {
484                    return url;
485            }
486    
487            public void setUrl(URL url) {
488                    this.url = url;
489            }
490    
491            public String getCharsetName() {
492                    return charsetName;
493            }
494    
495            public void setCharsetName(String charsetName) {
496                    this.charsetName = charsetName;
497            }
498    
499            public int getNIterations() {
500                    return nIterations;
501            }
502    
503            public void setNIterations(int iterations) {
504                    nIterations = iterations;
505            }
506    
507            public int getIterationCounter() {
508                    return cIterations;
509            }
510    //      public TigerXMLHeader getHeader() {
511    //              return header;
512    //      }
513    //      
514    //      public void setHeader(TigerXMLHeader header) {
515    //              this.header = header;
516    //      }
517    }