001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.SortedMap;
012    import java.util.regex.PatternSyntaxException;
013    
014    import javax.xml.stream.XMLInputFactory;
015    import javax.xml.stream.XMLStreamConstants;
016    import javax.xml.stream.XMLStreamException;
017    import javax.xml.stream.XMLStreamReader;
018    
019    import org.maltparser.core.exception.MaltChainedException;
020    import org.maltparser.core.io.dataformat.DataFormatException;
021    import org.maltparser.core.io.dataformat.DataFormatInstance;
022    import org.maltparser.core.symbol.SymbolTable;
023    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
024    import org.maltparser.core.syntaxgraph.PhraseStructure;
025    import org.maltparser.core.syntaxgraph.SyntaxGraphException;
026    import org.maltparser.core.syntaxgraph.TokenStructure;
027    import org.maltparser.core.syntaxgraph.edge.Edge;
028    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
029    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
030    
031    /**
032    *
033    *
034    * @author Johan Hall
035    */
036    public class TigerXMLReader implements SyntaxGraphReader {
037    //      private TigerXMLHeader header;
038            private XMLStreamReader reader;
039            private int sentenceCount;
040            private DataFormatInstance dataFormatInstance;
041            private StringBuffer ntid;
042            private final StringBuilder graphRootID;
043    //      private StringBuilder elementContent; 
044    //      private StringBuilder valueName;
045    //      private StringBuilder currentFeatureName;
046    //      private Domain domain;
047    //      private boolean collectChar = false;
048            private String optionString;
049            private String fileName = null;
050            private URL url = null;
051            private String charsetName;
052            private int nIterations;
053            private int cIterations;
054            private int START_ID_OF_NONTERMINALS = 500;
055            
056            
057            public TigerXMLReader() {
058                    this.ntid = new StringBuffer();
059    //              elementContent = new StringBuilder();
060    //              valueName = new StringBuilder();
061    //              currentFeatureName = new StringBuilder(); 
062                    graphRootID = new StringBuilder(); 
063                    nIterations = 1;
064                    cIterations = 1;
065            }
066            
067            private void reopen() throws MaltChainedException {
068                    close();
069                    if (fileName != null) {
070                            open(fileName, charsetName);
071                    } else if (url != null) {
072                            open(url, charsetName);
073                    } else {
074                            throw new DataFormatException("The input stream cannot be reopen. ");
075                    }
076            }
077            
078            public void open(String fileName, String charsetName) throws MaltChainedException {
079                    setFileName(fileName);
080                    setCharsetName(charsetName);
081                    try {
082                            open(new FileInputStream(fileName), charsetName);
083                    }catch (FileNotFoundException e) {
084                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
085                    }
086            }
087            public void open(URL url, String charsetName) throws MaltChainedException {
088                    setUrl(url);
089                    setCharsetName(charsetName);
090                    try {
091                            open(url.openStream(), charsetName);
092                    } catch (IOException e) {
093                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
094                    }
095            }
096            
097            public void open(InputStream is, String charsetName) throws MaltChainedException {
098                    try {
099                            open(new InputStreamReader(is, charsetName));
100                    } catch (UnsupportedEncodingException e) {
101                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
102                    }
103            }
104            
105            public void open(InputStreamReader isr) throws MaltChainedException {
106                    try {
107                            XMLInputFactory factory = XMLInputFactory.newInstance();
108                            setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
109                    } catch (XMLStreamException e) {
110                            throw new DataFormatException("XML input file could be opened. ", e);
111                    } 
112                    setSentenceCount(0);
113            }
114            
115            public void readProlog() throws MaltChainedException {
116                    
117            }
118            
119            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
120                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
121                            return false;
122                    }
123                    syntaxGraph.clear();
124                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
125                    PhraseStructureNode parent = null;
126                    PhraseStructureNode child = null;
127    //              if (header == null) {
128    //                      header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
129    //              }
130    
131                    try {
132                            while (true) {
133                                    int event = reader.next();
134                                    if (event == XMLStreamConstants.START_ELEMENT) {
135                                            if (reader.getLocalName().length() == 0) {
136                                                    continue;
137                                            }
138                                            if (reader.getLocalName().charAt(0) == 'e') {
139                                                    // e -> edge, edgelabel
140                                                    if (reader.getLocalName().length() == 4) { //edge
141                                                            int childid = -1;
142                                                            int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
143                                                            
144                                                            try {
145                                                                    if (indexSep != -1) {
146                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
147                                                                    } else {
148                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
149                                                                    }
150                                                                    if (childid == -1) {
151                                                                            throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
152                                                                    }
153                                                            } catch (NumberFormatException e) {
154                                                                    throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
155                                                            }
156    
157                                                            if (childid < START_ID_OF_NONTERMINALS) {
158                                                                    child = phraseStructure.getTokenNode(childid);
159                                                            } else {
160    
161                                                                    child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
162                                                            }
163    
164                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
165                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
166                                                            for (String name : inputTables.keySet()) {
167                                                                    e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
168                                                            }
169                                                    } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
170    //                                                      domain = Domain.EL;
171                                                    }
172                                            } else if (reader.getLocalName().charAt(0) == 'n') {
173                                                    // n -> nt, nonterminals, name
174                                                    if (reader.getLocalName().length() == 2) { // nt
175                                                            final String id = reader.getAttributeValue(null, "id");
176                                                            if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
177                                                                    parent = phraseStructure.getPhraseStructureRoot();
178                                                            } else {
179                                                                    int index = id.indexOf('_');
180                                                                    if (index != -1) {
181                                                                            parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
182                                                                    }
183                                                            }
184                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables();
185                                                            for (String name : inputTables.keySet()) {
186                                                                    parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
187                                                            }
188                                                    } else if (reader.getLocalName().equals("name")) { // name
189    //                                                      elementContent.setLength(0);
190    //                                                      collectChar = true;
191                                                    }
192                                            } else if (reader.getLocalName().charAt(0) == 't') {
193                                                    // t -> t, terminals
194                                                    if (reader.getLocalName().length() == 1) { // t
195                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables();
196                                                            child = syntaxGraph.addTokenNode();
197                                                            for (String name : inputTables.keySet()) {
198                                                                    child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
199                                                            }
200                                                    }
201                                            } else if (reader.getLocalName().charAt(0) == 's') {
202                                                    // s -> subcorpus, secedge, s, secedgelabel
203                                                    if (reader.getLocalName().length() == 1) { // s
204                                                            String id = reader.getAttributeValue(null, "id");
205                                                            boolean indexable = false;
206                                                            int index = -1;
207                                                            if (id != null && id.length() > 0) {
208                                                                    for (int i = 0, n = id.length(); i < n; i++) {
209                                                                            if (Character.isDigit(id.charAt(i))) {
210                                                                                    if (index == -1) { 
211                                                                                            index = i;
212                                                                                    }
213                                                                                    indexable = true;
214                                                                            }
215                                                                    }
216                                                            }
217                                                            if (indexable) {
218                                                                    phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
219                                                            } else {
220                                                                    phraseStructure.setSentenceID(sentenceCount+1);
221                                                            }
222                                                    }
223                                            } else if (reader.getLocalName().charAt(0) == 'v') {
224                                                    // v -> variable, value
225    //                                              if (reader.getLocalName().equals("value")) {
226    //                                                      valueName.setLength(0);
227    //                                                      valueName.append(reader.getAttributeValue(null, "name"));
228    //                                                      elementContent.setLength(0);
229    //                                                      collectChar = true;
230    //                                              }
231                                            } else {
232    //                                               a -> annotation, author
233    //                                               b -> body
234    //                                               c -> corpus
235    //                                               d -> date, description,
236    //                                               f -> feature, format
237    //                                               g -> graph
238    //                                               h -> head, history
239    //                                               m -> matches, match
240                                                    if (reader.getLocalName().equals("graph")) {
241                                                            graphRootID.setLength(0);
242                                                            graphRootID.append(reader.getAttributeValue(null, "root"));
243                                                    } else  if (reader.getLocalName().equals("corpus")) {
244    //                                                      header.setCorpusID(reader.getAttributeValue(null, "id"));
245    //                                                      header.setCorpusID(reader.getAttributeValue(null, "version"));
246                                                    } else if (reader.getLocalName().equals("feature")) {
247    //                                                      if (header != null) {
248    //                                                              currentFeatureName.setLength(0);
249    //                                                              currentFeatureName.append(reader.getAttributeValue(null, "name"));
250    //                                                              header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
251    //                                                      }
252    //                                                      domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
253                                                    } else if (reader.getLocalName().equals("secedgelabel")) {
254    //                                                      domain = Domain.SEL;
255                                                    } else if (reader.getLocalName().equals("author")) {
256    //                                                      elementContent.setLength(0);
257    //                                                      collectChar = true;
258                                                    } else if (reader.getLocalName().equals("date")) {
259    //                                                      elementContent.setLength(0);
260    //                                                      collectChar = true;
261                                                    } else if (reader.getLocalName().equals("description")) {
262    //                                                      elementContent.setLength(0);
263    //                                                      collectChar = true;
264                                                    } else if (reader.getLocalName().equals("format")) {
265    //                                                      elementContent.setLength(0);
266    //                                                      collectChar = true;
267                                                    } else if (reader.getLocalName().equals("history")) {
268    //                                                      elementContent.setLength(0);
269    //                                                      collectChar = true;
270                                                    } 
271                                            }
272                                    } else if (event == XMLStreamConstants.END_ELEMENT) {
273                                            if (reader.getLocalName().length() == 0) {
274                                                    continue;
275                                            }
276                                            if (reader.getLocalName().charAt(0) == 'e') {
277                                                    // e -> edge, edgelabel
278                                            } else if (reader.getLocalName().charAt(0) == 'n') {
279                                                    // n -> nt, nonterminals, name
280                                                    if (reader.getLocalName().equals("nt")) {
281                                                            ntid.setLength(0);
282                                                    }
283                                                    else if (reader.getLocalName().equals("nonterminals")) {
284                                                            if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
285                                                                    Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
286                                                                    SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
287                                                                    for (String name : inputTables.keySet()) {
288                                                                            e.addLabel(inputTables.get(name), "--");
289                                                                    }
290                                                            }
291                                                    }
292    //                                              else if (reader.getLocalName().equals("name")) {
293    //                                                      if (header != null) {
294    //                                                              header.setMetaName(elementContent.toString());
295    //                                                      }
296    //                                                      collectChar = false;
297    //                                              }
298                                            } else if (reader.getLocalName().charAt(0) == 't') {
299                                                    // t -> t, terminals
300                                            } else if (reader.getLocalName().charAt(0) == 's') {
301                                                    // s -> subcorpus, secedge, s, secedgelabel
302                                                    if (reader.getLocalName().equals("s")) {
303                                                            if (syntaxGraph.hasTokens()) {
304                                                                    sentenceCount++;
305                                                            }
306                                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
307                                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
308                                                            }
309                                                            return true;
310                                                    }
311                                            } else if (reader.getLocalName().charAt(0) == 'v') {
312                                                    // v -> variable, value
313    //                                              if (reader.getLocalName().equals("value")) {
314    //                                                      if (header != null) {
315    //                                                              if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
316    //                                                                      header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
317    //                                                              } else if (domain == Domain.EL) {
318    //                                                                      header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
319    //                                                              } else if (domain == Domain.SEL) {
320    //                                                                      header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
321    //                                                              }
322    //                                                      }
323    //                                                      collectChar = false;
324    //                                              }
325                                            } else {
326    //                                               a -> annotation, author
327    //                                               b -> body
328    //                                               c -> corpus
329    //                                               d -> date, description,
330    //                                               f -> feature, format
331    //                                               g -> graph
332    //                                               h -> head, history
333    //                                               m -> matches, match
334                                                    if (reader.getLocalName().equals("body")) {
335                                                            //sentence = dataStructures.getSentence();
336                                                            //phraseTree = dataStructures.getInPhraseTree();
337                                                            //sentence.clear();
338                                                            //phraseTree.clear();
339                                                            //dataStructures.setLastProcessObject(true);
340                                                    }  else if (reader.getLocalName().equals("author")) {
341    //                                                      if (header != null) {
342    //                                                              header.setMetaAuthor(elementContent.toString());
343    //                                                      }
344    //                                                      collectChar = false;
345                                                    } else if (reader.getLocalName().equals("date")) {
346    //                                                      if (header != null) {
347    //                                                              header.setMetaInDate(elementContent.toString());
348    //                                                      }
349    //                                                      collectChar = false;
350                                                    } else if (reader.getLocalName().equals("description")) {
351    //                                                      if (header != null) {
352    //                                                              header.setMetaDescription(elementContent.toString());
353    //                                                      }
354    //                                                      collectChar = false;
355                                                    } else if (reader.getLocalName().equals("format")) {
356    //                                                      if (header != null) {
357    //                                                              header.setMetaFormat(elementContent.toString());
358    //                                                      }
359    //                                                      collectChar = false;
360                                                    } else if (reader.getLocalName().equals("history")) {
361    //                                                      if (header != null) {
362    //                                                              header.setMetaHistory(elementContent.toString());
363    //                                                      }
364    //                                                      collectChar = false;
365                                                    } /* else if (reader.getLocalName().equals("annotation")) {
366                                                            if (header != null) {
367                                                                    System.out.println(header.toTigerXML());
368                                                            }
369                                                            collectChar = false;
370                                                    } */
371                                            }                               
372                                    } else if (event == XMLStreamConstants.END_DOCUMENT) {
373                                            if (syntaxGraph.hasTokens()) {
374                                                    sentenceCount++;
375                                            }
376                                            if (cIterations < nIterations) {
377                                                    cIterations++;
378                                                    reopen();
379                                                    return true;
380                                            }
381                                            return false;
382                                    } else if (event == XMLStreamConstants.CHARACTERS) {
383    //                                      if (collectChar) {
384    //                                              char[] ch = reader.getTextCharacters();
385    //                                              final int size = reader.getTextStart()+reader.getTextLength();
386    //                                              for (int i = reader.getTextStart(); i < size; i++) {
387    //                                                      elementContent.append(ch[i]);
388    //                                              }
389    //                                      }
390                                    }
391                            }
392                    } catch (XMLStreamException e) {
393                            throw new DataFormatException("", e);
394                    }
395            }
396            
397            public int getSentenceCount() {
398                    return sentenceCount;
399            }
400    
401            public void setSentenceCount(int sentenceCount) {
402                    this.sentenceCount = sentenceCount;
403            }
404            
405            public XMLStreamReader getReader() {
406                    return reader;
407            }
408    
409            public void setReader(XMLStreamReader reader) {
410                    this.reader = reader;
411            }
412            
413            public void readEpilog() throws MaltChainedException {
414                    
415            }
416            
417            public void close() throws MaltChainedException {
418                    try {
419                            if (reader != null) {
420                                    reader.close();
421                                    reader = null;
422                            }
423                    } catch (XMLStreamException e) {
424                            throw new DataFormatException("The XML input file could be closed. ", e);
425                    }
426            }
427    
428            public DataFormatInstance getDataFormatInstance() {
429                    return dataFormatInstance;
430            }
431            
432            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
433                    this.dataFormatInstance = inputDataFormatInstance;
434            }
435            
436            public String getOptions() {
437                    return optionString;
438            }
439            
440            public void setOptions(String optionString) throws MaltChainedException {
441                    this.optionString = optionString;
442                    String[] argv;
443                    try {
444                            argv = optionString.split("[_\\p{Blank}]");
445                    } catch (PatternSyntaxException e) {
446                            throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
447                    }
448                    for (int i=0; i < argv.length-1; i++) {
449                            if(argv[i].charAt(0) != '-') {
450                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
451                            }
452                            if(++i>=argv.length) {
453                                    throw new DataFormatException("The last argument does not have any value. ");
454                            }
455                            switch(argv[i-1].charAt(1)) {
456                            case 's': 
457                                    try {
458                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
459                                    } catch (NumberFormatException e){
460                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
461                                    }
462                                    break;
463                            default:
464                                    throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");          
465                            }
466                    }
467            }
468            
469            public String getFileName() {
470                    return fileName;
471            }
472    
473            public void setFileName(String fileName) {
474                    this.fileName = fileName;
475            }
476    
477            public URL getUrl() {
478                    return url;
479            }
480    
481            public void setUrl(URL url) {
482                    this.url = url;
483            }
484    
485            public String getCharsetName() {
486                    return charsetName;
487            }
488    
489            public void setCharsetName(String charsetName) {
490                    this.charsetName = charsetName;
491            }
492    
493            public int getNIterations() {
494                    return nIterations;
495            }
496    
497            public void setNIterations(int iterations) {
498                    nIterations = iterations;
499            }
500    
501            public int getIterationCounter() {
502                    return cIterations;
503            }
504    //      public TigerXMLHeader getHeader() {
505    //              return header;
506    //      }
507    //      
508    //      public void setHeader(TigerXMLHeader header) {
509    //              this.header = header;
510    //      }
511    }