001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.SortedMap;
012    import java.util.regex.PatternSyntaxException;
013    
014    import javax.xml.stream.XMLInputFactory;
015    import javax.xml.stream.XMLStreamConstants;
016    import javax.xml.stream.XMLStreamException;
017    import javax.xml.stream.XMLStreamReader;
018    
019    import org.maltparser.core.exception.MaltChainedException;
020    import org.maltparser.core.io.dataformat.DataFormatException;
021    import org.maltparser.core.io.dataformat.DataFormatInstance;
022    import org.maltparser.core.symbol.SymbolTable;
023    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
024    import org.maltparser.core.syntaxgraph.PhraseStructure;
025    import org.maltparser.core.syntaxgraph.SyntaxGraphException;
026    import org.maltparser.core.syntaxgraph.TokenStructure;
027    import org.maltparser.core.syntaxgraph.edge.Edge;
028    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
029    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
030    
031    /**
032    *
033    *
034    * @author Johan Hall
035    */
036    public class TigerXMLReader implements SyntaxGraphReader {
037    //      private TigerXMLHeader header;
038            private XMLStreamReader reader;
039            private int sentenceCount;
040            private DataFormatInstance dataFormatInstance;
041            private StringBuffer ntid;
042            private final StringBuilder graphRootID;
043    //      private StringBuilder elementContent; 
044    //      private StringBuilder valueName;
045    //      private StringBuilder currentFeatureName;
046    //      private Domain domain;
047    //      private boolean collectChar = false;
048            private String optionString;
049            private int START_ID_OF_NONTERMINALS = 500;
050            
051            
052            public TigerXMLReader() {
053                    this.ntid = new StringBuffer();
054    //              elementContent = new StringBuilder();
055    //              valueName = new StringBuilder();
056    //              currentFeatureName = new StringBuilder(); 
057                    graphRootID = new StringBuilder(); 
058            }
059            
060            public void open(String fileName, String charsetName) throws MaltChainedException {
061                    try {
062                            open(new FileInputStream(fileName), charsetName);
063                    }catch (FileNotFoundException e) {
064                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
065                    }
066            }
067            public void open(URL url, String charsetName) throws MaltChainedException {
068                    try {
069                            open(url.openStream(), charsetName);
070                    } catch (IOException e) {
071                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
072                    }
073            }
074            
075            public void open(InputStream is, String charsetName) throws MaltChainedException {
076                    try {
077                            open(new InputStreamReader(is, charsetName));
078                    } catch (UnsupportedEncodingException e) {
079                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
080                    }
081            }
082            
083            public void open(InputStreamReader isr) throws MaltChainedException {
084                    try {
085                            XMLInputFactory factory = XMLInputFactory.newInstance();
086                            setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
087                    } catch (XMLStreamException e) {
088                            throw new DataFormatException("XML input file could be opened. ", e);
089                    } 
090                    setSentenceCount(0);
091            }
092            
093            public void readProlog() throws MaltChainedException {
094                    
095            }
096            
097            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
098                    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
099                            return false;
100                    }
101                    syntaxGraph.clear();
102                    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
103                    PhraseStructureNode parent = null;
104                    PhraseStructureNode child = null;
105    //              if (header == null) {
106    //                      header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
107    //              }
108    
109                    try {
110                            while (true) {
111                                    int event = reader.next();
112                                    if (event == XMLStreamConstants.START_ELEMENT) {
113                                            if (reader.getLocalName().length() == 0) {
114                                                    continue;
115                                            }
116                                            if (reader.getLocalName().charAt(0) == 'e') {
117                                                    // e -> edge, edgelabel
118                                                    if (reader.getLocalName().length() == 4) { //edge
119                                                            int childid = -1;
120                                                            int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
121                                                            
122                                                            try {
123                                                                    if (indexSep != -1) {
124                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
125                                                                    } else {
126                                                                            childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
127                                                                    }
128                                                                    if (childid == -1) {
129                                                                            throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
130                                                                    }
131                                                            } catch (NumberFormatException e) {
132                                                                    throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
133                                                            }
134    
135                                                            if (childid < START_ID_OF_NONTERMINALS) {
136                                                                    child = phraseStructure.getTokenNode(childid);
137                                                            } else {
138    
139                                                                    child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
140                                                            }
141    
142                                                            Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
143                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
144                                                            for (String name : inputTables.keySet()) {
145                                                                    e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
146                                                            }
147                                                    } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
148    //                                                      domain = Domain.EL;
149                                                    }
150                                            } else if (reader.getLocalName().charAt(0) == 'n') {
151                                                    // n -> nt, nonterminals, name
152                                                    if (reader.getLocalName().length() == 2) { // nt
153                                                            final String id = reader.getAttributeValue(null, "id");
154                                                            if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
155                                                                    parent = phraseStructure.getPhraseStructureRoot();
156                                                            } else {
157                                                                    int index = id.indexOf('_');
158                                                                    if (index != -1) {
159                                                                            parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
160                                                                    }
161                                                            }
162                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables();
163                                                            for (String name : inputTables.keySet()) {
164                                                                    parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
165                                                            }
166                                                    } else if (reader.getLocalName().equals("name")) { // name
167    //                                                      elementContent.setLength(0);
168    //                                                      collectChar = true;
169                                                    }
170                                            } else if (reader.getLocalName().charAt(0) == 't') {
171                                                    // t -> t, terminals
172                                                    if (reader.getLocalName().length() == 1) { // t
173                                                            SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables();
174                                                            child = syntaxGraph.addTokenNode();
175                                                            for (String name : inputTables.keySet()) {
176                                                                    child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
177                                                            }
178                                                    }
179                                            } else if (reader.getLocalName().charAt(0) == 's') {
180                                                    // s -> subcorpus, secedge, s, secedgelabel
181                                                    if (reader.getLocalName().length() == 1) { // s
182                                                            String id = reader.getAttributeValue(null, "id");
183                                                            boolean indexable = false;
184                                                            int index = -1;
185                                                            if (id != null && id.length() > 0) {
186                                                                    for (int i = 0, n = id.length(); i < n; i++) {
187                                                                            if (Character.isDigit(id.charAt(i))) {
188                                                                                    if (index == -1) { 
189                                                                                            index = i;
190                                                                                    }
191                                                                                    indexable = true;
192                                                                            }
193                                                                    }
194                                                            }
195                                                            if (indexable) {
196                                                                    phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
197                                                            } else {
198                                                                    phraseStructure.setSentenceID(sentenceCount+1);
199                                                            }
200                                                    }
201                                            } else if (reader.getLocalName().charAt(0) == 'v') {
202                                                    // v -> variable, value
203    //                                              if (reader.getLocalName().equals("value")) {
204    //                                                      valueName.setLength(0);
205    //                                                      valueName.append(reader.getAttributeValue(null, "name"));
206    //                                                      elementContent.setLength(0);
207    //                                                      collectChar = true;
208    //                                              }
209                                            } else {
210    //                                               a -> annotation, author
211    //                                               b -> body
212    //                                               c -> corpus
213    //                                               d -> date, description,
214    //                                               f -> feature, format
215    //                                               g -> graph
216    //                                               h -> head, history
217    //                                               m -> matches, match
218                                                    if (reader.getLocalName().equals("graph")) {
219                                                            graphRootID.setLength(0);
220                                                            graphRootID.append(reader.getAttributeValue(null, "root"));
221                                                    } else  if (reader.getLocalName().equals("corpus")) {
222    //                                                      header.setCorpusID(reader.getAttributeValue(null, "id"));
223    //                                                      header.setCorpusID(reader.getAttributeValue(null, "version"));
224                                                    } else if (reader.getLocalName().equals("feature")) {
225    //                                                      if (header != null) {
226    //                                                              currentFeatureName.setLength(0);
227    //                                                              currentFeatureName.append(reader.getAttributeValue(null, "name"));
228    //                                                              header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
229    //                                                      }
230    //                                                      domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
231                                                    } else if (reader.getLocalName().equals("secedgelabel")) {
232    //                                                      domain = Domain.SEL;
233                                                    } else if (reader.getLocalName().equals("author")) {
234    //                                                      elementContent.setLength(0);
235    //                                                      collectChar = true;
236                                                    } else if (reader.getLocalName().equals("date")) {
237    //                                                      elementContent.setLength(0);
238    //                                                      collectChar = true;
239                                                    } else if (reader.getLocalName().equals("description")) {
240    //                                                      elementContent.setLength(0);
241    //                                                      collectChar = true;
242                                                    } else if (reader.getLocalName().equals("format")) {
243    //                                                      elementContent.setLength(0);
244    //                                                      collectChar = true;
245                                                    } else if (reader.getLocalName().equals("history")) {
246    //                                                      elementContent.setLength(0);
247    //                                                      collectChar = true;
248                                                    } 
249                                            }
250                                    } else if (event == XMLStreamConstants.END_ELEMENT) {
251                                            if (reader.getLocalName().length() == 0) {
252                                                    continue;
253                                            }
254                                            if (reader.getLocalName().charAt(0) == 'e') {
255                                                    // e -> edge, edgelabel
256                                            } else if (reader.getLocalName().charAt(0) == 'n') {
257                                                    // n -> nt, nonterminals, name
258                                                    if (reader.getLocalName().equals("nt")) {
259                                                            ntid.setLength(0);
260                                                    }
261                                                    else if (reader.getLocalName().equals("nonterminals")) {
262                                                            if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
263                                                                    Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
264                                                                    SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
265                                                                    for (String name : inputTables.keySet()) {
266                                                                            e.addLabel(inputTables.get(name), "--");
267                                                                    }
268                                                            }
269                                                    }
270    //                                              else if (reader.getLocalName().equals("name")) {
271    //                                                      if (header != null) {
272    //                                                              header.setMetaName(elementContent.toString());
273    //                                                      }
274    //                                                      collectChar = false;
275    //                                              }
276                                            } else if (reader.getLocalName().charAt(0) == 't') {
277                                                    // t -> t, terminals
278                                            } else if (reader.getLocalName().charAt(0) == 's') {
279                                                    // s -> subcorpus, secedge, s, secedgelabel
280                                                    if (reader.getLocalName().equals("s")) {
281                                                            if (syntaxGraph.hasTokens()) {
282                                                                    sentenceCount++;
283                                                            }
284                                                            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
285                                                                    ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
286                                                            }
287                                                            return true;
288                                                    }
289                                            } else if (reader.getLocalName().charAt(0) == 'v') {
290                                                    // v -> variable, value
291    //                                              if (reader.getLocalName().equals("value")) {
292    //                                                      if (header != null) {
293    //                                                              if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
294    //                                                                      header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
295    //                                                              } else if (domain == Domain.EL) {
296    //                                                                      header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
297    //                                                              } else if (domain == Domain.SEL) {
298    //                                                                      header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
299    //                                                              }
300    //                                                      }
301    //                                                      collectChar = false;
302    //                                              }
303                                            } else {
304    //                                               a -> annotation, author
305    //                                               b -> body
306    //                                               c -> corpus
307    //                                               d -> date, description,
308    //                                               f -> feature, format
309    //                                               g -> graph
310    //                                               h -> head, history
311    //                                               m -> matches, match
312                                                    if (reader.getLocalName().equals("body")) {
313                                                            //sentence = dataStructures.getSentence();
314                                                            //phraseTree = dataStructures.getInPhraseTree();
315                                                            //sentence.clear();
316                                                            //phraseTree.clear();
317                                                            //dataStructures.setLastProcessObject(true);
318                                                    }  else if (reader.getLocalName().equals("author")) {
319    //                                                      if (header != null) {
320    //                                                              header.setMetaAuthor(elementContent.toString());
321    //                                                      }
322    //                                                      collectChar = false;
323                                                    } else if (reader.getLocalName().equals("date")) {
324    //                                                      if (header != null) {
325    //                                                              header.setMetaInDate(elementContent.toString());
326    //                                                      }
327    //                                                      collectChar = false;
328                                                    } else if (reader.getLocalName().equals("description")) {
329    //                                                      if (header != null) {
330    //                                                              header.setMetaDescription(elementContent.toString());
331    //                                                      }
332    //                                                      collectChar = false;
333                                                    } else if (reader.getLocalName().equals("format")) {
334    //                                                      if (header != null) {
335    //                                                              header.setMetaFormat(elementContent.toString());
336    //                                                      }
337    //                                                      collectChar = false;
338                                                    } else if (reader.getLocalName().equals("history")) {
339    //                                                      if (header != null) {
340    //                                                              header.setMetaHistory(elementContent.toString());
341    //                                                      }
342    //                                                      collectChar = false;
343                                                    } /* else if (reader.getLocalName().equals("annotation")) {
344                                                            if (header != null) {
345                                                                    System.out.println(header.toTigerXML());
346                                                            }
347                                                            collectChar = false;
348                                                    } */
349                                            }                               
350                                    } else if (event == XMLStreamConstants.END_DOCUMENT) {
351                                            if (syntaxGraph.hasTokens()) {
352                                                    sentenceCount++;
353                                            }
354                                            return false;
355                                    } else if (event == XMLStreamConstants.CHARACTERS) {
356    //                                      if (collectChar) {
357    //                                              char[] ch = reader.getTextCharacters();
358    //                                              final int size = reader.getTextStart()+reader.getTextLength();
359    //                                              for (int i = reader.getTextStart(); i < size; i++) {
360    //                                                      elementContent.append(ch[i]);
361    //                                              }
362    //                                      }
363                                    }
364                            }
365                    } catch (XMLStreamException e) {
366                            throw new DataFormatException("", e);
367                    }
368            }
369            
370            public int getSentenceCount() {
371                    return sentenceCount;
372            }
373    
374            public void setSentenceCount(int sentenceCount) {
375                    this.sentenceCount = sentenceCount;
376            }
377            
378            public XMLStreamReader getReader() {
379                    return reader;
380            }
381    
382            public void setReader(XMLStreamReader reader) {
383                    this.reader = reader;
384            }
385            
386            public void readEpilog() throws MaltChainedException {
387                    
388            }
389            
390            public void close() throws MaltChainedException {
391                    try {
392                            if (reader != null) {
393                                    reader.close();
394                                    reader = null;
395                            }
396                    } catch (XMLStreamException e) {
397                            throw new DataFormatException("The XML input file could be closed. ", e);
398                    }
399            }
400    
401            public DataFormatInstance getDataFormatInstance() {
402                    return dataFormatInstance;
403            }
404            
405            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
406                    this.dataFormatInstance = inputDataFormatInstance;
407            }
408            
409            public String getOptions() {
410                    return optionString;
411            }
412            
413            public void setOptions(String optionString) throws MaltChainedException {
414                    this.optionString = optionString;
415                    String[] argv;
416                    try {
417                            argv = optionString.split("[_\\p{Blank}]");
418                    } catch (PatternSyntaxException e) {
419                            throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
420                    }
421                    for (int i=0; i < argv.length-1; i++) {
422                            if(argv[i].charAt(0) != '-') {
423                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
424                            }
425                            if(++i>=argv.length) {
426                                    throw new DataFormatException("The last argument does not have any value. ");
427                            }
428                            switch(argv[i-1].charAt(1)) {
429                            case 's': 
430                                    try {
431                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
432                                    } catch (NumberFormatException e){
433                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
434                                    }
435                                    break;
436                            default:
437                                    throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");          
438                            }
439                    }
440            }
441            
442    //      public TigerXMLHeader getHeader() {
443    //              return header;
444    //      }
445    //      
446    //      public void setHeader(TigerXMLHeader header) {
447    //              this.header = header;
448    //      }
449    }