001package org.maltparser.core.syntaxgraph.reader;
002
003import java.io.BufferedReader;
004import java.io.FileInputStream;
005import java.io.FileNotFoundException;
006import java.io.IOException;
007import java.io.InputStream;
008import java.io.InputStreamReader;
009import java.io.UnsupportedEncodingException;
010import java.net.URL;
011import java.util.SortedMap;
012import java.util.regex.PatternSyntaxException;
013
014import javax.xml.stream.XMLInputFactory;
015import javax.xml.stream.XMLStreamConstants;
016import javax.xml.stream.XMLStreamException;
017import javax.xml.stream.XMLStreamReader;
018
019import org.maltparser.core.exception.MaltChainedException;
020import org.maltparser.core.io.dataformat.DataFormatException;
021import org.maltparser.core.io.dataformat.DataFormatInstance;
022import org.maltparser.core.symbol.SymbolTable;
023import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
024import org.maltparser.core.syntaxgraph.PhraseStructure;
025import org.maltparser.core.syntaxgraph.SyntaxGraphException;
026import org.maltparser.core.syntaxgraph.TokenStructure;
027import org.maltparser.core.syntaxgraph.edge.Edge;
028import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
029import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
030
031/**
032*
033*
034* @author Johan Hall
035*/
036public class TigerXMLReader implements SyntaxGraphReader {
037//      private TigerXMLHeader header;
038        private XMLStreamReader reader;
039        private int sentenceCount;
040        private DataFormatInstance dataFormatInstance;
041        private StringBuffer ntid;
042        private final StringBuilder graphRootID;
043//      private StringBuilder elementContent; 
044//      private StringBuilder valueName;
045//      private StringBuilder currentFeatureName;
046//      private Domain domain;
047//      private boolean collectChar = false;
048        private String optionString;
049        private String fileName = null;
050        private URL url = null;
051        private String charsetName;
052        private int nIterations;
053        private int cIterations;
054        private int START_ID_OF_NONTERMINALS = 500;
055        private boolean closeStream = true;
056        
057        public TigerXMLReader() {
058                this.ntid = new StringBuffer();
059//              elementContent = new StringBuilder();
060//              valueName = new StringBuilder();
061//              currentFeatureName = new StringBuilder(); 
062                graphRootID = new StringBuilder(); 
063                nIterations = 1;
064                cIterations = 1;
065        }
066        
067        private void reopen() throws MaltChainedException {
068                close();
069                if (fileName != null) {
070                        open(fileName, charsetName);
071                } else if (url != null) {
072                        open(url, charsetName);
073                } else {
074                        throw new DataFormatException("The input stream cannot be reopen. ");
075                }
076        }
077        
078        public void open(String fileName, String charsetName) throws MaltChainedException {
079                setFileName(fileName);
080                setCharsetName(charsetName);
081                try {
082                        open(new FileInputStream(fileName), charsetName);
083                }catch (FileNotFoundException e) {
084                        throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
085                }
086        }
087        public void open(URL url, String charsetName) throws MaltChainedException {
088                setUrl(url);
089                setCharsetName(charsetName);
090                try {
091                        open(url.openStream(), charsetName);
092                } catch (IOException e) {
093                        throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
094                }
095        }
096        
097        public void open(InputStream is, String charsetName) throws MaltChainedException {
098                try {
099                        if (is == System.in) {
100                                closeStream = false;
101                        }
102                        open(new InputStreamReader(is, charsetName));
103                } catch (UnsupportedEncodingException e) {
104                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
105                }
106        }
107        
108        private void open(InputStreamReader isr) throws MaltChainedException {
109                try {
110                        XMLInputFactory factory = XMLInputFactory.newInstance();
111                        setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
112                } catch (XMLStreamException e) {
113                        throw new DataFormatException("XML input file could be opened. ", e);
114                } 
115                setSentenceCount(0);
116        }
117        
118        public void readProlog() throws MaltChainedException {
119                
120        }
121        
122        public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
123                if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
124                        return false;
125                }
126                syntaxGraph.clear();
127                final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
128                PhraseStructureNode parent = null;
129                PhraseStructureNode child = null;
130//              if (header == null) {
131//                      header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
132//              }
133                
134                try {
135                        while (true) {
136                                int event = reader.next();
137                                if (event == XMLStreamConstants.START_ELEMENT) {
138                                        if (reader.getLocalName().length() == 0) {
139                                                continue;
140                                        }
141                                        if (reader.getLocalName().charAt(0) == 'e') {
142                                                // e -> edge, edgelabel
143                                                if (reader.getLocalName().length() == 4) { //edge
144                                                        int childid = -1;
145                                                        int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
146                                                        
147                                                        try {
148                                                                if (indexSep != -1) {
149                                                                        childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
150                                                                } else {
151                                                                        childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
152                                                                }
153                                                                if (childid == -1) {
154                                                                        throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
155                                                                }
156                                                        } catch (NumberFormatException e) {
157                                                                throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
158                                                        }
159
160                                                        if (childid < START_ID_OF_NONTERMINALS) {
161                                                                child = phraseStructure.getTokenNode(childid);
162                                                        } else {
163
164                                                                child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
165                                                        }
166
167                                                        Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
168                                                        SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(phraseStructure.getSymbolTables());
169                                                        for (String name : inputTables.keySet()) {
170                                                                e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
171                                                        }
172                                                } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
173//                                                      domain = Domain.EL;
174                                                }
175                                        } else if (reader.getLocalName().charAt(0) == 'n') {
176                                                // n -> nt, nonterminals, name
177                                                if (reader.getLocalName().length() == 2) { // nt
178                                                        final String id = reader.getAttributeValue(null, "id");
179                                                        if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
180                                                                parent = phraseStructure.getPhraseStructureRoot();
181                                                        } else {
182                                                                int index = id.indexOf('_');
183                                                                if (index != -1) {
184                                                                        parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
185                                                                }
186                                                        }
187                                                        SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables(phraseStructure.getSymbolTables());
188                                                        for (String name : inputTables.keySet()) {
189                                                                parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
190                                                        }
191                                                } else if (reader.getLocalName().equals("name")) { // name
192//                                                      elementContent.setLength(0);
193//                                                      collectChar = true;
194                                                }
195                                        } else if (reader.getLocalName().charAt(0) == 't') {
196                                                // t -> t, terminals
197                                                if (reader.getLocalName().length() == 1) { // t
198                                                        SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables(phraseStructure.getSymbolTables());
199                                                        child = syntaxGraph.addTokenNode();
200                                                        for (String name : inputTables.keySet()) {
201                                                                child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
202                                                        }
203                                                }
204                                        } else if (reader.getLocalName().charAt(0) == 's') {
205                                                // s -> subcorpus, secedge, s, secedgelabel
206                                                if (reader.getLocalName().length() == 1) { // s
207                                                        String id = reader.getAttributeValue(null, "id");
208                                                        boolean indexable = false;
209                                                        int index = -1;
210                                                        if (id != null && id.length() > 0) {
211                                                                for (int i = 0, n = id.length(); i < n; i++) {
212                                                                        if (Character.isDigit(id.charAt(i))) {
213                                                                                if (index == -1) { 
214                                                                                        index = i;
215                                                                                }
216                                                                                indexable = true;
217                                                                        }
218                                                                }
219                                                        }
220                                                        if (indexable) {
221                                                                phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
222                                                        } else {
223                                                                phraseStructure.setSentenceID(sentenceCount+1);
224                                                        }
225                                                }
226                                        } else if (reader.getLocalName().charAt(0) == 'v') {
227                                                // v -> variable, value
228//                                              if (reader.getLocalName().equals("value")) {
229//                                                      valueName.setLength(0);
230//                                                      valueName.append(reader.getAttributeValue(null, "name"));
231//                                                      elementContent.setLength(0);
232//                                                      collectChar = true;
233//                                              }
234                                        } else {
235//                                               a -> annotation, author
236//                                               b -> body
237//                                               c -> corpus
238//                                               d -> date, description,
239//                                               f -> feature, format
240//                                               g -> graph
241//                                               h -> head, history
242//                                               m -> matches, match
243                                                if (reader.getLocalName().equals("graph")) {
244                                                        graphRootID.setLength(0);
245                                                        graphRootID.append(reader.getAttributeValue(null, "root"));
246                                                } else  if (reader.getLocalName().equals("corpus")) {
247//                                                      header.setCorpusID(reader.getAttributeValue(null, "id"));
248//                                                      header.setCorpusID(reader.getAttributeValue(null, "version"));
249                                                } else if (reader.getLocalName().equals("feature")) {
250//                                                      if (header != null) {
251//                                                              currentFeatureName.setLength(0);
252//                                                              currentFeatureName.append(reader.getAttributeValue(null, "name"));
253//                                                              header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
254//                                                      }
255//                                                      domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
256                                                } else if (reader.getLocalName().equals("secedgelabel")) {
257//                                                      domain = Domain.SEL;
258                                                } else if (reader.getLocalName().equals("author")) {
259//                                                      elementContent.setLength(0);
260//                                                      collectChar = true;
261                                                } else if (reader.getLocalName().equals("date")) {
262//                                                      elementContent.setLength(0);
263//                                                      collectChar = true;
264                                                } else if (reader.getLocalName().equals("description")) {
265//                                                      elementContent.setLength(0);
266//                                                      collectChar = true;
267                                                } else if (reader.getLocalName().equals("format")) {
268//                                                      elementContent.setLength(0);
269//                                                      collectChar = true;
270                                                } else if (reader.getLocalName().equals("history")) {
271//                                                      elementContent.setLength(0);
272//                                                      collectChar = true;
273                                                } 
274                                        }
275                                } else if (event == XMLStreamConstants.END_ELEMENT) {
276                                        if (reader.getLocalName().length() == 0) {
277                                                continue;
278                                        }
279                                        if (reader.getLocalName().charAt(0) == 'e') {
280                                                // e -> edge, edgelabel
281                                        } else if (reader.getLocalName().charAt(0) == 'n') {
282                                                // n -> nt, nonterminals, name
283                                                if (reader.getLocalName().equals("nt")) {
284                                                        ntid.setLength(0);
285                                                }
286                                                else if (reader.getLocalName().equals("nonterminals")) {
287                                                        if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
288                                                                Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
289                                                                SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(phraseStructure.getSymbolTables());
290                                                                for (String name : inputTables.keySet()) {
291                                                                        e.addLabel(inputTables.get(name), "--");
292                                                                }
293                                                        }
294                                                }
295//                                              else if (reader.getLocalName().equals("name")) {
296//                                                      if (header != null) {
297//                                                              header.setMetaName(elementContent.toString());
298//                                                      }
299//                                                      collectChar = false;
300//                                              }
301                                        } else if (reader.getLocalName().charAt(0) == 't') {
302                                                // t -> t, terminals
303                                        } else if (reader.getLocalName().charAt(0) == 's') {
304                                                // s -> subcorpus, secedge, s, secedgelabel
305                                                if (reader.getLocalName().equals("s")) {
306                                                        if (syntaxGraph.hasTokens()) {
307                                                                sentenceCount++;
308                                                        }
309                                                        if (syntaxGraph instanceof MappablePhraseStructureGraph) {
310                                                                ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
311                                                        }
312                                                        return true;
313                                                }
314                                        } else if (reader.getLocalName().charAt(0) == 'v') {
315                                                // v -> variable, value
316//                                              if (reader.getLocalName().equals("value")) {
317//                                                      if (header != null) {
318//                                                              if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
319//                                                                      header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
320//                                                              } else if (domain == Domain.EL) {
321//                                                                      header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
322//                                                              } else if (domain == Domain.SEL) {
323//                                                                      header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
324//                                                              }
325//                                                      }
326//                                                      collectChar = false;
327//                                              }
328                                        } else {
329//                                               a -> annotation, author
330//                                               b -> body
331//                                               c -> corpus
332//                                               d -> date, description,
333//                                               f -> feature, format
334//                                               g -> graph
335//                                               h -> head, history
336//                                               m -> matches, match
337                                                if (reader.getLocalName().equals("body")) {
338                                                        //sentence = dataStructures.getSentence();
339                                                        //phraseTree = dataStructures.getInPhraseTree();
340                                                        //sentence.clear();
341                                                        //phraseTree.clear();
342                                                        //dataStructures.setLastProcessObject(true);
343                                                }  else if (reader.getLocalName().equals("author")) {
344//                                                      if (header != null) {
345//                                                              header.setMetaAuthor(elementContent.toString());
346//                                                      }
347//                                                      collectChar = false;
348                                                } else if (reader.getLocalName().equals("date")) {
349//                                                      if (header != null) {
350//                                                              header.setMetaInDate(elementContent.toString());
351//                                                      }
352//                                                      collectChar = false;
353                                                } else if (reader.getLocalName().equals("description")) {
354//                                                      if (header != null) {
355//                                                              header.setMetaDescription(elementContent.toString());
356//                                                      }
357//                                                      collectChar = false;
358                                                } else if (reader.getLocalName().equals("format")) {
359//                                                      if (header != null) {
360//                                                              header.setMetaFormat(elementContent.toString());
361//                                                      }
362//                                                      collectChar = false;
363                                                } else if (reader.getLocalName().equals("history")) {
364//                                                      if (header != null) {
365//                                                              header.setMetaHistory(elementContent.toString());
366//                                                      }
367//                                                      collectChar = false;
368                                                } /* else if (reader.getLocalName().equals("annotation")) {
369                                                        if (header != null) {
370                                                                System.out.println(header.toTigerXML());
371                                                        }
372                                                        collectChar = false;
373                                                } */
374                                        }                               
375                                } else if (event == XMLStreamConstants.END_DOCUMENT) {
376                                        if (syntaxGraph.hasTokens()) {
377                                                sentenceCount++;
378                                        }
379                                        if (cIterations < nIterations) {
380                                                cIterations++;
381                                                reopen();
382                                                return true;
383                                        }
384                                        return false;
385                                } else if (event == XMLStreamConstants.CHARACTERS) {
386//                                      if (collectChar) {
387//                                              char[] ch = reader.getTextCharacters();
388//                                              final int size = reader.getTextStart()+reader.getTextLength();
389//                                              for (int i = reader.getTextStart(); i < size; i++) {
390//                                                      elementContent.append(ch[i]);
391//                                              }
392//                                      }
393                                }
394                        }
395                } catch (XMLStreamException e) {
396                        throw new DataFormatException("", e);
397                }
398        }
399        
400        public int getSentenceCount() {
401                return sentenceCount;
402        }
403
404        public void setSentenceCount(int sentenceCount) {
405                this.sentenceCount = sentenceCount;
406        }
407        
408        public XMLStreamReader getReader() {
409                return reader;
410        }
411
412        public void setReader(XMLStreamReader reader) {
413                this.reader = reader;
414        }
415        
416        public void readEpilog() throws MaltChainedException {
417                
418        }
419        
420        public void close() throws MaltChainedException {
421                try {
422                        if (reader != null) {
423                                if (closeStream) {
424                                        reader.close();
425                                }
426                                reader = null;
427                        }
428                } catch (XMLStreamException e) {
429                        throw new DataFormatException("The XML input file could be closed. ", e);
430                }
431        }
432
433        public DataFormatInstance getDataFormatInstance() {
434                return dataFormatInstance;
435        }
436        
437        public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
438                this.dataFormatInstance = inputDataFormatInstance;
439        }
440        
441        public String getOptions() {
442                return optionString;
443        }
444        
445        public void setOptions(String optionString) throws MaltChainedException {
446                this.optionString = optionString;
447                String[] argv;
448                try {
449                        argv = optionString.split("[_\\p{Blank}]");
450                } catch (PatternSyntaxException e) {
451                        throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
452                }
453                for (int i=0; i < argv.length-1; i++) {
454                        if(argv[i].charAt(0) != '-') {
455                                throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
456                        }
457                        if(++i>=argv.length) {
458                                throw new DataFormatException("The last argument does not have any value. ");
459                        }
460                        switch(argv[i-1].charAt(1)) {
461                        case 's': 
462                                try {
463                                        START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
464                                } catch (NumberFormatException e){
465                                        throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
466                                }
467                                break;
468                        default:
469                                throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");          
470                        }
471                }
472        }
473        
474        public String getFileName() {
475                return fileName;
476        }
477
478        public void setFileName(String fileName) {
479                this.fileName = fileName;
480        }
481
482        public URL getUrl() {
483                return url;
484        }
485
486        public void setUrl(URL url) {
487                this.url = url;
488        }
489
490        public String getCharsetName() {
491                return charsetName;
492        }
493
494        public void setCharsetName(String charsetName) {
495                this.charsetName = charsetName;
496        }
497
498        public int getNIterations() {
499                return nIterations;
500        }
501
502        public void setNIterations(int iterations) {
503                nIterations = iterations;
504        }
505
506        public int getIterationCounter() {
507                return cIterations;
508        }
509//      public TigerXMLHeader getHeader() {
510//              return header;
511//      }
512//      
513//      public void setHeader(TigerXMLHeader header) {
514//              this.header = header;
515//      }
516}