001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.SortedMap;
012 import java.util.regex.PatternSyntaxException;
013
014 import javax.xml.stream.XMLInputFactory;
015 import javax.xml.stream.XMLStreamConstants;
016 import javax.xml.stream.XMLStreamException;
017 import javax.xml.stream.XMLStreamReader;
018
019 import org.maltparser.core.exception.MaltChainedException;
020 import org.maltparser.core.io.dataformat.DataFormatException;
021 import org.maltparser.core.io.dataformat.DataFormatInstance;
022 import org.maltparser.core.symbol.SymbolTable;
023 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
024 import org.maltparser.core.syntaxgraph.PhraseStructure;
025 import org.maltparser.core.syntaxgraph.SyntaxGraphException;
026 import org.maltparser.core.syntaxgraph.TokenStructure;
027 import org.maltparser.core.syntaxgraph.edge.Edge;
028 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
029 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
030
031 /**
032 *
033 *
034 * @author Johan Hall
035 */
036 public class TigerXMLReader implements SyntaxGraphReader {
037 // private TigerXMLHeader header;
038 private XMLStreamReader reader;
039 private int sentenceCount;
040 private DataFormatInstance dataFormatInstance;
041 private StringBuffer ntid;
042 private final StringBuilder graphRootID;
043 // private StringBuilder elementContent;
044 // private StringBuilder valueName;
045 // private StringBuilder currentFeatureName;
046 // private Domain domain;
047 // private boolean collectChar = false;
048 private String optionString;
049 private String fileName = null;
050 private URL url = null;
051 private String charsetName;
052 private int nIterations;
053 private int cIterations;
054 private int START_ID_OF_NONTERMINALS = 500;
055 private boolean closeStream = true;
056
057 public TigerXMLReader() {
058 this.ntid = new StringBuffer();
059 // elementContent = new StringBuilder();
060 // valueName = new StringBuilder();
061 // currentFeatureName = new StringBuilder();
062 graphRootID = new StringBuilder();
063 nIterations = 1;
064 cIterations = 1;
065 }
066
067 private void reopen() throws MaltChainedException {
068 close();
069 if (fileName != null) {
070 open(fileName, charsetName);
071 } else if (url != null) {
072 open(url, charsetName);
073 } else {
074 throw new DataFormatException("The input stream cannot be reopen. ");
075 }
076 }
077
078 public void open(String fileName, String charsetName) throws MaltChainedException {
079 setFileName(fileName);
080 setCharsetName(charsetName);
081 try {
082 open(new FileInputStream(fileName), charsetName);
083 }catch (FileNotFoundException e) {
084 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
085 }
086 }
087 public void open(URL url, String charsetName) throws MaltChainedException {
088 setUrl(url);
089 setCharsetName(charsetName);
090 try {
091 open(url.openStream(), charsetName);
092 } catch (IOException e) {
093 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
094 }
095 }
096
097 public void open(InputStream is, String charsetName) throws MaltChainedException {
098 try {
099 if (is == System.in) {
100 closeStream = false;
101 }
102 open(new InputStreamReader(is, charsetName));
103 } catch (UnsupportedEncodingException e) {
104 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
105 }
106 }
107
108 private void open(InputStreamReader isr) throws MaltChainedException {
109 try {
110 XMLInputFactory factory = XMLInputFactory.newInstance();
111 setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
112 } catch (XMLStreamException e) {
113 throw new DataFormatException("XML input file could be opened. ", e);
114 }
115 setSentenceCount(0);
116 }
117
118 public void readProlog() throws MaltChainedException {
119
120 }
121
122 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
123 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
124 return false;
125 }
126 syntaxGraph.clear();
127 syntaxGraph.getSymbolTables().cleanUp();
128 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
129 PhraseStructureNode parent = null;
130 PhraseStructureNode child = null;
131 // if (header == null) {
132 // header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
133 // }
134
135 try {
136 while (true) {
137 int event = reader.next();
138 if (event == XMLStreamConstants.START_ELEMENT) {
139 if (reader.getLocalName().length() == 0) {
140 continue;
141 }
142 if (reader.getLocalName().charAt(0) == 'e') {
143 // e -> edge, edgelabel
144 if (reader.getLocalName().length() == 4) { //edge
145 int childid = -1;
146 int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
147
148 try {
149 if (indexSep != -1) {
150 childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
151 } else {
152 childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
153 }
154 if (childid == -1) {
155 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
156 }
157 } catch (NumberFormatException e) {
158 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
159 }
160
161 if (childid < START_ID_OF_NONTERMINALS) {
162 child = phraseStructure.getTokenNode(childid);
163 } else {
164
165 child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
166 }
167
168 Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
169 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
170 for (String name : inputTables.keySet()) {
171 e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
172 }
173 } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
174 // domain = Domain.EL;
175 }
176 } else if (reader.getLocalName().charAt(0) == 'n') {
177 // n -> nt, nonterminals, name
178 if (reader.getLocalName().length() == 2) { // nt
179 final String id = reader.getAttributeValue(null, "id");
180 if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
181 parent = phraseStructure.getPhraseStructureRoot();
182 } else {
183 int index = id.indexOf('_');
184 if (index != -1) {
185 parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
186 }
187 }
188 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables();
189 for (String name : inputTables.keySet()) {
190 parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
191 }
192 } else if (reader.getLocalName().equals("name")) { // name
193 // elementContent.setLength(0);
194 // collectChar = true;
195 }
196 } else if (reader.getLocalName().charAt(0) == 't') {
197 // t -> t, terminals
198 if (reader.getLocalName().length() == 1) { // t
199 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables();
200 child = syntaxGraph.addTokenNode();
201 for (String name : inputTables.keySet()) {
202 child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
203 }
204 }
205 } else if (reader.getLocalName().charAt(0) == 's') {
206 // s -> subcorpus, secedge, s, secedgelabel
207 if (reader.getLocalName().length() == 1) { // s
208 String id = reader.getAttributeValue(null, "id");
209 boolean indexable = false;
210 int index = -1;
211 if (id != null && id.length() > 0) {
212 for (int i = 0, n = id.length(); i < n; i++) {
213 if (Character.isDigit(id.charAt(i))) {
214 if (index == -1) {
215 index = i;
216 }
217 indexable = true;
218 }
219 }
220 }
221 if (indexable) {
222 phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
223 } else {
224 phraseStructure.setSentenceID(sentenceCount+1);
225 }
226 }
227 } else if (reader.getLocalName().charAt(0) == 'v') {
228 // v -> variable, value
229 // if (reader.getLocalName().equals("value")) {
230 // valueName.setLength(0);
231 // valueName.append(reader.getAttributeValue(null, "name"));
232 // elementContent.setLength(0);
233 // collectChar = true;
234 // }
235 } else {
236 // a -> annotation, author
237 // b -> body
238 // c -> corpus
239 // d -> date, description,
240 // f -> feature, format
241 // g -> graph
242 // h -> head, history
243 // m -> matches, match
244 if (reader.getLocalName().equals("graph")) {
245 graphRootID.setLength(0);
246 graphRootID.append(reader.getAttributeValue(null, "root"));
247 } else if (reader.getLocalName().equals("corpus")) {
248 // header.setCorpusID(reader.getAttributeValue(null, "id"));
249 // header.setCorpusID(reader.getAttributeValue(null, "version"));
250 } else if (reader.getLocalName().equals("feature")) {
251 // if (header != null) {
252 // currentFeatureName.setLength(0);
253 // currentFeatureName.append(reader.getAttributeValue(null, "name"));
254 // header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
255 // }
256 // domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
257 } else if (reader.getLocalName().equals("secedgelabel")) {
258 // domain = Domain.SEL;
259 } else if (reader.getLocalName().equals("author")) {
260 // elementContent.setLength(0);
261 // collectChar = true;
262 } else if (reader.getLocalName().equals("date")) {
263 // elementContent.setLength(0);
264 // collectChar = true;
265 } else if (reader.getLocalName().equals("description")) {
266 // elementContent.setLength(0);
267 // collectChar = true;
268 } else if (reader.getLocalName().equals("format")) {
269 // elementContent.setLength(0);
270 // collectChar = true;
271 } else if (reader.getLocalName().equals("history")) {
272 // elementContent.setLength(0);
273 // collectChar = true;
274 }
275 }
276 } else if (event == XMLStreamConstants.END_ELEMENT) {
277 if (reader.getLocalName().length() == 0) {
278 continue;
279 }
280 if (reader.getLocalName().charAt(0) == 'e') {
281 // e -> edge, edgelabel
282 } else if (reader.getLocalName().charAt(0) == 'n') {
283 // n -> nt, nonterminals, name
284 if (reader.getLocalName().equals("nt")) {
285 ntid.setLength(0);
286 }
287 else if (reader.getLocalName().equals("nonterminals")) {
288 if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
289 Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
290 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
291 for (String name : inputTables.keySet()) {
292 e.addLabel(inputTables.get(name), "--");
293 }
294 }
295 }
296 // else if (reader.getLocalName().equals("name")) {
297 // if (header != null) {
298 // header.setMetaName(elementContent.toString());
299 // }
300 // collectChar = false;
301 // }
302 } else if (reader.getLocalName().charAt(0) == 't') {
303 // t -> t, terminals
304 } else if (reader.getLocalName().charAt(0) == 's') {
305 // s -> subcorpus, secedge, s, secedgelabel
306 if (reader.getLocalName().equals("s")) {
307 if (syntaxGraph.hasTokens()) {
308 sentenceCount++;
309 }
310 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
311 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
312 }
313 return true;
314 }
315 } else if (reader.getLocalName().charAt(0) == 'v') {
316 // v -> variable, value
317 // if (reader.getLocalName().equals("value")) {
318 // if (header != null) {
319 // if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
320 // header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
321 // } else if (domain == Domain.EL) {
322 // header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
323 // } else if (domain == Domain.SEL) {
324 // header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
325 // }
326 // }
327 // collectChar = false;
328 // }
329 } else {
330 // a -> annotation, author
331 // b -> body
332 // c -> corpus
333 // d -> date, description,
334 // f -> feature, format
335 // g -> graph
336 // h -> head, history
337 // m -> matches, match
338 if (reader.getLocalName().equals("body")) {
339 //sentence = dataStructures.getSentence();
340 //phraseTree = dataStructures.getInPhraseTree();
341 //sentence.clear();
342 //phraseTree.clear();
343 //dataStructures.setLastProcessObject(true);
344 } else if (reader.getLocalName().equals("author")) {
345 // if (header != null) {
346 // header.setMetaAuthor(elementContent.toString());
347 // }
348 // collectChar = false;
349 } else if (reader.getLocalName().equals("date")) {
350 // if (header != null) {
351 // header.setMetaInDate(elementContent.toString());
352 // }
353 // collectChar = false;
354 } else if (reader.getLocalName().equals("description")) {
355 // if (header != null) {
356 // header.setMetaDescription(elementContent.toString());
357 // }
358 // collectChar = false;
359 } else if (reader.getLocalName().equals("format")) {
360 // if (header != null) {
361 // header.setMetaFormat(elementContent.toString());
362 // }
363 // collectChar = false;
364 } else if (reader.getLocalName().equals("history")) {
365 // if (header != null) {
366 // header.setMetaHistory(elementContent.toString());
367 // }
368 // collectChar = false;
369 } /* else if (reader.getLocalName().equals("annotation")) {
370 if (header != null) {
371 System.out.println(header.toTigerXML());
372 }
373 collectChar = false;
374 } */
375 }
376 } else if (event == XMLStreamConstants.END_DOCUMENT) {
377 if (syntaxGraph.hasTokens()) {
378 sentenceCount++;
379 }
380 if (cIterations < nIterations) {
381 cIterations++;
382 reopen();
383 return true;
384 }
385 return false;
386 } else if (event == XMLStreamConstants.CHARACTERS) {
387 // if (collectChar) {
388 // char[] ch = reader.getTextCharacters();
389 // final int size = reader.getTextStart()+reader.getTextLength();
390 // for (int i = reader.getTextStart(); i < size; i++) {
391 // elementContent.append(ch[i]);
392 // }
393 // }
394 }
395 }
396 } catch (XMLStreamException e) {
397 throw new DataFormatException("", e);
398 }
399 }
400
401 public int getSentenceCount() {
402 return sentenceCount;
403 }
404
405 public void setSentenceCount(int sentenceCount) {
406 this.sentenceCount = sentenceCount;
407 }
408
409 public XMLStreamReader getReader() {
410 return reader;
411 }
412
413 public void setReader(XMLStreamReader reader) {
414 this.reader = reader;
415 }
416
417 public void readEpilog() throws MaltChainedException {
418
419 }
420
421 public void close() throws MaltChainedException {
422 try {
423 if (reader != null) {
424 if (closeStream) {
425 reader.close();
426 }
427 reader = null;
428 }
429 } catch (XMLStreamException e) {
430 throw new DataFormatException("The XML input file could be closed. ", e);
431 }
432 }
433
434 public DataFormatInstance getDataFormatInstance() {
435 return dataFormatInstance;
436 }
437
438 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
439 this.dataFormatInstance = inputDataFormatInstance;
440 }
441
442 public String getOptions() {
443 return optionString;
444 }
445
446 public void setOptions(String optionString) throws MaltChainedException {
447 this.optionString = optionString;
448 String[] argv;
449 try {
450 argv = optionString.split("[_\\p{Blank}]");
451 } catch (PatternSyntaxException e) {
452 throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
453 }
454 for (int i=0; i < argv.length-1; i++) {
455 if(argv[i].charAt(0) != '-') {
456 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
457 }
458 if(++i>=argv.length) {
459 throw new DataFormatException("The last argument does not have any value. ");
460 }
461 switch(argv[i-1].charAt(1)) {
462 case 's':
463 try {
464 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
465 } catch (NumberFormatException e){
466 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
467 }
468 break;
469 default:
470 throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
471 }
472 }
473 }
474
475 public String getFileName() {
476 return fileName;
477 }
478
479 public void setFileName(String fileName) {
480 this.fileName = fileName;
481 }
482
483 public URL getUrl() {
484 return url;
485 }
486
487 public void setUrl(URL url) {
488 this.url = url;
489 }
490
491 public String getCharsetName() {
492 return charsetName;
493 }
494
495 public void setCharsetName(String charsetName) {
496 this.charsetName = charsetName;
497 }
498
499 public int getNIterations() {
500 return nIterations;
501 }
502
503 public void setNIterations(int iterations) {
504 nIterations = iterations;
505 }
506
507 public int getIterationCounter() {
508 return cIterations;
509 }
510 // public TigerXMLHeader getHeader() {
511 // return header;
512 // }
513 //
514 // public void setHeader(TigerXMLHeader header) {
515 // this.header = header;
516 // }
517 }