001package org.maltparser.core.syntaxgraph.reader; 002 003import java.io.BufferedReader; 004import java.io.FileInputStream; 005import java.io.FileNotFoundException; 006import java.io.IOException; 007import java.io.InputStream; 008import java.io.InputStreamReader; 009import java.io.UnsupportedEncodingException; 010import java.net.URL; 011import java.util.SortedMap; 012import java.util.regex.PatternSyntaxException; 013 014import javax.xml.stream.XMLInputFactory; 015import javax.xml.stream.XMLStreamConstants; 016import javax.xml.stream.XMLStreamException; 017import javax.xml.stream.XMLStreamReader; 018 019import org.maltparser.core.exception.MaltChainedException; 020import org.maltparser.core.io.dataformat.DataFormatException; 021import org.maltparser.core.io.dataformat.DataFormatInstance; 022import org.maltparser.core.symbol.SymbolTable; 023import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 024import org.maltparser.core.syntaxgraph.PhraseStructure; 025import org.maltparser.core.syntaxgraph.SyntaxGraphException; 026import org.maltparser.core.syntaxgraph.TokenStructure; 027import org.maltparser.core.syntaxgraph.edge.Edge; 028import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 029import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 030 031/** 032* 033* 034* @author Johan Hall 035*/ 036public class TigerXMLReader implements SyntaxGraphReader { 037// private TigerXMLHeader header; 038 private XMLStreamReader reader; 039 private int sentenceCount; 040 private DataFormatInstance dataFormatInstance; 041 private StringBuffer ntid; 042 private final StringBuilder graphRootID; 043// private StringBuilder elementContent; 044// private StringBuilder valueName; 045// private StringBuilder currentFeatureName; 046// private Domain domain; 047// private boolean collectChar = false; 048 private String optionString; 049 private String fileName = null; 050 private URL url = null; 051 private String charsetName; 052 private int nIterations; 053 private int cIterations; 054 private int START_ID_OF_NONTERMINALS = 500; 055 private boolean closeStream = true; 056 057 public TigerXMLReader() { 058 this.ntid = new StringBuffer(); 059// elementContent = new StringBuilder(); 060// valueName = new StringBuilder(); 061// currentFeatureName = new StringBuilder(); 062 graphRootID = new StringBuilder(); 063 nIterations = 1; 064 cIterations = 1; 065 } 066 067 private void reopen() throws MaltChainedException { 068 close(); 069 if (fileName != null) { 070 open(fileName, charsetName); 071 } else if (url != null) { 072 open(url, charsetName); 073 } else { 074 throw new DataFormatException("The input stream cannot be reopen. "); 075 } 076 } 077 078 public void open(String fileName, String charsetName) throws MaltChainedException { 079 setFileName(fileName); 080 setCharsetName(charsetName); 081 try { 082 open(new FileInputStream(fileName), charsetName); 083 }catch (FileNotFoundException e) { 084 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 085 } 086 } 087 public void open(URL url, String charsetName) throws MaltChainedException { 088 setUrl(url); 089 setCharsetName(charsetName); 090 try { 091 open(url.openStream(), charsetName); 092 } catch (IOException e) { 093 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 094 } 095 } 096 097 public void open(InputStream is, String charsetName) throws MaltChainedException { 098 try { 099 if (is == System.in) { 100 closeStream = false; 101 } 102 open(new InputStreamReader(is, charsetName)); 103 } catch (UnsupportedEncodingException e) { 104 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 105 } 106 } 107 108 private void open(InputStreamReader isr) throws MaltChainedException { 109 try { 110 XMLInputFactory factory = XMLInputFactory.newInstance(); 111 setReader(factory.createXMLStreamReader(new BufferedReader(isr))); 112 } catch (XMLStreamException e) { 113 throw new DataFormatException("XML input file could be opened. ", e); 114 } 115 setSentenceCount(0); 116 } 117 118 public void readProlog() throws MaltChainedException { 119 120 } 121 122 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 123 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 124 return false; 125 } 126 syntaxGraph.clear(); 127 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 128 PhraseStructureNode parent = null; 129 PhraseStructureNode child = null; 130// if (header == null) { 131// header = new TigerXMLHeader(syntaxGraph.getSymbolTables()); 132// } 133 134 try { 135 while (true) { 136 int event = reader.next(); 137 if (event == XMLStreamConstants.START_ELEMENT) { 138 if (reader.getLocalName().length() == 0) { 139 continue; 140 } 141 if (reader.getLocalName().charAt(0) == 'e') { 142 // e -> edge, edgelabel 143 if (reader.getLocalName().length() == 4) { //edge 144 int childid = -1; 145 int indexSep = reader.getAttributeValue(null, "idref").indexOf('_'); 146 147 try { 148 if (indexSep != -1) { 149 childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1)); 150 } else { 151 childid = Integer.parseInt(reader.getAttributeValue(null, "idref")); 152 } 153 if (childid == -1) { 154 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 155 } 156 } catch (NumberFormatException e) { 157 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 158 } 159 160 if (childid < START_ID_OF_NONTERMINALS) { 161 child = phraseStructure.getTokenNode(childid); 162 } else { 163 164 child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1); 165 } 166 167 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 168 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(phraseStructure.getSymbolTables()); 169 for (String name : inputTables.keySet()) { 170 e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 171 } 172 } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel 173// domain = Domain.EL; 174 } 175 } else if (reader.getLocalName().charAt(0) == 'n') { 176 // n -> nt, nonterminals, name 177 if (reader.getLocalName().length() == 2) { // nt 178 final String id = reader.getAttributeValue(null, "id"); 179 if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) { 180 parent = phraseStructure.getPhraseStructureRoot(); 181 } else { 182 int index = id.indexOf('_'); 183 if (index != -1) { 184 parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1); 185 } 186 } 187 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables(phraseStructure.getSymbolTables()); 188 for (String name : inputTables.keySet()) { 189 parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 190 } 191 } else if (reader.getLocalName().equals("name")) { // name 192// elementContent.setLength(0); 193// collectChar = true; 194 } 195 } else if (reader.getLocalName().charAt(0) == 't') { 196 // t -> t, terminals 197 if (reader.getLocalName().length() == 1) { // t 198 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables(phraseStructure.getSymbolTables()); 199 child = syntaxGraph.addTokenNode(); 200 for (String name : inputTables.keySet()) { 201 child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 202 } 203 } 204 } else if (reader.getLocalName().charAt(0) == 's') { 205 // s -> subcorpus, secedge, s, secedgelabel 206 if (reader.getLocalName().length() == 1) { // s 207 String id = reader.getAttributeValue(null, "id"); 208 boolean indexable = false; 209 int index = -1; 210 if (id != null && id.length() > 0) { 211 for (int i = 0, n = id.length(); i < n; i++) { 212 if (Character.isDigit(id.charAt(i))) { 213 if (index == -1) { 214 index = i; 215 } 216 indexable = true; 217 } 218 } 219 } 220 if (indexable) { 221 phraseStructure.setSentenceID(Integer.parseInt(id.substring(index))); 222 } else { 223 phraseStructure.setSentenceID(sentenceCount+1); 224 } 225 } 226 } else if (reader.getLocalName().charAt(0) == 'v') { 227 // v -> variable, value 228// if (reader.getLocalName().equals("value")) { 229// valueName.setLength(0); 230// valueName.append(reader.getAttributeValue(null, "name")); 231// elementContent.setLength(0); 232// collectChar = true; 233// } 234 } else { 235// a -> annotation, author 236// b -> body 237// c -> corpus 238// d -> date, description, 239// f -> feature, format 240// g -> graph 241// h -> head, history 242// m -> matches, match 243 if (reader.getLocalName().equals("graph")) { 244 graphRootID.setLength(0); 245 graphRootID.append(reader.getAttributeValue(null, "root")); 246 } else if (reader.getLocalName().equals("corpus")) { 247// header.setCorpusID(reader.getAttributeValue(null, "id")); 248// header.setCorpusID(reader.getAttributeValue(null, "version")); 249 } else if (reader.getLocalName().equals("feature")) { 250// if (header != null) { 251// currentFeatureName.setLength(0); 252// currentFeatureName.append(reader.getAttributeValue(null, "name")); 253// header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain")); 254// } 255// domain = Domain.valueOf(reader.getAttributeValue(null, "domain")); 256 } else if (reader.getLocalName().equals("secedgelabel")) { 257// domain = Domain.SEL; 258 } else if (reader.getLocalName().equals("author")) { 259// elementContent.setLength(0); 260// collectChar = true; 261 } else if (reader.getLocalName().equals("date")) { 262// elementContent.setLength(0); 263// collectChar = true; 264 } else if (reader.getLocalName().equals("description")) { 265// elementContent.setLength(0); 266// collectChar = true; 267 } else if (reader.getLocalName().equals("format")) { 268// elementContent.setLength(0); 269// collectChar = true; 270 } else if (reader.getLocalName().equals("history")) { 271// elementContent.setLength(0); 272// collectChar = true; 273 } 274 } 275 } else if (event == XMLStreamConstants.END_ELEMENT) { 276 if (reader.getLocalName().length() == 0) { 277 continue; 278 } 279 if (reader.getLocalName().charAt(0) == 'e') { 280 // e -> edge, edgelabel 281 } else if (reader.getLocalName().charAt(0) == 'n') { 282 // n -> nt, nonterminals, name 283 if (reader.getLocalName().equals("nt")) { 284 ntid.setLength(0); 285 } 286 else if (reader.getLocalName().equals("nonterminals")) { 287 if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) { 288 Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1)); 289 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(phraseStructure.getSymbolTables()); 290 for (String name : inputTables.keySet()) { 291 e.addLabel(inputTables.get(name), "--"); 292 } 293 } 294 } 295// else if (reader.getLocalName().equals("name")) { 296// if (header != null) { 297// header.setMetaName(elementContent.toString()); 298// } 299// collectChar = false; 300// } 301 } else if (reader.getLocalName().charAt(0) == 't') { 302 // t -> t, terminals 303 } else if (reader.getLocalName().charAt(0) == 's') { 304 // s -> subcorpus, secedge, s, secedgelabel 305 if (reader.getLocalName().equals("s")) { 306 if (syntaxGraph.hasTokens()) { 307 sentenceCount++; 308 } 309 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 310 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 311 } 312 return true; 313 } 314 } else if (reader.getLocalName().charAt(0) == 'v') { 315 // v -> variable, value 316// if (reader.getLocalName().equals("value")) { 317// if (header != null) { 318// if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) { 319// header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString()); 320// } else if (domain == Domain.EL) { 321// header.addEdgeLabelValue(valueName.toString(), elementContent.toString()); 322// } else if (domain == Domain.SEL) { 323// header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString()); 324// } 325// } 326// collectChar = false; 327// } 328 } else { 329// a -> annotation, author 330// b -> body 331// c -> corpus 332// d -> date, description, 333// f -> feature, format 334// g -> graph 335// h -> head, history 336// m -> matches, match 337 if (reader.getLocalName().equals("body")) { 338 //sentence = dataStructures.getSentence(); 339 //phraseTree = dataStructures.getInPhraseTree(); 340 //sentence.clear(); 341 //phraseTree.clear(); 342 //dataStructures.setLastProcessObject(true); 343 } else if (reader.getLocalName().equals("author")) { 344// if (header != null) { 345// header.setMetaAuthor(elementContent.toString()); 346// } 347// collectChar = false; 348 } else if (reader.getLocalName().equals("date")) { 349// if (header != null) { 350// header.setMetaInDate(elementContent.toString()); 351// } 352// collectChar = false; 353 } else if (reader.getLocalName().equals("description")) { 354// if (header != null) { 355// header.setMetaDescription(elementContent.toString()); 356// } 357// collectChar = false; 358 } else if (reader.getLocalName().equals("format")) { 359// if (header != null) { 360// header.setMetaFormat(elementContent.toString()); 361// } 362// collectChar = false; 363 } else if (reader.getLocalName().equals("history")) { 364// if (header != null) { 365// header.setMetaHistory(elementContent.toString()); 366// } 367// collectChar = false; 368 } /* else if (reader.getLocalName().equals("annotation")) { 369 if (header != null) { 370 System.out.println(header.toTigerXML()); 371 } 372 collectChar = false; 373 } */ 374 } 375 } else if (event == XMLStreamConstants.END_DOCUMENT) { 376 if (syntaxGraph.hasTokens()) { 377 sentenceCount++; 378 } 379 if (cIterations < nIterations) { 380 cIterations++; 381 reopen(); 382 return true; 383 } 384 return false; 385 } else if (event == XMLStreamConstants.CHARACTERS) { 386// if (collectChar) { 387// char[] ch = reader.getTextCharacters(); 388// final int size = reader.getTextStart()+reader.getTextLength(); 389// for (int i = reader.getTextStart(); i < size; i++) { 390// elementContent.append(ch[i]); 391// } 392// } 393 } 394 } 395 } catch (XMLStreamException e) { 396 throw new DataFormatException("", e); 397 } 398 } 399 400 public int getSentenceCount() { 401 return sentenceCount; 402 } 403 404 public void setSentenceCount(int sentenceCount) { 405 this.sentenceCount = sentenceCount; 406 } 407 408 public XMLStreamReader getReader() { 409 return reader; 410 } 411 412 public void setReader(XMLStreamReader reader) { 413 this.reader = reader; 414 } 415 416 public void readEpilog() throws MaltChainedException { 417 418 } 419 420 public void close() throws MaltChainedException { 421 try { 422 if (reader != null) { 423 if (closeStream) { 424 reader.close(); 425 } 426 reader = null; 427 } 428 } catch (XMLStreamException e) { 429 throw new DataFormatException("The XML input file could be closed. ", e); 430 } 431 } 432 433 public DataFormatInstance getDataFormatInstance() { 434 return dataFormatInstance; 435 } 436 437 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 438 this.dataFormatInstance = inputDataFormatInstance; 439 } 440 441 public String getOptions() { 442 return optionString; 443 } 444 445 public void setOptions(String optionString) throws MaltChainedException { 446 this.optionString = optionString; 447 String[] argv; 448 try { 449 argv = optionString.split("[_\\p{Blank}]"); 450 } catch (PatternSyntaxException e) { 451 throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e); 452 } 453 for (int i=0; i < argv.length-1; i++) { 454 if(argv[i].charAt(0) != '-') { 455 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 456 } 457 if(++i>=argv.length) { 458 throw new DataFormatException("The last argument does not have any value. "); 459 } 460 switch(argv[i-1].charAt(1)) { 461 case 's': 462 try { 463 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 464 } catch (NumberFormatException e){ 465 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 466 } 467 break; 468 default: 469 throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 470 } 471 } 472 } 473 474 public String getFileName() { 475 return fileName; 476 } 477 478 public void setFileName(String fileName) { 479 this.fileName = fileName; 480 } 481 482 public URL getUrl() { 483 return url; 484 } 485 486 public void setUrl(URL url) { 487 this.url = url; 488 } 489 490 public String getCharsetName() { 491 return charsetName; 492 } 493 494 public void setCharsetName(String charsetName) { 495 this.charsetName = charsetName; 496 } 497 498 public int getNIterations() { 499 return nIterations; 500 } 501 502 public void setNIterations(int iterations) { 503 nIterations = iterations; 504 } 505 506 public int getIterationCounter() { 507 return cIterations; 508 } 509// public TigerXMLHeader getHeader() { 510// return header; 511// } 512// 513// public void setHeader(TigerXMLHeader header) { 514// this.header = header; 515// } 516}