001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.SortedMap; 012 import java.util.regex.PatternSyntaxException; 013 014 import javax.xml.stream.XMLInputFactory; 015 import javax.xml.stream.XMLStreamConstants; 016 import javax.xml.stream.XMLStreamException; 017 import javax.xml.stream.XMLStreamReader; 018 019 import org.maltparser.core.exception.MaltChainedException; 020 import org.maltparser.core.io.dataformat.DataFormatException; 021 import org.maltparser.core.io.dataformat.DataFormatInstance; 022 import org.maltparser.core.symbol.SymbolTable; 023 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 024 import org.maltparser.core.syntaxgraph.PhraseStructure; 025 import org.maltparser.core.syntaxgraph.SyntaxGraphException; 026 import org.maltparser.core.syntaxgraph.TokenStructure; 027 import org.maltparser.core.syntaxgraph.edge.Edge; 028 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 029 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 030 031 /** 032 * 033 * 034 * @author Johan Hall 035 */ 036 public class TigerXMLReader implements SyntaxGraphReader { 037 // private TigerXMLHeader header; 038 private XMLStreamReader reader; 039 private int sentenceCount; 040 private DataFormatInstance dataFormatInstance; 041 private StringBuffer ntid; 042 private final StringBuilder graphRootID; 043 // private StringBuilder elementContent; 044 // private StringBuilder valueName; 045 // private StringBuilder currentFeatureName; 046 // private Domain domain; 047 // private boolean collectChar = false; 048 private String optionString; 049 private String fileName = null; 050 private URL url = null; 051 private String charsetName; 052 private int nIterations; 053 private int cIterations; 054 private int START_ID_OF_NONTERMINALS = 500; 055 private boolean closeStream = true; 056 057 public TigerXMLReader() { 058 this.ntid = new StringBuffer(); 059 // elementContent = new StringBuilder(); 060 // valueName = new StringBuilder(); 061 // currentFeatureName = new StringBuilder(); 062 graphRootID = new StringBuilder(); 063 nIterations = 1; 064 cIterations = 1; 065 } 066 067 private void reopen() throws MaltChainedException { 068 close(); 069 if (fileName != null) { 070 open(fileName, charsetName); 071 } else if (url != null) { 072 open(url, charsetName); 073 } else { 074 throw new DataFormatException("The input stream cannot be reopen. "); 075 } 076 } 077 078 public void open(String fileName, String charsetName) throws MaltChainedException { 079 setFileName(fileName); 080 setCharsetName(charsetName); 081 try { 082 open(new FileInputStream(fileName), charsetName); 083 }catch (FileNotFoundException e) { 084 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 085 } 086 } 087 public void open(URL url, String charsetName) throws MaltChainedException { 088 setUrl(url); 089 setCharsetName(charsetName); 090 try { 091 open(url.openStream(), charsetName); 092 } catch (IOException e) { 093 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 094 } 095 } 096 097 public void open(InputStream is, String charsetName) throws MaltChainedException { 098 try { 099 if (is == System.in) { 100 closeStream = false; 101 } 102 open(new InputStreamReader(is, charsetName)); 103 } catch (UnsupportedEncodingException e) { 104 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 105 } 106 } 107 108 private void open(InputStreamReader isr) throws MaltChainedException { 109 try { 110 XMLInputFactory factory = XMLInputFactory.newInstance(); 111 setReader(factory.createXMLStreamReader(new BufferedReader(isr))); 112 } catch (XMLStreamException e) { 113 throw new DataFormatException("XML input file could be opened. ", e); 114 } 115 setSentenceCount(0); 116 } 117 118 public void readProlog() throws MaltChainedException { 119 120 } 121 122 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 123 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 124 return false; 125 } 126 syntaxGraph.clear(); 127 syntaxGraph.getSymbolTables().cleanUp(); 128 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 129 PhraseStructureNode parent = null; 130 PhraseStructureNode child = null; 131 // if (header == null) { 132 // header = new TigerXMLHeader(syntaxGraph.getSymbolTables()); 133 // } 134 135 try { 136 while (true) { 137 int event = reader.next(); 138 if (event == XMLStreamConstants.START_ELEMENT) { 139 if (reader.getLocalName().length() == 0) { 140 continue; 141 } 142 if (reader.getLocalName().charAt(0) == 'e') { 143 // e -> edge, edgelabel 144 if (reader.getLocalName().length() == 4) { //edge 145 int childid = -1; 146 int indexSep = reader.getAttributeValue(null, "idref").indexOf('_'); 147 148 try { 149 if (indexSep != -1) { 150 childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1)); 151 } else { 152 childid = Integer.parseInt(reader.getAttributeValue(null, "idref")); 153 } 154 if (childid == -1) { 155 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 156 } 157 } catch (NumberFormatException e) { 158 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 159 } 160 161 if (childid < START_ID_OF_NONTERMINALS) { 162 child = phraseStructure.getTokenNode(childid); 163 } else { 164 165 child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1); 166 } 167 168 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 169 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 170 for (String name : inputTables.keySet()) { 171 e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 172 } 173 } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel 174 // domain = Domain.EL; 175 } 176 } else if (reader.getLocalName().charAt(0) == 'n') { 177 // n -> nt, nonterminals, name 178 if (reader.getLocalName().length() == 2) { // nt 179 final String id = reader.getAttributeValue(null, "id"); 180 if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) { 181 parent = phraseStructure.getPhraseStructureRoot(); 182 } else { 183 int index = id.indexOf('_'); 184 if (index != -1) { 185 parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1); 186 } 187 } 188 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables(); 189 for (String name : inputTables.keySet()) { 190 parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 191 } 192 } else if (reader.getLocalName().equals("name")) { // name 193 // elementContent.setLength(0); 194 // collectChar = true; 195 } 196 } else if (reader.getLocalName().charAt(0) == 't') { 197 // t -> t, terminals 198 if (reader.getLocalName().length() == 1) { // t 199 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables(); 200 child = syntaxGraph.addTokenNode(); 201 for (String name : inputTables.keySet()) { 202 child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 203 } 204 } 205 } else if (reader.getLocalName().charAt(0) == 's') { 206 // s -> subcorpus, secedge, s, secedgelabel 207 if (reader.getLocalName().length() == 1) { // s 208 String id = reader.getAttributeValue(null, "id"); 209 boolean indexable = false; 210 int index = -1; 211 if (id != null && id.length() > 0) { 212 for (int i = 0, n = id.length(); i < n; i++) { 213 if (Character.isDigit(id.charAt(i))) { 214 if (index == -1) { 215 index = i; 216 } 217 indexable = true; 218 } 219 } 220 } 221 if (indexable) { 222 phraseStructure.setSentenceID(Integer.parseInt(id.substring(index))); 223 } else { 224 phraseStructure.setSentenceID(sentenceCount+1); 225 } 226 } 227 } else if (reader.getLocalName().charAt(0) == 'v') { 228 // v -> variable, value 229 // if (reader.getLocalName().equals("value")) { 230 // valueName.setLength(0); 231 // valueName.append(reader.getAttributeValue(null, "name")); 232 // elementContent.setLength(0); 233 // collectChar = true; 234 // } 235 } else { 236 // a -> annotation, author 237 // b -> body 238 // c -> corpus 239 // d -> date, description, 240 // f -> feature, format 241 // g -> graph 242 // h -> head, history 243 // m -> matches, match 244 if (reader.getLocalName().equals("graph")) { 245 graphRootID.setLength(0); 246 graphRootID.append(reader.getAttributeValue(null, "root")); 247 } else if (reader.getLocalName().equals("corpus")) { 248 // header.setCorpusID(reader.getAttributeValue(null, "id")); 249 // header.setCorpusID(reader.getAttributeValue(null, "version")); 250 } else if (reader.getLocalName().equals("feature")) { 251 // if (header != null) { 252 // currentFeatureName.setLength(0); 253 // currentFeatureName.append(reader.getAttributeValue(null, "name")); 254 // header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain")); 255 // } 256 // domain = Domain.valueOf(reader.getAttributeValue(null, "domain")); 257 } else if (reader.getLocalName().equals("secedgelabel")) { 258 // domain = Domain.SEL; 259 } else if (reader.getLocalName().equals("author")) { 260 // elementContent.setLength(0); 261 // collectChar = true; 262 } else if (reader.getLocalName().equals("date")) { 263 // elementContent.setLength(0); 264 // collectChar = true; 265 } else if (reader.getLocalName().equals("description")) { 266 // elementContent.setLength(0); 267 // collectChar = true; 268 } else if (reader.getLocalName().equals("format")) { 269 // elementContent.setLength(0); 270 // collectChar = true; 271 } else if (reader.getLocalName().equals("history")) { 272 // elementContent.setLength(0); 273 // collectChar = true; 274 } 275 } 276 } else if (event == XMLStreamConstants.END_ELEMENT) { 277 if (reader.getLocalName().length() == 0) { 278 continue; 279 } 280 if (reader.getLocalName().charAt(0) == 'e') { 281 // e -> edge, edgelabel 282 } else if (reader.getLocalName().charAt(0) == 'n') { 283 // n -> nt, nonterminals, name 284 if (reader.getLocalName().equals("nt")) { 285 ntid.setLength(0); 286 } 287 else if (reader.getLocalName().equals("nonterminals")) { 288 if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) { 289 Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1)); 290 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 291 for (String name : inputTables.keySet()) { 292 e.addLabel(inputTables.get(name), "--"); 293 } 294 } 295 } 296 // else if (reader.getLocalName().equals("name")) { 297 // if (header != null) { 298 // header.setMetaName(elementContent.toString()); 299 // } 300 // collectChar = false; 301 // } 302 } else if (reader.getLocalName().charAt(0) == 't') { 303 // t -> t, terminals 304 } else if (reader.getLocalName().charAt(0) == 's') { 305 // s -> subcorpus, secedge, s, secedgelabel 306 if (reader.getLocalName().equals("s")) { 307 if (syntaxGraph.hasTokens()) { 308 sentenceCount++; 309 } 310 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 311 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 312 } 313 return true; 314 } 315 } else if (reader.getLocalName().charAt(0) == 'v') { 316 // v -> variable, value 317 // if (reader.getLocalName().equals("value")) { 318 // if (header != null) { 319 // if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) { 320 // header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString()); 321 // } else if (domain == Domain.EL) { 322 // header.addEdgeLabelValue(valueName.toString(), elementContent.toString()); 323 // } else if (domain == Domain.SEL) { 324 // header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString()); 325 // } 326 // } 327 // collectChar = false; 328 // } 329 } else { 330 // a -> annotation, author 331 // b -> body 332 // c -> corpus 333 // d -> date, description, 334 // f -> feature, format 335 // g -> graph 336 // h -> head, history 337 // m -> matches, match 338 if (reader.getLocalName().equals("body")) { 339 //sentence = dataStructures.getSentence(); 340 //phraseTree = dataStructures.getInPhraseTree(); 341 //sentence.clear(); 342 //phraseTree.clear(); 343 //dataStructures.setLastProcessObject(true); 344 } else if (reader.getLocalName().equals("author")) { 345 // if (header != null) { 346 // header.setMetaAuthor(elementContent.toString()); 347 // } 348 // collectChar = false; 349 } else if (reader.getLocalName().equals("date")) { 350 // if (header != null) { 351 // header.setMetaInDate(elementContent.toString()); 352 // } 353 // collectChar = false; 354 } else if (reader.getLocalName().equals("description")) { 355 // if (header != null) { 356 // header.setMetaDescription(elementContent.toString()); 357 // } 358 // collectChar = false; 359 } else if (reader.getLocalName().equals("format")) { 360 // if (header != null) { 361 // header.setMetaFormat(elementContent.toString()); 362 // } 363 // collectChar = false; 364 } else if (reader.getLocalName().equals("history")) { 365 // if (header != null) { 366 // header.setMetaHistory(elementContent.toString()); 367 // } 368 // collectChar = false; 369 } /* else if (reader.getLocalName().equals("annotation")) { 370 if (header != null) { 371 System.out.println(header.toTigerXML()); 372 } 373 collectChar = false; 374 } */ 375 } 376 } else if (event == XMLStreamConstants.END_DOCUMENT) { 377 if (syntaxGraph.hasTokens()) { 378 sentenceCount++; 379 } 380 if (cIterations < nIterations) { 381 cIterations++; 382 reopen(); 383 return true; 384 } 385 return false; 386 } else if (event == XMLStreamConstants.CHARACTERS) { 387 // if (collectChar) { 388 // char[] ch = reader.getTextCharacters(); 389 // final int size = reader.getTextStart()+reader.getTextLength(); 390 // for (int i = reader.getTextStart(); i < size; i++) { 391 // elementContent.append(ch[i]); 392 // } 393 // } 394 } 395 } 396 } catch (XMLStreamException e) { 397 throw new DataFormatException("", e); 398 } 399 } 400 401 public int getSentenceCount() { 402 return sentenceCount; 403 } 404 405 public void setSentenceCount(int sentenceCount) { 406 this.sentenceCount = sentenceCount; 407 } 408 409 public XMLStreamReader getReader() { 410 return reader; 411 } 412 413 public void setReader(XMLStreamReader reader) { 414 this.reader = reader; 415 } 416 417 public void readEpilog() throws MaltChainedException { 418 419 } 420 421 public void close() throws MaltChainedException { 422 try { 423 if (reader != null) { 424 if (closeStream) { 425 reader.close(); 426 } 427 reader = null; 428 } 429 } catch (XMLStreamException e) { 430 throw new DataFormatException("The XML input file could be closed. ", e); 431 } 432 } 433 434 public DataFormatInstance getDataFormatInstance() { 435 return dataFormatInstance; 436 } 437 438 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 439 this.dataFormatInstance = inputDataFormatInstance; 440 } 441 442 public String getOptions() { 443 return optionString; 444 } 445 446 public void setOptions(String optionString) throws MaltChainedException { 447 this.optionString = optionString; 448 String[] argv; 449 try { 450 argv = optionString.split("[_\\p{Blank}]"); 451 } catch (PatternSyntaxException e) { 452 throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e); 453 } 454 for (int i=0; i < argv.length-1; i++) { 455 if(argv[i].charAt(0) != '-') { 456 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 457 } 458 if(++i>=argv.length) { 459 throw new DataFormatException("The last argument does not have any value. "); 460 } 461 switch(argv[i-1].charAt(1)) { 462 case 's': 463 try { 464 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 465 } catch (NumberFormatException e){ 466 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 467 } 468 break; 469 default: 470 throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 471 } 472 } 473 } 474 475 public String getFileName() { 476 return fileName; 477 } 478 479 public void setFileName(String fileName) { 480 this.fileName = fileName; 481 } 482 483 public URL getUrl() { 484 return url; 485 } 486 487 public void setUrl(URL url) { 488 this.url = url; 489 } 490 491 public String getCharsetName() { 492 return charsetName; 493 } 494 495 public void setCharsetName(String charsetName) { 496 this.charsetName = charsetName; 497 } 498 499 public int getNIterations() { 500 return nIterations; 501 } 502 503 public void setNIterations(int iterations) { 504 nIterations = iterations; 505 } 506 507 public int getIterationCounter() { 508 return cIterations; 509 } 510 // public TigerXMLHeader getHeader() { 511 // return header; 512 // } 513 // 514 // public void setHeader(TigerXMLHeader header) { 515 // this.header = header; 516 // } 517 }