001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.SortedMap; 012 import java.util.regex.PatternSyntaxException; 013 014 import javax.xml.stream.XMLInputFactory; 015 import javax.xml.stream.XMLStreamConstants; 016 import javax.xml.stream.XMLStreamException; 017 import javax.xml.stream.XMLStreamReader; 018 019 import org.maltparser.core.exception.MaltChainedException; 020 import org.maltparser.core.io.dataformat.DataFormatException; 021 import org.maltparser.core.io.dataformat.DataFormatInstance; 022 import org.maltparser.core.symbol.SymbolTable; 023 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 024 import org.maltparser.core.syntaxgraph.PhraseStructure; 025 import org.maltparser.core.syntaxgraph.SyntaxGraphException; 026 import org.maltparser.core.syntaxgraph.TokenStructure; 027 import org.maltparser.core.syntaxgraph.edge.Edge; 028 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 029 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 030 031 /** 032 * 033 * 034 * @author Johan Hall 035 */ 036 public class TigerXMLReader implements SyntaxGraphReader { 037 // private TigerXMLHeader header; 038 private XMLStreamReader reader; 039 private int sentenceCount; 040 private DataFormatInstance dataFormatInstance; 041 private StringBuffer ntid; 042 private final StringBuilder graphRootID; 043 // private StringBuilder elementContent; 044 // private StringBuilder valueName; 045 // private StringBuilder currentFeatureName; 046 // private Domain domain; 047 // private boolean collectChar = false; 048 private String optionString; 049 private String fileName = null; 050 private URL url = null; 051 private String charsetName; 052 private int nIterations; 053 private int cIterations; 054 private int START_ID_OF_NONTERMINALS = 500; 055 056 057 public TigerXMLReader() { 058 this.ntid = new StringBuffer(); 059 // elementContent = new StringBuilder(); 060 // valueName = new StringBuilder(); 061 // currentFeatureName = new StringBuilder(); 062 graphRootID = new StringBuilder(); 063 nIterations = 1; 064 cIterations = 1; 065 } 066 067 private void reopen() throws MaltChainedException { 068 close(); 069 if (fileName != null) { 070 open(fileName, charsetName); 071 } else if (url != null) { 072 open(url, charsetName); 073 } else { 074 throw new DataFormatException("The input stream cannot be reopen. "); 075 } 076 } 077 078 public void open(String fileName, String charsetName) throws MaltChainedException { 079 setFileName(fileName); 080 setCharsetName(charsetName); 081 try { 082 open(new FileInputStream(fileName), charsetName); 083 }catch (FileNotFoundException e) { 084 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 085 } 086 } 087 public void open(URL url, String charsetName) throws MaltChainedException { 088 setUrl(url); 089 setCharsetName(charsetName); 090 try { 091 open(url.openStream(), charsetName); 092 } catch (IOException e) { 093 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 094 } 095 } 096 097 public void open(InputStream is, String charsetName) throws MaltChainedException { 098 try { 099 open(new InputStreamReader(is, charsetName)); 100 } catch (UnsupportedEncodingException e) { 101 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 102 } 103 } 104 105 public void open(InputStreamReader isr) throws MaltChainedException { 106 try { 107 XMLInputFactory factory = XMLInputFactory.newInstance(); 108 setReader(factory.createXMLStreamReader(new BufferedReader(isr))); 109 } catch (XMLStreamException e) { 110 throw new DataFormatException("XML input file could be opened. ", e); 111 } 112 setSentenceCount(0); 113 } 114 115 public void readProlog() throws MaltChainedException { 116 117 } 118 119 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 120 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 121 return false; 122 } 123 syntaxGraph.clear(); 124 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 125 PhraseStructureNode parent = null; 126 PhraseStructureNode child = null; 127 // if (header == null) { 128 // header = new TigerXMLHeader(syntaxGraph.getSymbolTables()); 129 // } 130 131 try { 132 while (true) { 133 int event = reader.next(); 134 if (event == XMLStreamConstants.START_ELEMENT) { 135 if (reader.getLocalName().length() == 0) { 136 continue; 137 } 138 if (reader.getLocalName().charAt(0) == 'e') { 139 // e -> edge, edgelabel 140 if (reader.getLocalName().length() == 4) { //edge 141 int childid = -1; 142 int indexSep = reader.getAttributeValue(null, "idref").indexOf('_'); 143 144 try { 145 if (indexSep != -1) { 146 childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1)); 147 } else { 148 childid = Integer.parseInt(reader.getAttributeValue(null, "idref")); 149 } 150 if (childid == -1) { 151 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 152 } 153 } catch (NumberFormatException e) { 154 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 155 } 156 157 if (childid < START_ID_OF_NONTERMINALS) { 158 child = phraseStructure.getTokenNode(childid); 159 } else { 160 161 child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1); 162 } 163 164 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 165 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 166 for (String name : inputTables.keySet()) { 167 e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 168 } 169 } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel 170 // domain = Domain.EL; 171 } 172 } else if (reader.getLocalName().charAt(0) == 'n') { 173 // n -> nt, nonterminals, name 174 if (reader.getLocalName().length() == 2) { // nt 175 final String id = reader.getAttributeValue(null, "id"); 176 if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) { 177 parent = phraseStructure.getPhraseStructureRoot(); 178 } else { 179 int index = id.indexOf('_'); 180 if (index != -1) { 181 parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1); 182 } 183 } 184 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables(); 185 for (String name : inputTables.keySet()) { 186 parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 187 } 188 } else if (reader.getLocalName().equals("name")) { // name 189 // elementContent.setLength(0); 190 // collectChar = true; 191 } 192 } else if (reader.getLocalName().charAt(0) == 't') { 193 // t -> t, terminals 194 if (reader.getLocalName().length() == 1) { // t 195 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables(); 196 child = syntaxGraph.addTokenNode(); 197 for (String name : inputTables.keySet()) { 198 child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 199 } 200 } 201 } else if (reader.getLocalName().charAt(0) == 's') { 202 // s -> subcorpus, secedge, s, secedgelabel 203 if (reader.getLocalName().length() == 1) { // s 204 String id = reader.getAttributeValue(null, "id"); 205 boolean indexable = false; 206 int index = -1; 207 if (id != null && id.length() > 0) { 208 for (int i = 0, n = id.length(); i < n; i++) { 209 if (Character.isDigit(id.charAt(i))) { 210 if (index == -1) { 211 index = i; 212 } 213 indexable = true; 214 } 215 } 216 } 217 if (indexable) { 218 phraseStructure.setSentenceID(Integer.parseInt(id.substring(index))); 219 } else { 220 phraseStructure.setSentenceID(sentenceCount+1); 221 } 222 } 223 } else if (reader.getLocalName().charAt(0) == 'v') { 224 // v -> variable, value 225 // if (reader.getLocalName().equals("value")) { 226 // valueName.setLength(0); 227 // valueName.append(reader.getAttributeValue(null, "name")); 228 // elementContent.setLength(0); 229 // collectChar = true; 230 // } 231 } else { 232 // a -> annotation, author 233 // b -> body 234 // c -> corpus 235 // d -> date, description, 236 // f -> feature, format 237 // g -> graph 238 // h -> head, history 239 // m -> matches, match 240 if (reader.getLocalName().equals("graph")) { 241 graphRootID.setLength(0); 242 graphRootID.append(reader.getAttributeValue(null, "root")); 243 } else if (reader.getLocalName().equals("corpus")) { 244 // header.setCorpusID(reader.getAttributeValue(null, "id")); 245 // header.setCorpusID(reader.getAttributeValue(null, "version")); 246 } else if (reader.getLocalName().equals("feature")) { 247 // if (header != null) { 248 // currentFeatureName.setLength(0); 249 // currentFeatureName.append(reader.getAttributeValue(null, "name")); 250 // header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain")); 251 // } 252 // domain = Domain.valueOf(reader.getAttributeValue(null, "domain")); 253 } else if (reader.getLocalName().equals("secedgelabel")) { 254 // domain = Domain.SEL; 255 } else if (reader.getLocalName().equals("author")) { 256 // elementContent.setLength(0); 257 // collectChar = true; 258 } else if (reader.getLocalName().equals("date")) { 259 // elementContent.setLength(0); 260 // collectChar = true; 261 } else if (reader.getLocalName().equals("description")) { 262 // elementContent.setLength(0); 263 // collectChar = true; 264 } else if (reader.getLocalName().equals("format")) { 265 // elementContent.setLength(0); 266 // collectChar = true; 267 } else if (reader.getLocalName().equals("history")) { 268 // elementContent.setLength(0); 269 // collectChar = true; 270 } 271 } 272 } else if (event == XMLStreamConstants.END_ELEMENT) { 273 if (reader.getLocalName().length() == 0) { 274 continue; 275 } 276 if (reader.getLocalName().charAt(0) == 'e') { 277 // e -> edge, edgelabel 278 } else if (reader.getLocalName().charAt(0) == 'n') { 279 // n -> nt, nonterminals, name 280 if (reader.getLocalName().equals("nt")) { 281 ntid.setLength(0); 282 } 283 else if (reader.getLocalName().equals("nonterminals")) { 284 if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) { 285 Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1)); 286 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 287 for (String name : inputTables.keySet()) { 288 e.addLabel(inputTables.get(name), "--"); 289 } 290 } 291 } 292 // else if (reader.getLocalName().equals("name")) { 293 // if (header != null) { 294 // header.setMetaName(elementContent.toString()); 295 // } 296 // collectChar = false; 297 // } 298 } else if (reader.getLocalName().charAt(0) == 't') { 299 // t -> t, terminals 300 } else if (reader.getLocalName().charAt(0) == 's') { 301 // s -> subcorpus, secedge, s, secedgelabel 302 if (reader.getLocalName().equals("s")) { 303 if (syntaxGraph.hasTokens()) { 304 sentenceCount++; 305 } 306 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 307 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 308 } 309 return true; 310 } 311 } else if (reader.getLocalName().charAt(0) == 'v') { 312 // v -> variable, value 313 // if (reader.getLocalName().equals("value")) { 314 // if (header != null) { 315 // if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) { 316 // header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString()); 317 // } else if (domain == Domain.EL) { 318 // header.addEdgeLabelValue(valueName.toString(), elementContent.toString()); 319 // } else if (domain == Domain.SEL) { 320 // header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString()); 321 // } 322 // } 323 // collectChar = false; 324 // } 325 } else { 326 // a -> annotation, author 327 // b -> body 328 // c -> corpus 329 // d -> date, description, 330 // f -> feature, format 331 // g -> graph 332 // h -> head, history 333 // m -> matches, match 334 if (reader.getLocalName().equals("body")) { 335 //sentence = dataStructures.getSentence(); 336 //phraseTree = dataStructures.getInPhraseTree(); 337 //sentence.clear(); 338 //phraseTree.clear(); 339 //dataStructures.setLastProcessObject(true); 340 } else if (reader.getLocalName().equals("author")) { 341 // if (header != null) { 342 // header.setMetaAuthor(elementContent.toString()); 343 // } 344 // collectChar = false; 345 } else if (reader.getLocalName().equals("date")) { 346 // if (header != null) { 347 // header.setMetaInDate(elementContent.toString()); 348 // } 349 // collectChar = false; 350 } else if (reader.getLocalName().equals("description")) { 351 // if (header != null) { 352 // header.setMetaDescription(elementContent.toString()); 353 // } 354 // collectChar = false; 355 } else if (reader.getLocalName().equals("format")) { 356 // if (header != null) { 357 // header.setMetaFormat(elementContent.toString()); 358 // } 359 // collectChar = false; 360 } else if (reader.getLocalName().equals("history")) { 361 // if (header != null) { 362 // header.setMetaHistory(elementContent.toString()); 363 // } 364 // collectChar = false; 365 } /* else if (reader.getLocalName().equals("annotation")) { 366 if (header != null) { 367 System.out.println(header.toTigerXML()); 368 } 369 collectChar = false; 370 } */ 371 } 372 } else if (event == XMLStreamConstants.END_DOCUMENT) { 373 if (syntaxGraph.hasTokens()) { 374 sentenceCount++; 375 } 376 if (cIterations < nIterations) { 377 cIterations++; 378 reopen(); 379 return true; 380 } 381 return false; 382 } else if (event == XMLStreamConstants.CHARACTERS) { 383 // if (collectChar) { 384 // char[] ch = reader.getTextCharacters(); 385 // final int size = reader.getTextStart()+reader.getTextLength(); 386 // for (int i = reader.getTextStart(); i < size; i++) { 387 // elementContent.append(ch[i]); 388 // } 389 // } 390 } 391 } 392 } catch (XMLStreamException e) { 393 throw new DataFormatException("", e); 394 } 395 } 396 397 public int getSentenceCount() { 398 return sentenceCount; 399 } 400 401 public void setSentenceCount(int sentenceCount) { 402 this.sentenceCount = sentenceCount; 403 } 404 405 public XMLStreamReader getReader() { 406 return reader; 407 } 408 409 public void setReader(XMLStreamReader reader) { 410 this.reader = reader; 411 } 412 413 public void readEpilog() throws MaltChainedException { 414 415 } 416 417 public void close() throws MaltChainedException { 418 try { 419 if (reader != null) { 420 reader.close(); 421 reader = null; 422 } 423 } catch (XMLStreamException e) { 424 throw new DataFormatException("The XML input file could be closed. ", e); 425 } 426 } 427 428 public DataFormatInstance getDataFormatInstance() { 429 return dataFormatInstance; 430 } 431 432 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 433 this.dataFormatInstance = inputDataFormatInstance; 434 } 435 436 public String getOptions() { 437 return optionString; 438 } 439 440 public void setOptions(String optionString) throws MaltChainedException { 441 this.optionString = optionString; 442 String[] argv; 443 try { 444 argv = optionString.split("[_\\p{Blank}]"); 445 } catch (PatternSyntaxException e) { 446 throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e); 447 } 448 for (int i=0; i < argv.length-1; i++) { 449 if(argv[i].charAt(0) != '-') { 450 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 451 } 452 if(++i>=argv.length) { 453 throw new DataFormatException("The last argument does not have any value. "); 454 } 455 switch(argv[i-1].charAt(1)) { 456 case 's': 457 try { 458 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 459 } catch (NumberFormatException e){ 460 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 461 } 462 break; 463 default: 464 throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 465 } 466 } 467 } 468 469 public String getFileName() { 470 return fileName; 471 } 472 473 public void setFileName(String fileName) { 474 this.fileName = fileName; 475 } 476 477 public URL getUrl() { 478 return url; 479 } 480 481 public void setUrl(URL url) { 482 this.url = url; 483 } 484 485 public String getCharsetName() { 486 return charsetName; 487 } 488 489 public void setCharsetName(String charsetName) { 490 this.charsetName = charsetName; 491 } 492 493 public int getNIterations() { 494 return nIterations; 495 } 496 497 public void setNIterations(int iterations) { 498 nIterations = iterations; 499 } 500 501 public int getIterationCounter() { 502 return cIterations; 503 } 504 // public TigerXMLHeader getHeader() { 505 // return header; 506 // } 507 // 508 // public void setHeader(TigerXMLHeader header) { 509 // this.header = header; 510 // } 511 }