001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.SortedMap; 012 import java.util.regex.PatternSyntaxException; 013 014 import javax.xml.stream.XMLInputFactory; 015 import javax.xml.stream.XMLStreamConstants; 016 import javax.xml.stream.XMLStreamException; 017 import javax.xml.stream.XMLStreamReader; 018 019 import org.maltparser.core.exception.MaltChainedException; 020 import org.maltparser.core.io.dataformat.DataFormatException; 021 import org.maltparser.core.io.dataformat.DataFormatInstance; 022 import org.maltparser.core.symbol.SymbolTable; 023 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 024 import org.maltparser.core.syntaxgraph.PhraseStructure; 025 import org.maltparser.core.syntaxgraph.SyntaxGraphException; 026 import org.maltparser.core.syntaxgraph.TokenStructure; 027 import org.maltparser.core.syntaxgraph.edge.Edge; 028 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 029 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 030 031 /** 032 * 033 * 034 * @author Johan Hall 035 */ 036 public class TigerXMLReader implements SyntaxGraphReader { 037 // private TigerXMLHeader header; 038 private XMLStreamReader reader; 039 private int sentenceCount; 040 private DataFormatInstance dataFormatInstance; 041 private StringBuffer ntid; 042 private final StringBuilder graphRootID; 043 // private StringBuilder elementContent; 044 // private StringBuilder valueName; 045 // private StringBuilder currentFeatureName; 046 // private Domain domain; 047 // private boolean collectChar = false; 048 private String optionString; 049 private int START_ID_OF_NONTERMINALS = 500; 050 051 052 public TigerXMLReader() { 053 this.ntid = new StringBuffer(); 054 // elementContent = new StringBuilder(); 055 // valueName = new StringBuilder(); 056 // currentFeatureName = new StringBuilder(); 057 graphRootID = new StringBuilder(); 058 } 059 060 public void open(String fileName, String charsetName) throws MaltChainedException { 061 try { 062 open(new FileInputStream(fileName), charsetName); 063 }catch (FileNotFoundException e) { 064 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 065 } 066 } 067 public void open(URL url, String charsetName) throws MaltChainedException { 068 try { 069 open(url.openStream(), charsetName); 070 } catch (IOException e) { 071 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 072 } 073 } 074 075 public void open(InputStream is, String charsetName) throws MaltChainedException { 076 try { 077 open(new InputStreamReader(is, charsetName)); 078 } catch (UnsupportedEncodingException e) { 079 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 080 } 081 } 082 083 public void open(InputStreamReader isr) throws MaltChainedException { 084 try { 085 XMLInputFactory factory = XMLInputFactory.newInstance(); 086 setReader(factory.createXMLStreamReader(new BufferedReader(isr))); 087 } catch (XMLStreamException e) { 088 throw new DataFormatException("XML input file could be opened. ", e); 089 } 090 setSentenceCount(0); 091 } 092 093 public void readProlog() throws MaltChainedException { 094 095 } 096 097 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 098 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 099 return false; 100 } 101 syntaxGraph.clear(); 102 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 103 PhraseStructureNode parent = null; 104 PhraseStructureNode child = null; 105 // if (header == null) { 106 // header = new TigerXMLHeader(syntaxGraph.getSymbolTables()); 107 // } 108 109 try { 110 while (true) { 111 int event = reader.next(); 112 if (event == XMLStreamConstants.START_ELEMENT) { 113 if (reader.getLocalName().length() == 0) { 114 continue; 115 } 116 if (reader.getLocalName().charAt(0) == 'e') { 117 // e -> edge, edgelabel 118 if (reader.getLocalName().length() == 4) { //edge 119 int childid = -1; 120 int indexSep = reader.getAttributeValue(null, "idref").indexOf('_'); 121 122 try { 123 if (indexSep != -1) { 124 childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1)); 125 } else { 126 childid = Integer.parseInt(reader.getAttributeValue(null, "idref")); 127 } 128 if (childid == -1) { 129 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 130 } 131 } catch (NumberFormatException e) { 132 throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. "); 133 } 134 135 if (childid < START_ID_OF_NONTERMINALS) { 136 child = phraseStructure.getTokenNode(childid); 137 } else { 138 139 child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1); 140 } 141 142 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 143 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 144 for (String name : inputTables.keySet()) { 145 e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 146 } 147 } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel 148 // domain = Domain.EL; 149 } 150 } else if (reader.getLocalName().charAt(0) == 'n') { 151 // n -> nt, nonterminals, name 152 if (reader.getLocalName().length() == 2) { // nt 153 final String id = reader.getAttributeValue(null, "id"); 154 if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) { 155 parent = phraseStructure.getPhraseStructureRoot(); 156 } else { 157 int index = id.indexOf('_'); 158 if (index != -1) { 159 parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1); 160 } 161 } 162 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables(); 163 for (String name : inputTables.keySet()) { 164 parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 165 } 166 } else if (reader.getLocalName().equals("name")) { // name 167 // elementContent.setLength(0); 168 // collectChar = true; 169 } 170 } else if (reader.getLocalName().charAt(0) == 't') { 171 // t -> t, terminals 172 if (reader.getLocalName().length() == 1) { // t 173 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables(); 174 child = syntaxGraph.addTokenNode(); 175 for (String name : inputTables.keySet()) { 176 child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase())); 177 } 178 } 179 } else if (reader.getLocalName().charAt(0) == 's') { 180 // s -> subcorpus, secedge, s, secedgelabel 181 if (reader.getLocalName().length() == 1) { // s 182 String id = reader.getAttributeValue(null, "id"); 183 boolean indexable = false; 184 int index = -1; 185 if (id != null && id.length() > 0) { 186 for (int i = 0, n = id.length(); i < n; i++) { 187 if (Character.isDigit(id.charAt(i))) { 188 if (index == -1) { 189 index = i; 190 } 191 indexable = true; 192 } 193 } 194 } 195 if (indexable) { 196 phraseStructure.setSentenceID(Integer.parseInt(id.substring(index))); 197 } else { 198 phraseStructure.setSentenceID(sentenceCount+1); 199 } 200 } 201 } else if (reader.getLocalName().charAt(0) == 'v') { 202 // v -> variable, value 203 // if (reader.getLocalName().equals("value")) { 204 // valueName.setLength(0); 205 // valueName.append(reader.getAttributeValue(null, "name")); 206 // elementContent.setLength(0); 207 // collectChar = true; 208 // } 209 } else { 210 // a -> annotation, author 211 // b -> body 212 // c -> corpus 213 // d -> date, description, 214 // f -> feature, format 215 // g -> graph 216 // h -> head, history 217 // m -> matches, match 218 if (reader.getLocalName().equals("graph")) { 219 graphRootID.setLength(0); 220 graphRootID.append(reader.getAttributeValue(null, "root")); 221 } else if (reader.getLocalName().equals("corpus")) { 222 // header.setCorpusID(reader.getAttributeValue(null, "id")); 223 // header.setCorpusID(reader.getAttributeValue(null, "version")); 224 } else if (reader.getLocalName().equals("feature")) { 225 // if (header != null) { 226 // currentFeatureName.setLength(0); 227 // currentFeatureName.append(reader.getAttributeValue(null, "name")); 228 // header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain")); 229 // } 230 // domain = Domain.valueOf(reader.getAttributeValue(null, "domain")); 231 } else if (reader.getLocalName().equals("secedgelabel")) { 232 // domain = Domain.SEL; 233 } else if (reader.getLocalName().equals("author")) { 234 // elementContent.setLength(0); 235 // collectChar = true; 236 } else if (reader.getLocalName().equals("date")) { 237 // elementContent.setLength(0); 238 // collectChar = true; 239 } else if (reader.getLocalName().equals("description")) { 240 // elementContent.setLength(0); 241 // collectChar = true; 242 } else if (reader.getLocalName().equals("format")) { 243 // elementContent.setLength(0); 244 // collectChar = true; 245 } else if (reader.getLocalName().equals("history")) { 246 // elementContent.setLength(0); 247 // collectChar = true; 248 } 249 } 250 } else if (event == XMLStreamConstants.END_ELEMENT) { 251 if (reader.getLocalName().length() == 0) { 252 continue; 253 } 254 if (reader.getLocalName().charAt(0) == 'e') { 255 // e -> edge, edgelabel 256 } else if (reader.getLocalName().charAt(0) == 'n') { 257 // n -> nt, nonterminals, name 258 if (reader.getLocalName().equals("nt")) { 259 ntid.setLength(0); 260 } 261 else if (reader.getLocalName().equals("nonterminals")) { 262 if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) { 263 Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1)); 264 SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables(); 265 for (String name : inputTables.keySet()) { 266 e.addLabel(inputTables.get(name), "--"); 267 } 268 } 269 } 270 // else if (reader.getLocalName().equals("name")) { 271 // if (header != null) { 272 // header.setMetaName(elementContent.toString()); 273 // } 274 // collectChar = false; 275 // } 276 } else if (reader.getLocalName().charAt(0) == 't') { 277 // t -> t, terminals 278 } else if (reader.getLocalName().charAt(0) == 's') { 279 // s -> subcorpus, secedge, s, secedgelabel 280 if (reader.getLocalName().equals("s")) { 281 if (syntaxGraph.hasTokens()) { 282 sentenceCount++; 283 } 284 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 285 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 286 } 287 return true; 288 } 289 } else if (reader.getLocalName().charAt(0) == 'v') { 290 // v -> variable, value 291 // if (reader.getLocalName().equals("value")) { 292 // if (header != null) { 293 // if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) { 294 // header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString()); 295 // } else if (domain == Domain.EL) { 296 // header.addEdgeLabelValue(valueName.toString(), elementContent.toString()); 297 // } else if (domain == Domain.SEL) { 298 // header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString()); 299 // } 300 // } 301 // collectChar = false; 302 // } 303 } else { 304 // a -> annotation, author 305 // b -> body 306 // c -> corpus 307 // d -> date, description, 308 // f -> feature, format 309 // g -> graph 310 // h -> head, history 311 // m -> matches, match 312 if (reader.getLocalName().equals("body")) { 313 //sentence = dataStructures.getSentence(); 314 //phraseTree = dataStructures.getInPhraseTree(); 315 //sentence.clear(); 316 //phraseTree.clear(); 317 //dataStructures.setLastProcessObject(true); 318 } else if (reader.getLocalName().equals("author")) { 319 // if (header != null) { 320 // header.setMetaAuthor(elementContent.toString()); 321 // } 322 // collectChar = false; 323 } else if (reader.getLocalName().equals("date")) { 324 // if (header != null) { 325 // header.setMetaInDate(elementContent.toString()); 326 // } 327 // collectChar = false; 328 } else if (reader.getLocalName().equals("description")) { 329 // if (header != null) { 330 // header.setMetaDescription(elementContent.toString()); 331 // } 332 // collectChar = false; 333 } else if (reader.getLocalName().equals("format")) { 334 // if (header != null) { 335 // header.setMetaFormat(elementContent.toString()); 336 // } 337 // collectChar = false; 338 } else if (reader.getLocalName().equals("history")) { 339 // if (header != null) { 340 // header.setMetaHistory(elementContent.toString()); 341 // } 342 // collectChar = false; 343 } /* else if (reader.getLocalName().equals("annotation")) { 344 if (header != null) { 345 System.out.println(header.toTigerXML()); 346 } 347 collectChar = false; 348 } */ 349 } 350 } else if (event == XMLStreamConstants.END_DOCUMENT) { 351 if (syntaxGraph.hasTokens()) { 352 sentenceCount++; 353 } 354 return false; 355 } else if (event == XMLStreamConstants.CHARACTERS) { 356 // if (collectChar) { 357 // char[] ch = reader.getTextCharacters(); 358 // final int size = reader.getTextStart()+reader.getTextLength(); 359 // for (int i = reader.getTextStart(); i < size; i++) { 360 // elementContent.append(ch[i]); 361 // } 362 // } 363 } 364 } 365 } catch (XMLStreamException e) { 366 throw new DataFormatException("", e); 367 } 368 } 369 370 public int getSentenceCount() { 371 return sentenceCount; 372 } 373 374 public void setSentenceCount(int sentenceCount) { 375 this.sentenceCount = sentenceCount; 376 } 377 378 public XMLStreamReader getReader() { 379 return reader; 380 } 381 382 public void setReader(XMLStreamReader reader) { 383 this.reader = reader; 384 } 385 386 public void readEpilog() throws MaltChainedException { 387 388 } 389 390 public void close() throws MaltChainedException { 391 try { 392 if (reader != null) { 393 reader.close(); 394 reader = null; 395 } 396 } catch (XMLStreamException e) { 397 throw new DataFormatException("The XML input file could be closed. ", e); 398 } 399 } 400 401 public DataFormatInstance getDataFormatInstance() { 402 return dataFormatInstance; 403 } 404 405 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 406 this.dataFormatInstance = inputDataFormatInstance; 407 } 408 409 public String getOptions() { 410 return optionString; 411 } 412 413 public void setOptions(String optionString) throws MaltChainedException { 414 this.optionString = optionString; 415 String[] argv; 416 try { 417 argv = optionString.split("[_\\p{Blank}]"); 418 } catch (PatternSyntaxException e) { 419 throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e); 420 } 421 for (int i=0; i < argv.length-1; i++) { 422 if(argv[i].charAt(0) != '-') { 423 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 424 } 425 if(++i>=argv.length) { 426 throw new DataFormatException("The last argument does not have any value. "); 427 } 428 switch(argv[i-1].charAt(1)) { 429 case 's': 430 try { 431 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 432 } catch (NumberFormatException e){ 433 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 434 } 435 break; 436 default: 437 throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 438 } 439 } 440 } 441 442 // public TigerXMLHeader getHeader() { 443 // return header; 444 // } 445 // 446 // public void setHeader(TigerXMLHeader header) { 447 // this.header = header; 448 // } 449 }