001package org.maltparser.core.syntaxgraph.reader; 002 003import java.io.BufferedReader; 004import java.io.FileInputStream; 005import java.io.FileNotFoundException; 006import java.io.IOException; 007import java.io.InputStream; 008import java.io.InputStreamReader; 009import java.io.UnsupportedEncodingException; 010import java.net.URL; 011import java.util.Iterator; 012import java.util.SortedMap; 013import java.util.TreeMap; 014import java.util.regex.PatternSyntaxException; 015 016import org.maltparser.core.exception.MaltChainedException; 017import org.maltparser.core.io.dataformat.ColumnDescription; 018import org.maltparser.core.io.dataformat.DataFormatException; 019import org.maltparser.core.io.dataformat.DataFormatInstance; 020import org.maltparser.core.symbol.SymbolTableHandler; 021import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 022import org.maltparser.core.syntaxgraph.PhraseStructure; 023import org.maltparser.core.syntaxgraph.TokenStructure; 024import org.maltparser.core.syntaxgraph.edge.Edge; 025import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 026 027/** 028* 029* 030* @author Johan Hall 031*/ 032public class NegraReader implements SyntaxGraphReader { 033 private enum NegraTables { 034 ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF 035 }; 036 private BufferedReader reader; 037 private DataFormatInstance dataFormatInstance; 038 private int sentenceCount; 039 private String optionString; 040 private int formatVersion; 041 private NegraTables currentHeaderTable; 042 private int currentTerminalSize; 043 private int currentNonTerminalSize; 044 private SortedMap<Integer,PhraseStructureNode> nonterminals; 045 private StringBuilder edgelabelSymbol; 046 private StringBuilder edgelabelTableName; 047 private int START_ID_OF_NONTERMINALS = 500; 048 private String fileName = null; 049 private URL url = null; 050 private String charsetName; 051 private int nIterations; 052 private int cIterations; 053 private boolean closeStream = true; 054 055 public NegraReader() { 056 currentHeaderTable = NegraTables.UNDEF; 057 edgelabelSymbol = new StringBuilder(); 058 edgelabelTableName = new StringBuilder(); 059 nonterminals = new TreeMap<Integer,PhraseStructureNode>(); 060 nIterations = 1; 061 cIterations = 1; 062 } 063 064 private void reopen() throws MaltChainedException { 065 close(); 066 if (fileName != null) { 067 open(fileName, charsetName); 068 } else if (url != null) { 069 open(url, charsetName); 070 } else { 071 throw new DataFormatException("The input stream cannot be reopen. "); 072 } 073 } 074 075 public void open(String fileName, String charsetName) throws MaltChainedException { 076 setFileName(fileName); 077 setCharsetName(charsetName); 078 try { 079 open(new FileInputStream(fileName), charsetName); 080 } catch (FileNotFoundException e) { 081 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 082 } 083 } 084 public void open(URL url, String charsetName) throws MaltChainedException { 085 setUrl(url); 086 setCharsetName(charsetName); 087 try { 088 open(url.openStream(), charsetName); 089 } catch (IOException e) { 090 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 091 } 092 } 093 094 public void open(InputStream is, String charsetName) throws MaltChainedException { 095 try { 096 if (is == System.in) { 097 closeStream = false; 098 } 099 open(new InputStreamReader(is, charsetName)); 100 } catch (UnsupportedEncodingException e) { 101 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 102 } 103 } 104 105 private void open(InputStreamReader isr) throws MaltChainedException { 106 setReader(new BufferedReader(isr)); 107 setSentenceCount(0); 108 } 109 110 public void readProlog() throws MaltChainedException { 111 112 } 113 114 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 115 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 116 return false; 117 } 118 syntaxGraph.clear(); 119 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 120 final SymbolTableHandler symbolTables = phraseStructure.getSymbolTables(); 121 PhraseStructureNode parent = null; 122 PhraseStructureNode child = null; 123 currentHeaderTable = NegraTables.UNDEF; 124 String line = null; 125 syntaxGraph.clear(); 126 nonterminals.clear(); 127 try { 128 while (true) { 129 line = reader.readLine(); 130 if (line == null) { 131 if (syntaxGraph.hasTokens()) { 132 sentenceCount++; 133 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 134 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 135 } 136 } 137 if (cIterations < nIterations) { 138 cIterations++; 139 reopen(); 140 return true; 141 } 142 return false; 143 } else if (line.startsWith("#EOS")) { 144 currentTerminalSize = 0; 145 currentNonTerminalSize = 0; 146 currentHeaderTable = NegraTables.UNDEF; 147 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 148 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 149 } 150 return true; 151 } else if (line.startsWith("#BOS")) { 152 currentHeaderTable = NegraTables.SENTENCE; 153 int s = -1, e = -1; 154 for (int i = 5, n = line.length(); i < n; i++) { 155 if (Character.isDigit(line.charAt(i)) && s == -1) { 156 s = i; 157 } 158 if (line.charAt(i) == ' ') { 159 e = i; 160 break; 161 } 162 } 163 if (s != e && s != -1 && e != -1) { 164 phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e))); 165 } 166 sentenceCount++; 167 } else if (currentHeaderTable == NegraTables.SENTENCE) { 168 if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal 169 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 170 ColumnDescription column = null; 171 currentNonTerminalSize++; 172 char[] lineChars = line.toCharArray(); 173 int start = 0; 174 int secedgecounter = 0; 175 for (int i = 0, n = lineChars.length; i < n; i++) { 176 if (lineChars[i] == '\t' && start == i) { 177 start++; 178 } else if (lineChars[i] == '\t' || i == n - 1) { 179 if (columns.hasNext()) { 180 column = columns.next(); 181 } 182 if (column.getPosition() == 0) { 183 int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i)); 184 child = nonterminals.get(index); 185 if (child == null) { 186 if (index != 0) { 187 child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 188 } 189 nonterminals.put(index,child); 190 } 191 } else if (column.getPosition() == 2 && child != null) { 192 syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i)); 193 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 194 edgelabelSymbol.setLength(0); 195 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 196 edgelabelTableName.setLength(0); 197 edgelabelTableName.append(column.getName()); 198 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 199 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 200 parent = nonterminals.get(index); 201 if (parent == null) { 202 if (index == 0) { 203 parent = phraseStructure.getPhraseStructureRoot(); 204 } else { 205 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 206 } 207 nonterminals.put(index,parent); 208 } 209 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 210 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 211 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 212 if (secedgecounter % 2 == 0) { 213 edgelabelSymbol.setLength(0); 214 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 215 secedgecounter++; 216 } else { 217 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 218 if (index == 0) { 219 parent = phraseStructure.getPhraseStructureRoot(); 220 } else if (index < START_ID_OF_NONTERMINALS) { 221 parent = phraseStructure.getTokenNode(index); 222 } else { 223 parent = nonterminals.get(index); 224 if (parent == null) { 225 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 226 nonterminals.put(index,parent); 227 } 228 } 229 Edge e = phraseStructure.addSecondaryEdge(parent, child); 230 e.addLabel(symbolTables.getSymbolTable(column.getName()), edgelabelSymbol.toString()); 231 secedgecounter++; 232 } 233 } 234 start = i + 1; 235 } 236 } 237 } else { // Terminal 238 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 239 ColumnDescription column = null; 240 241 currentTerminalSize++; 242 child = syntaxGraph.addTokenNode(currentTerminalSize); 243 char[] lineChars = line.toCharArray(); 244 int start = 0; 245 int secedgecounter = 0; 246 for (int i = 0, n = lineChars.length; i < n; i++) { 247 if (lineChars[i] == '\t' && start == i) { 248 start++; 249 } else if (lineChars[i] == '\t' || i == n - 1) { 250 if (columns.hasNext()) { 251 column = columns.next(); 252 } 253 if (column.getCategory() == ColumnDescription.INPUT && child != null) { 254 syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i)); 255 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) { 256 edgelabelSymbol.setLength(0); 257 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 258 edgelabelTableName.setLength(0); 259 edgelabelTableName.append(column.getName()); 260 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 261 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 262 parent = nonterminals.get(index); 263 if (parent == null) { 264 if (index == 0) { 265 parent = phraseStructure.getPhraseStructureRoot(); 266 } else { 267 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 268 } 269 nonterminals.put(index,parent); 270 } 271 272 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 273 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 274 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 275 if (secedgecounter % 2 == 0) { 276 edgelabelSymbol.setLength(0); 277 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 278 secedgecounter++; 279 } else { 280 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 281 if (index == 0) { 282 parent = phraseStructure.getPhraseStructureRoot(); 283 } else if (index < START_ID_OF_NONTERMINALS) { 284 parent = phraseStructure.getTokenNode(index); 285 } else { 286 parent = nonterminals.get(index); 287 if (parent == null) { 288 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 289 nonterminals.put(index,parent); 290 } 291 } 292 Edge e = phraseStructure.addSecondaryEdge(parent, child); 293 e.addLabel(symbolTables.getSymbolTable(column.getName()), edgelabelSymbol.toString()); 294 secedgecounter++; 295 } 296 } 297 start = i + 1; 298 } 299 } 300 } 301 } else if (line.startsWith("%%")) { // comment skip 302 303 } else if (line.startsWith("#FORMAT")) { 304// int index = line.indexOf(' '); 305// if (index > -1) { 306// try { 307// formatVersion = Integer.parseInt(line.substring(index+1)); 308// } catch (NumberFormatException e) { 309// 310// } 311// } 312 } else if (line.startsWith("#BOT")) { 313// int index = line.indexOf(' '); 314// if (index > -1) { 315// if (line.substring(index+1).equals("ORIGIN")) { 316// currentHeaderTable = NegraTables.ORIGIN; 317// } else if (line.substring(index+1).equals("EDITOR")) { 318// currentHeaderTable = NegraTables.EDITOR; 319// } else if (line.substring(index+1).equals("WORDTAG")) { 320// currentHeaderTable = NegraTables.WORDTAG; 321// } else if (line.substring(index+1).equals("MORPHTAG")) { 322// currentHeaderTable = NegraTables.MORPHTAG; 323// } else if (line.substring(index+1).equals("NODETAG")) { 324// currentHeaderTable = NegraTables.NODETAG; 325// } else if (line.substring(index+1).equals("EDGETAG")) { 326// currentHeaderTable = NegraTables.EDGETAG; 327// } else if (line.substring(index+1).equals("SECEDGETAG")) { 328// currentHeaderTable = NegraTables.SECEDGETAG; 329// } else { 330// currentHeaderTable = NegraTables.UNDEF; 331// } 332// } 333 } else if (line.startsWith("#EOT")) { 334 currentHeaderTable = NegraTables.UNDEF; 335 } 336 } 337 } catch (IOException e) { 338 throw new DataFormatException("Error when reading from the input file. ", e); 339 } 340 } 341 342 public void readEpilog() throws MaltChainedException { 343 344 } 345 346 public BufferedReader getReader() { 347 return reader; 348 } 349 350 public void setReader(BufferedReader reader) { 351 this.reader = reader; 352 } 353 354 public int getSentenceCount() { 355 return sentenceCount; 356 } 357 358 public void setSentenceCount(int sentenceCount) { 359 this.sentenceCount = sentenceCount; 360 } 361 362 public int getFormatVersion() { 363 return formatVersion; 364 } 365 366 public void setFormatVersion(int formatVersion) { 367 this.formatVersion = formatVersion; 368 } 369 370 public DataFormatInstance getDataFormatInstance() { 371 return dataFormatInstance; 372 } 373 374 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 375 this.dataFormatInstance = inputDataFormatInstance; 376 } 377 378 public String getOptions() { 379 return optionString; 380 } 381 382 public void setOptions(String optionString) throws MaltChainedException { 383 this.optionString = optionString; 384 385 String[] argv; 386 try { 387 argv = optionString.split("[_\\p{Blank}]"); 388 } catch (PatternSyntaxException e) { 389 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 390 } 391 for (int i=0; i < argv.length-1; i++) { 392 if(argv[i].charAt(0) != '-') { 393 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 394 } 395 if(++i>=argv.length) { 396 throw new DataFormatException("The last argument does not have any value. "); 397 } 398 switch(argv[i-1].charAt(1)) { 399 case 's': 400 try { 401 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 402 } catch (NumberFormatException e){ 403 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 404 } 405 break; 406 default: 407 throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 408 } 409 } 410 } 411 412 public String getFileName() { 413 return fileName; 414 } 415 416 public void setFileName(String fileName) { 417 this.fileName = fileName; 418 } 419 420 public URL getUrl() { 421 return url; 422 } 423 424 public void setUrl(URL url) { 425 this.url = url; 426 } 427 428 public String getCharsetName() { 429 return charsetName; 430 } 431 432 public void setCharsetName(String charsetName) { 433 this.charsetName = charsetName; 434 } 435 436 public int getNIterations() { 437 return nIterations; 438 } 439 440 public void setNIterations(int iterations) { 441 nIterations = iterations; 442 } 443 444 public int getIterationCounter() { 445 return cIterations; 446 } 447 448 public void close() throws MaltChainedException { 449 try { 450 if (reader != null) { 451 if (closeStream) { 452 reader.close(); 453 } 454 reader = null; 455 } 456 } catch (IOException e) { 457 throw new DataFormatException("Error when closing the input file.", e); 458 } 459 } 460}