001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 import java.util.TreeMap; 014 import java.util.regex.PatternSyntaxException; 015 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 021 import org.maltparser.core.syntaxgraph.PhraseStructure; 022 import org.maltparser.core.syntaxgraph.TokenStructure; 023 import org.maltparser.core.syntaxgraph.edge.Edge; 024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025 026 /** 027 * 028 * 029 * @author Johan Hall 030 */ 031 public class NegraReader implements SyntaxGraphReader { 032 private enum NegraTables { 033 ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF 034 }; 035 private BufferedReader reader; 036 private DataFormatInstance dataFormatInstance; 037 private int sentenceCount; 038 private String optionString; 039 private int formatVersion; 040 private NegraTables currentHeaderTable; 041 private int currentTerminalSize; 042 private int currentNonTerminalSize; 043 private SortedMap<Integer,PhraseStructureNode> nonterminals; 044 private StringBuilder edgelabelSymbol; 045 private StringBuilder edgelabelTableName; 046 private int START_ID_OF_NONTERMINALS = 500; 047 private String fileName = null; 048 private URL url = null; 049 private String charsetName; 050 private int nIterations; 051 private int cIterations; 052 private boolean closeStream = true; 053 054 public NegraReader() { 055 currentHeaderTable = NegraTables.UNDEF; 056 edgelabelSymbol = new StringBuilder(); 057 edgelabelTableName = new StringBuilder(); 058 nonterminals = new TreeMap<Integer,PhraseStructureNode>(); 059 nIterations = 1; 060 cIterations = 1; 061 } 062 063 private void reopen() throws MaltChainedException { 064 close(); 065 if (fileName != null) { 066 open(fileName, charsetName); 067 } else if (url != null) { 068 open(url, charsetName); 069 } else { 070 throw new DataFormatException("The input stream cannot be reopen. "); 071 } 072 } 073 074 public void open(String fileName, String charsetName) throws MaltChainedException { 075 setFileName(fileName); 076 setCharsetName(charsetName); 077 try { 078 open(new FileInputStream(fileName), charsetName); 079 } catch (FileNotFoundException e) { 080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 081 } 082 } 083 public void open(URL url, String charsetName) throws MaltChainedException { 084 setUrl(url); 085 setCharsetName(charsetName); 086 try { 087 open(url.openStream(), charsetName); 088 } catch (IOException e) { 089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 090 } 091 } 092 093 public void open(InputStream is, String charsetName) throws MaltChainedException { 094 try { 095 if (is == System.in) { 096 closeStream = false; 097 } 098 open(new InputStreamReader(is, charsetName)); 099 } catch (UnsupportedEncodingException e) { 100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 101 } 102 } 103 104 private void open(InputStreamReader isr) throws MaltChainedException { 105 setReader(new BufferedReader(isr)); 106 setSentenceCount(0); 107 } 108 109 public void readProlog() throws MaltChainedException { 110 111 } 112 113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 114 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 115 return false; 116 } 117 syntaxGraph.clear(); 118 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 119 PhraseStructureNode parent = null; 120 PhraseStructureNode child = null; 121 currentHeaderTable = NegraTables.UNDEF; 122 String line = null; 123 syntaxGraph.clear(); 124 syntaxGraph.getSymbolTables().cleanUp(); 125 nonterminals.clear(); 126 try { 127 while (true) { 128 line = reader.readLine(); 129 if (line == null) { 130 if (syntaxGraph.hasTokens()) { 131 sentenceCount++; 132 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 133 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 134 } 135 } 136 if (cIterations < nIterations) { 137 cIterations++; 138 reopen(); 139 return true; 140 } 141 return false; 142 } else if (line.startsWith("#EOS")) { 143 currentTerminalSize = 0; 144 currentNonTerminalSize = 0; 145 currentHeaderTable = NegraTables.UNDEF; 146 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 147 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 148 } 149 return true; 150 } else if (line.startsWith("#BOS")) { 151 currentHeaderTable = NegraTables.SENTENCE; 152 int s = -1, e = -1; 153 for (int i = 5, n = line.length(); i < n; i++) { 154 if (Character.isDigit(line.charAt(i)) && s == -1) { 155 s = i; 156 } 157 if (line.charAt(i) == ' ') { 158 e = i; 159 break; 160 } 161 } 162 if (s != e && s != -1 && e != -1) { 163 phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e))); 164 } 165 sentenceCount++; 166 } else if (currentHeaderTable == NegraTables.SENTENCE) { 167 if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal 168 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 169 ColumnDescription column = null; 170 currentNonTerminalSize++; 171 char[] lineChars = line.toCharArray(); 172 int start = 0; 173 int secedgecounter = 0; 174 for (int i = 0, n = lineChars.length; i < n; i++) { 175 if (lineChars[i] == '\t' && start == i) { 176 start++; 177 } else if (lineChars[i] == '\t' || i == n - 1) { 178 if (columns.hasNext()) { 179 column = columns.next(); 180 } 181 if (column.getPosition() == 0) { 182 int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i)); 183 child = nonterminals.get(index); 184 if (child == null) { 185 if (index != 0) { 186 child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 187 } 188 nonterminals.put(index,child); 189 } 190 } else if (column.getPosition() == 2 && child != null) { 191 syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i)); 192 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 193 edgelabelSymbol.setLength(0); 194 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 195 edgelabelTableName.setLength(0); 196 edgelabelTableName.append(column.getName()); 197 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 198 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 199 parent = nonterminals.get(index); 200 if (parent == null) { 201 if (index == 0) { 202 parent = phraseStructure.getPhraseStructureRoot(); 203 } else { 204 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 205 } 206 nonterminals.put(index,parent); 207 } 208 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 209 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 210 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 211 if (secedgecounter % 2 == 0) { 212 edgelabelSymbol.setLength(0); 213 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 214 secedgecounter++; 215 } else { 216 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 217 if (index == 0) { 218 parent = phraseStructure.getPhraseStructureRoot(); 219 } else if (index < START_ID_OF_NONTERMINALS) { 220 parent = phraseStructure.getTokenNode(index); 221 } else { 222 parent = nonterminals.get(index); 223 if (parent == null) { 224 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 225 nonterminals.put(index,parent); 226 } 227 } 228 Edge e = phraseStructure.addSecondaryEdge(parent, child); 229 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 230 secedgecounter++; 231 } 232 } 233 start = i + 1; 234 } 235 } 236 } else { // Terminal 237 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 238 ColumnDescription column = null; 239 240 currentTerminalSize++; 241 child = syntaxGraph.addTokenNode(currentTerminalSize); 242 char[] lineChars = line.toCharArray(); 243 int start = 0; 244 int secedgecounter = 0; 245 for (int i = 0, n = lineChars.length; i < n; i++) { 246 if (lineChars[i] == '\t' && start == i) { 247 start++; 248 } else if (lineChars[i] == '\t' || i == n - 1) { 249 if (columns.hasNext()) { 250 column = columns.next(); 251 } 252 if (column.getCategory() == ColumnDescription.INPUT && child != null) { 253 syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i)); 254 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) { 255 edgelabelSymbol.setLength(0); 256 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 257 edgelabelTableName.setLength(0); 258 edgelabelTableName.append(column.getName()); 259 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 260 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 261 parent = nonterminals.get(index); 262 if (parent == null) { 263 if (index == 0) { 264 parent = phraseStructure.getPhraseStructureRoot(); 265 } else { 266 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 267 } 268 nonterminals.put(index,parent); 269 } 270 271 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 272 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 273 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 274 if (secedgecounter % 2 == 0) { 275 edgelabelSymbol.setLength(0); 276 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 277 secedgecounter++; 278 } else { 279 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 280 if (index == 0) { 281 parent = phraseStructure.getPhraseStructureRoot(); 282 } else if (index < START_ID_OF_NONTERMINALS) { 283 parent = phraseStructure.getTokenNode(index); 284 } else { 285 parent = nonterminals.get(index); 286 if (parent == null) { 287 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 288 nonterminals.put(index,parent); 289 } 290 } 291 Edge e = phraseStructure.addSecondaryEdge(parent, child); 292 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 293 secedgecounter++; 294 } 295 } 296 start = i + 1; 297 } 298 } 299 } 300 } else if (line.startsWith("%%")) { // comment skip 301 302 } else if (line.startsWith("#FORMAT")) { 303 // int index = line.indexOf(' '); 304 // if (index > -1) { 305 // try { 306 // formatVersion = Integer.parseInt(line.substring(index+1)); 307 // } catch (NumberFormatException e) { 308 // 309 // } 310 // } 311 } else if (line.startsWith("#BOT")) { 312 // int index = line.indexOf(' '); 313 // if (index > -1) { 314 // if (line.substring(index+1).equals("ORIGIN")) { 315 // currentHeaderTable = NegraTables.ORIGIN; 316 // } else if (line.substring(index+1).equals("EDITOR")) { 317 // currentHeaderTable = NegraTables.EDITOR; 318 // } else if (line.substring(index+1).equals("WORDTAG")) { 319 // currentHeaderTable = NegraTables.WORDTAG; 320 // } else if (line.substring(index+1).equals("MORPHTAG")) { 321 // currentHeaderTable = NegraTables.MORPHTAG; 322 // } else if (line.substring(index+1).equals("NODETAG")) { 323 // currentHeaderTable = NegraTables.NODETAG; 324 // } else if (line.substring(index+1).equals("EDGETAG")) { 325 // currentHeaderTable = NegraTables.EDGETAG; 326 // } else if (line.substring(index+1).equals("SECEDGETAG")) { 327 // currentHeaderTable = NegraTables.SECEDGETAG; 328 // } else { 329 // currentHeaderTable = NegraTables.UNDEF; 330 // } 331 // } 332 } else if (line.startsWith("#EOT")) { 333 currentHeaderTable = NegraTables.UNDEF; 334 } 335 } 336 } catch (IOException e) { 337 throw new DataFormatException("Error when reading from the input file. ", e); 338 } 339 } 340 341 public void readEpilog() throws MaltChainedException { 342 343 } 344 345 public BufferedReader getReader() { 346 return reader; 347 } 348 349 public void setReader(BufferedReader reader) { 350 this.reader = reader; 351 } 352 353 public int getSentenceCount() { 354 return sentenceCount; 355 } 356 357 public void setSentenceCount(int sentenceCount) { 358 this.sentenceCount = sentenceCount; 359 } 360 361 public int getFormatVersion() { 362 return formatVersion; 363 } 364 365 public void setFormatVersion(int formatVersion) { 366 this.formatVersion = formatVersion; 367 } 368 369 public DataFormatInstance getDataFormatInstance() { 370 return dataFormatInstance; 371 } 372 373 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 374 this.dataFormatInstance = inputDataFormatInstance; 375 } 376 377 public String getOptions() { 378 return optionString; 379 } 380 381 public void setOptions(String optionString) throws MaltChainedException { 382 this.optionString = optionString; 383 384 String[] argv; 385 try { 386 argv = optionString.split("[_\\p{Blank}]"); 387 } catch (PatternSyntaxException e) { 388 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 389 } 390 for (int i=0; i < argv.length-1; i++) { 391 if(argv[i].charAt(0) != '-') { 392 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 393 } 394 if(++i>=argv.length) { 395 throw new DataFormatException("The last argument does not have any value. "); 396 } 397 switch(argv[i-1].charAt(1)) { 398 case 's': 399 try { 400 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 401 } catch (NumberFormatException e){ 402 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 403 } 404 break; 405 default: 406 throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 407 } 408 } 409 } 410 411 public String getFileName() { 412 return fileName; 413 } 414 415 public void setFileName(String fileName) { 416 this.fileName = fileName; 417 } 418 419 public URL getUrl() { 420 return url; 421 } 422 423 public void setUrl(URL url) { 424 this.url = url; 425 } 426 427 public String getCharsetName() { 428 return charsetName; 429 } 430 431 public void setCharsetName(String charsetName) { 432 this.charsetName = charsetName; 433 } 434 435 public int getNIterations() { 436 return nIterations; 437 } 438 439 public void setNIterations(int iterations) { 440 nIterations = iterations; 441 } 442 443 public int getIterationCounter() { 444 return cIterations; 445 } 446 447 public void close() throws MaltChainedException { 448 try { 449 if (reader != null) { 450 if (closeStream) { 451 reader.close(); 452 } 453 reader = null; 454 } 455 } catch (IOException e) { 456 throw new DataFormatException("Error when closing the input file.", e); 457 } 458 } 459 }