001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 import java.util.TreeMap; 014 import java.util.regex.PatternSyntaxException; 015 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 021 import org.maltparser.core.syntaxgraph.PhraseStructure; 022 import org.maltparser.core.syntaxgraph.TokenStructure; 023 import org.maltparser.core.syntaxgraph.edge.Edge; 024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025 026 /** 027 * 028 * 029 * @author Johan Hall 030 */ 031 public class NegraReader implements SyntaxGraphReader { 032 private enum NegraTables { 033 ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF 034 }; 035 private BufferedReader reader; 036 private DataFormatInstance dataFormatInstance; 037 private int sentenceCount; 038 private String optionString; 039 private int formatVersion; 040 private NegraTables currentHeaderTable; 041 private int currentTerminalSize; 042 private int currentNonTerminalSize; 043 private SortedMap<Integer,PhraseStructureNode> nonterminals; 044 private StringBuilder edgelabelSymbol; 045 private StringBuilder edgelabelTableName; 046 private int START_ID_OF_NONTERMINALS = 500; 047 private String fileName = null; 048 private URL url = null; 049 private String charsetName; 050 private int nIterations; 051 private int cIterations; 052 053 public NegraReader() { 054 currentHeaderTable = NegraTables.UNDEF; 055 edgelabelSymbol = new StringBuilder(); 056 edgelabelTableName = new StringBuilder(); 057 nonterminals = new TreeMap<Integer,PhraseStructureNode>(); 058 nIterations = 1; 059 cIterations = 1; 060 } 061 062 private void reopen() throws MaltChainedException { 063 close(); 064 if (fileName != null) { 065 open(fileName, charsetName); 066 } else if (url != null) { 067 open(url, charsetName); 068 } else { 069 throw new DataFormatException("The input stream cannot be reopen. "); 070 } 071 } 072 073 public void open(String fileName, String charsetName) throws MaltChainedException { 074 setFileName(fileName); 075 setCharsetName(charsetName); 076 try { 077 open(new FileInputStream(fileName), charsetName); 078 } catch (FileNotFoundException e) { 079 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 080 } 081 } 082 public void open(URL url, String charsetName) throws MaltChainedException { 083 setUrl(url); 084 setCharsetName(charsetName); 085 try { 086 open(url.openStream(), charsetName); 087 } catch (IOException e) { 088 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 089 } 090 } 091 092 public void open(InputStream is, String charsetName) throws MaltChainedException { 093 try { 094 open(new InputStreamReader(is, charsetName)); 095 } catch (UnsupportedEncodingException e) { 096 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 097 } 098 } 099 100 public void open(InputStreamReader isr) throws MaltChainedException { 101 setReader(new BufferedReader(isr)); 102 setSentenceCount(0); 103 } 104 105 public void readProlog() throws MaltChainedException { 106 107 } 108 109 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 110 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 111 return false; 112 } 113 syntaxGraph.clear(); 114 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 115 PhraseStructureNode parent = null; 116 PhraseStructureNode child = null; 117 currentHeaderTable = NegraTables.UNDEF; 118 String line = null; 119 syntaxGraph.clear(); 120 nonterminals.clear(); 121 try { 122 while (true) { 123 line = reader.readLine(); 124 if (line == null) { 125 if (syntaxGraph.hasTokens()) { 126 sentenceCount++; 127 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 128 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 129 } 130 } 131 if (cIterations < nIterations) { 132 cIterations++; 133 reopen(); 134 return true; 135 } 136 return false; 137 } else if (line.startsWith("#EOS")) { 138 currentTerminalSize = 0; 139 currentNonTerminalSize = 0; 140 currentHeaderTable = NegraTables.UNDEF; 141 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 142 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 143 } 144 return true; 145 } else if (line.startsWith("#BOS")) { 146 currentHeaderTable = NegraTables.SENTENCE; 147 int s = -1, e = -1; 148 for (int i = 5, n = line.length(); i < n; i++) { 149 if (Character.isDigit(line.charAt(i)) && s == -1) { 150 s = i; 151 } 152 if (line.charAt(i) == ' ') { 153 e = i; 154 break; 155 } 156 } 157 if (s != e && s != -1 && e != -1) { 158 phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e))); 159 } 160 sentenceCount++; 161 } else if (currentHeaderTable == NegraTables.SENTENCE) { 162 if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal 163 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 164 ColumnDescription column = null; 165 currentNonTerminalSize++; 166 char[] lineChars = line.toCharArray(); 167 int start = 0; 168 int secedgecounter = 0; 169 for (int i = 0, n = lineChars.length; i < n; i++) { 170 if (lineChars[i] == '\t' && start == i) { 171 start++; 172 } else if (lineChars[i] == '\t' || i == n - 1) { 173 if (columns.hasNext()) { 174 column = columns.next(); 175 } 176 if (column.getPosition() == 0) { 177 int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i)); 178 child = nonterminals.get(index); 179 if (child == null) { 180 if (index != 0) { 181 child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 182 } 183 nonterminals.put(index,child); 184 } 185 } else if (column.getPosition() == 2 && child != null) { 186 syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i)); 187 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 188 edgelabelSymbol.setLength(0); 189 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 190 edgelabelTableName.setLength(0); 191 edgelabelTableName.append(column.getName()); 192 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 193 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 194 parent = nonterminals.get(index); 195 if (parent == null) { 196 if (index == 0) { 197 parent = phraseStructure.getPhraseStructureRoot(); 198 } else { 199 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 200 } 201 nonterminals.put(index,parent); 202 } 203 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 204 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 205 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 206 if (secedgecounter % 2 == 0) { 207 edgelabelSymbol.setLength(0); 208 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 209 secedgecounter++; 210 } else { 211 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 212 if (index == 0) { 213 parent = phraseStructure.getPhraseStructureRoot(); 214 } else if (index < START_ID_OF_NONTERMINALS) { 215 parent = phraseStructure.getTokenNode(index); 216 } else { 217 parent = nonterminals.get(index); 218 if (parent == null) { 219 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 220 nonterminals.put(index,parent); 221 } 222 } 223 Edge e = phraseStructure.addSecondaryEdge(parent, child); 224 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 225 secedgecounter++; 226 } 227 } 228 start = i + 1; 229 } 230 } 231 } else { // Terminal 232 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 233 ColumnDescription column = null; 234 235 currentTerminalSize++; 236 child = syntaxGraph.addTokenNode(currentTerminalSize); 237 char[] lineChars = line.toCharArray(); 238 int start = 0; 239 int secedgecounter = 0; 240 for (int i = 0, n = lineChars.length; i < n; i++) { 241 if (lineChars[i] == '\t' && start == i) { 242 start++; 243 } else if (lineChars[i] == '\t' || i == n - 1) { 244 if (columns.hasNext()) { 245 column = columns.next(); 246 } 247 if (column.getCategory() == ColumnDescription.INPUT && child != null) { 248 syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i)); 249 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) { 250 edgelabelSymbol.setLength(0); 251 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 252 edgelabelTableName.setLength(0); 253 edgelabelTableName.append(column.getName()); 254 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 255 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 256 parent = nonterminals.get(index); 257 if (parent == null) { 258 if (index == 0) { 259 parent = phraseStructure.getPhraseStructureRoot(); 260 } else { 261 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 262 } 263 nonterminals.put(index,parent); 264 } 265 266 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 267 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 268 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 269 if (secedgecounter % 2 == 0) { 270 edgelabelSymbol.setLength(0); 271 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 272 secedgecounter++; 273 } else { 274 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 275 if (index == 0) { 276 parent = phraseStructure.getPhraseStructureRoot(); 277 } else if (index < START_ID_OF_NONTERMINALS) { 278 parent = phraseStructure.getTokenNode(index); 279 } else { 280 parent = nonterminals.get(index); 281 if (parent == null) { 282 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 283 nonterminals.put(index,parent); 284 } 285 } 286 Edge e = phraseStructure.addSecondaryEdge(parent, child); 287 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 288 secedgecounter++; 289 } 290 } 291 start = i + 1; 292 } 293 } 294 } 295 } else if (line.startsWith("%%")) { // comment skip 296 297 } else if (line.startsWith("#FORMAT")) { 298 // int index = line.indexOf(' '); 299 // if (index > -1) { 300 // try { 301 // formatVersion = Integer.parseInt(line.substring(index+1)); 302 // } catch (NumberFormatException e) { 303 // 304 // } 305 // } 306 } else if (line.startsWith("#BOT")) { 307 // int index = line.indexOf(' '); 308 // if (index > -1) { 309 // if (line.substring(index+1).equals("ORIGIN")) { 310 // currentHeaderTable = NegraTables.ORIGIN; 311 // } else if (line.substring(index+1).equals("EDITOR")) { 312 // currentHeaderTable = NegraTables.EDITOR; 313 // } else if (line.substring(index+1).equals("WORDTAG")) { 314 // currentHeaderTable = NegraTables.WORDTAG; 315 // } else if (line.substring(index+1).equals("MORPHTAG")) { 316 // currentHeaderTable = NegraTables.MORPHTAG; 317 // } else if (line.substring(index+1).equals("NODETAG")) { 318 // currentHeaderTable = NegraTables.NODETAG; 319 // } else if (line.substring(index+1).equals("EDGETAG")) { 320 // currentHeaderTable = NegraTables.EDGETAG; 321 // } else if (line.substring(index+1).equals("SECEDGETAG")) { 322 // currentHeaderTable = NegraTables.SECEDGETAG; 323 // } else { 324 // currentHeaderTable = NegraTables.UNDEF; 325 // } 326 // } 327 } else if (line.startsWith("#EOT")) { 328 currentHeaderTable = NegraTables.UNDEF; 329 } 330 } 331 } catch (IOException e) { 332 throw new DataFormatException("Error when reading from the input file. ", e); 333 } 334 } 335 336 public void readEpilog() throws MaltChainedException { 337 338 } 339 340 public BufferedReader getReader() { 341 return reader; 342 } 343 344 public void setReader(BufferedReader reader) { 345 this.reader = reader; 346 } 347 348 public int getSentenceCount() { 349 return sentenceCount; 350 } 351 352 public void setSentenceCount(int sentenceCount) { 353 this.sentenceCount = sentenceCount; 354 } 355 356 public int getFormatVersion() { 357 return formatVersion; 358 } 359 360 public void setFormatVersion(int formatVersion) { 361 this.formatVersion = formatVersion; 362 } 363 364 public DataFormatInstance getDataFormatInstance() { 365 return dataFormatInstance; 366 } 367 368 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 369 this.dataFormatInstance = inputDataFormatInstance; 370 } 371 372 public String getOptions() { 373 return optionString; 374 } 375 376 public void setOptions(String optionString) throws MaltChainedException { 377 this.optionString = optionString; 378 379 String[] argv; 380 try { 381 argv = optionString.split("[_\\p{Blank}]"); 382 } catch (PatternSyntaxException e) { 383 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 384 } 385 for (int i=0; i < argv.length-1; i++) { 386 if(argv[i].charAt(0) != '-') { 387 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 388 } 389 if(++i>=argv.length) { 390 throw new DataFormatException("The last argument does not have any value. "); 391 } 392 switch(argv[i-1].charAt(1)) { 393 case 's': 394 try { 395 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 396 } catch (NumberFormatException e){ 397 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 398 } 399 break; 400 default: 401 throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 402 } 403 } 404 } 405 406 public String getFileName() { 407 return fileName; 408 } 409 410 public void setFileName(String fileName) { 411 this.fileName = fileName; 412 } 413 414 public URL getUrl() { 415 return url; 416 } 417 418 public void setUrl(URL url) { 419 this.url = url; 420 } 421 422 public String getCharsetName() { 423 return charsetName; 424 } 425 426 public void setCharsetName(String charsetName) { 427 this.charsetName = charsetName; 428 } 429 430 public int getNIterations() { 431 return nIterations; 432 } 433 434 public void setNIterations(int iterations) { 435 nIterations = iterations; 436 } 437 438 public int getIterationCounter() { 439 return cIterations; 440 } 441 442 public void close() throws MaltChainedException { 443 try { 444 if (reader != null) { 445 reader.close(); 446 reader = null; 447 } 448 } catch (IOException e) { 449 throw new DataFormatException("Error when closing the input file.", e); 450 } 451 } 452 }