001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 import java.util.TreeMap; 014 import java.util.regex.PatternSyntaxException; 015 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 021 import org.maltparser.core.syntaxgraph.PhraseStructure; 022 import org.maltparser.core.syntaxgraph.TokenStructure; 023 import org.maltparser.core.syntaxgraph.edge.Edge; 024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025 026 /** 027 * 028 * 029 * @author Johan Hall 030 */ 031 public class NegraReader implements SyntaxGraphReader { 032 private enum NegraTables { 033 ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF 034 }; 035 private BufferedReader reader; 036 private DataFormatInstance dataFormatInstance; 037 private int sentenceCount; 038 private String optionString; 039 private int formatVersion; 040 private NegraTables currentHeaderTable; 041 private int currentTerminalSize; 042 private int currentNonTerminalSize; 043 private SortedMap<Integer,PhraseStructureNode> nonterminals; 044 private StringBuilder edgelabelSymbol; 045 private StringBuilder edgelabelTableName; 046 private int START_ID_OF_NONTERMINALS = 500; 047 048 public NegraReader() { 049 currentHeaderTable = NegraTables.UNDEF; 050 edgelabelSymbol = new StringBuilder(); 051 edgelabelTableName = new StringBuilder(); 052 nonterminals = new TreeMap<Integer,PhraseStructureNode>(); 053 } 054 055 public void open(String fileName, String charsetName) throws MaltChainedException { 056 try { 057 open(new FileInputStream(fileName), charsetName); 058 } catch (FileNotFoundException e) { 059 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 060 } 061 } 062 public void open(URL url, String charsetName) throws MaltChainedException { 063 try { 064 open(url.openStream(), charsetName); 065 } catch (IOException e) { 066 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 067 } 068 } 069 070 public void open(InputStream is, String charsetName) throws MaltChainedException { 071 try { 072 open(new InputStreamReader(is, charsetName)); 073 } catch (UnsupportedEncodingException e) { 074 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 075 } 076 } 077 078 public void open(InputStreamReader isr) throws MaltChainedException { 079 setReader(new BufferedReader(isr)); 080 setSentenceCount(0); 081 } 082 083 public void readProlog() throws MaltChainedException { 084 085 } 086 087 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 088 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { 089 return false; 090 } 091 syntaxGraph.clear(); 092 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 093 PhraseStructureNode parent = null; 094 PhraseStructureNode child = null; 095 currentHeaderTable = NegraTables.UNDEF; 096 String line = null; 097 syntaxGraph.clear(); 098 nonterminals.clear(); 099 try { 100 while (true) { 101 line = reader.readLine(); 102 if (line == null) { 103 if (syntaxGraph.hasTokens()) { 104 sentenceCount++; 105 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 106 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 107 } 108 } 109 return false; 110 } else if (line.startsWith("#EOS")) { 111 currentTerminalSize = 0; 112 currentNonTerminalSize = 0; 113 currentHeaderTable = NegraTables.UNDEF; 114 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 115 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 116 } 117 return true; 118 } else if (line.startsWith("#BOS")) { 119 currentHeaderTable = NegraTables.SENTENCE; 120 int s = -1, e = -1; 121 for (int i = 5, n = line.length(); i < n; i++) { 122 if (Character.isDigit(line.charAt(i)) && s == -1) { 123 s = i; 124 } 125 if (line.charAt(i) == ' ') { 126 e = i; 127 break; 128 } 129 } 130 if (s != e && s != -1 && e != -1) { 131 phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e))); 132 } 133 sentenceCount++; 134 } else if (currentHeaderTable == NegraTables.SENTENCE) { 135 if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal 136 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 137 ColumnDescription column = null; 138 currentNonTerminalSize++; 139 char[] lineChars = line.toCharArray(); 140 int start = 0; 141 int secedgecounter = 0; 142 for (int i = 0, n = lineChars.length; i < n; i++) { 143 if (lineChars[i] == '\t' && start == i) { 144 start++; 145 } else if (lineChars[i] == '\t' || i == n - 1) { 146 if (columns.hasNext()) { 147 column = columns.next(); 148 } 149 if (column.getPosition() == 0) { 150 int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i)); 151 child = nonterminals.get(index); 152 if (child == null) { 153 if (index != 0) { 154 child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 155 } 156 nonterminals.put(index,child); 157 } 158 } else if (column.getPosition() == 2 && child != null) { 159 syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i)); 160 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 161 edgelabelSymbol.setLength(0); 162 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 163 edgelabelTableName.setLength(0); 164 edgelabelTableName.append(column.getName()); 165 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 166 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 167 parent = nonterminals.get(index); 168 if (parent == null) { 169 if (index == 0) { 170 parent = phraseStructure.getPhraseStructureRoot(); 171 } else { 172 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 173 } 174 nonterminals.put(index,parent); 175 } 176 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 177 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 178 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 179 if (secedgecounter % 2 == 0) { 180 edgelabelSymbol.setLength(0); 181 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 182 secedgecounter++; 183 } else { 184 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 185 if (index == 0) { 186 parent = phraseStructure.getPhraseStructureRoot(); 187 } else if (index < START_ID_OF_NONTERMINALS) { 188 parent = phraseStructure.getTokenNode(index); 189 } else { 190 parent = nonterminals.get(index); 191 if (parent == null) { 192 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 193 nonterminals.put(index,parent); 194 } 195 } 196 Edge e = phraseStructure.addSecondaryEdge(parent, child); 197 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 198 secedgecounter++; 199 } 200 } 201 start = i + 1; 202 } 203 } 204 } else { // Terminal 205 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 206 ColumnDescription column = null; 207 208 currentTerminalSize++; 209 child = syntaxGraph.addTokenNode(currentTerminalSize); 210 char[] lineChars = line.toCharArray(); 211 int start = 0; 212 int secedgecounter = 0; 213 for (int i = 0, n = lineChars.length; i < n; i++) { 214 if (lineChars[i] == '\t' && start == i) { 215 start++; 216 } else if (lineChars[i] == '\t' || i == n - 1) { 217 if (columns.hasNext()) { 218 column = columns.next(); 219 } 220 if (column.getCategory() == ColumnDescription.INPUT && child != null) { 221 syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i)); 222 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) { 223 edgelabelSymbol.setLength(0); 224 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 225 edgelabelTableName.setLength(0); 226 edgelabelTableName.append(column.getName()); 227 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { 228 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 229 parent = nonterminals.get(index); 230 if (parent == null) { 231 if (index == 0) { 232 parent = phraseStructure.getPhraseStructureRoot(); 233 } else { 234 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 235 } 236 nonterminals.put(index,parent); 237 } 238 239 Edge e = phraseStructure.addPhraseStructureEdge(parent, child); 240 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); 241 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { 242 if (secedgecounter % 2 == 0) { 243 edgelabelSymbol.setLength(0); 244 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i)); 245 secedgecounter++; 246 } else { 247 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i)); 248 if (index == 0) { 249 parent = phraseStructure.getPhraseStructureRoot(); 250 } else if (index < START_ID_OF_NONTERMINALS) { 251 parent = phraseStructure.getTokenNode(index); 252 } else { 253 parent = nonterminals.get(index); 254 if (parent == null) { 255 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1); 256 nonterminals.put(index,parent); 257 } 258 } 259 Edge e = phraseStructure.addSecondaryEdge(parent, child); 260 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); 261 secedgecounter++; 262 } 263 } 264 start = i + 1; 265 } 266 } 267 } 268 } else if (line.startsWith("%%")) { // comment skip 269 270 } else if (line.startsWith("#FORMAT")) { 271 // int index = line.indexOf(' '); 272 // if (index > -1) { 273 // try { 274 // formatVersion = Integer.parseInt(line.substring(index+1)); 275 // } catch (NumberFormatException e) { 276 // 277 // } 278 // } 279 } else if (line.startsWith("#BOT")) { 280 // int index = line.indexOf(' '); 281 // if (index > -1) { 282 // if (line.substring(index+1).equals("ORIGIN")) { 283 // currentHeaderTable = NegraTables.ORIGIN; 284 // } else if (line.substring(index+1).equals("EDITOR")) { 285 // currentHeaderTable = NegraTables.EDITOR; 286 // } else if (line.substring(index+1).equals("WORDTAG")) { 287 // currentHeaderTable = NegraTables.WORDTAG; 288 // } else if (line.substring(index+1).equals("MORPHTAG")) { 289 // currentHeaderTable = NegraTables.MORPHTAG; 290 // } else if (line.substring(index+1).equals("NODETAG")) { 291 // currentHeaderTable = NegraTables.NODETAG; 292 // } else if (line.substring(index+1).equals("EDGETAG")) { 293 // currentHeaderTable = NegraTables.EDGETAG; 294 // } else if (line.substring(index+1).equals("SECEDGETAG")) { 295 // currentHeaderTable = NegraTables.SECEDGETAG; 296 // } else { 297 // currentHeaderTable = NegraTables.UNDEF; 298 // } 299 // } 300 } else if (line.startsWith("#EOT")) { 301 currentHeaderTable = NegraTables.UNDEF; 302 } 303 } 304 } catch (IOException e) { 305 throw new DataFormatException("Error when reading from the input file. ", e); 306 } 307 } 308 309 public void readEpilog() throws MaltChainedException { 310 311 } 312 313 public BufferedReader getReader() { 314 return reader; 315 } 316 317 public void setReader(BufferedReader reader) { 318 this.reader = reader; 319 } 320 321 public int getSentenceCount() { 322 return sentenceCount; 323 } 324 325 public void setSentenceCount(int sentenceCount) { 326 this.sentenceCount = sentenceCount; 327 } 328 329 public int getFormatVersion() { 330 return formatVersion; 331 } 332 333 public void setFormatVersion(int formatVersion) { 334 this.formatVersion = formatVersion; 335 } 336 337 public DataFormatInstance getDataFormatInstance() { 338 return dataFormatInstance; 339 } 340 341 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 342 this.dataFormatInstance = inputDataFormatInstance; 343 } 344 345 public String getOptions() { 346 return optionString; 347 } 348 349 public void setOptions(String optionString) throws MaltChainedException { 350 this.optionString = optionString; 351 352 String[] argv; 353 try { 354 argv = optionString.split("[_\\p{Blank}]"); 355 } catch (PatternSyntaxException e) { 356 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 357 } 358 for (int i=0; i < argv.length-1; i++) { 359 if(argv[i].charAt(0) != '-') { 360 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 361 } 362 if(++i>=argv.length) { 363 throw new DataFormatException("The last argument does not have any value. "); 364 } 365 switch(argv[i-1].charAt(1)) { 366 case 's': 367 try { 368 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 369 } catch (NumberFormatException e){ 370 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 371 } 372 break; 373 default: 374 throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 375 } 376 } 377 } 378 379 public void close() throws MaltChainedException { 380 try { 381 if (reader != null) { 382 reader.close(); 383 reader = null; 384 } 385 } catch (IOException e) { 386 throw new DataFormatException("Error when closing the input file.", e); 387 } 388 } 389 }