001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 import org.maltparser.core.io.dataformat.ColumnDescription; 016 import org.maltparser.core.io.dataformat.DataFormatException; 017 import org.maltparser.core.io.dataformat.DataFormatInstance; 018 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 019 import org.maltparser.core.syntaxgraph.PhraseStructure; 020 import org.maltparser.core.syntaxgraph.TokenStructure; 021 import org.maltparser.core.syntaxgraph.edge.Edge; 022 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 023 import org.maltparser.core.syntaxgraph.node.TokenNode; 024 /** 025 * 026 * 027 * @author Johan Hall 028 */ 029 public class BracketReader implements SyntaxGraphReader { 030 private BufferedReader reader; 031 private DataFormatInstance dataFormatInstance; 032 private int sentenceCount; 033 private StringBuilder input; 034 private int terminalCounter; 035 private int nonTerminalCounter; 036 private String optionString; 037 private SortedMap<String,ColumnDescription> inputColumns; 038 private SortedMap<String,ColumnDescription> edgeLabelColumns; 039 private SortedMap<String,ColumnDescription> phraseLabelColumns; 040 041 private String fileName = null; 042 private URL url = null; 043 private String charsetName; 044 private int nIterations; 045 private int cIterations; 046 private boolean closeStream = true; 047 048 private char STARTING_BRACKET = '('; 049 private char CLOSING_BRACKET = ')'; 050 private char INPUT_SEPARATOR = ' '; 051 private char EDGELABEL_SEPARATOR = '-'; 052 private char SENTENCE_SEPARATOR = '\n'; 053 private char BLANK = ' '; 054 private char CARRIAGE_RETURN = '\r'; 055 private char TAB = '\t'; 056 057 public BracketReader() { 058 input = new StringBuilder(); 059 nIterations = 1; 060 cIterations = 1; 061 } 062 063 private void reopen() throws MaltChainedException { 064 close(); 065 if (fileName != null) { 066 open(fileName, charsetName); 067 } else if (url != null) { 068 open(url, charsetName); 069 } else { 070 throw new DataFormatException("The input stream cannot be reopen. "); 071 } 072 } 073 074 public void open(String fileName, String charsetName) throws MaltChainedException { 075 setFileName(fileName); 076 setCharsetName(charsetName); 077 try { 078 open(new FileInputStream(fileName), charsetName); 079 }catch (FileNotFoundException e) { 080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 081 } 082 } 083 public void open(URL url, String charsetName) throws MaltChainedException { 084 setUrl(url); 085 setCharsetName(charsetName); 086 try { 087 open(url.openStream(), charsetName); 088 } catch (IOException e) { 089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 090 } 091 } 092 093 public void open(InputStream is, String charsetName) throws MaltChainedException { 094 try { 095 if (is == System.in) { 096 closeStream = false; 097 } 098 open(new InputStreamReader(is, charsetName)); 099 } catch (UnsupportedEncodingException e) { 100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 101 } 102 } 103 104 private void open(InputStreamReader isr) throws MaltChainedException { 105 setReader(new BufferedReader(isr)); 106 setSentenceCount(0); 107 } 108 109 public void readProlog() throws MaltChainedException { 110 111 } 112 113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 114 if (syntaxGraph == null || dataFormatInstance == null) { 115 return false; 116 } 117 syntaxGraph.clear(); 118 syntaxGraph.getSymbolTables().cleanUp(); 119 int brackets = 0; 120 try { 121 int l = reader.read(); 122 char c; 123 input.setLength(0); 124 125 while (true) { 126 if (l == -1) { 127 input.setLength(0); 128 return false; 129 } 130 131 c = (char)l; 132 l = reader.read(); 133 134 if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) { 135 136 } else if (c == STARTING_BRACKET) { 137 input.append(c); 138 brackets++; 139 } else if (c == CLOSING_BRACKET) { 140 input.append(c); 141 brackets--; 142 } else if (c == INPUT_SEPARATOR) { 143 if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) { 144 input.append(c); 145 } 146 // Start BracketProgLangReader 147 } else if (c == '\\') { 148 c = (char) l; 149 l = reader.read(); 150 if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') { 151 // System.out.println("Error"); 152 System.exit(1); 153 } else { 154 input.append("\\" + c); 155 } 156 // End BracketProgLangReader 157 } else if (brackets != 0){ 158 input.append(c); 159 } 160 if (brackets == 0 && input.length() != 0) { 161 sentenceCount++; 162 terminalCounter = 1; 163 nonTerminalCounter = 1; 164 if (syntaxGraph instanceof PhraseStructure) { 165 bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null); 166 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 167 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 168 } 169 } 170 return true; 171 } 172 173 if (c == -1) { 174 if (brackets != 0) { 175 close(); 176 throw new MaltChainedException("Error when reading from the input file. "); 177 } 178 if (cIterations < nIterations) { 179 cIterations++; 180 reopen(); 181 return true; 182 } 183 return false; 184 } 185 } 186 } catch (IOException e) { 187 close(); 188 throw new MaltChainedException("Error when reading from the input file. ", e); 189 } 190 191 } 192 193 private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException { 194 int bracketsdepth = 0; 195 int startpos = start-1; 196 for (int i = start, n = end; i < n; i++) { 197 if (input.charAt(i) == STARTING_BRACKET 198 // Start BracketProgLangReader 199 && (i == 0 || input.charAt(i - 1) != '\\') 200 // end BracketProgLangReader 201 202 ) { 203 if (bracketsdepth == 0) { 204 startpos = i; 205 } 206 bracketsdepth++; 207 } else if (input.charAt(i) == CLOSING_BRACKET 208 // Start BracketProgLangReader 209 && (i == 0 || input.charAt(i - 1) != '\\') 210 // end BracketProgLangReader 211 ) { 212 bracketsdepth--; 213 if (bracketsdepth == 0) { 214 extract(phraseStructure, startpos+1, i, parent); 215 } 216 } 217 } 218 } 219 220 private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException { 221 int index = -1; 222 for (int i = begin; i < end; i++) { 223 if (input.charAt(i) == STARTING_BRACKET 224 // Start BracketProgLangReader 225 && (i == begin || input.charAt(i - 1) != '\\') 226 // end BracketProgLangReader 227 ) { 228 index = i; 229 break; 230 } 231 } 232 if (index == -1) { 233 TokenNode t = phraseStructure.addTokenNode(terminalCounter); 234 if (t == null) { 235 close(); 236 throw new MaltChainedException("Bracket Reader error: could not create a terminal node. "); 237 } 238 239 terminalCounter++; 240 Edge e = null; 241 242 if (parent != null) { 243 e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t); 244 } else { 245 close(); 246 throw new MaltChainedException("Bracket Reader error: could not find the parent node. "); 247 } 248 249 int start = begin; 250 251 Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator(); 252 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 253 boolean noneNode = false; 254 boolean edgeLabels = false; 255 for (int i = begin; i < end; i++) { 256 if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 257 // Start BracketProgLangReader 258 && (i == begin || input.charAt(i - 1) != '\\') 259 // end BracketProgLangReader 260 ) || i == end - 1) { 261 if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) { 262 noneNode = true; 263 } else if (start == begin) { 264 if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) { 265 if (inputColumnsIterator.hasNext()) { 266 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), 267 268 // Start BracketProgLangReader 269 decodeString( 270 // end BracketProgLangReader 271 (i == end - 1)?input.substring(start,end):input.substring(start, i) 272 // Start BracketProgLangReader 273 ) 274 // end BracketProgLangReader 275 ); 276 } 277 start = i + 1; 278 if (input.charAt(i) == EDGELABEL_SEPARATOR) { 279 edgeLabels = true; 280 } 281 } 282 } else if (edgeLabels && e != null) { 283 if (edgeLabelsColumnsIterator.hasNext()) { 284 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 285 } 286 start = i + 1; 287 if (input.charAt(i) == INPUT_SEPARATOR 288 // Start BracketProgLangReader 289 && (i == begin || input.charAt(i - 1) != '\\') 290 // end BracketProgLangReader 291 ) { 292 edgeLabels = false; 293 } 294 } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR 295 // Start BracketProgLangReader 296 && (i == begin || input.charAt(i - 1) != '\\') 297 // end BracketProgLangReader 298 ) 299 ) { 300 } else { 301 if (inputColumnsIterator.hasNext()) { 302 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 303 } 304 start = i + 1; 305 } 306 } 307 } 308 } else { 309 PhraseStructureNode nt; 310 Edge e = null; 311 if (parent == null) { 312 nt = phraseStructure.getPhraseStructureRoot(); 313 } else { 314 nt = phraseStructure.addNonTerminalNode(nonTerminalCounter); 315 if (nt == null) { 316 close(); 317 throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. "); 318 } 319 nonTerminalCounter++; 320 321 e = phraseStructure.addPhraseStructureEdge(parent, nt); 322 } 323 Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator(); 324 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 325 int newbegin = begin; 326 int start = begin; 327 328 for (int i = begin; i < index; i++) { 329 if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) { 330 if (start == newbegin) { 331 if (phraseLabelColumnsIterator.hasNext()) { 332 nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 333 } 334 start = i + 1; 335 } else if (e != null) { 336 if (edgeLabelsColumnsIterator.hasNext()) { 337 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 338 } 339 start = i + 1; 340 } 341 } else if (input.charAt(i) == BLANK) { 342 start++; 343 newbegin++; 344 } 345 } 346 347 bracketing(phraseStructure, index, end, nt); 348 } 349 } 350 351 private String decodeString(String string) { 352 return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " "); 353 } 354 355 public void readEpilog() throws MaltChainedException { 356 357 } 358 359 public BufferedReader getReader() { 360 return reader; 361 } 362 363 public void setReader(BufferedReader reader) { 364 this.reader = reader; 365 } 366 367 public int getSentenceCount() throws MaltChainedException { 368 return sentenceCount; 369 } 370 371 public void setSentenceCount(int sentenceCount) { 372 this.sentenceCount = sentenceCount; 373 } 374 375 public DataFormatInstance getDataFormatInstance() { 376 return dataFormatInstance; 377 } 378 379 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 380 this.dataFormatInstance = inputDataFormatInstance; 381 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 382 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 383 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 384 } 385 386 public String getOptions() { 387 return optionString; 388 } 389 390 public void setOptions(String optionString) throws MaltChainedException { 391 this.optionString = optionString; 392 } 393 394 public String getFileName() { 395 return fileName; 396 } 397 398 public void setFileName(String fileName) { 399 this.fileName = fileName; 400 } 401 402 public URL getUrl() { 403 return url; 404 } 405 406 public void setUrl(URL url) { 407 this.url = url; 408 } 409 410 public String getCharsetName() { 411 return charsetName; 412 } 413 414 public void setCharsetName(String charsetName) { 415 this.charsetName = charsetName; 416 } 417 418 public int getNIterations() { 419 return nIterations; 420 } 421 422 public void setNIterations(int iterations) { 423 nIterations = iterations; 424 } 425 426 public int getIterationCounter() { 427 return cIterations; 428 } 429 430 public void close() throws MaltChainedException { 431 try { 432 if (reader != null) { 433 if (closeStream) { 434 reader.close(); 435 } 436 reader = null; 437 } 438 } catch (IOException e) { 439 throw new DataFormatException("Error when closing the input file.", e); 440 } 441 } 442 }