001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 import org.maltparser.core.io.dataformat.ColumnDescription; 016 import org.maltparser.core.io.dataformat.DataFormatException; 017 import org.maltparser.core.io.dataformat.DataFormatInstance; 018 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 019 import org.maltparser.core.syntaxgraph.PhraseStructure; 020 import org.maltparser.core.syntaxgraph.TokenStructure; 021 import org.maltparser.core.syntaxgraph.edge.Edge; 022 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 023 import org.maltparser.core.syntaxgraph.node.TokenNode; 024 /** 025 * 026 * 027 * @author Johan Hall 028 */ 029 public class BracketReader implements SyntaxGraphReader { 030 private BufferedReader reader; 031 private DataFormatInstance dataFormatInstance; 032 private int sentenceCount; 033 private StringBuilder input; 034 private int terminalCounter; 035 private int nonTerminalCounter; 036 private String optionString; 037 private SortedMap<String,ColumnDescription> inputColumns; 038 private SortedMap<String,ColumnDescription> edgeLabelColumns; 039 private SortedMap<String,ColumnDescription> phraseLabelColumns; 040 041 private String fileName = null; 042 private URL url = null; 043 private String charsetName; 044 private int nIterations; 045 private int cIterations; 046 047 private char STARTING_BRACKET = '('; 048 private char CLOSING_BRACKET = ')'; 049 private char INPUT_SEPARATOR = ' '; 050 private char EDGELABEL_SEPARATOR = '-'; 051 private char SENTENCE_SEPARATOR = '\n'; 052 private char BLANK = ' '; 053 private char CARRIAGE_RETURN = '\r'; 054 private char TAB = '\t'; 055 056 public BracketReader() { 057 input = new StringBuilder(); 058 nIterations = 1; 059 cIterations = 1; 060 } 061 062 private void reopen() throws MaltChainedException { 063 close(); 064 if (fileName != null) { 065 open(fileName, charsetName); 066 } else if (url != null) { 067 open(url, charsetName); 068 } else { 069 throw new DataFormatException("The input stream cannot be reopen. "); 070 } 071 } 072 073 public void open(String fileName, String charsetName) throws MaltChainedException { 074 setFileName(fileName); 075 setCharsetName(charsetName); 076 try { 077 open(new FileInputStream(fileName), charsetName); 078 }catch (FileNotFoundException e) { 079 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 080 } 081 } 082 public void open(URL url, String charsetName) throws MaltChainedException { 083 setUrl(url); 084 setCharsetName(charsetName); 085 try { 086 open(url.openStream(), charsetName); 087 } catch (IOException e) { 088 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 089 } 090 } 091 092 public void open(InputStream is, String charsetName) throws MaltChainedException { 093 try { 094 open(new InputStreamReader(is, charsetName)); 095 } catch (UnsupportedEncodingException e) { 096 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 097 } 098 } 099 100 public void open(InputStreamReader isr) throws MaltChainedException { 101 setReader(new BufferedReader(isr)); 102 setSentenceCount(0); 103 } 104 105 public void readProlog() throws MaltChainedException { 106 107 } 108 109 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 110 if (syntaxGraph == null || dataFormatInstance == null) { 111 return false; 112 } 113 syntaxGraph.clear(); 114 int brackets = 0; 115 try { 116 int l = reader.read(); 117 char c; 118 input.setLength(0); 119 120 while (true) { 121 if (l == -1) { 122 input.setLength(0); 123 return false; 124 } 125 126 c = (char)l; 127 l = reader.read(); 128 129 if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) { 130 131 } else if (c == STARTING_BRACKET) { 132 input.append(c); 133 brackets++; 134 } else if (c == CLOSING_BRACKET) { 135 input.append(c); 136 brackets--; 137 } else if (c == INPUT_SEPARATOR) { 138 if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) { 139 input.append(c); 140 } 141 // Start BracketProgLangReader 142 } else if (c == '\\') { 143 c = (char) l; 144 l = reader.read(); 145 if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') { 146 System.out.println("Error"); 147 System.exit(1); 148 } else { 149 input.append("\\" + c); 150 } 151 // End BracketProgLangReader 152 } else if (brackets != 0){ 153 input.append(c); 154 } 155 if (brackets == 0 && input.length() != 0) { 156 sentenceCount++; 157 terminalCounter = 1; 158 nonTerminalCounter = 1; 159 if (syntaxGraph instanceof PhraseStructure) { 160 bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null); 161 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 162 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 163 } 164 } 165 return true; 166 } 167 168 if (c == -1) { 169 if (brackets != 0) { 170 close(); 171 throw new MaltChainedException("Error when reading from the input file. "); 172 } 173 if (cIterations < nIterations) { 174 cIterations++; 175 reopen(); 176 return true; 177 } 178 return false; 179 } 180 } 181 } catch (IOException e) { 182 close(); 183 throw new MaltChainedException("Error when reading from the input file. ", e); 184 } 185 186 } 187 188 private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException { 189 int bracketsdepth = 0; 190 int startpos = start-1; 191 for (int i = start, n = end; i < n; i++) { 192 if (input.charAt(i) == STARTING_BRACKET 193 // Start BracketProgLangReader 194 && (i == 0 || input.charAt(i - 1) != '\\') 195 // end BracketProgLangReader 196 197 ) { 198 if (bracketsdepth == 0) { 199 startpos = i; 200 } 201 bracketsdepth++; 202 } else if (input.charAt(i) == CLOSING_BRACKET 203 // Start BracketProgLangReader 204 && (i == 0 || input.charAt(i - 1) != '\\') 205 // end BracketProgLangReader 206 ) { 207 bracketsdepth--; 208 if (bracketsdepth == 0) { 209 extract(phraseStructure, startpos+1, i, parent); 210 } 211 } 212 } 213 } 214 215 private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException { 216 int index = -1; 217 for (int i = begin; i < end; i++) { 218 if (input.charAt(i) == STARTING_BRACKET 219 // Start BracketProgLangReader 220 && (i == begin || input.charAt(i - 1) != '\\') 221 // end BracketProgLangReader 222 ) { 223 index = i; 224 break; 225 } 226 } 227 if (index == -1) { 228 TokenNode t = phraseStructure.addTokenNode(terminalCounter); 229 if (t == null) { 230 close(); 231 throw new MaltChainedException("Bracket Reader error: could not create a terminal node. "); 232 } 233 234 terminalCounter++; 235 Edge e = null; 236 237 if (parent != null) { 238 e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t); 239 } else { 240 close(); 241 throw new MaltChainedException("Bracket Reader error: could not find the parent node. "); 242 } 243 244 int start = begin; 245 246 Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator(); 247 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 248 boolean noneNode = false; 249 boolean edgeLabels = false; 250 for (int i = begin; i < end; i++) { 251 if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 252 // Start BracketProgLangReader 253 && (i == begin || input.charAt(i - 1) != '\\') 254 // end BracketProgLangReader 255 ) || i == end - 1) { 256 if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) { 257 noneNode = true; 258 } else if (start == begin) { 259 if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) { 260 if (inputColumnsIterator.hasNext()) { 261 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), 262 263 // Start BracketProgLangReader 264 decodeString( 265 // end BracketProgLangReader 266 (i == end - 1)?input.substring(start,end):input.substring(start, i) 267 // Start BracketProgLangReader 268 ) 269 // end BracketProgLangReader 270 ); 271 } 272 start = i + 1; 273 if (input.charAt(i) == EDGELABEL_SEPARATOR) { 274 edgeLabels = true; 275 } 276 } 277 } else if (edgeLabels && e != null) { 278 if (edgeLabelsColumnsIterator.hasNext()) { 279 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 280 } 281 start = i + 1; 282 if (input.charAt(i) == INPUT_SEPARATOR 283 // Start BracketProgLangReader 284 && (i == begin || input.charAt(i - 1) != '\\') 285 // end BracketProgLangReader 286 ) { 287 edgeLabels = false; 288 } 289 } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR 290 // Start BracketProgLangReader 291 && (i == begin || input.charAt(i - 1) != '\\') 292 // end BracketProgLangReader 293 ) 294 ) { 295 } else { 296 if (inputColumnsIterator.hasNext()) { 297 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 298 } 299 start = i + 1; 300 } 301 } 302 } 303 } else { 304 PhraseStructureNode nt; 305 Edge e = null; 306 if (parent == null) { 307 nt = phraseStructure.getPhraseStructureRoot(); 308 } else { 309 nt = phraseStructure.addNonTerminalNode(nonTerminalCounter); 310 if (nt == null) { 311 close(); 312 throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. "); 313 } 314 nonTerminalCounter++; 315 316 e = phraseStructure.addPhraseStructureEdge(parent, nt); 317 } 318 Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator(); 319 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 320 int newbegin = begin; 321 int start = begin; 322 323 for (int i = begin; i < index; i++) { 324 if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) { 325 if (start == newbegin) { 326 if (phraseLabelColumnsIterator.hasNext()) { 327 nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 328 } 329 start = i + 1; 330 } else if (e != null) { 331 if (edgeLabelsColumnsIterator.hasNext()) { 332 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 333 } 334 start = i + 1; 335 } 336 } else if (input.charAt(i) == BLANK) { 337 start++; 338 newbegin++; 339 } 340 } 341 342 bracketing(phraseStructure, index, end, nt); 343 } 344 } 345 346 private String decodeString(String string) { 347 return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " "); 348 } 349 350 public void readEpilog() throws MaltChainedException { 351 352 } 353 354 public BufferedReader getReader() { 355 return reader; 356 } 357 358 public void setReader(BufferedReader reader) { 359 this.reader = reader; 360 } 361 362 public int getSentenceCount() throws MaltChainedException { 363 return sentenceCount; 364 } 365 366 public void setSentenceCount(int sentenceCount) { 367 this.sentenceCount = sentenceCount; 368 } 369 370 public DataFormatInstance getDataFormatInstance() { 371 return dataFormatInstance; 372 } 373 374 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 375 this.dataFormatInstance = inputDataFormatInstance; 376 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 377 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 378 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 379 } 380 381 public String getOptions() { 382 return optionString; 383 } 384 385 public void setOptions(String optionString) throws MaltChainedException { 386 this.optionString = optionString; 387 } 388 389 public String getFileName() { 390 return fileName; 391 } 392 393 public void setFileName(String fileName) { 394 this.fileName = fileName; 395 } 396 397 public URL getUrl() { 398 return url; 399 } 400 401 public void setUrl(URL url) { 402 this.url = url; 403 } 404 405 public String getCharsetName() { 406 return charsetName; 407 } 408 409 public void setCharsetName(String charsetName) { 410 this.charsetName = charsetName; 411 } 412 413 public int getNIterations() { 414 return nIterations; 415 } 416 417 public void setNIterations(int iterations) { 418 nIterations = iterations; 419 } 420 421 public int getIterationCounter() { 422 return cIterations; 423 } 424 425 public void close() throws MaltChainedException { 426 try { 427 if (reader != null) { 428 reader.close(); 429 } 430 reader = null; 431 } catch (IOException e) { 432 throw new DataFormatException("Error when closing the input file.", e); 433 } 434 } 435 }