001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 import java.util.SortedMap; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 import org.maltparser.core.io.dataformat.ColumnDescription; 016 import org.maltparser.core.io.dataformat.DataFormatException; 017 import org.maltparser.core.io.dataformat.DataFormatInstance; 018 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 019 import org.maltparser.core.syntaxgraph.PhraseStructure; 020 import org.maltparser.core.syntaxgraph.TokenStructure; 021 import org.maltparser.core.syntaxgraph.edge.Edge; 022 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 023 import org.maltparser.core.syntaxgraph.node.TokenNode; 024 /** 025 * 026 * 027 * @author Johan Hall 028 */ 029 public class BracketReader implements SyntaxGraphReader { 030 private BufferedReader reader; 031 private DataFormatInstance dataFormatInstance; 032 private int sentenceCount; 033 private StringBuilder input; 034 private int terminalCounter; 035 private int nonTerminalCounter; 036 private String optionString; 037 private SortedMap<String,ColumnDescription> inputColumns; 038 private SortedMap<String,ColumnDescription> edgeLabelColumns; 039 private SortedMap<String,ColumnDescription> phraseLabelColumns; 040 private char STARTING_BRACKET = '('; 041 private char CLOSING_BRACKET = ')'; 042 private char INPUT_SEPARATOR = ' '; 043 private char EDGELABEL_SEPARATOR = '-'; 044 private char SENTENCE_SEPARATOR = '\n'; 045 private char BLANK = ' '; 046 private char CARRIAGE_RETURN = '\r'; 047 private char TAB = '\t'; 048 049 public BracketReader() { 050 input = new StringBuilder(); 051 } 052 053 public void open(String fileName, String charsetName) throws MaltChainedException { 054 try { 055 open(new FileInputStream(fileName), charsetName); 056 }catch (FileNotFoundException e) { 057 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 058 } 059 } 060 public void open(URL url, String charsetName) throws MaltChainedException { 061 try { 062 open(url.openStream(), charsetName); 063 } catch (IOException e) { 064 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 065 } 066 } 067 068 public void open(InputStream is, String charsetName) throws MaltChainedException { 069 try { 070 open(new InputStreamReader(is, charsetName)); 071 } catch (UnsupportedEncodingException e) { 072 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 073 } 074 } 075 076 public void open(InputStreamReader isr) throws MaltChainedException { 077 setReader(new BufferedReader(isr)); 078 setSentenceCount(0); 079 } 080 081 public void readProlog() throws MaltChainedException { 082 083 } 084 085 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 086 if (syntaxGraph == null || dataFormatInstance == null) { 087 return false; 088 } 089 syntaxGraph.clear(); 090 int brackets = 0; 091 try { 092 int l = reader.read(); 093 char c; 094 input.setLength(0); 095 096 while (true) { 097 if (l == -1) { 098 input.setLength(0); 099 return false; 100 } 101 102 c = (char)l; 103 l = reader.read(); 104 105 if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) { 106 107 } else if (c == STARTING_BRACKET) { 108 input.append(c); 109 brackets++; 110 } else if (c == CLOSING_BRACKET) { 111 input.append(c); 112 brackets--; 113 } else if (c == INPUT_SEPARATOR) { 114 if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) { 115 input.append(c); 116 } 117 } else if (brackets != 0){ 118 input.append(c); 119 } 120 if (brackets == 0 && input.length() != 0) { 121 sentenceCount++; 122 terminalCounter = 1; 123 nonTerminalCounter = 1; 124 if (syntaxGraph instanceof PhraseStructure) { 125 bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null); 126 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 127 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 128 } 129 } 130 return true; 131 } 132 133 if (c == -1) { 134 if (brackets != 0) { 135 close(); 136 throw new MaltChainedException("Error when reading from the input file. "); 137 } 138 return false; 139 } 140 } 141 } catch (IOException e) { 142 close(); 143 throw new MaltChainedException("Error when reading from the input file. ", e); 144 } 145 146 } 147 148 private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException { 149 int bracketsdepth = 0; 150 int startpos = start-1; 151 for (int i = start, n = end; i < n; i++) { 152 if (input.charAt(i) == STARTING_BRACKET) { 153 if (bracketsdepth == 0) { 154 startpos = i; 155 } 156 bracketsdepth++; 157 } else if (input.charAt(i) == CLOSING_BRACKET) { 158 bracketsdepth--; 159 if (bracketsdepth == 0) { 160 extract(phraseStructure, startpos+1, i, parent); 161 } 162 } 163 } 164 } 165 166 private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException { 167 int index = -1; 168 for (int i = begin; i < end; i++) { 169 if (input.charAt(i) == STARTING_BRACKET) { 170 index = i; 171 break; 172 } 173 } 174 if (index == -1) { 175 TokenNode t = phraseStructure.addTokenNode(terminalCounter); 176 if (t == null) { 177 close(); 178 throw new MaltChainedException("Bracket Reader error: could not create a terminal node. "); 179 } 180 181 terminalCounter++; 182 Edge e = null; 183 184 if (parent != null) { 185 e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t); 186 } else { 187 close(); 188 throw new MaltChainedException("Bracket Reader error: could not find the parent node. "); 189 } 190 191 int start = begin; 192 193 Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator(); 194 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 195 boolean noneNode = false; 196 boolean edgeLabels = false; 197 for (int i = begin; i < end; i++) { 198 if (input.charAt(i) == EDGELABEL_SEPARATOR || input.charAt(i) == INPUT_SEPARATOR || i == end - 1) { 199 if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) { 200 noneNode = true; 201 } else if (start == begin) { 202 if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) { 203 if (inputColumnsIterator.hasNext()) { 204 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 205 } 206 start = i + 1; 207 if (input.charAt(i) == EDGELABEL_SEPARATOR) { 208 edgeLabels = true; 209 } 210 } 211 } else if (edgeLabels && e != null) { 212 if (edgeLabelsColumnsIterator.hasNext()) { 213 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 214 } 215 start = i + 1; 216 if (input.charAt(i) == INPUT_SEPARATOR) { 217 edgeLabels = false; 218 } 219 } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && input.charAt(i+1) != INPUT_SEPARATOR) { 220 } else { 221 if (inputColumnsIterator.hasNext()) { 222 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 223 } 224 start = i + 1; 225 } 226 } 227 } 228 } else { 229 PhraseStructureNode nt; 230 Edge e = null; 231 if (parent == null) { 232 nt = phraseStructure.getPhraseStructureRoot(); 233 } else { 234 nt = phraseStructure.addNonTerminalNode(nonTerminalCounter); 235 if (nt == null) { 236 close(); 237 throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. "); 238 } 239 nonTerminalCounter++; 240 241 e = phraseStructure.addPhraseStructureEdge(parent, nt); 242 } 243 Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator(); 244 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 245 int newbegin = begin; 246 int start = begin; 247 248 for (int i = begin; i < index; i++) { 249 if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) { 250 if (start == newbegin) { 251 if (phraseLabelColumnsIterator.hasNext()) { 252 nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 253 } 254 start = i + 1; 255 } else if (e != null) { 256 if (edgeLabelsColumnsIterator.hasNext()) { 257 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 258 } 259 start = i + 1; 260 } 261 } else if (input.charAt(i) == BLANK) { 262 start++; 263 newbegin++; 264 } 265 } 266 267 bracketing(phraseStructure, index, end, nt); 268 } 269 } 270 271 public void readEpilog() throws MaltChainedException { 272 273 } 274 275 public BufferedReader getReader() { 276 return reader; 277 } 278 279 public void setReader(BufferedReader reader) { 280 this.reader = reader; 281 } 282 283 public int getSentenceCount() throws MaltChainedException { 284 return sentenceCount; 285 } 286 287 public void setSentenceCount(int sentenceCount) { 288 this.sentenceCount = sentenceCount; 289 } 290 291 public DataFormatInstance getDataFormatInstance() { 292 return dataFormatInstance; 293 } 294 295 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 296 this.dataFormatInstance = inputDataFormatInstance; 297 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 298 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 299 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 300 } 301 302 public String getOptions() { 303 return optionString; 304 } 305 306 public void setOptions(String optionString) throws MaltChainedException { 307 this.optionString = optionString; 308 } 309 310 public void close() throws MaltChainedException { 311 try { 312 if (reader != null) { 313 reader.close(); 314 } 315 reader = null; 316 } catch (IOException e) { 317 throw new DataFormatException("Error when closing the input file.", e); 318 } 319 } 320 }