001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.Iterator;
012 import java.util.SortedMap;
013
014 import org.maltparser.core.exception.MaltChainedException;
015 import org.maltparser.core.io.dataformat.ColumnDescription;
016 import org.maltparser.core.io.dataformat.DataFormatException;
017 import org.maltparser.core.io.dataformat.DataFormatInstance;
018 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
019 import org.maltparser.core.syntaxgraph.PhraseStructure;
020 import org.maltparser.core.syntaxgraph.TokenStructure;
021 import org.maltparser.core.syntaxgraph.edge.Edge;
022 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
023 import org.maltparser.core.syntaxgraph.node.TokenNode;
024 /**
025 *
026 *
027 * @author Johan Hall
028 */
029 public class BracketReader implements SyntaxGraphReader {
030 private BufferedReader reader;
031 private DataFormatInstance dataFormatInstance;
032 private int sentenceCount;
033 private StringBuilder input;
034 private int terminalCounter;
035 private int nonTerminalCounter;
036 private String optionString;
037 private SortedMap<String,ColumnDescription> inputColumns;
038 private SortedMap<String,ColumnDescription> edgeLabelColumns;
039 private SortedMap<String,ColumnDescription> phraseLabelColumns;
040
041 private String fileName = null;
042 private URL url = null;
043 private String charsetName;
044 private int nIterations;
045 private int cIterations;
046 private boolean closeStream = true;
047
048 private char STARTING_BRACKET = '(';
049 private char CLOSING_BRACKET = ')';
050 private char INPUT_SEPARATOR = ' ';
051 private char EDGELABEL_SEPARATOR = '-';
052 private char SENTENCE_SEPARATOR = '\n';
053 private char BLANK = ' ';
054 private char CARRIAGE_RETURN = '\r';
055 private char TAB = '\t';
056
057 public BracketReader() {
058 input = new StringBuilder();
059 nIterations = 1;
060 cIterations = 1;
061 }
062
063 private void reopen() throws MaltChainedException {
064 close();
065 if (fileName != null) {
066 open(fileName, charsetName);
067 } else if (url != null) {
068 open(url, charsetName);
069 } else {
070 throw new DataFormatException("The input stream cannot be reopen. ");
071 }
072 }
073
074 public void open(String fileName, String charsetName) throws MaltChainedException {
075 setFileName(fileName);
076 setCharsetName(charsetName);
077 try {
078 open(new FileInputStream(fileName), charsetName);
079 }catch (FileNotFoundException e) {
080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081 }
082 }
083 public void open(URL url, String charsetName) throws MaltChainedException {
084 setUrl(url);
085 setCharsetName(charsetName);
086 try {
087 open(url.openStream(), charsetName);
088 } catch (IOException e) {
089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090 }
091 }
092
093 public void open(InputStream is, String charsetName) throws MaltChainedException {
094 try {
095 if (is == System.in) {
096 closeStream = false;
097 }
098 open(new InputStreamReader(is, charsetName));
099 } catch (UnsupportedEncodingException e) {
100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101 }
102 }
103
104 private void open(InputStreamReader isr) throws MaltChainedException {
105 setReader(new BufferedReader(isr));
106 setSentenceCount(0);
107 }
108
109 public void readProlog() throws MaltChainedException {
110
111 }
112
113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
114 if (syntaxGraph == null || dataFormatInstance == null) {
115 return false;
116 }
117 syntaxGraph.clear();
118 syntaxGraph.getSymbolTables().cleanUp();
119 int brackets = 0;
120 try {
121 int l = reader.read();
122 char c;
123 input.setLength(0);
124
125 while (true) {
126 if (l == -1) {
127 input.setLength(0);
128 return false;
129 }
130
131 c = (char)l;
132 l = reader.read();
133
134 if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) {
135
136 } else if (c == STARTING_BRACKET) {
137 input.append(c);
138 brackets++;
139 } else if (c == CLOSING_BRACKET) {
140 input.append(c);
141 brackets--;
142 } else if (c == INPUT_SEPARATOR) {
143 if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) {
144 input.append(c);
145 }
146 // Start BracketProgLangReader
147 } else if (c == '\\') {
148 c = (char) l;
149 l = reader.read();
150 if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') {
151 // System.out.println("Error");
152 System.exit(1);
153 } else {
154 input.append("\\" + c);
155 }
156 // End BracketProgLangReader
157 } else if (brackets != 0){
158 input.append(c);
159 }
160 if (brackets == 0 && input.length() != 0) {
161 sentenceCount++;
162 terminalCounter = 1;
163 nonTerminalCounter = 1;
164 if (syntaxGraph instanceof PhraseStructure) {
165 bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null);
166 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
167 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
168 }
169 }
170 return true;
171 }
172
173 if (c == -1) {
174 if (brackets != 0) {
175 close();
176 throw new MaltChainedException("Error when reading from the input file. ");
177 }
178 if (cIterations < nIterations) {
179 cIterations++;
180 reopen();
181 return true;
182 }
183 return false;
184 }
185 }
186 } catch (IOException e) {
187 close();
188 throw new MaltChainedException("Error when reading from the input file. ", e);
189 }
190
191 }
192
193 private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException {
194 int bracketsdepth = 0;
195 int startpos = start-1;
196 for (int i = start, n = end; i < n; i++) {
197 if (input.charAt(i) == STARTING_BRACKET
198 // Start BracketProgLangReader
199 && (i == 0 || input.charAt(i - 1) != '\\')
200 // end BracketProgLangReader
201
202 ) {
203 if (bracketsdepth == 0) {
204 startpos = i;
205 }
206 bracketsdepth++;
207 } else if (input.charAt(i) == CLOSING_BRACKET
208 // Start BracketProgLangReader
209 && (i == 0 || input.charAt(i - 1) != '\\')
210 // end BracketProgLangReader
211 ) {
212 bracketsdepth--;
213 if (bracketsdepth == 0) {
214 extract(phraseStructure, startpos+1, i, parent);
215 }
216 }
217 }
218 }
219
220 private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException {
221 int index = -1;
222 for (int i = begin; i < end; i++) {
223 if (input.charAt(i) == STARTING_BRACKET
224 // Start BracketProgLangReader
225 && (i == begin || input.charAt(i - 1) != '\\')
226 // end BracketProgLangReader
227 ) {
228 index = i;
229 break;
230 }
231 }
232 if (index == -1) {
233 TokenNode t = phraseStructure.addTokenNode(terminalCounter);
234 if (t == null) {
235 close();
236 throw new MaltChainedException("Bracket Reader error: could not create a terminal node. ");
237 }
238
239 terminalCounter++;
240 Edge e = null;
241
242 if (parent != null) {
243 e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t);
244 } else {
245 close();
246 throw new MaltChainedException("Bracket Reader error: could not find the parent node. ");
247 }
248
249 int start = begin;
250
251 Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator();
252 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
253 boolean noneNode = false;
254 boolean edgeLabels = false;
255 for (int i = begin; i < end; i++) {
256 if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR
257 // Start BracketProgLangReader
258 && (i == begin || input.charAt(i - 1) != '\\')
259 // end BracketProgLangReader
260 ) || i == end - 1) {
261 if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) {
262 noneNode = true;
263 } else if (start == begin) {
264 if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) {
265 if (inputColumnsIterator.hasNext()) {
266 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(),
267
268 // Start BracketProgLangReader
269 decodeString(
270 // end BracketProgLangReader
271 (i == end - 1)?input.substring(start,end):input.substring(start, i)
272 // Start BracketProgLangReader
273 )
274 // end BracketProgLangReader
275 );
276 }
277 start = i + 1;
278 if (input.charAt(i) == EDGELABEL_SEPARATOR) {
279 edgeLabels = true;
280 }
281 }
282 } else if (edgeLabels && e != null) {
283 if (edgeLabelsColumnsIterator.hasNext()) {
284 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
285 }
286 start = i + 1;
287 if (input.charAt(i) == INPUT_SEPARATOR
288 // Start BracketProgLangReader
289 && (i == begin || input.charAt(i - 1) != '\\')
290 // end BracketProgLangReader
291 ) {
292 edgeLabels = false;
293 }
294 } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR
295 // Start BracketProgLangReader
296 && (i == begin || input.charAt(i - 1) != '\\')
297 // end BracketProgLangReader
298 )
299 ) {
300 } else {
301 if (inputColumnsIterator.hasNext()) {
302 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
303 }
304 start = i + 1;
305 }
306 }
307 }
308 } else {
309 PhraseStructureNode nt;
310 Edge e = null;
311 if (parent == null) {
312 nt = phraseStructure.getPhraseStructureRoot();
313 } else {
314 nt = phraseStructure.addNonTerminalNode(nonTerminalCounter);
315 if (nt == null) {
316 close();
317 throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. ");
318 }
319 nonTerminalCounter++;
320
321 e = phraseStructure.addPhraseStructureEdge(parent, nt);
322 }
323 Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator();
324 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
325 int newbegin = begin;
326 int start = begin;
327
328 for (int i = begin; i < index; i++) {
329 if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) {
330 if (start == newbegin) {
331 if (phraseLabelColumnsIterator.hasNext()) {
332 nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
333 }
334 start = i + 1;
335 } else if (e != null) {
336 if (edgeLabelsColumnsIterator.hasNext()) {
337 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
338 }
339 start = i + 1;
340 }
341 } else if (input.charAt(i) == BLANK) {
342 start++;
343 newbegin++;
344 }
345 }
346
347 bracketing(phraseStructure, index, end, nt);
348 }
349 }
350
351 private String decodeString(String string) {
352 return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " ");
353 }
354
355 public void readEpilog() throws MaltChainedException {
356
357 }
358
359 public BufferedReader getReader() {
360 return reader;
361 }
362
363 public void setReader(BufferedReader reader) {
364 this.reader = reader;
365 }
366
367 public int getSentenceCount() throws MaltChainedException {
368 return sentenceCount;
369 }
370
371 public void setSentenceCount(int sentenceCount) {
372 this.sentenceCount = sentenceCount;
373 }
374
375 public DataFormatInstance getDataFormatInstance() {
376 return dataFormatInstance;
377 }
378
379 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
380 this.dataFormatInstance = inputDataFormatInstance;
381 inputColumns = dataFormatInstance.getInputColumnDescriptions();
382 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
383 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
384 }
385
386 public String getOptions() {
387 return optionString;
388 }
389
390 public void setOptions(String optionString) throws MaltChainedException {
391 this.optionString = optionString;
392 }
393
394 public String getFileName() {
395 return fileName;
396 }
397
398 public void setFileName(String fileName) {
399 this.fileName = fileName;
400 }
401
402 public URL getUrl() {
403 return url;
404 }
405
406 public void setUrl(URL url) {
407 this.url = url;
408 }
409
410 public String getCharsetName() {
411 return charsetName;
412 }
413
414 public void setCharsetName(String charsetName) {
415 this.charsetName = charsetName;
416 }
417
418 public int getNIterations() {
419 return nIterations;
420 }
421
422 public void setNIterations(int iterations) {
423 nIterations = iterations;
424 }
425
426 public int getIterationCounter() {
427 return cIterations;
428 }
429
430 public void close() throws MaltChainedException {
431 try {
432 if (reader != null) {
433 if (closeStream) {
434 reader.close();
435 }
436 reader = null;
437 }
438 } catch (IOException e) {
439 throw new DataFormatException("Error when closing the input file.", e);
440 }
441 }
442 }