001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    
014    import org.maltparser.core.exception.MaltChainedException;
015    import org.maltparser.core.io.dataformat.ColumnDescription;
016    import org.maltparser.core.io.dataformat.DataFormatException;
017    import org.maltparser.core.io.dataformat.DataFormatInstance;
018    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
019    import org.maltparser.core.syntaxgraph.PhraseStructure;
020    import org.maltparser.core.syntaxgraph.TokenStructure;
021    import org.maltparser.core.syntaxgraph.edge.Edge;
022    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
023    import org.maltparser.core.syntaxgraph.node.TokenNode;
024    /**
025    *
026    *
027    * @author Johan Hall
028    */
029    public class BracketReader implements SyntaxGraphReader {
030            private BufferedReader reader;
031            private DataFormatInstance dataFormatInstance;
032            private int sentenceCount;
033            private StringBuilder input;
034            private int terminalCounter; 
035            private int nonTerminalCounter;
036            private String optionString;
037            private SortedMap<String,ColumnDescription> inputColumns;
038            private SortedMap<String,ColumnDescription> edgeLabelColumns;
039            private SortedMap<String,ColumnDescription> phraseLabelColumns;
040            
041            private String fileName = null;
042            private URL url = null;
043            private String charsetName;
044            private int nIterations;
045            private int cIterations;
046            private boolean closeStream = true;
047            
048            private char STARTING_BRACKET = '(';
049            private char CLOSING_BRACKET = ')';
050            private char INPUT_SEPARATOR = ' ';
051            private char EDGELABEL_SEPARATOR = '-';
052            private char SENTENCE_SEPARATOR = '\n';
053            private char BLANK = ' ';
054            private char CARRIAGE_RETURN = '\r';
055            private char TAB = '\t';
056            
057            public BracketReader() { 
058                    input = new StringBuilder();
059                    nIterations = 1;
060                    cIterations = 1;
061            }
062            
063            private void reopen() throws MaltChainedException {
064                    close();
065                    if (fileName != null) {
066                            open(fileName, charsetName);
067                    } else if (url != null) {
068                            open(url, charsetName);
069                    } else {
070                            throw new DataFormatException("The input stream cannot be reopen. ");
071                    }
072            }
073            
074            public void open(String fileName, String charsetName) throws MaltChainedException {
075                    setFileName(fileName);
076                    setCharsetName(charsetName);
077                    try {
078                            open(new FileInputStream(fileName), charsetName);
079                    }catch (FileNotFoundException e) {
080                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081                    }
082            }
083            public void open(URL url, String charsetName) throws MaltChainedException {
084                    setUrl(url);
085                    setCharsetName(charsetName);
086                    try {
087                            open(url.openStream(), charsetName);
088                    } catch (IOException e) {
089                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090                    }
091            }
092            
093            public void open(InputStream is, String charsetName) throws MaltChainedException {
094                    try {
095                            if (is == System.in) {
096                                    closeStream = false;
097                            }
098                            open(new InputStreamReader(is, charsetName));
099                    } catch (UnsupportedEncodingException e) {
100                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101                    }
102            }
103            
104            private void open(InputStreamReader isr) throws MaltChainedException {
105                    setReader(new BufferedReader(isr));
106                    setSentenceCount(0);
107            }
108            
109            public void readProlog() throws MaltChainedException {
110                    
111            }
112            
113            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
114                    if (syntaxGraph == null || dataFormatInstance == null) {
115                            return false;
116                    }
117                    syntaxGraph.clear();
118                    syntaxGraph.getSymbolTables().cleanUp();
119                    int brackets = 0;
120                    try {
121                            int l = reader.read();
122                            char c;
123                            input.setLength(0);
124                    
125                            while (true) {
126                                    if (l == -1) {
127                                            input.setLength(0);
128                                            return false;
129                                    }
130                                    
131                                    c = (char)l; 
132                                    l = reader.read();
133    
134                                    if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) {
135    
136                                    } else if (c == STARTING_BRACKET) {
137                                            input.append(c);
138                                            brackets++;
139                                    } else if (c == CLOSING_BRACKET) {
140                                            input.append(c);
141                                            brackets--;
142                                    } else if (c == INPUT_SEPARATOR) {
143                                            if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) {
144                                                    input.append(c);
145                                            }
146                                    // Start BracketProgLangReader
147                                    } else if (c == '\\') {
148                                            c = (char) l;
149                                            l = reader.read();
150                                            if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') {
151    //                                              System.out.println("Error");
152                                                    System.exit(1);
153                                            } else {
154                                                    input.append("\\" + c);
155                                            }
156                                    // End BracketProgLangReader
157                                    } else if (brackets != 0){
158                                            input.append(c);
159                                    }
160                                    if (brackets == 0 && input.length() != 0) {
161                                            sentenceCount++;
162                                            terminalCounter = 1; 
163                                            nonTerminalCounter = 1;
164                                            if (syntaxGraph instanceof PhraseStructure) {
165                                                    bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null);
166                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
167                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
168                                                    }
169                                            }
170                                            return true;
171                                    }
172                                    
173                                    if (c == -1) {
174                                            if (brackets != 0) {
175                                                    close();
176                                                    throw new MaltChainedException("Error when reading from the input file. ");
177                                            }
178                                            if (cIterations < nIterations) {
179                                                    cIterations++;
180                                                    reopen();
181                                                    return true;
182                                            }
183                                            return false;
184                                    }
185                            }
186                    }  catch (IOException e) {
187                            close();
188                            throw new MaltChainedException("Error when reading from the input file. ", e);
189                    } 
190                    
191            }
192                    
193            private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException {
194                    int bracketsdepth = 0;
195                    int startpos = start-1;
196                    for (int i = start, n = end; i < n; i++) {
197                            if (input.charAt(i) == STARTING_BRACKET
198                                            // Start BracketProgLangReader
199                                            && (i == 0 || input.charAt(i - 1) != '\\')
200                                            // end BracketProgLangReader
201                            
202                            ) {
203                                    if (bracketsdepth == 0) {
204                                            startpos = i;
205                                    }
206                                    bracketsdepth++;
207                            } else if (input.charAt(i) == CLOSING_BRACKET
208                                            // Start BracketProgLangReader
209                                            && (i == 0 || input.charAt(i - 1) != '\\')
210                                            // end BracketProgLangReader
211                            ) {
212                                    bracketsdepth--;
213                                    if (bracketsdepth == 0) {
214                                            extract(phraseStructure, startpos+1, i, parent);
215                                    }       
216                            }
217                    }
218            }
219    
220            private void extract(PhraseStructure phraseStructure, int begin, int end,  PhraseStructureNode parent) throws MaltChainedException {
221                    int index = -1;
222                    for (int i = begin; i < end; i++) {
223                            if (input.charAt(i) == STARTING_BRACKET
224                                            // Start BracketProgLangReader
225                                            && (i == begin || input.charAt(i - 1) != '\\')
226                                            // end BracketProgLangReader            
227                            ) {
228                                    index = i;
229                                    break;
230                            }
231                    }
232                    if (index == -1) {
233                            TokenNode t = phraseStructure.addTokenNode(terminalCounter);
234                            if (t == null) {
235                                    close();
236                                    throw new MaltChainedException("Bracket Reader error: could not create a terminal node. ");
237                            }
238    
239                            terminalCounter++;
240                            Edge e = null;
241    
242                            if (parent != null) {
243                                    e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t);
244                            } else {
245                                    close();
246                                    throw new MaltChainedException("Bracket Reader error: could not find the parent node. ");
247                            }
248    
249                            int start = begin;
250    
251                            Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator();
252                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
253                            boolean noneNode = false;
254                            boolean edgeLabels = false;
255                            for (int i = begin; i < end; i++) {
256                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 
257                                                    // Start BracketProgLangReader
258                                                    && (i == begin || input.charAt(i - 1) != '\\')
259                                                    // end BracketProgLangReader    
260                                            ) || i == end - 1) {
261                                            if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) {
262                                                    noneNode = true;
263                                            } else if (start == begin) {
264                                                    if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) {
265                                                            if (inputColumnsIterator.hasNext()) { 
266                                                                    t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), 
267                                                                                    
268                                                                                    // Start BracketProgLangReader
269                                                                                    decodeString(
270                                                                                    // end BracketProgLangReader
271                                                                                    (i == end - 1)?input.substring(start,end):input.substring(start, i)
272                                                                                    // Start BracketProgLangReader
273                                                                                    )
274                                                                                    // end BracketProgLangReader            
275                                                                                    );
276                                                            }
277                                                            start = i + 1;
278                                                            if (input.charAt(i) == EDGELABEL_SEPARATOR) {
279                                                                    edgeLabels = true;
280                                                            }
281                                                    }
282                                            } else if (edgeLabels && e != null) {
283                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
284                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
285                                                    }
286                                                    start = i + 1;
287                                                    if (input.charAt(i) == INPUT_SEPARATOR
288                                                                    // Start BracketProgLangReader
289                                                                    && (i == begin || input.charAt(i - 1) != '\\')
290                                                                    // end BracketProgLangReader            
291                                                    ) {
292                                                            edgeLabels = false;
293                                                    }
294                                            } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR
295                                                            // Start BracketProgLangReader
296                                                            && (i == begin || input.charAt(i - 1) != '\\')
297                                                            // end BracketProgLangReader
298                                                            )
299                                            ) {     
300                                            } else {
301                                                    if (inputColumnsIterator.hasNext()) { 
302                                                            t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
303                                                    }
304                                                    start = i + 1;
305                                            }
306                                    }
307                            }
308                    } else {
309                            PhraseStructureNode nt;
310                            Edge e = null;
311                            if (parent == null) {
312                                    nt = phraseStructure.getPhraseStructureRoot();
313                            } else {
314                                    nt = phraseStructure.addNonTerminalNode(nonTerminalCounter);
315                                    if (nt == null) {
316                                            close();
317                                            throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. ");
318                                    } 
319                                    nonTerminalCounter++;
320    
321                                    e = phraseStructure.addPhraseStructureEdge(parent, nt);
322                            }
323                            Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator();
324                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
325                            int newbegin = begin;
326                            int start = begin;
327                            
328                            for (int i = begin; i < index; i++) {
329                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) {
330                                            if (start == newbegin) {
331                                                    if (phraseLabelColumnsIterator.hasNext()) { 
332                                                            nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
333                                                    }
334                                                    start = i + 1;
335                                            } else if (e != null) {
336                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
337                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
338                                                    }
339                                                    start = i + 1;
340                                            }
341                                    } else if (input.charAt(i) == BLANK) {
342                                            start++;
343                                            newbegin++;
344                                    }
345                            }
346    
347                            bracketing(phraseStructure, index, end, nt);
348                    }
349            }
350            
351            private String decodeString(String string) {
352                    return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " ");
353            }
354            
355            public void readEpilog() throws MaltChainedException {
356                    
357            }
358            
359            public BufferedReader getReader() {
360                    return reader;
361            }
362    
363            public void setReader(BufferedReader reader) {
364                    this.reader = reader;
365            }
366            
367            public int getSentenceCount() throws MaltChainedException {
368                    return sentenceCount;
369            }
370            
371            public void setSentenceCount(int sentenceCount) {
372                    this.sentenceCount = sentenceCount;
373            }
374            
375            public DataFormatInstance getDataFormatInstance() {
376                    return dataFormatInstance;
377            }
378            
379            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
380                    this.dataFormatInstance = inputDataFormatInstance;
381                    inputColumns = dataFormatInstance.getInputColumnDescriptions();
382                    edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
383                    phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
384            }
385            
386            public String getOptions() {
387                    return optionString;
388            }
389            
390            public void setOptions(String optionString) throws MaltChainedException {
391                    this.optionString = optionString;
392            }
393            
394            public String getFileName() {
395                    return fileName;
396            }
397    
398            public void setFileName(String fileName) {
399                    this.fileName = fileName;
400            }
401    
402            public URL getUrl() {
403                    return url;
404            }
405    
406            public void setUrl(URL url) {
407                    this.url = url;
408            }
409    
410            public String getCharsetName() {
411                    return charsetName;
412            }
413    
414            public void setCharsetName(String charsetName) {
415                    this.charsetName = charsetName;
416            }
417    
418            public int getNIterations() {
419                    return nIterations;
420            }
421    
422            public void setNIterations(int iterations) {
423                    nIterations = iterations;
424            }
425    
426            public int getIterationCounter() {
427                    return cIterations;
428            }
429            
430            public void close() throws MaltChainedException {
431                    try {
432                            if (reader != null) {
433                                    if (closeStream) {
434                                            reader.close();
435                                    }
436                                    reader = null;
437                            }
438                    }   catch (IOException e) {
439                            throw new DataFormatException("Error when closing the input file.", e);
440                    } 
441            }
442    }