001package org.maltparser.core.syntaxgraph.reader;
002
003import java.io.BufferedReader;
004import java.io.FileInputStream;
005import java.io.FileNotFoundException;
006import java.io.IOException;
007import java.io.InputStream;
008import java.io.InputStreamReader;
009import java.io.UnsupportedEncodingException;
010import java.net.URL;
011import java.util.Iterator;
012import java.util.SortedMap;
013
014import org.maltparser.core.exception.MaltChainedException;
015import org.maltparser.core.io.dataformat.ColumnDescription;
016import org.maltparser.core.io.dataformat.DataFormatException;
017import org.maltparser.core.io.dataformat.DataFormatInstance;
018import org.maltparser.core.symbol.SymbolTableHandler;
019import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
020import org.maltparser.core.syntaxgraph.PhraseStructure;
021import org.maltparser.core.syntaxgraph.TokenStructure;
022import org.maltparser.core.syntaxgraph.edge.Edge;
023import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
024import org.maltparser.core.syntaxgraph.node.TokenNode;
025/**
026*
027*
028* @author Johan Hall
029*/
030public class BracketReader implements SyntaxGraphReader {
031        private BufferedReader reader;
032        private DataFormatInstance dataFormatInstance;
033        private int sentenceCount;
034        private StringBuilder input;
035        private int terminalCounter; 
036        private int nonTerminalCounter;
037        private String optionString;
038        private SortedMap<String,ColumnDescription> inputColumns;
039        private SortedMap<String,ColumnDescription> edgeLabelColumns;
040        private SortedMap<String,ColumnDescription> phraseLabelColumns;
041        
042        private String fileName = null;
043        private URL url = null;
044        private String charsetName;
045        private int nIterations;
046        private int cIterations;
047        private boolean closeStream = true;
048        
049        private char STARTING_BRACKET = '(';
050        private char CLOSING_BRACKET = ')';
051        private char INPUT_SEPARATOR = ' ';
052        private char EDGELABEL_SEPARATOR = '-';
053        private char SENTENCE_SEPARATOR = '\n';
054        private char BLANK = ' ';
055        private char CARRIAGE_RETURN = '\r';
056        private char TAB = '\t';
057        
058        public BracketReader() { 
059                input = new StringBuilder();
060                nIterations = 1;
061                cIterations = 1;
062        }
063        
064        private void reopen() throws MaltChainedException {
065                close();
066                if (fileName != null) {
067                        open(fileName, charsetName);
068                } else if (url != null) {
069                        open(url, charsetName);
070                } else {
071                        throw new DataFormatException("The input stream cannot be reopen. ");
072                }
073        }
074        
075        public void open(String fileName, String charsetName) throws MaltChainedException {
076                setFileName(fileName);
077                setCharsetName(charsetName);
078                try {
079                        open(new FileInputStream(fileName), charsetName);
080                }catch (FileNotFoundException e) {
081                        throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
082                }
083        }
084        public void open(URL url, String charsetName) throws MaltChainedException {
085                setUrl(url);
086                setCharsetName(charsetName);
087                try {
088                        open(url.openStream(), charsetName);
089                } catch (IOException e) {
090                        throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
091                }
092        }
093        
094        public void open(InputStream is, String charsetName) throws MaltChainedException {
095                try {
096                        if (is == System.in) {
097                                closeStream = false;
098                        }
099                        open(new InputStreamReader(is, charsetName));
100                } catch (UnsupportedEncodingException e) {
101                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
102                }
103        }
104        
105        private void open(InputStreamReader isr) throws MaltChainedException {
106                setReader(new BufferedReader(isr));
107                setSentenceCount(0);
108        }
109        
110        public void readProlog() throws MaltChainedException {
111                
112        }
113        
114        public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
115                if (syntaxGraph == null || dataFormatInstance == null) {
116                        return false;
117                }
118                syntaxGraph.clear();
119                int brackets = 0;
120                try {
121                        int l = reader.read();
122                        char c;
123                        input.setLength(0);
124                
125                        while (true) {
126                                if (l == -1) {
127                                        input.setLength(0);
128                                        return false;
129                                }
130                                
131                                c = (char)l; 
132                                l = reader.read();
133
134                                if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) {
135
136                                } else if (c == STARTING_BRACKET) {
137                                        input.append(c);
138                                        brackets++;
139                                } else if (c == CLOSING_BRACKET) {
140                                        input.append(c);
141                                        brackets--;
142                                } else if (c == INPUT_SEPARATOR) {
143                                        if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) {
144                                                input.append(c);
145                                        }
146                                // Start BracketProgLangReader
147                                } else if (c == '\\') {
148                                        c = (char) l;
149                                        l = reader.read();
150                                        if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') {
151//                                              System.out.println("Error");
152                                                System.exit(1);
153                                        } else {
154                                                input.append("\\" + c);
155                                        }
156                                // End BracketProgLangReader
157                                } else if (brackets != 0){
158                                        input.append(c);
159                                }
160                                if (brackets == 0 && input.length() != 0) {
161                                        sentenceCount++;
162                                        terminalCounter = 1; 
163                                        nonTerminalCounter = 1;
164                                        if (syntaxGraph instanceof PhraseStructure) {
165                                                bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null);
166                                                if (syntaxGraph instanceof MappablePhraseStructureGraph) {
167                                                        ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
168                                                }
169                                        }
170                                        return true;
171                                }
172                                
173                                if (c == -1) {
174                                        if (brackets != 0) {
175                                                close();
176                                                throw new MaltChainedException("Error when reading from the input file. ");
177                                        }
178                                        if (cIterations < nIterations) {
179                                                cIterations++;
180                                                reopen();
181                                                return true;
182                                        }
183                                        return false;
184                                }
185                        }
186                }  catch (IOException e) {
187                        close();
188                        throw new MaltChainedException("Error when reading from the input file. ", e);
189                } 
190                
191        }
192                
193        private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException {
194                int bracketsdepth = 0;
195                int startpos = start-1;
196                for (int i = start, n = end; i < n; i++) {
197                        if (input.charAt(i) == STARTING_BRACKET
198                                        // Start BracketProgLangReader
199                                        && (i == 0 || input.charAt(i - 1) != '\\')
200                                        // end BracketProgLangReader
201                        
202                        ) {
203                                if (bracketsdepth == 0) {
204                                        startpos = i;
205                                }
206                                bracketsdepth++;
207                        } else if (input.charAt(i) == CLOSING_BRACKET
208                                        // Start BracketProgLangReader
209                                        && (i == 0 || input.charAt(i - 1) != '\\')
210                                        // end BracketProgLangReader
211                        ) {
212                                bracketsdepth--;
213                                if (bracketsdepth == 0) {
214                                        extract(phraseStructure, startpos+1, i, parent);
215                                }       
216                        }
217                }
218        }
219
220        private void extract(PhraseStructure phraseStructure, int begin, int end,  PhraseStructureNode parent) throws MaltChainedException {
221                SymbolTableHandler symbolTables = phraseStructure.getSymbolTables();
222                int index = -1;
223                for (int i = begin; i < end; i++) {
224                        if (input.charAt(i) == STARTING_BRACKET
225                                        // Start BracketProgLangReader
226                                        && (i == begin || input.charAt(i - 1) != '\\')
227                                        // end BracketProgLangReader            
228                        ) {
229                                index = i;
230                                break;
231                        }
232                }
233                if (index == -1) {
234                        TokenNode t = phraseStructure.addTokenNode(terminalCounter);
235                        if (t == null) {
236                                close();
237                                throw new MaltChainedException("Bracket Reader error: could not create a terminal node. ");
238                        }
239
240                        terminalCounter++;
241                        Edge e = null;
242
243                        if (parent != null) {
244                                e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t);
245                        } else {
246                                close();
247                                throw new MaltChainedException("Bracket Reader error: could not find the parent node. ");
248                        }
249
250                        int start = begin;
251
252                        Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator();
253                        Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
254                        boolean noneNode = false;
255                        boolean edgeLabels = false;
256                        for (int i = begin; i < end; i++) {
257                                if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 
258                                                // Start BracketProgLangReader
259                                                && (i == begin || input.charAt(i - 1) != '\\')
260                                                // end BracketProgLangReader    
261                                        ) || i == end - 1) {
262                                        if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) {
263                                                noneNode = true;
264                                        } else if (start == begin) {
265                                                if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) {
266                                                        if (inputColumnsIterator.hasNext()) { 
267                                                                
268                                                                t.addLabel(symbolTables.getSymbolTable(inputColumns.get(inputColumnsIterator.next()).getName()), 
269                                                                                                
270                                                                                // Start BracketProgLangReader
271                                                                                decodeString(
272                                                                                // end BracketProgLangReader
273                                                                                (i == end - 1)?input.substring(start,end):input.substring(start, i)
274                                                                                // Start BracketProgLangReader
275                                                                                )
276                                                                                // end BracketProgLangReader            
277                                                                                );
278                                                        }
279                                                        start = i + 1;
280                                                        if (input.charAt(i) == EDGELABEL_SEPARATOR) {
281                                                                edgeLabels = true;
282                                                        }
283                                                }
284                                        } else if (edgeLabels && e != null) {
285                                                if (edgeLabelsColumnsIterator.hasNext()) { 
286                                                        e.addLabel(symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getName()), (i == end - 1)?input.substring(start,end):input.substring(start, i));
287                                                }
288                                                start = i + 1;
289                                                if (input.charAt(i) == INPUT_SEPARATOR
290                                                                // Start BracketProgLangReader
291                                                                && (i == begin || input.charAt(i - 1) != '\\')
292                                                                // end BracketProgLangReader            
293                                                ) {
294                                                        edgeLabels = false;
295                                                }
296                                        } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR
297                                                        // Start BracketProgLangReader
298                                                        && (i == begin || input.charAt(i - 1) != '\\')
299                                                        // end BracketProgLangReader
300                                                        )
301                                        ) {     
302                                        } else {
303                                                if (inputColumnsIterator.hasNext()) { 
304                                                        t.addLabel(symbolTables.getSymbolTable(inputColumns.get(inputColumnsIterator.next()).getName()), (i == end - 1)?input.substring(start,end):input.substring(start, i));
305                                                }
306                                                start = i + 1;
307                                        }
308                                }
309                        }
310                } else {
311                        PhraseStructureNode nt;
312                        Edge e = null;
313                        if (parent == null) {
314                                nt = phraseStructure.getPhraseStructureRoot();
315                        } else {
316                                nt = phraseStructure.addNonTerminalNode(nonTerminalCounter);
317                                if (nt == null) {
318                                        close();
319                                        throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. ");
320                                } 
321                                nonTerminalCounter++;
322
323                                e = phraseStructure.addPhraseStructureEdge(parent, nt);
324                        }
325                        Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator();
326                        Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
327                        int newbegin = begin;
328                        int start = begin;
329                        
330                        for (int i = begin; i < index; i++) {
331                                if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) {
332                                        if (start == newbegin) {
333                                                if (phraseLabelColumnsIterator.hasNext()) { 
334                                                        nt.addLabel(symbolTables.getSymbolTable(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getName()), (i == index - 1)?input.substring(start,index):input.substring(start, i));
335                                                }
336                                                start = i + 1;
337                                        } else if (e != null) {
338                                                if (edgeLabelsColumnsIterator.hasNext()) { 
339                                                        e.addLabel(symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getName()), (i == index - 1)?input.substring(start,index):input.substring(start, i));
340                                                }
341                                                start = i + 1;
342                                        }
343                                } else if (input.charAt(i) == BLANK) {
344                                        start++;
345                                        newbegin++;
346                                }
347                        }
348
349                        bracketing(phraseStructure, index, end, nt);
350                }
351        }
352        
353        private String decodeString(String string) {
354                return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " ");
355        }
356        
357        public void readEpilog() throws MaltChainedException {
358                
359        }
360        
361        public BufferedReader getReader() {
362                return reader;
363        }
364
365        public void setReader(BufferedReader reader) {
366                this.reader = reader;
367        }
368        
369        public int getSentenceCount() throws MaltChainedException {
370                return sentenceCount;
371        }
372        
373        public void setSentenceCount(int sentenceCount) {
374                this.sentenceCount = sentenceCount;
375        }
376        
377        public DataFormatInstance getDataFormatInstance() {
378                return dataFormatInstance;
379        }
380        
381        public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
382                this.dataFormatInstance = inputDataFormatInstance;
383                inputColumns = dataFormatInstance.getInputColumnDescriptions();
384                edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
385                phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
386        }
387        
388        public String getOptions() {
389                return optionString;
390        }
391        
392        public void setOptions(String optionString) throws MaltChainedException {
393                this.optionString = optionString;
394        }
395        
396        public String getFileName() {
397                return fileName;
398        }
399
400        public void setFileName(String fileName) {
401                this.fileName = fileName;
402        }
403
404        public URL getUrl() {
405                return url;
406        }
407
408        public void setUrl(URL url) {
409                this.url = url;
410        }
411
412        public String getCharsetName() {
413                return charsetName;
414        }
415
416        public void setCharsetName(String charsetName) {
417                this.charsetName = charsetName;
418        }
419
420        public int getNIterations() {
421                return nIterations;
422        }
423
424        public void setNIterations(int iterations) {
425                nIterations = iterations;
426        }
427
428        public int getIterationCounter() {
429                return cIterations;
430        }
431        
432        public void close() throws MaltChainedException {
433                try {
434                        if (reader != null) {
435                                if (closeStream) {
436                                        reader.close();
437                                }
438                                reader = null;
439                        }
440                }   catch (IOException e) {
441                        throw new DataFormatException("Error when closing the input file.", e);
442                } 
443        }
444}