001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    
014    import org.maltparser.core.exception.MaltChainedException;
015    import org.maltparser.core.io.dataformat.ColumnDescription;
016    import org.maltparser.core.io.dataformat.DataFormatException;
017    import org.maltparser.core.io.dataformat.DataFormatInstance;
018    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
019    import org.maltparser.core.syntaxgraph.PhraseStructure;
020    import org.maltparser.core.syntaxgraph.TokenStructure;
021    import org.maltparser.core.syntaxgraph.edge.Edge;
022    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
023    import org.maltparser.core.syntaxgraph.node.TokenNode;
024    /**
025    *
026    *
027    * @author Johan Hall
028    */
029    public class BracketReader implements SyntaxGraphReader {
030            private BufferedReader reader;
031            private DataFormatInstance dataFormatInstance;
032            private int sentenceCount;
033            private StringBuilder input;
034            private int terminalCounter; 
035            private int nonTerminalCounter;
036            private String optionString;
037            private SortedMap<String,ColumnDescription> inputColumns;
038            private SortedMap<String,ColumnDescription> edgeLabelColumns;
039            private SortedMap<String,ColumnDescription> phraseLabelColumns;
040            
041            private String fileName = null;
042            private URL url = null;
043            private String charsetName;
044            private int nIterations;
045            private int cIterations;
046            
047            private char STARTING_BRACKET = '(';
048            private char CLOSING_BRACKET = ')';
049            private char INPUT_SEPARATOR = ' ';
050            private char EDGELABEL_SEPARATOR = '-';
051            private char SENTENCE_SEPARATOR = '\n';
052            private char BLANK = ' ';
053            private char CARRIAGE_RETURN = '\r';
054            private char TAB = '\t';
055            
056            public BracketReader() { 
057                    input = new StringBuilder();
058                    nIterations = 1;
059                    cIterations = 1;
060            }
061            
062            private void reopen() throws MaltChainedException {
063                    close();
064                    if (fileName != null) {
065                            open(fileName, charsetName);
066                    } else if (url != null) {
067                            open(url, charsetName);
068                    } else {
069                            throw new DataFormatException("The input stream cannot be reopen. ");
070                    }
071            }
072            
073            public void open(String fileName, String charsetName) throws MaltChainedException {
074                    setFileName(fileName);
075                    setCharsetName(charsetName);
076                    try {
077                            open(new FileInputStream(fileName), charsetName);
078                    }catch (FileNotFoundException e) {
079                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
080                    }
081            }
082            public void open(URL url, String charsetName) throws MaltChainedException {
083                    setUrl(url);
084                    setCharsetName(charsetName);
085                    try {
086                            open(url.openStream(), charsetName);
087                    } catch (IOException e) {
088                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
089                    }
090            }
091            
092            public void open(InputStream is, String charsetName) throws MaltChainedException {
093                    try {
094                            open(new InputStreamReader(is, charsetName));
095                    } catch (UnsupportedEncodingException e) {
096                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
097                    }
098            }
099            
100            public void open(InputStreamReader isr) throws MaltChainedException {
101                    setReader(new BufferedReader(isr));
102                    setSentenceCount(0);
103            }
104            
105            public void readProlog() throws MaltChainedException {
106                    
107            }
108            
109            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
110                    if (syntaxGraph == null || dataFormatInstance == null) {
111                            return false;
112                    }
113                    syntaxGraph.clear();
114                    int brackets = 0;
115                    try {
116                            int l = reader.read();
117                            char c;
118                            input.setLength(0);
119                    
120                            while (true) {
121                                    if (l == -1) {
122                                            input.setLength(0);
123                                            return false;
124                                    }
125                                    
126                                    c = (char)l; 
127                                    l = reader.read();
128    
129                                    if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) {
130    
131                                    } else if (c == STARTING_BRACKET) {
132                                            input.append(c);
133                                            brackets++;
134                                    } else if (c == CLOSING_BRACKET) {
135                                            input.append(c);
136                                            brackets--;
137                                    } else if (c == INPUT_SEPARATOR) {
138                                            if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) {
139                                                    input.append(c);
140                                            }
141                                    // Start BracketProgLangReader
142                                    } else if (c == '\\') {
143                                            c = (char) l;
144                                            l = reader.read();
145                                            if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') {
146                                                    System.out.println("Error");
147                                                    System.exit(1);
148                                            } else {
149                                                    input.append("\\" + c);
150                                            }
151                                    // End BracketProgLangReader
152                                    } else if (brackets != 0){
153                                            input.append(c);
154                                    }
155                                    if (brackets == 0 && input.length() != 0) {
156                                            sentenceCount++;
157                                            terminalCounter = 1; 
158                                            nonTerminalCounter = 1;
159                                            if (syntaxGraph instanceof PhraseStructure) {
160                                                    bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null);
161                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
162                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
163                                                    }
164                                            }
165                                            return true;
166                                    }
167                                    
168                                    if (c == -1) {
169                                            if (brackets != 0) {
170                                                    close();
171                                                    throw new MaltChainedException("Error when reading from the input file. ");
172                                            }
173                                            if (cIterations < nIterations) {
174                                                    cIterations++;
175                                                    reopen();
176                                                    return true;
177                                            }
178                                            return false;
179                                    }
180                            }
181                    }  catch (IOException e) {
182                            close();
183                            throw new MaltChainedException("Error when reading from the input file. ", e);
184                    } 
185                    
186            }
187                    
188            private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException {
189                    int bracketsdepth = 0;
190                    int startpos = start-1;
191                    for (int i = start, n = end; i < n; i++) {
192                            if (input.charAt(i) == STARTING_BRACKET
193                                            // Start BracketProgLangReader
194                                            && (i == 0 || input.charAt(i - 1) != '\\')
195                                            // end BracketProgLangReader
196                            
197                            ) {
198                                    if (bracketsdepth == 0) {
199                                            startpos = i;
200                                    }
201                                    bracketsdepth++;
202                            } else if (input.charAt(i) == CLOSING_BRACKET
203                                            // Start BracketProgLangReader
204                                            && (i == 0 || input.charAt(i - 1) != '\\')
205                                            // end BracketProgLangReader
206                            ) {
207                                    bracketsdepth--;
208                                    if (bracketsdepth == 0) {
209                                            extract(phraseStructure, startpos+1, i, parent);
210                                    }       
211                            }
212                    }
213            }
214    
215            private void extract(PhraseStructure phraseStructure, int begin, int end,  PhraseStructureNode parent) throws MaltChainedException {
216                    int index = -1;
217                    for (int i = begin; i < end; i++) {
218                            if (input.charAt(i) == STARTING_BRACKET
219                                            // Start BracketProgLangReader
220                                            && (i == begin || input.charAt(i - 1) != '\\')
221                                            // end BracketProgLangReader            
222                            ) {
223                                    index = i;
224                                    break;
225                            }
226                    }
227                    if (index == -1) {
228                            TokenNode t = phraseStructure.addTokenNode(terminalCounter);
229                            if (t == null) {
230                                    close();
231                                    throw new MaltChainedException("Bracket Reader error: could not create a terminal node. ");
232                            }
233    
234                            terminalCounter++;
235                            Edge e = null;
236    
237                            if (parent != null) {
238                                    e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t);
239                            } else {
240                                    close();
241                                    throw new MaltChainedException("Bracket Reader error: could not find the parent node. ");
242                            }
243    
244                            int start = begin;
245    
246                            Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator();
247                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
248                            boolean noneNode = false;
249                            boolean edgeLabels = false;
250                            for (int i = begin; i < end; i++) {
251                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 
252                                                    // Start BracketProgLangReader
253                                                    && (i == begin || input.charAt(i - 1) != '\\')
254                                                    // end BracketProgLangReader    
255                                            ) || i == end - 1) {
256                                            if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) {
257                                                    noneNode = true;
258                                            } else if (start == begin) {
259                                                    if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) {
260                                                            if (inputColumnsIterator.hasNext()) { 
261                                                                    t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), 
262                                                                                    
263                                                                                    // Start BracketProgLangReader
264                                                                                    decodeString(
265                                                                                    // end BracketProgLangReader
266                                                                                    (i == end - 1)?input.substring(start,end):input.substring(start, i)
267                                                                                    // Start BracketProgLangReader
268                                                                                    )
269                                                                                    // end BracketProgLangReader            
270                                                                                    );
271                                                            }
272                                                            start = i + 1;
273                                                            if (input.charAt(i) == EDGELABEL_SEPARATOR) {
274                                                                    edgeLabels = true;
275                                                            }
276                                                    }
277                                            } else if (edgeLabels && e != null) {
278                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
279                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
280                                                    }
281                                                    start = i + 1;
282                                                    if (input.charAt(i) == INPUT_SEPARATOR
283                                                                    // Start BracketProgLangReader
284                                                                    && (i == begin || input.charAt(i - 1) != '\\')
285                                                                    // end BracketProgLangReader            
286                                                    ) {
287                                                            edgeLabels = false;
288                                                    }
289                                            } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR
290                                                            // Start BracketProgLangReader
291                                                            && (i == begin || input.charAt(i - 1) != '\\')
292                                                            // end BracketProgLangReader
293                                                            )
294                                            ) {     
295                                            } else {
296                                                    if (inputColumnsIterator.hasNext()) { 
297                                                            t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
298                                                    }
299                                                    start = i + 1;
300                                            }
301                                    }
302                            }
303                    } else {
304                            PhraseStructureNode nt;
305                            Edge e = null;
306                            if (parent == null) {
307                                    nt = phraseStructure.getPhraseStructureRoot();
308                            } else {
309                                    nt = phraseStructure.addNonTerminalNode(nonTerminalCounter);
310                                    if (nt == null) {
311                                            close();
312                                            throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. ");
313                                    } 
314                                    nonTerminalCounter++;
315    
316                                    e = phraseStructure.addPhraseStructureEdge(parent, nt);
317                            }
318                            Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator();
319                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
320                            int newbegin = begin;
321                            int start = begin;
322                            
323                            for (int i = begin; i < index; i++) {
324                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) {
325                                            if (start == newbegin) {
326                                                    if (phraseLabelColumnsIterator.hasNext()) { 
327                                                            nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
328                                                    }
329                                                    start = i + 1;
330                                            } else if (e != null) {
331                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
332                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
333                                                    }
334                                                    start = i + 1;
335                                            }
336                                    } else if (input.charAt(i) == BLANK) {
337                                            start++;
338                                            newbegin++;
339                                    }
340                            }
341    
342                            bracketing(phraseStructure, index, end, nt);
343                    }
344            }
345            
346            private String decodeString(String string) {
347                    return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " ");
348            }
349            
350            public void readEpilog() throws MaltChainedException {
351                    
352            }
353            
354            public BufferedReader getReader() {
355                    return reader;
356            }
357    
358            public void setReader(BufferedReader reader) {
359                    this.reader = reader;
360            }
361            
362            public int getSentenceCount() throws MaltChainedException {
363                    return sentenceCount;
364            }
365            
366            public void setSentenceCount(int sentenceCount) {
367                    this.sentenceCount = sentenceCount;
368            }
369            
370            public DataFormatInstance getDataFormatInstance() {
371                    return dataFormatInstance;
372            }
373            
374            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
375                    this.dataFormatInstance = inputDataFormatInstance;
376                    inputColumns = dataFormatInstance.getInputColumnDescriptions();
377                    edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
378                    phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
379            }
380            
381            public String getOptions() {
382                    return optionString;
383            }
384            
385            public void setOptions(String optionString) throws MaltChainedException {
386                    this.optionString = optionString;
387            }
388            
389            public String getFileName() {
390                    return fileName;
391            }
392    
393            public void setFileName(String fileName) {
394                    this.fileName = fileName;
395            }
396    
397            public URL getUrl() {
398                    return url;
399            }
400    
401            public void setUrl(URL url) {
402                    this.url = url;
403            }
404    
405            public String getCharsetName() {
406                    return charsetName;
407            }
408    
409            public void setCharsetName(String charsetName) {
410                    this.charsetName = charsetName;
411            }
412    
413            public int getNIterations() {
414                    return nIterations;
415            }
416    
417            public void setNIterations(int iterations) {
418                    nIterations = iterations;
419            }
420    
421            public int getIterationCounter() {
422                    return cIterations;
423            }
424            
425            public void close() throws MaltChainedException {
426                    try {
427                            if (reader != null) {
428                                    reader.close();
429                            }
430                            reader = null;
431                    }   catch (IOException e) {
432                            throw new DataFormatException("Error when closing the input file.", e);
433                    } 
434            }
435    }