001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    import java.util.SortedMap;
013    
014    import org.maltparser.core.exception.MaltChainedException;
015    import org.maltparser.core.io.dataformat.ColumnDescription;
016    import org.maltparser.core.io.dataformat.DataFormatException;
017    import org.maltparser.core.io.dataformat.DataFormatInstance;
018    import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
019    import org.maltparser.core.syntaxgraph.PhraseStructure;
020    import org.maltparser.core.syntaxgraph.TokenStructure;
021    import org.maltparser.core.syntaxgraph.edge.Edge;
022    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
023    import org.maltparser.core.syntaxgraph.node.TokenNode;
024    /**
025    *
026    *
027    * @author Johan Hall
028    */
029    public class BracketReader implements SyntaxGraphReader {
030            private BufferedReader reader;
031            private DataFormatInstance dataFormatInstance;
032            private int sentenceCount;
033            private StringBuilder input;
034            private int terminalCounter; 
035            private int nonTerminalCounter;
036            private String optionString;
037            private SortedMap<String,ColumnDescription> inputColumns;
038            private SortedMap<String,ColumnDescription> edgeLabelColumns;
039            private SortedMap<String,ColumnDescription> phraseLabelColumns;
040            private char STARTING_BRACKET = '(';
041            private char CLOSING_BRACKET = ')';
042            private char INPUT_SEPARATOR = ' ';
043            private char EDGELABEL_SEPARATOR = '-';
044            private char SENTENCE_SEPARATOR = '\n';
045            private char BLANK = ' ';
046            private char CARRIAGE_RETURN = '\r';
047            private char TAB = '\t';
048            
049            public BracketReader() { 
050                    input = new StringBuilder();
051            }
052            
053            public void open(String fileName, String charsetName) throws MaltChainedException {
054                    try {
055                            open(new FileInputStream(fileName), charsetName);
056                    }catch (FileNotFoundException e) {
057                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
058                    }
059            }
060            public void open(URL url, String charsetName) throws MaltChainedException {
061                    try {
062                            open(url.openStream(), charsetName);
063                    } catch (IOException e) {
064                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
065                    }
066            }
067            
068            public void open(InputStream is, String charsetName) throws MaltChainedException {
069                    try {
070                            open(new InputStreamReader(is, charsetName));
071                    } catch (UnsupportedEncodingException e) {
072                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
073                    }
074            }
075            
076            public void open(InputStreamReader isr) throws MaltChainedException {
077                    setReader(new BufferedReader(isr));
078                    setSentenceCount(0);
079            }
080            
081            public void readProlog() throws MaltChainedException {
082                    
083            }
084            
085            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
086                    if (syntaxGraph == null || dataFormatInstance == null) {
087                            return false;
088                    }
089                    syntaxGraph.clear();
090                    int brackets = 0;
091                    try {
092                            int l = reader.read();
093                            char c;
094                            input.setLength(0);
095                    
096                            while (true) {
097                                    if (l == -1) {
098                                            input.setLength(0);
099                                            return false;
100                                    }
101                                    
102                                    c = (char)l; 
103                                    l = reader.read();
104    
105                                    if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) {
106    
107                                    } else if (c == STARTING_BRACKET) {
108                                            input.append(c);
109                                            brackets++;
110                                    } else if (c == CLOSING_BRACKET) {
111                                            input.append(c);
112                                            brackets--;
113                                    } else if (c == INPUT_SEPARATOR) {
114                                            if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) {
115                                                    input.append(c);
116                                            }
117                                    } else if (brackets != 0){
118                                            input.append(c);
119                                    }
120                                    if (brackets == 0 && input.length() != 0) {
121                                            sentenceCount++;
122                                            terminalCounter = 1; 
123                                            nonTerminalCounter = 1;
124                                            if (syntaxGraph instanceof PhraseStructure) {
125                                                    bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null);
126                                                    if (syntaxGraph instanceof MappablePhraseStructureGraph) {
127                                                            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
128                                                    }
129                                            }
130                                            return true;
131                                    }
132                                    
133                                    if (c == -1) {
134                                            if (brackets != 0) {
135                                                    close();
136                                                    throw new MaltChainedException("Error when reading from the input file. ");
137                                            } 
138                                            return false;
139                                    }
140                            }
141                    }  catch (IOException e) {
142                            close();
143                            throw new MaltChainedException("Error when reading from the input file. ", e);
144                    } 
145                    
146            }
147                    
148            private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException {
149                    int bracketsdepth = 0;
150                    int startpos = start-1;
151                    for (int i = start, n = end; i < n; i++) {
152                            if (input.charAt(i) == STARTING_BRACKET) {
153                                    if (bracketsdepth == 0) {
154                                            startpos = i;
155                                    }
156                                    bracketsdepth++;
157                            } else if (input.charAt(i) == CLOSING_BRACKET) {
158                                    bracketsdepth--;
159                                    if (bracketsdepth == 0) {
160                                            extract(phraseStructure, startpos+1, i, parent);
161                                    }       
162                            }
163                    }
164            }
165    
166            private void extract(PhraseStructure phraseStructure, int begin, int end,  PhraseStructureNode parent) throws MaltChainedException {
167                    int index = -1;
168                    for (int i = begin; i < end; i++) {
169                            if (input.charAt(i) == STARTING_BRACKET) {
170                                    index = i;
171                                    break;
172                            }
173                    }
174                    if (index == -1) {
175                            TokenNode t = phraseStructure.addTokenNode(terminalCounter);
176                            if (t == null) {
177                                    close();
178                                    throw new MaltChainedException("Bracket Reader error: could not create a terminal node. ");
179                            }
180    
181                            terminalCounter++;
182                            Edge e = null;
183    
184                            if (parent != null) {
185                                    e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t);
186                            } else {
187                                    close();
188                                    throw new MaltChainedException("Bracket Reader error: could not find the parent node. ");
189                            }
190    
191                            int start = begin;
192    
193                            Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator();
194                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
195                            boolean noneNode = false;
196                            boolean edgeLabels = false;
197                            for (int i = begin; i < end; i++) {
198                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || input.charAt(i) == INPUT_SEPARATOR || i == end - 1) {
199                                            if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) {
200                                                    noneNode = true;
201                                            } else if (start == begin) {
202                                                    if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) {
203                                                            if (inputColumnsIterator.hasNext()) { 
204                                                                    t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
205                                                            }
206                                                            start = i + 1;
207                                                            if (input.charAt(i) == EDGELABEL_SEPARATOR) {
208                                                                    edgeLabels = true;
209                                                            }
210                                                    }
211                                            } else if (edgeLabels && e != null) {
212                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
213                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
214                                                    }
215                                                    start = i + 1;
216                                                    if (input.charAt(i) == INPUT_SEPARATOR) {
217                                                            edgeLabels = false;
218                                                    }
219                                            } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && input.charAt(i+1) != INPUT_SEPARATOR) {    
220                                            } else {
221                                                    if (inputColumnsIterator.hasNext()) { 
222                                                            t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
223                                                    }
224                                                    start = i + 1;
225                                            }
226                                    }
227                            }
228                    } else {
229                            PhraseStructureNode nt;
230                            Edge e = null;
231                            if (parent == null) {
232                                    nt = phraseStructure.getPhraseStructureRoot();
233                            } else {
234                                    nt = phraseStructure.addNonTerminalNode(nonTerminalCounter);
235                                    if (nt == null) {
236                                            close();
237                                            throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. ");
238                                    } 
239                                    nonTerminalCounter++;
240    
241                                    e = phraseStructure.addPhraseStructureEdge(parent, nt);
242                            }
243                            Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator();
244                            Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
245                            int newbegin = begin;
246                            int start = begin;
247                            
248                            for (int i = begin; i < index; i++) {
249                                    if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) {
250                                            if (start == newbegin) {
251                                                    if (phraseLabelColumnsIterator.hasNext()) { 
252                                                            nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
253                                                    }
254                                                    start = i + 1;
255                                            } else if (e != null) {
256                                                    if (edgeLabelsColumnsIterator.hasNext()) { 
257                                                            e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
258                                                    }
259                                                    start = i + 1;
260                                            }
261                                    } else if (input.charAt(i) == BLANK) {
262                                            start++;
263                                            newbegin++;
264                                    }
265                            }
266    
267                            bracketing(phraseStructure, index, end, nt);
268                    }
269            }
270            
271            public void readEpilog() throws MaltChainedException {
272                    
273            }
274            
275            public BufferedReader getReader() {
276                    return reader;
277            }
278    
279            public void setReader(BufferedReader reader) {
280                    this.reader = reader;
281            }
282            
283            public int getSentenceCount() throws MaltChainedException {
284                    return sentenceCount;
285            }
286            
287            public void setSentenceCount(int sentenceCount) {
288                    this.sentenceCount = sentenceCount;
289            }
290            
291            public DataFormatInstance getDataFormatInstance() {
292                    return dataFormatInstance;
293            }
294            
295            public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
296                    this.dataFormatInstance = inputDataFormatInstance;
297                    inputColumns = dataFormatInstance.getInputColumnDescriptions();
298                    edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
299                    phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
300            }
301            
302            public String getOptions() {
303                    return optionString;
304            }
305            
306            public void setOptions(String optionString) throws MaltChainedException {
307                    this.optionString = optionString;
308            }
309            
310            public void close() throws MaltChainedException {
311                    try {
312                            if (reader != null) {
313                                    reader.close();
314                            }
315                            reader = null;
316                    }   catch (IOException e) {
317                            throw new DataFormatException("Error when closing the input file.", e);
318                    } 
319            }
320    }