001    package org.maltparser.core.syntaxgraph.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.FileInputStream;
005    import java.io.FileNotFoundException;
006    import java.io.IOException;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.UnsupportedEncodingException;
010    import java.net.URL;
011    import java.util.Iterator;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.io.dataformat.ColumnDescription;
015    import org.maltparser.core.io.dataformat.DataFormatException;
016    import org.maltparser.core.io.dataformat.DataFormatInstance;
017    import org.maltparser.core.syntaxgraph.DependencyStructure;
018    import org.maltparser.core.syntaxgraph.Element;
019    import org.maltparser.core.syntaxgraph.TokenStructure;
020    import org.maltparser.core.syntaxgraph.edge.Edge;
021    /**
022    *
023    *
024    * @author Johan Hall
025    */
026    public class TabReader implements SyntaxGraphReader {
027            private BufferedReader reader;
028            private int sentenceCount;
029            private final StringBuilder input;
030            private DataFormatInstance dataFormatInstance;
031            private static final String IGNORE_COLUMN_SIGN = "_";
032            private static final char TAB = '\t';
033            private static final char NEWLINE = '\n';
034            private static final char CARRIAGE_RETURN = '\r';
035            private String fileName = null;
036            private URL url = null;
037            private String charsetName;
038            private int nIterations;
039            private int cIterations;
040            
041            
042            public TabReader() { 
043                    input = new StringBuilder();
044                    nIterations = 1;
045                    cIterations = 1;
046            }
047            
048            private void reopen() throws MaltChainedException {
049                    close();
050                    if (fileName != null) {
051                            open(fileName, charsetName);
052                    } else if (url != null) {
053                            open(url, charsetName);
054                    } else {
055                            throw new DataFormatException("The input stream cannot be reopen. ");
056                    }
057            }
058            
059            public void open(String fileName, String charsetName) throws MaltChainedException {
060                    setFileName(fileName);
061                    setCharsetName(charsetName);
062                    try {
063                            open(new FileInputStream(fileName), charsetName);
064                    } catch (FileNotFoundException e) {
065                            throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
066                    }
067            }
068            
069            public void open(URL url, String charsetName) throws MaltChainedException {
070                    setUrl(url);
071                    setCharsetName(charsetName);
072                    if (url == null) {
073                            throw new DataFormatException("The input file cannot be found. ");
074                    }
075                    try {
076                            open(url.openStream(), charsetName);
077                    } catch (IOException e) {
078                            throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
079                    }
080            }
081            
082            public void open(InputStream is, String charsetName) throws MaltChainedException {
083                    try {
084                            open(new InputStreamReader(is, charsetName));
085                    } catch (UnsupportedEncodingException e) {
086                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
087                    }
088            }
089            
090            public void open(InputStreamReader isr) throws MaltChainedException {
091                    setReader(new BufferedReader(isr));
092                    setSentenceCount(0);
093            }
094            
095            public void readProlog() throws MaltChainedException {
096                    
097            }
098            
099            public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
100                    if (syntaxGraph == null || dataFormatInstance == null) {
101                            return false;
102                    }
103                    
104                    Element node = null;
105                    Edge edge = null;
106                    input.setLength(0);
107                    int i = 0;
108                    int terminalCounter = 0;
109                    int nNewLines = 0;
110                    syntaxGraph.clear();
111                    Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
112                    while (true) {
113                            int c;
114    
115                            try {
116                                    c = reader.read();
117                            } catch (IOException e) {
118                                    close();
119                                    throw new DataFormatException("Error when reading from the input file. ", e);
120                            }
121                            if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
122                                    if (input.length() != 0) {                                      
123                                            if (i == 0) {
124                                                    terminalCounter++;
125                                                    node = syntaxGraph.addTokenNode(terminalCounter);
126                                            }
127                                            ColumnDescription column = null;
128                                            if (columns.hasNext()) {
129                                                    column = columns.next();
130                                                    if (column.getCategory() == ColumnDescription.INPUT && node != null) {
131                                                            syntaxGraph.addLabel(node, column.getName(), input.toString());
132                                                    } else if (column.getCategory() == ColumnDescription.HEAD) {
133                                                            if (syntaxGraph instanceof DependencyStructure) {
134                                                                    if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
135                                                                    //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
136                                                                            edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
137                                                                    }
138                                                            } 
139                                                            else {
140                                                                    close();
141                                                                    throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
142                                                            }
143                                                    } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
144                                                            //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
145                                                                    syntaxGraph.addLabel(edge, column.getName(), input.toString());
146                                                            //} // bugfix
147                                                    }
148                                            }
149                                            input.setLength(0);
150                                            nNewLines = 0;
151                                            i++;
152                                    } else if (c == TAB) {
153                                            throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
154                                    }
155                                    if (c == NEWLINE) {
156                                            nNewLines++;
157                                            i = 0;
158                                            columns = dataFormatInstance.iterator();
159                                    }
160                            } else {
161                                    input.append((char)c);
162                            }
163                            
164                            if (nNewLines == 2 && c == NEWLINE) {
165                                    if (syntaxGraph.hasTokens()) {
166                                            sentenceCount++;
167                                    }
168                                    return true;
169                            } else if (c == -1) {
170                                    if (syntaxGraph.hasTokens()) {
171                                            sentenceCount++;
172                                    }
173                                    if (cIterations < nIterations) {
174                                            cIterations++;
175                                            reopen();
176                                            return true;
177                                    }
178                                    
179                                    return false;                                   
180                            }
181                    }
182            }
183            
184            public void readEpilog() throws MaltChainedException {
185                    
186            }
187            
188            public BufferedReader getReader() {
189                    return reader;
190            }
191    
192            public void setReader(BufferedReader reader) throws MaltChainedException {
193                    close();
194                    this.reader = reader;
195            }
196            
197            public DataFormatInstance getDataFormatInstance() {
198                    return dataFormatInstance;
199            }
200    
201            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
202                    this.dataFormatInstance = dataFormatInstance;
203            }
204    
205            public int getSentenceCount() throws MaltChainedException {
206                    return sentenceCount;
207            }
208            
209            public void setSentenceCount(int sentenceCount) {
210                    this.sentenceCount = sentenceCount;
211            }
212            
213            public String getOptions() {
214                    return null;
215            }
216            
217            public void setOptions(String optionString) throws MaltChainedException {
218                    
219            }
220            
221            public String getFileName() {
222                    return fileName;
223            }
224    
225            public void setFileName(String fileName) {
226                    this.fileName = fileName;
227            }
228    
229            public URL getUrl() {
230                    return url;
231            }
232    
233            public void setUrl(URL url) {
234                    this.url = url;
235            }
236    
237            public String getCharsetName() {
238                    return charsetName;
239            }
240    
241            public void setCharsetName(String charsetName) {
242                    this.charsetName = charsetName;
243            }
244    
245            public int getNIterations() {
246                    return nIterations;
247            }
248    
249            public void setNIterations(int iterations) {
250                    nIterations = iterations;
251            }
252    
253            public int getIterationCounter() {
254                    return cIterations;
255            }
256    
257            public void close() throws MaltChainedException {
258                    try {
259                            if (reader != null) {
260                                    reader.close();
261                                    reader = null;
262                            }
263                    } catch (IOException e) {
264                            throw new DataFormatException("Error when closing the input file. ", e);
265                    } 
266            }
267            
268            public void clear() throws MaltChainedException {
269                    close();
270                    input.setLength(0);
271                    dataFormatInstance = null;
272                    sentenceCount = 0;
273            }
274    }