001package org.maltparser.core.syntaxgraph.reader;
002
003import java.io.BufferedReader;
004import java.io.FileInputStream;
005import java.io.FileNotFoundException;
006import java.io.IOException;
007import java.io.InputStream;
008import java.io.InputStreamReader;
009import java.io.UnsupportedEncodingException;
010import java.net.URL;
011import java.util.Iterator;
012
013import org.maltparser.core.exception.MaltChainedException;
014import org.maltparser.core.io.dataformat.ColumnDescription;
015import org.maltparser.core.io.dataformat.DataFormatException;
016import org.maltparser.core.io.dataformat.DataFormatInstance;
017import org.maltparser.core.syntaxgraph.DependencyStructure;
018import org.maltparser.core.syntaxgraph.Element;
019import org.maltparser.core.syntaxgraph.TokenStructure;
020import org.maltparser.core.syntaxgraph.edge.Edge;
021/**
022*
023*
024* @author Johan Hall
025*/
026public class TabReader implements SyntaxGraphReader {
027        private BufferedReader reader;
028        private int sentenceCount;
029        private final StringBuilder input;
030        private DataFormatInstance dataFormatInstance;
031        private static final String IGNORE_COLUMN_SIGN = "_";
032        private static final char TAB = '\t';
033        private static final char NEWLINE = '\n';
034        private static final char CARRIAGE_RETURN = '\r';
035        private String fileName = null;
036        private URL url = null;
037        private String charsetName;
038        private int nIterations;
039        private int cIterations;
040        private boolean closeStream = true;
041        
042        public TabReader() { 
043                input = new StringBuilder();
044                nIterations = 1;
045                cIterations = 1;
046        }
047        
048        private void reopen() throws MaltChainedException {
049                close();
050                if (fileName != null) {
051                        open(fileName, charsetName);
052                } else if (url != null) {
053                        open(url, charsetName);
054                } else {
055                        throw new DataFormatException("The input stream cannot be reopen. ");
056                }
057        }
058        
059        public void open(String fileName, String charsetName) throws MaltChainedException {
060                setFileName(fileName);
061                setCharsetName(charsetName);
062                try {
063                        open(new FileInputStream(fileName), charsetName);
064                } catch (FileNotFoundException e) {
065                        throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
066                }
067        }
068        
069        public void open(URL url, String charsetName) throws MaltChainedException {
070                setUrl(url);
071                setCharsetName(charsetName);
072                if (url == null) {
073                        throw new DataFormatException("The input file cannot be found. ");
074                }
075                try {
076                        open(url.openStream(), charsetName);
077                } catch (IOException e) {
078                        throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
079                }
080        }
081        
082        public void open(InputStream is, String charsetName) throws MaltChainedException {
083                try {
084                        if (is == System.in) {
085                                closeStream = false;
086                        }
087                        open(new InputStreamReader(is, charsetName));
088                } catch (UnsupportedEncodingException e) {
089                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
090                }
091        }
092        
093        private void open(InputStreamReader isr) throws MaltChainedException {
094                setReader(new BufferedReader(isr));
095                setSentenceCount(0);
096        }
097        
098        public void readProlog() throws MaltChainedException {
099                
100        }
101        
102        public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
103                if (syntaxGraph == null || dataFormatInstance == null) {
104                        return false;
105                }
106                
107                Element node = null;
108                Edge edge = null;
109                input.setLength(0);
110                int i = 0;
111                int terminalCounter = 0;
112                int nNewLines = 0;
113                syntaxGraph.clear();
114                syntaxGraph.getSymbolTables().cleanUp();
115                Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
116                while (true) {
117                        int c;
118
119                        try {
120                                c = reader.read();
121                        } catch (IOException e) {
122                                close();
123                                throw new DataFormatException("Error when reading from the input file. ", e);
124                        }
125                        if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
126                                if (input.length() != 0) {                                      
127                                        if (i == 0) {
128                                                terminalCounter++;
129                                                node = syntaxGraph.addTokenNode(terminalCounter);
130                                        }
131                                        if (columns.hasNext()) {
132                                                ColumnDescription column = columns.next();
133                                                if (column.getCategory() == ColumnDescription.INPUT && node != null) {
134                                                        syntaxGraph.addLabel(node, column.getName(), input.toString());
135                                                } else if (column.getCategory() == ColumnDescription.HEAD) {
136                                                        if (syntaxGraph instanceof DependencyStructure) {
137                                                                if (column.getCategory() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) {
138//                                                              if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
139                                                                //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
140                                                                        edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
141                                                                }
142                                                        } 
143                                                        else {
144                                                                close();
145                                                                throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
146                                                        }
147                                                } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
148                                                        //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
149                                                                syntaxGraph.addLabel(edge, column.getName(), input.toString());
150                                                        //} // bugfix
151                                                }
152                                        }
153                                        input.setLength(0);
154                                        nNewLines = 0;
155                                        i++;
156                                } else if (c == TAB) {
157                                        throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
158                                }
159                                if (c == NEWLINE) {
160                                        nNewLines++;
161                                        i = 0;
162                                        columns = dataFormatInstance.iterator();
163                                }
164                        } else {
165                                input.append((char)c);
166                        }
167                        
168                        if (nNewLines == 2 && c == NEWLINE) {
169                                if (syntaxGraph.hasTokens()) {
170                                        sentenceCount++;
171                                }
172                                return true;
173                        } else if (c == -1) {
174                                if (syntaxGraph.hasTokens()) {
175                                        sentenceCount++;
176                                }
177                                if (cIterations < nIterations) {
178                                        cIterations++;
179                                        reopen();
180                                        return true;
181                                }
182                                
183                                return false;                                   
184                        }
185                }
186        }
187        
188        public void readEpilog() throws MaltChainedException {
189                
190        }
191        
192        public BufferedReader getReader() {
193                return reader;
194        }
195
196        public void setReader(BufferedReader reader) throws MaltChainedException {
197                close();
198                this.reader = reader;
199        }
200        
201        public DataFormatInstance getDataFormatInstance() {
202                return dataFormatInstance;
203        }
204
205        public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
206                this.dataFormatInstance = dataFormatInstance;
207        }
208
209        public int getSentenceCount() throws MaltChainedException {
210                return sentenceCount;
211        }
212        
213        public void setSentenceCount(int sentenceCount) {
214                this.sentenceCount = sentenceCount;
215        }
216        
217        public String getOptions() {
218                return null;
219        }
220        
221        public void setOptions(String optionString) throws MaltChainedException {
222                
223        }
224        
225        public String getFileName() {
226                return fileName;
227        }
228
229        public void setFileName(String fileName) {
230                this.fileName = fileName;
231        }
232
233        public URL getUrl() {
234                return url;
235        }
236
237        public void setUrl(URL url) {
238                this.url = url;
239        }
240
241        public String getCharsetName() {
242                return charsetName;
243        }
244
245        public void setCharsetName(String charsetName) {
246                this.charsetName = charsetName;
247        }
248
249        public int getNIterations() {
250                return nIterations;
251        }
252
253        public void setNIterations(int iterations) {
254                nIterations = iterations;
255        }
256
257        public int getIterationCounter() {
258                return cIterations;
259        }
260
261        public void close() throws MaltChainedException {
262                try {
263                        if (reader != null) {
264                                if (closeStream) {
265                                        reader.close();
266                                }
267                                reader = null;
268                        }
269                } catch (IOException e) {
270                        throw new DataFormatException("Error when closing the input file. ", e);
271                } 
272        }
273        
274        public void clear() throws MaltChainedException {
275                close();
276                input.setLength(0);
277                dataFormatInstance = null;
278                sentenceCount = 0;
279        }
280}