001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.io.dataformat.ColumnDescription; 015 import org.maltparser.core.io.dataformat.DataFormatException; 016 import org.maltparser.core.io.dataformat.DataFormatInstance; 017 import org.maltparser.core.syntaxgraph.DependencyStructure; 018 import org.maltparser.core.syntaxgraph.Element; 019 import org.maltparser.core.syntaxgraph.TokenStructure; 020 import org.maltparser.core.syntaxgraph.edge.Edge; 021 /** 022 * 023 * 024 * @author Johan Hall 025 */ 026 public class TabReader implements SyntaxGraphReader { 027 private BufferedReader reader; 028 private int sentenceCount; 029 private final StringBuilder input; 030 private DataFormatInstance dataFormatInstance; 031 private static final String IGNORE_COLUMN_SIGN = "_"; 032 private static final char TAB = '\t'; 033 private static final char NEWLINE = '\n'; 034 private static final char CARRIAGE_RETURN = '\r'; 035 private String fileName = null; 036 private URL url = null; 037 private String charsetName; 038 private int nIterations; 039 private int cIterations; 040 041 042 public TabReader() { 043 input = new StringBuilder(); 044 nIterations = 1; 045 cIterations = 1; 046 } 047 048 private void reopen() throws MaltChainedException { 049 close(); 050 if (fileName != null) { 051 open(fileName, charsetName); 052 } else if (url != null) { 053 open(url, charsetName); 054 } else { 055 throw new DataFormatException("The input stream cannot be reopen. "); 056 } 057 } 058 059 public void open(String fileName, String charsetName) throws MaltChainedException { 060 setFileName(fileName); 061 setCharsetName(charsetName); 062 try { 063 open(new FileInputStream(fileName), charsetName); 064 } catch (FileNotFoundException e) { 065 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 066 } 067 } 068 069 public void open(URL url, String charsetName) throws MaltChainedException { 070 setUrl(url); 071 setCharsetName(charsetName); 072 if (url == null) { 073 throw new DataFormatException("The input file cannot be found. "); 074 } 075 try { 076 open(url.openStream(), charsetName); 077 } catch (IOException e) { 078 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 079 } 080 } 081 082 public void open(InputStream is, String charsetName) throws MaltChainedException { 083 try { 084 open(new InputStreamReader(is, charsetName)); 085 } catch (UnsupportedEncodingException e) { 086 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 087 } 088 } 089 090 public void open(InputStreamReader isr) throws MaltChainedException { 091 setReader(new BufferedReader(isr)); 092 setSentenceCount(0); 093 } 094 095 public void readProlog() throws MaltChainedException { 096 097 } 098 099 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 100 if (syntaxGraph == null || dataFormatInstance == null) { 101 return false; 102 } 103 104 Element node = null; 105 Edge edge = null; 106 input.setLength(0); 107 int i = 0; 108 int terminalCounter = 0; 109 int nNewLines = 0; 110 syntaxGraph.clear(); 111 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 112 while (true) { 113 int c; 114 115 try { 116 c = reader.read(); 117 } catch (IOException e) { 118 close(); 119 throw new DataFormatException("Error when reading from the input file. ", e); 120 } 121 if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) { 122 if (input.length() != 0) { 123 if (i == 0) { 124 terminalCounter++; 125 node = syntaxGraph.addTokenNode(terminalCounter); 126 } 127 ColumnDescription column = null; 128 if (columns.hasNext()) { 129 column = columns.next(); 130 if (column.getCategory() == ColumnDescription.INPUT && node != null) { 131 syntaxGraph.addLabel(node, column.getName(), input.toString()); 132 } else if (column.getCategory() == ColumnDescription.HEAD) { 133 if (syntaxGraph instanceof DependencyStructure) { 134 if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix 135 //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) { 136 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter); 137 } 138 } 139 else { 140 close(); 141 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. "); 142 } 143 } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) { 144 //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody 145 syntaxGraph.addLabel(edge, column.getName(), input.toString()); 146 //} // bugfix 147 } 148 } 149 input.setLength(0); 150 nNewLines = 0; 151 i++; 152 } else if (c == TAB) { 153 throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. "); 154 } 155 if (c == NEWLINE) { 156 nNewLines++; 157 i = 0; 158 columns = dataFormatInstance.iterator(); 159 } 160 } else { 161 input.append((char)c); 162 } 163 164 if (nNewLines == 2 && c == NEWLINE) { 165 if (syntaxGraph.hasTokens()) { 166 sentenceCount++; 167 } 168 return true; 169 } else if (c == -1) { 170 if (syntaxGraph.hasTokens()) { 171 sentenceCount++; 172 } 173 if (cIterations < nIterations) { 174 cIterations++; 175 reopen(); 176 return true; 177 } 178 179 return false; 180 } 181 } 182 } 183 184 public void readEpilog() throws MaltChainedException { 185 186 } 187 188 public BufferedReader getReader() { 189 return reader; 190 } 191 192 public void setReader(BufferedReader reader) throws MaltChainedException { 193 close(); 194 this.reader = reader; 195 } 196 197 public DataFormatInstance getDataFormatInstance() { 198 return dataFormatInstance; 199 } 200 201 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 202 this.dataFormatInstance = dataFormatInstance; 203 } 204 205 public int getSentenceCount() throws MaltChainedException { 206 return sentenceCount; 207 } 208 209 public void setSentenceCount(int sentenceCount) { 210 this.sentenceCount = sentenceCount; 211 } 212 213 public String getOptions() { 214 return null; 215 } 216 217 public void setOptions(String optionString) throws MaltChainedException { 218 219 } 220 221 public String getFileName() { 222 return fileName; 223 } 224 225 public void setFileName(String fileName) { 226 this.fileName = fileName; 227 } 228 229 public URL getUrl() { 230 return url; 231 } 232 233 public void setUrl(URL url) { 234 this.url = url; 235 } 236 237 public String getCharsetName() { 238 return charsetName; 239 } 240 241 public void setCharsetName(String charsetName) { 242 this.charsetName = charsetName; 243 } 244 245 public int getNIterations() { 246 return nIterations; 247 } 248 249 public void setNIterations(int iterations) { 250 nIterations = iterations; 251 } 252 253 public int getIterationCounter() { 254 return cIterations; 255 } 256 257 public void close() throws MaltChainedException { 258 try { 259 if (reader != null) { 260 reader.close(); 261 reader = null; 262 } 263 } catch (IOException e) { 264 throw new DataFormatException("Error when closing the input file. ", e); 265 } 266 } 267 268 public void clear() throws MaltChainedException { 269 close(); 270 input.setLength(0); 271 dataFormatInstance = null; 272 sentenceCount = 0; 273 } 274 }