001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.io.dataformat.ColumnDescription; 015 import org.maltparser.core.io.dataformat.DataFormatException; 016 import org.maltparser.core.io.dataformat.DataFormatInstance; 017 import org.maltparser.core.syntaxgraph.DependencyStructure; 018 import org.maltparser.core.syntaxgraph.Element; 019 import org.maltparser.core.syntaxgraph.TokenStructure; 020 import org.maltparser.core.syntaxgraph.edge.Edge; 021 /** 022 * 023 * 024 * @author Johan Hall 025 */ 026 public class TabReader implements SyntaxGraphReader { 027 private BufferedReader reader; 028 private int sentenceCount; 029 private final StringBuilder input; 030 private DataFormatInstance dataFormatInstance; 031 private static final String IGNORE_COLUMN_SIGN = "_"; 032 private static final char TAB = '\t'; 033 private static final char NEWLINE = '\n'; 034 private static final char CARRIAGE_RETURN = '\r'; 035 private String fileName = null; 036 private URL url = null; 037 private String charsetName; 038 private int nIterations; 039 private int cIterations; 040 private boolean closeStream = true; 041 042 public TabReader() { 043 input = new StringBuilder(); 044 nIterations = 1; 045 cIterations = 1; 046 } 047 048 private void reopen() throws MaltChainedException { 049 close(); 050 if (fileName != null) { 051 open(fileName, charsetName); 052 } else if (url != null) { 053 open(url, charsetName); 054 } else { 055 throw new DataFormatException("The input stream cannot be reopen. "); 056 } 057 } 058 059 public void open(String fileName, String charsetName) throws MaltChainedException { 060 setFileName(fileName); 061 setCharsetName(charsetName); 062 try { 063 open(new FileInputStream(fileName), charsetName); 064 } catch (FileNotFoundException e) { 065 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 066 } 067 } 068 069 public void open(URL url, String charsetName) throws MaltChainedException { 070 setUrl(url); 071 setCharsetName(charsetName); 072 if (url == null) { 073 throw new DataFormatException("The input file cannot be found. "); 074 } 075 try { 076 open(url.openStream(), charsetName); 077 } catch (IOException e) { 078 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 079 } 080 } 081 082 public void open(InputStream is, String charsetName) throws MaltChainedException { 083 try { 084 if (is == System.in) { 085 closeStream = false; 086 } 087 open(new InputStreamReader(is, charsetName)); 088 } catch (UnsupportedEncodingException e) { 089 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 090 } 091 } 092 093 private void open(InputStreamReader isr) throws MaltChainedException { 094 setReader(new BufferedReader(isr)); 095 setSentenceCount(0); 096 } 097 098 public void readProlog() throws MaltChainedException { 099 100 } 101 102 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 103 if (syntaxGraph == null || dataFormatInstance == null) { 104 return false; 105 } 106 107 Element node = null; 108 Edge edge = null; 109 input.setLength(0); 110 int i = 0; 111 int terminalCounter = 0; 112 int nNewLines = 0; 113 syntaxGraph.clear(); 114 syntaxGraph.getSymbolTables().cleanUp(); 115 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 116 while (true) { 117 int c; 118 119 try { 120 c = reader.read(); 121 } catch (IOException e) { 122 close(); 123 throw new DataFormatException("Error when reading from the input file. ", e); 124 } 125 if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) { 126 if (input.length() != 0) { 127 if (i == 0) { 128 terminalCounter++; 129 node = syntaxGraph.addTokenNode(terminalCounter); 130 } 131 if (columns.hasNext()) { 132 ColumnDescription column = columns.next(); 133 if (column.getCategory() == ColumnDescription.INPUT && node != null) { 134 syntaxGraph.addLabel(node, column.getName(), input.toString()); 135 } else if (column.getCategory() == ColumnDescription.HEAD) { 136 if (syntaxGraph instanceof DependencyStructure) { 137 if (column.getCategory() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { 138 // if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix 139 //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) { 140 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter); 141 } 142 } 143 else { 144 close(); 145 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. "); 146 } 147 } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) { 148 //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody 149 syntaxGraph.addLabel(edge, column.getName(), input.toString()); 150 //} // bugfix 151 } 152 } 153 input.setLength(0); 154 nNewLines = 0; 155 i++; 156 } else if (c == TAB) { 157 throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. "); 158 } 159 if (c == NEWLINE) { 160 nNewLines++; 161 i = 0; 162 columns = dataFormatInstance.iterator(); 163 } 164 } else { 165 input.append((char)c); 166 } 167 168 if (nNewLines == 2 && c == NEWLINE) { 169 if (syntaxGraph.hasTokens()) { 170 sentenceCount++; 171 } 172 return true; 173 } else if (c == -1) { 174 if (syntaxGraph.hasTokens()) { 175 sentenceCount++; 176 } 177 if (cIterations < nIterations) { 178 cIterations++; 179 reopen(); 180 return true; 181 } 182 183 return false; 184 } 185 } 186 } 187 188 public void readEpilog() throws MaltChainedException { 189 190 } 191 192 public BufferedReader getReader() { 193 return reader; 194 } 195 196 public void setReader(BufferedReader reader) throws MaltChainedException { 197 close(); 198 this.reader = reader; 199 } 200 201 public DataFormatInstance getDataFormatInstance() { 202 return dataFormatInstance; 203 } 204 205 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 206 this.dataFormatInstance = dataFormatInstance; 207 } 208 209 public int getSentenceCount() throws MaltChainedException { 210 return sentenceCount; 211 } 212 213 public void setSentenceCount(int sentenceCount) { 214 this.sentenceCount = sentenceCount; 215 } 216 217 public String getOptions() { 218 return null; 219 } 220 221 public void setOptions(String optionString) throws MaltChainedException { 222 223 } 224 225 public String getFileName() { 226 return fileName; 227 } 228 229 public void setFileName(String fileName) { 230 this.fileName = fileName; 231 } 232 233 public URL getUrl() { 234 return url; 235 } 236 237 public void setUrl(URL url) { 238 this.url = url; 239 } 240 241 public String getCharsetName() { 242 return charsetName; 243 } 244 245 public void setCharsetName(String charsetName) { 246 this.charsetName = charsetName; 247 } 248 249 public int getNIterations() { 250 return nIterations; 251 } 252 253 public void setNIterations(int iterations) { 254 nIterations = iterations; 255 } 256 257 public int getIterationCounter() { 258 return cIterations; 259 } 260 261 public void close() throws MaltChainedException { 262 try { 263 if (reader != null) { 264 if (closeStream) { 265 reader.close(); 266 } 267 reader = null; 268 } 269 } catch (IOException e) { 270 throw new DataFormatException("Error when closing the input file. ", e); 271 } 272 } 273 274 public void clear() throws MaltChainedException { 275 close(); 276 input.setLength(0); 277 dataFormatInstance = null; 278 sentenceCount = 0; 279 } 280 }