001package org.maltparser.core.syntaxgraph.reader; 002 003import java.io.BufferedReader; 004import java.io.FileInputStream; 005import java.io.FileNotFoundException; 006import java.io.IOException; 007import java.io.InputStream; 008import java.io.InputStreamReader; 009import java.io.UnsupportedEncodingException; 010import java.net.URL; 011import java.util.ArrayList; 012import java.util.Iterator; 013 014import org.maltparser.core.exception.MaltChainedException; 015import org.maltparser.core.io.dataformat.ColumnDescription; 016import org.maltparser.core.io.dataformat.DataFormatException; 017import org.maltparser.core.io.dataformat.DataFormatInstance; 018import org.maltparser.core.syntaxgraph.DependencyStructure; 019import org.maltparser.core.syntaxgraph.Element; 020import org.maltparser.core.syntaxgraph.TokenStructure; 021import org.maltparser.core.syntaxgraph.edge.Edge; 022/** 023* 024* 025* @author Johan Hall 026*/ 027public class TabReader implements SyntaxGraphReader { 028 private BufferedReader reader; 029 private int sentenceCount; 030// private final StringBuilder input; 031 private DataFormatInstance dataFormatInstance; 032 private static final String IGNORE_COLUMN_SIGN = "_"; 033// private static final char TAB = '\t'; 034// private static final char NEWLINE = '\n'; 035// private static final char CARRIAGE_RETURN = '\r'; 036 private String fileName = null; 037 private URL url = null; 038 private String charsetName; 039 private int nIterations; 040 private int cIterations; 041 private boolean closeStream = true; 042 043 public TabReader() { 044// input = new StringBuilder(); 045 nIterations = 1; 046 cIterations = 1; 047 } 048 049// private void reopen() throws MaltChainedException { 050// close(); 051// if (fileName != null) { 052// open(fileName, charsetName); 053// } else if (url != null) { 054// open(url, charsetName); 055// } else { 056// throw new DataFormatException("The input stream cannot be reopen. "); 057// } 058// } 059 060 public void open(String fileName, String charsetName) throws MaltChainedException { 061 setFileName(fileName); 062 setCharsetName(charsetName); 063 try { 064 open(new FileInputStream(fileName), charsetName); 065 } catch (FileNotFoundException e) { 066 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 067 } 068 } 069 070 public void open(URL url, String charsetName) throws MaltChainedException { 071 setUrl(url); 072 setCharsetName(charsetName); 073 if (url == null) { 074 throw new DataFormatException("The input file cannot be found. "); 075 } 076 try { 077 open(url.openStream(), charsetName); 078 } catch (IOException e) { 079 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 080 } 081 } 082 083 public void open(InputStream is, String charsetName) throws MaltChainedException { 084 try { 085 if (is == System.in) { 086 closeStream = false; 087 } 088 open(new InputStreamReader(is, charsetName)); 089 } catch (UnsupportedEncodingException e) { 090 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 091 } 092 } 093 094 private void open(InputStreamReader isr) throws MaltChainedException { 095 setReader(new BufferedReader(isr)); 096 setSentenceCount(0); 097 } 098 099 public void readProlog() throws MaltChainedException { 100 101 } 102 103 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 104 if (syntaxGraph == null || dataFormatInstance == null) { 105 return false; 106 } 107 syntaxGraph.clear(); 108 syntaxGraph.getSymbolTables().cleanUp(); 109 Element node = null; 110 Edge edge = null; 111 112 113 ArrayList<String> tokens = new ArrayList<String>(); 114 try { 115 String line; 116 while ((line = reader.readLine()) != null) { 117 if (line.trim().length() == 0) { 118 break; 119 } else { 120 tokens.add(line.trim()); 121 } 122 } 123 } catch (IOException e) { 124 close(); 125 throw new DataFormatException("Error when reading from the input file. ", e); 126 } 127 128 int terminalCounter = 0; 129 for (int i = 0; i < tokens.size(); i++) { 130 String token = tokens.get(i); 131 132 if (token.charAt(0) == '#') { 133 syntaxGraph.addComment(token, terminalCounter+1); 134 continue; 135 } 136 String[] columns = token.split("\t"); 137 if (columns[0].contains("-") || columns[0].contains(".")) { 138 syntaxGraph.addComment(token, terminalCounter+1); 139 continue; 140 } 141 terminalCounter++; 142 node = syntaxGraph.addTokenNode(terminalCounter); 143 144 Iterator<ColumnDescription> columnDescriptions = dataFormatInstance.iterator(); 145 for (int j = 0; j < columns.length; j++) { 146 ColumnDescription columnDescription = columnDescriptions.next(); 147 148 if (columnDescription.getCategory() == ColumnDescription.INPUT && node != null) { 149 syntaxGraph.addLabel(node, columnDescription.getName(), columns[j]); 150 } else if (columnDescription.getCategory() == ColumnDescription.HEAD) { 151 if (syntaxGraph instanceof DependencyStructure) { 152 if (columnDescription.getCategory() != ColumnDescription.IGNORE && !columns[j].equals(IGNORE_COLUMN_SIGN)) { 153 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(columns[j]), terminalCounter); 154 } 155 } 156 else { 157 close(); 158 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. "); 159 } 160 } else if (columnDescription.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) { 161 syntaxGraph.addLabel(edge, columnDescription.getName(), columns[j]); 162 } 163 } 164 } 165 166 if (!syntaxGraph.hasTokens()) { 167 return false; 168 } 169 sentenceCount++; 170 return true; 171 } 172 173// public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 174// if (syntaxGraph == null || dataFormatInstance == null) { 175// return false; 176// } 177// 178// Element node = null; 179// Edge edge = null; 180// input.setLength(0); 181// int i = 0; 182// int terminalCounter = 0; 183// int nNewLines = 0; 184// syntaxGraph.clear(); 185// syntaxGraph.getSymbolTables().cleanUp(); 186// Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 187// while (true) { 188// int c; 189// 190// try { 191// c = reader.read(); 192// } catch (IOException e) { 193// close(); 194// throw new DataFormatException("Error when reading from the input file. ", e); 195// } 196// if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) { 197// if (input.length() != 0) { 198// if (i == 0) { 199// terminalCounter++; 200// node = syntaxGraph.addTokenNode(terminalCounter); 201// } 202// if (columns.hasNext()) { 203// ColumnDescription column = columns.next(); 204// if (column.getCategory() == ColumnDescription.INPUT && node != null) { 205// syntaxGraph.addLabel(node, column.getName(), input.toString()); 206// } else if (column.getCategory() == ColumnDescription.HEAD) { 207// if (syntaxGraph instanceof DependencyStructure) { 208// if (column.getCategory() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { 209//// if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix 210// //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) { 211// edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter); 212// } 213// } 214// else { 215// close(); 216// throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. "); 217// } 218// } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) { 219// //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody 220// syntaxGraph.addLabel(edge, column.getName(), input.toString()); 221// //} // bugfix 222// } 223// } 224// input.setLength(0); 225// nNewLines = 0; 226// i++; 227// } else if (c == TAB) { 228// throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. "); 229// } 230// if (c == NEWLINE) { 231// nNewLines++; 232// i = 0; 233// columns = dataFormatInstance.iterator(); 234// } 235// } else { 236// input.append((char)c); 237// } 238// 239// if (nNewLines == 2 && c == NEWLINE) { 240// if (syntaxGraph.hasTokens()) { 241// sentenceCount++; 242// } 243// return true; 244// } else if (c == -1) { 245// if (syntaxGraph.hasTokens()) { 246// sentenceCount++; 247// } 248// if (cIterations < nIterations) { 249// cIterations++; 250// reopen(); 251// return true; 252// } 253// 254// return false; 255// } 256// } 257// } 258 259 public void readEpilog() throws MaltChainedException { 260 261 } 262 263 public BufferedReader getReader() { 264 return reader; 265 } 266 267 public void setReader(BufferedReader reader) throws MaltChainedException { 268 close(); 269 this.reader = reader; 270 } 271 272 public DataFormatInstance getDataFormatInstance() { 273 return dataFormatInstance; 274 } 275 276 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 277 this.dataFormatInstance = dataFormatInstance; 278 } 279 280 public int getSentenceCount() throws MaltChainedException { 281 return sentenceCount; 282 } 283 284 public void setSentenceCount(int sentenceCount) { 285 this.sentenceCount = sentenceCount; 286 } 287 288 public String getOptions() { 289 return null; 290 } 291 292 public void setOptions(String optionString) throws MaltChainedException { 293 294 } 295 296 public String getFileName() { 297 return fileName; 298 } 299 300 public void setFileName(String fileName) { 301 this.fileName = fileName; 302 } 303 304 public URL getUrl() { 305 return url; 306 } 307 308 public void setUrl(URL url) { 309 this.url = url; 310 } 311 312 public String getCharsetName() { 313 return charsetName; 314 } 315 316 public void setCharsetName(String charsetName) { 317 this.charsetName = charsetName; 318 } 319 320 public int getNIterations() { 321 return nIterations; 322 } 323 324 public void setNIterations(int iterations) { 325 nIterations = iterations; 326 } 327 328 public int getIterationCounter() { 329 return cIterations; 330 } 331 332 public void close() throws MaltChainedException { 333 try { 334 if (reader != null) { 335 if (closeStream) { 336 reader.close(); 337 } 338 reader = null; 339 } 340 } catch (IOException e) { 341 throw new DataFormatException("Error when closing the input file. ", e); 342 } 343 } 344 345 public void clear() throws MaltChainedException { 346 close(); 347// input.setLength(0); 348 dataFormatInstance = null; 349 sentenceCount = 0; 350 } 351}