001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.File; 005 import java.io.FileInputStream; 006 import java.io.FileNotFoundException; 007 import java.io.IOException; 008 import java.io.InputStream; 009 import java.io.InputStreamReader; 010 import java.io.UnsupportedEncodingException; 011 import java.net.URL; 012 import java.util.Iterator; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 import org.maltparser.core.io.dataformat.ColumnDescription; 016 import org.maltparser.core.io.dataformat.DataFormatException; 017 import org.maltparser.core.io.dataformat.DataFormatInstance; 018 import org.maltparser.core.syntaxgraph.DependencyStructure; 019 import org.maltparser.core.syntaxgraph.Element; 020 import org.maltparser.core.syntaxgraph.TokenStructure; 021 import org.maltparser.core.syntaxgraph.edge.Edge; 022 /** 023 * 024 * 025 * @author Johan Hall 026 */ 027 public class TabReader implements SyntaxGraphReader { 028 private BufferedReader reader; 029 private int sentenceCount; 030 private final StringBuilder input; 031 private DataFormatInstance dataFormatInstance; 032 private static final String IGNORE_COLUMN_SIGN = "_"; 033 private static final char TAB = '\t'; 034 private static final char NEWLINE = '\n'; 035 private static final char CARRIAGE_RETURN = '\r'; 036 private String fileName = null; 037 private URL url = null; 038 private String charsetName; 039 private int nIterations; 040 private int cIterations; 041 042 043 public TabReader() { 044 input = new StringBuilder(); 045 nIterations = 1; 046 cIterations = 1; 047 } 048 049 private void reopen() throws MaltChainedException { 050 close(); 051 if (fileName != null) { 052 open(fileName, charsetName); 053 } else if (url != null) { 054 open(url, charsetName); 055 } else { 056 throw new DataFormatException("The input stream cannot be reopen. "); 057 } 058 } 059 060 public void open(String fileName, String charsetName) throws MaltChainedException { 061 setFileName(fileName); 062 setCharsetName(charsetName); 063 try { 064 open(new FileInputStream(fileName), charsetName); 065 } catch (FileNotFoundException e) { 066 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 067 } 068 } 069 070 public void open(URL url, String charsetName) throws MaltChainedException { 071 setUrl(url); 072 setCharsetName(charsetName); 073 if (url == null) { 074 throw new DataFormatException("The input file cannot be found. "); 075 } 076 try { 077 open(url.openStream(), charsetName); 078 } catch (IOException e) { 079 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 080 } 081 } 082 083 public void open(InputStream is, String charsetName) throws MaltChainedException { 084 try { 085 open(new InputStreamReader(is, charsetName)); 086 } catch (UnsupportedEncodingException e) { 087 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 088 } 089 } 090 091 public void open(InputStreamReader isr) throws MaltChainedException { 092 setReader(new BufferedReader(isr)); 093 setSentenceCount(0); 094 } 095 096 public void readProlog() throws MaltChainedException { 097 098 } 099 100 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 101 if (syntaxGraph == null || dataFormatInstance == null) { 102 return false; 103 } 104 105 Element node = null; 106 Edge edge = null; 107 input.setLength(0); 108 int i = 0; 109 int terminalCounter = 0; 110 int nNewLines = 0; 111 syntaxGraph.clear(); 112 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 113 while (true) { 114 int c; 115 116 try { 117 c = reader.read(); 118 } catch (IOException e) { 119 close(); 120 throw new DataFormatException("Error when reading from the input file. ", e); 121 } 122 if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) { 123 if (input.length() != 0) { 124 if (i == 0) { 125 terminalCounter++; 126 node = syntaxGraph.addTokenNode(terminalCounter); 127 } 128 ColumnDescription column = null; 129 if (columns.hasNext()) { 130 column = columns.next(); 131 if (column.getCategory() == ColumnDescription.INPUT && node != null) { 132 syntaxGraph.addLabel(node, column.getName(), input.toString()); 133 } else if (column.getCategory() == ColumnDescription.HEAD) { 134 if (syntaxGraph instanceof DependencyStructure) { 135 if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix 136 //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) { 137 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter); 138 } 139 } 140 else { 141 close(); 142 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. "); 143 } 144 } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) { 145 if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix 146 syntaxGraph.addLabel(edge, column.getName(), input.toString()); 147 } // bugfix 148 } 149 } 150 input.setLength(0); 151 nNewLines = 0; 152 i++; 153 } 154 if (c == NEWLINE) { 155 nNewLines++; 156 i = 0; 157 columns = dataFormatInstance.iterator(); 158 } 159 } else { 160 input.append((char)c); 161 } 162 163 if (nNewLines == 2 && c == NEWLINE) { 164 if (syntaxGraph.hasTokens()) { 165 sentenceCount++; 166 } 167 return true; 168 } else if (c == -1) { 169 if (syntaxGraph.hasTokens()) { 170 sentenceCount++; 171 } 172 if (cIterations < nIterations) { 173 cIterations++; 174 reopen(); 175 return true; 176 } 177 178 return false; 179 } 180 } 181 } 182 183 public void readEpilog() throws MaltChainedException { 184 185 } 186 187 public BufferedReader getReader() { 188 return reader; 189 } 190 191 public void setReader(BufferedReader reader) throws MaltChainedException { 192 close(); 193 this.reader = reader; 194 } 195 196 public DataFormatInstance getDataFormatInstance() { 197 return dataFormatInstance; 198 } 199 200 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 201 this.dataFormatInstance = dataFormatInstance; 202 } 203 204 public int getSentenceCount() throws MaltChainedException { 205 return sentenceCount; 206 } 207 208 public void setSentenceCount(int sentenceCount) { 209 this.sentenceCount = sentenceCount; 210 } 211 212 public String getOptions() { 213 return null; 214 } 215 216 public void setOptions(String optionString) throws MaltChainedException { 217 218 } 219 220 public String getFileName() { 221 return fileName; 222 } 223 224 public void setFileName(String fileName) { 225 this.fileName = fileName; 226 } 227 228 public URL getUrl() { 229 return url; 230 } 231 232 public void setUrl(URL url) { 233 this.url = url; 234 } 235 236 public String getCharsetName() { 237 return charsetName; 238 } 239 240 public void setCharsetName(String charsetName) { 241 this.charsetName = charsetName; 242 } 243 244 public int getNIterations() { 245 return nIterations; 246 } 247 248 public void setNIterations(int iterations) { 249 nIterations = iterations; 250 } 251 252 public int getIterationCounter() { 253 return cIterations; 254 } 255 256 public void close() throws MaltChainedException { 257 try { 258 if (reader != null) { 259 reader.close(); 260 reader = null; 261 } 262 } catch (IOException e) { 263 throw new DataFormatException("Error when closing the input file. ", e); 264 } 265 } 266 267 public void clear() throws MaltChainedException { 268 close(); 269 input.setLength(0); 270 dataFormatInstance = null; 271 sentenceCount = 0; 272 } 273 }