001 package org.maltparser.core.syntaxgraph.reader; 002 003 import java.io.BufferedReader; 004 import java.io.FileInputStream; 005 import java.io.FileNotFoundException; 006 import java.io.IOException; 007 import java.io.InputStream; 008 import java.io.InputStreamReader; 009 import java.io.UnsupportedEncodingException; 010 import java.net.URL; 011 import java.util.Iterator; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.io.dataformat.ColumnDescription; 015 import org.maltparser.core.io.dataformat.DataFormatException; 016 import org.maltparser.core.io.dataformat.DataFormatInstance; 017 import org.maltparser.core.syntaxgraph.DependencyStructure; 018 import org.maltparser.core.syntaxgraph.Element; 019 import org.maltparser.core.syntaxgraph.TokenStructure; 020 import org.maltparser.core.syntaxgraph.edge.Edge; 021 /** 022 * 023 * 024 * @author Johan Hall 025 */ 026 public class TabReader implements SyntaxGraphReader { 027 private BufferedReader reader; 028 private int sentenceCount; 029 private final StringBuilder input; 030 private DataFormatInstance dataFormatInstance; 031 private static final String IGNORE_COLUMN_SIGN = "_"; 032 private static final char TAB = '\t'; 033 private static final char NEWLINE = '\n'; 034 private static final char CARRIAGE_RETURN = '\r'; 035 036 037 public TabReader() { 038 input = new StringBuilder(); 039 } 040 041 public void open(String fileName, String charsetName) throws MaltChainedException { 042 try { 043 open(new FileInputStream(fileName), charsetName); 044 }catch (FileNotFoundException e) { 045 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 046 } 047 } 048 049 public void open(URL url, String charsetName) throws MaltChainedException { 050 try { 051 open(url.openStream(), charsetName); 052 } catch (IOException e) { 053 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 054 } 055 } 056 057 public void open(InputStream is, String charsetName) throws MaltChainedException { 058 try { 059 open(new InputStreamReader(is, charsetName)); 060 } catch (UnsupportedEncodingException e) { 061 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 062 } 063 } 064 065 public void open(InputStreamReader isr) throws MaltChainedException { 066 setReader(new BufferedReader(isr)); 067 setSentenceCount(0); 068 } 069 070 public void readProlog() throws MaltChainedException { 071 072 } 073 074 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 075 if (syntaxGraph == null || dataFormatInstance == null) { 076 return false; 077 } 078 079 Element node = null; 080 Edge edge = null; 081 input.setLength(0); 082 int i = 0; 083 int terminalCounter = 0; 084 int nNewLines = 0; 085 086 syntaxGraph.clear(); 087 Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 088 while (true) { 089 int c; 090 091 try { 092 c = reader.read(); 093 } catch (IOException e) { 094 close(); 095 throw new DataFormatException("Error when reading from the input file. ", e); 096 } 097 if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) { 098 if (input.length() != 0) { 099 if (i == 0) { 100 terminalCounter++; 101 node = syntaxGraph.addTokenNode(terminalCounter); 102 } 103 ColumnDescription column = null; 104 if (columns.hasNext()) { 105 column = columns.next(); 106 if (column.getCategory() == ColumnDescription.INPUT && node != null) { 107 syntaxGraph.addLabel(node, column.getName(), input.toString()); 108 } else if (column.getCategory() == ColumnDescription.HEAD) { 109 if (syntaxGraph instanceof DependencyStructure) { 110 if (!input.toString().equals(IGNORE_COLUMN_SIGN)) { 111 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter); 112 } 113 } 114 else { 115 close(); 116 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. "); 117 } 118 } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) { 119 syntaxGraph.addLabel(edge, column.getName(), input.toString()); 120 } 121 } 122 input.setLength(0); 123 nNewLines = 0; 124 i++; 125 } 126 if (c == NEWLINE) { 127 nNewLines++; 128 i = 0; 129 columns = dataFormatInstance.iterator(); 130 } 131 } else { 132 input.append((char)c); 133 } 134 135 if (nNewLines == 2 && c == NEWLINE) { 136 if (syntaxGraph.hasTokens()) { 137 sentenceCount++; 138 } 139 return true; 140 } else if (c == -1) { 141 if (syntaxGraph.hasTokens()) { 142 sentenceCount++; 143 } 144 return false; 145 } 146 } 147 } 148 149 public void readEpilog() throws MaltChainedException { 150 151 } 152 153 public BufferedReader getReader() { 154 return reader; 155 } 156 157 public void setReader(BufferedReader reader) throws MaltChainedException { 158 close(); 159 this.reader = reader; 160 } 161 162 public DataFormatInstance getDataFormatInstance() { 163 return dataFormatInstance; 164 } 165 166 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 167 this.dataFormatInstance = dataFormatInstance; 168 } 169 170 public int getSentenceCount() throws MaltChainedException { 171 return sentenceCount; 172 } 173 174 public void setSentenceCount(int sentenceCount) { 175 this.sentenceCount = sentenceCount; 176 } 177 178 public String getOptions() { 179 return null; 180 } 181 182 public void setOptions(String optionString) throws MaltChainedException { 183 184 } 185 186 public void close() throws MaltChainedException { 187 try { 188 if (reader != null) { 189 reader.close(); 190 reader = null; 191 } 192 } catch (IOException e) { 193 throw new DataFormatException("Error when closing the input file. ", e); 194 } 195 } 196 197 public void clear() throws MaltChainedException { 198 close(); 199 input.setLength(0); 200 dataFormatInstance = null; 201 sentenceCount = 0; 202 } 203 }