001 package org.maltparser.core.symbol.trie;
002
003 import java.io.BufferedReader;
004 import java.io.BufferedWriter;
005 import java.io.FileInputStream;
006 import java.io.FileNotFoundException;
007 import java.io.FileOutputStream;
008 import java.io.UnsupportedEncodingException;
009
010 import java.io.IOException;
011 import java.io.InputStreamReader;
012 import java.io.OutputStreamWriter;
013 import java.util.Set;
014 import java.util.regex.Pattern;
015 import java.util.regex.PatternSyntaxException;
016
017 import org.apache.log4j.Logger;
018
019 import org.maltparser.core.exception.MaltChainedException;
020 import org.maltparser.core.helper.HashMap;
021 import org.maltparser.core.symbol.SymbolException;
022 import org.maltparser.core.symbol.SymbolTable;
023 import org.maltparser.core.symbol.SymbolTableHandler;
024
025
026 /**
027
028 @author Johan Hall
029 @since 1.0
030 */
031 public class TrieSymbolTableHandler implements SymbolTableHandler {
032 private final Trie trie;
033 private final HashMap<String, TrieSymbolTable> symbolTables;
034
035 public final static int ADD_NEW_TO_TRIE = 1;
036 public final static int ADD_NEW_TO_TMP_STORAGE = 2;
037 private final int symbolTableMode;
038
039 public TrieSymbolTableHandler(int symbolTableMode) {
040 trie = new Trie();
041 symbolTables = new HashMap<String, TrieSymbolTable>();
042 this.symbolTableMode = symbolTableMode;
043 }
044
045 public TrieSymbolTable addSymbolTable(String tableName) throws MaltChainedException {
046 TrieSymbolTable symbolTable = symbolTables.get(tableName);
047 if (symbolTable == null) {
048 symbolTable = new TrieSymbolTable(tableName, trie, symbolTableMode);
049 symbolTables.put(tableName, symbolTable);
050 }
051 return symbolTable;
052 }
053
054 public TrieSymbolTable addSymbolTable(String tableName, SymbolTable parentTable) throws MaltChainedException {
055 TrieSymbolTable symbolTable = symbolTables.get(tableName);
056 if (symbolTable == null) {
057 TrieSymbolTable trieParentTable = (TrieSymbolTable)parentTable;
058 symbolTable = new TrieSymbolTable(tableName, trie, trieParentTable.getColumnCategory(), trieParentTable.getNullValueStrategy(), symbolTableMode);
059 symbolTables.put(tableName, symbolTable);
060 }
061 return symbolTable;
062 }
063
064 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy) throws MaltChainedException {
065 TrieSymbolTable symbolTable = symbolTables.get(tableName);
066 if (symbolTable == null) {
067 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy, symbolTableMode);
068 symbolTables.put(tableName, symbolTable);
069 }
070 return symbolTable;
071 }
072
073 public TrieSymbolTable getSymbolTable(String tableName) {
074 return symbolTables.get(tableName);
075 }
076
077 public Set<String> getSymbolTableNames() {
078 return symbolTables.keySet();
079 }
080
081 public void cleanUp() {
082 if (symbolTableMode == TrieSymbolTableHandler.ADD_NEW_TO_TMP_STORAGE) {
083 for (TrieSymbolTable table : symbolTables.values()) {
084 table.clearTmpStorage();
085 }
086 }
087 }
088
089 public void save(OutputStreamWriter osw) throws MaltChainedException {
090 try {
091 BufferedWriter bout = new BufferedWriter(osw);
092 for (TrieSymbolTable table : symbolTables.values()) {
093 table.saveHeader(bout);
094 }
095 bout.write('\n');
096 for (TrieSymbolTable table : symbolTables.values()) {
097 table.save(bout);
098 }
099 bout.close();
100 } catch (IOException e) {
101 throw new SymbolException("Could not save the symbol tables. ", e);
102 }
103 }
104
105 public void save(String fileName, String charSet) throws MaltChainedException {
106 try {
107 save(new OutputStreamWriter(new FileOutputStream(fileName), charSet));
108 } catch (FileNotFoundException e) {
109 throw new SymbolException("The symbol table file '"+fileName+"' cannot be created. ", e);
110 } catch (UnsupportedEncodingException e) {
111 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
112 }
113 }
114
115 public void loadHeader(BufferedReader bin) throws MaltChainedException {
116 String fileLine = "";
117 Pattern tabPattern = Pattern.compile("\t");
118 try {
119 while ((fileLine = bin.readLine()) != null) {
120 if (fileLine.length() == 0 || fileLine.charAt(0) != '\t') {
121 break;
122 }
123 String items[];
124 try {
125 items = tabPattern.split(fileLine.substring(1));
126 } catch (PatternSyntaxException e) {
127 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' could not split into atomic parts. ", e);
128 }
129 if (items.length != 3) {
130 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' must contain four columns. ");
131 }
132 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2]);
133 }
134 } catch (NumberFormatException e) {
135 throw new SymbolException("The symbol table file (.sym) contains a non-integer value in the header. ", e);
136 } catch (IOException e) {
137 throw new SymbolException("Could not load the symbol table. ", e);
138 }
139 }
140
141
142 public void load(InputStreamReader isr) throws MaltChainedException {
143 try {
144 BufferedReader bin = new BufferedReader(isr);
145 String fileLine;
146 SymbolTable table = null;
147 bin.mark(2);
148 if (bin.read() == '\t') {
149 bin.reset();
150 loadHeader(bin);
151 } else {
152 bin.reset();
153 }
154 while ((fileLine = bin.readLine()) != null) {
155 if (fileLine.length() > 0) {
156 table = addSymbolTable(fileLine);
157 table.load(bin);
158 }
159 }
160 bin.close();
161 } catch (IOException e) {
162 throw new SymbolException("Could not load the symbol tables. ", e);
163 }
164 }
165
166 public void load(String fileName, String charSet) throws MaltChainedException {
167 try {
168 load(new InputStreamReader(new FileInputStream(fileName), charSet));
169
170 } catch (FileNotFoundException e) {
171 throw new SymbolException("The symbol table file '"+fileName+"' cannot be found. ", e);
172 } catch (UnsupportedEncodingException e) {
173 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
174 }
175 }
176
177
178 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy) throws MaltChainedException {
179 try {
180 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet));
181 String fileLine;
182 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy);
183
184 while ((fileLine = br.readLine()) != null) {
185 table.addSymbol(fileLine.trim());
186 }
187 return table;
188 } catch (FileNotFoundException e) {
189 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e);
190 } catch (UnsupportedEncodingException e) {
191 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
192 } catch (IOException e) {
193 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e);
194 }
195 }
196
197 public void printSymbolTables(Logger logger) throws MaltChainedException {
198 for (TrieSymbolTable table : symbolTables.values()) {
199 table.printSymbolTable(logger);
200 }
201 }
202 }