001 package org.maltparser.core.symbol.trie;
002
003 import java.io.BufferedReader;
004 import java.io.BufferedWriter;
005 import java.io.FileInputStream;
006 import java.io.FileNotFoundException;
007 import java.io.FileOutputStream;
008 import java.io.UnsupportedEncodingException;
009
010 import java.io.IOException;
011 import java.io.InputStreamReader;
012 import java.io.OutputStreamWriter;
013 import java.util.Set;
014 import java.util.regex.Pattern;
015 import java.util.regex.PatternSyntaxException;
016
017 import org.apache.log4j.Logger;
018
019 import org.maltparser.core.exception.MaltChainedException;
020 import org.maltparser.core.helper.HashMap;
021 import org.maltparser.core.symbol.SymbolException;
022 import org.maltparser.core.symbol.SymbolTable;
023 import org.maltparser.core.symbol.SymbolTableHandler;
024
025
026 /**
027
028 @author Johan Hall
029 @since 1.0
030 */
031 public class TrieSymbolTableHandler implements SymbolTableHandler {
032 private final Trie trie;
033 private final HashMap<String, TrieSymbolTable> symbolTables;
034
035 public TrieSymbolTableHandler() {
036 trie = new Trie();
037 symbolTables = new HashMap<String, TrieSymbolTable>();
038 }
039
040 public TrieSymbolTable addSymbolTable(String tableName) throws MaltChainedException {
041 TrieSymbolTable symbolTable = symbolTables.get(tableName);
042 if (symbolTable == null) {
043 symbolTable = new TrieSymbolTable(tableName, trie);
044 symbolTables.put(tableName, symbolTable);
045 }
046 return symbolTable;
047 }
048
049 public TrieSymbolTable addSymbolTable(String tableName, SymbolTable parentTable) throws MaltChainedException {
050 TrieSymbolTable symbolTable = symbolTables.get(tableName);
051 if (symbolTable == null) {
052 TrieSymbolTable trieParentTable = (TrieSymbolTable)parentTable;
053 symbolTable = new TrieSymbolTable(tableName, trie, trieParentTable.getColumnCategory(), trieParentTable.getNullValueStrategy());
054 symbolTables.put(tableName, symbolTable);
055 }
056 return symbolTable;
057 }
058
059 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy) throws MaltChainedException {
060 TrieSymbolTable symbolTable = symbolTables.get(tableName);
061 if (symbolTable == null) {
062 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy);
063 symbolTables.put(tableName, symbolTable);
064 }
065 return symbolTable;
066 }
067
068 public TrieSymbolTable getSymbolTable(String tableName) {
069 return symbolTables.get(tableName);
070 }
071
072 public Set<String> getSymbolTableNames() {
073 return symbolTables.keySet();
074 }
075
076 public void save(OutputStreamWriter osw) throws MaltChainedException {
077 try {
078 BufferedWriter bout = new BufferedWriter(osw);
079 for (TrieSymbolTable table : symbolTables.values()) {
080 table.saveHeader(bout);
081 }
082 bout.write('\n');
083 for (TrieSymbolTable table : symbolTables.values()) {
084 table.save(bout);
085 }
086 bout.close();
087 } catch (IOException e) {
088 throw new SymbolException("Could not save the symbol tables. ", e);
089 }
090 }
091
092 public void save(String fileName, String charSet) throws MaltChainedException {
093 try {
094 save(new OutputStreamWriter(new FileOutputStream(fileName), charSet));
095 } catch (FileNotFoundException e) {
096 throw new SymbolException("The symbol table file '"+fileName+"' cannot be created. ", e);
097 } catch (UnsupportedEncodingException e) {
098 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
099 }
100 }
101
102 public void loadHeader(BufferedReader bin) throws MaltChainedException {
103 String fileLine = "";
104 Pattern tabPattern = Pattern.compile("\t");
105 try {
106 while ((fileLine = bin.readLine()) != null) {
107 if (fileLine.length() == 0 || fileLine.charAt(0) != '\t') {
108 break;
109 }
110 String items[];
111 try {
112 items = tabPattern.split(fileLine.substring(1));
113 } catch (PatternSyntaxException e) {
114 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' could not split into atomic parts. ", e);
115 }
116 if (items.length != 3) {
117 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' must contain four columns. ");
118 }
119 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2]);
120 }
121 } catch (NumberFormatException e) {
122 throw new SymbolException("The symbol table file (.sym) contains a non-integer value in the header. ", e);
123 } catch (IOException e) {
124 throw new SymbolException("Could not load the symbol table. ", e);
125 }
126 }
127
128
129 public void load(InputStreamReader isr) throws MaltChainedException {
130 try {
131 BufferedReader bin = new BufferedReader(isr);
132 String fileLine;
133 SymbolTable table = null;
134 bin.mark(2);
135 if (bin.read() == '\t') {
136 bin.reset();
137 loadHeader(bin);
138 } else {
139 bin.reset();
140 }
141 while ((fileLine = bin.readLine()) != null) {
142 if (fileLine.length() > 0) {
143 table = addSymbolTable(fileLine);
144 table.load(bin);
145 }
146 }
147 bin.close();
148 } catch (IOException e) {
149 throw new SymbolException("Could not load the symbol tables. ", e);
150 }
151 }
152
153 public void load(String fileName, String charSet) throws MaltChainedException {
154 try {
155 load(new InputStreamReader(new FileInputStream(fileName), charSet));
156
157 } catch (FileNotFoundException e) {
158 throw new SymbolException("The symbol table file '"+fileName+"' cannot be found. ", e);
159 } catch (UnsupportedEncodingException e) {
160 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
161 }
162 }
163
164
165 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy) throws MaltChainedException {
166 try {
167 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet));
168 String fileLine;
169 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy);
170
171 while ((fileLine = br.readLine()) != null) {
172 table.addSymbol(fileLine.trim());
173 }
174 return table;
175 } catch (FileNotFoundException e) {
176 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e);
177 } catch (UnsupportedEncodingException e) {
178 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
179 } catch (IOException e) {
180 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e);
181 }
182 }
183
184 public void printSymbolTables(Logger logger) throws MaltChainedException {
185 for (TrieSymbolTable table : symbolTables.values()) {
186 table.printSymbolTable(logger);
187 }
188 }
189 }