001 package org.maltparser.core.symbol.trie; 002 003 import java.io.BufferedReader; 004 import java.io.BufferedWriter; 005 import java.io.FileInputStream; 006 import java.io.FileNotFoundException; 007 import java.io.FileOutputStream; 008 import java.io.UnsupportedEncodingException; 009 010 import java.io.IOException; 011 import java.io.InputStreamReader; 012 import java.io.OutputStreamWriter; 013 import java.util.Set; 014 import java.util.regex.Pattern; 015 import java.util.regex.PatternSyntaxException; 016 017 import org.apache.log4j.Logger; 018 019 import org.maltparser.core.exception.MaltChainedException; 020 import org.maltparser.core.helper.HashMap; 021 import org.maltparser.core.symbol.SymbolException; 022 import org.maltparser.core.symbol.SymbolTable; 023 import org.maltparser.core.symbol.SymbolTableHandler; 024 025 026 /** 027 028 @author Johan Hall 029 @since 1.0 030 */ 031 public class TrieSymbolTableHandler implements SymbolTableHandler { 032 private final Trie trie; 033 private final HashMap<String, TrieSymbolTable> symbolTables; 034 035 public final static int ADD_NEW_TO_TRIE = 1; 036 public final static int ADD_NEW_TO_TMP_STORAGE = 2; 037 private final int symbolTableMode; 038 039 public TrieSymbolTableHandler(int symbolTableMode) { 040 trie = new Trie(); 041 symbolTables = new HashMap<String, TrieSymbolTable>(); 042 this.symbolTableMode = symbolTableMode; 043 } 044 045 public TrieSymbolTable addSymbolTable(String tableName) throws MaltChainedException { 046 TrieSymbolTable symbolTable = symbolTables.get(tableName); 047 if (symbolTable == null) { 048 symbolTable = new TrieSymbolTable(tableName, trie, symbolTableMode); 049 symbolTables.put(tableName, symbolTable); 050 } 051 return symbolTable; 052 } 053 054 public TrieSymbolTable addSymbolTable(String tableName, SymbolTable parentTable) throws MaltChainedException { 055 TrieSymbolTable symbolTable = symbolTables.get(tableName); 056 if (symbolTable == null) { 057 TrieSymbolTable trieParentTable = (TrieSymbolTable)parentTable; 058 symbolTable = new TrieSymbolTable(tableName, trie, trieParentTable.getColumnCategory(), trieParentTable.getNullValueStrategy(), symbolTableMode); 059 symbolTables.put(tableName, symbolTable); 060 } 061 return symbolTable; 062 } 063 064 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy) throws MaltChainedException { 065 TrieSymbolTable symbolTable = symbolTables.get(tableName); 066 if (symbolTable == null) { 067 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy, symbolTableMode); 068 symbolTables.put(tableName, symbolTable); 069 } 070 return symbolTable; 071 } 072 073 public TrieSymbolTable getSymbolTable(String tableName) { 074 return symbolTables.get(tableName); 075 } 076 077 public Set<String> getSymbolTableNames() { 078 return symbolTables.keySet(); 079 } 080 081 public void cleanUp() { 082 if (symbolTableMode == TrieSymbolTableHandler.ADD_NEW_TO_TMP_STORAGE) { 083 for (TrieSymbolTable table : symbolTables.values()) { 084 table.clearTmpStorage(); 085 } 086 } 087 } 088 089 public void save(OutputStreamWriter osw) throws MaltChainedException { 090 try { 091 BufferedWriter bout = new BufferedWriter(osw); 092 for (TrieSymbolTable table : symbolTables.values()) { 093 table.saveHeader(bout); 094 } 095 bout.write('\n'); 096 for (TrieSymbolTable table : symbolTables.values()) { 097 table.save(bout); 098 } 099 bout.close(); 100 } catch (IOException e) { 101 throw new SymbolException("Could not save the symbol tables. ", e); 102 } 103 } 104 105 public void save(String fileName, String charSet) throws MaltChainedException { 106 try { 107 save(new OutputStreamWriter(new FileOutputStream(fileName), charSet)); 108 } catch (FileNotFoundException e) { 109 throw new SymbolException("The symbol table file '"+fileName+"' cannot be created. ", e); 110 } catch (UnsupportedEncodingException e) { 111 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 112 } 113 } 114 115 public void loadHeader(BufferedReader bin) throws MaltChainedException { 116 String fileLine = ""; 117 Pattern tabPattern = Pattern.compile("\t"); 118 try { 119 while ((fileLine = bin.readLine()) != null) { 120 if (fileLine.length() == 0 || fileLine.charAt(0) != '\t') { 121 break; 122 } 123 String items[]; 124 try { 125 items = tabPattern.split(fileLine.substring(1)); 126 } catch (PatternSyntaxException e) { 127 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' could not split into atomic parts. ", e); 128 } 129 if (items.length != 3) { 130 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' must contain four columns. "); 131 } 132 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2]); 133 } 134 } catch (NumberFormatException e) { 135 throw new SymbolException("The symbol table file (.sym) contains a non-integer value in the header. ", e); 136 } catch (IOException e) { 137 throw new SymbolException("Could not load the symbol table. ", e); 138 } 139 } 140 141 142 public void load(InputStreamReader isr) throws MaltChainedException { 143 try { 144 BufferedReader bin = new BufferedReader(isr); 145 String fileLine; 146 SymbolTable table = null; 147 bin.mark(2); 148 if (bin.read() == '\t') { 149 bin.reset(); 150 loadHeader(bin); 151 } else { 152 bin.reset(); 153 } 154 while ((fileLine = bin.readLine()) != null) { 155 if (fileLine.length() > 0) { 156 table = addSymbolTable(fileLine); 157 table.load(bin); 158 } 159 } 160 bin.close(); 161 } catch (IOException e) { 162 throw new SymbolException("Could not load the symbol tables. ", e); 163 } 164 } 165 166 public void load(String fileName, String charSet) throws MaltChainedException { 167 try { 168 load(new InputStreamReader(new FileInputStream(fileName), charSet)); 169 170 } catch (FileNotFoundException e) { 171 throw new SymbolException("The symbol table file '"+fileName+"' cannot be found. ", e); 172 } catch (UnsupportedEncodingException e) { 173 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 174 } 175 } 176 177 178 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy) throws MaltChainedException { 179 try { 180 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet)); 181 String fileLine; 182 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy); 183 184 while ((fileLine = br.readLine()) != null) { 185 table.addSymbol(fileLine.trim()); 186 } 187 return table; 188 } catch (FileNotFoundException e) { 189 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e); 190 } catch (UnsupportedEncodingException e) { 191 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 192 } catch (IOException e) { 193 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e); 194 } 195 } 196 197 public void printSymbolTables(Logger logger) throws MaltChainedException { 198 for (TrieSymbolTable table : symbolTables.values()) { 199 table.printSymbolTable(logger); 200 } 201 } 202 }