001    package org.maltparser.core.symbol.trie;
002    
003    import java.io.BufferedReader;
004    import java.io.BufferedWriter;
005    import java.io.FileInputStream;
006    import java.io.FileNotFoundException;
007    import java.io.FileOutputStream;
008    import java.io.UnsupportedEncodingException;
009    
010    import java.io.IOException;
011    import java.io.InputStreamReader;
012    import java.io.OutputStreamWriter;
013    import java.util.Set;
014    import java.util.regex.Pattern;
015    import java.util.regex.PatternSyntaxException;
016    
017    import org.apache.log4j.Logger;
018    
019    import org.maltparser.core.exception.MaltChainedException;
020    import org.maltparser.core.helper.HashMap;
021    import org.maltparser.core.symbol.SymbolException;
022    import org.maltparser.core.symbol.SymbolTable;
023    import org.maltparser.core.symbol.SymbolTableHandler;
024    
025    
026    /**
027    
028    @author Johan Hall
029    @since 1.0
030    */
031    public class TrieSymbolTableHandler implements SymbolTableHandler {
032            private final Trie trie;
033            private final HashMap<String, TrieSymbolTable> symbolTables;
034            
035            public final static int ADD_NEW_TO_TRIE = 1;
036            public final static int ADD_NEW_TO_TMP_STORAGE = 2;
037            private final int symbolTableMode;
038    
039            public TrieSymbolTableHandler(int symbolTableMode) {
040                    trie = new Trie();
041                    symbolTables = new HashMap<String, TrieSymbolTable>();
042                    this.symbolTableMode = symbolTableMode;
043            }
044    
045            public TrieSymbolTable addSymbolTable(String tableName) throws MaltChainedException {
046                    TrieSymbolTable symbolTable = symbolTables.get(tableName);
047                    if (symbolTable == null) {
048                            symbolTable = new TrieSymbolTable(tableName, trie, symbolTableMode);
049                            symbolTables.put(tableName, symbolTable);
050                    }
051                    return symbolTable;
052            }
053            
054            public TrieSymbolTable addSymbolTable(String tableName, SymbolTable parentTable) throws MaltChainedException {
055                    TrieSymbolTable symbolTable = symbolTables.get(tableName);
056                    if (symbolTable == null) {
057                            TrieSymbolTable trieParentTable = (TrieSymbolTable)parentTable;
058                            symbolTable = new TrieSymbolTable(tableName, trie, trieParentTable.getColumnCategory(), trieParentTable.getNullValueStrategy(), symbolTableMode);
059                            symbolTables.put(tableName, symbolTable);
060                    }
061                    return symbolTable;
062            }
063            
064            public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy) throws MaltChainedException {
065                    TrieSymbolTable symbolTable = symbolTables.get(tableName);
066                    if (symbolTable == null) {
067                            symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy, symbolTableMode);
068                            symbolTables.put(tableName, symbolTable);
069                    }
070                    return symbolTable;
071            }
072            
073            public TrieSymbolTable getSymbolTable(String tableName) {
074                    return symbolTables.get(tableName);
075            }
076            
077            public Set<String> getSymbolTableNames() {
078                    return symbolTables.keySet();
079            }
080            
081            public void cleanUp() {
082                    if (symbolTableMode == TrieSymbolTableHandler.ADD_NEW_TO_TMP_STORAGE) {
083                            for (TrieSymbolTable table : symbolTables.values()) {
084                                    table.clearTmpStorage();
085                            }
086                    }
087            }
088            
089            public void save(OutputStreamWriter osw) throws MaltChainedException  {
090                    try {
091                            BufferedWriter bout = new BufferedWriter(osw);
092                            for (TrieSymbolTable table : symbolTables.values()) {
093                                    table.saveHeader(bout);
094                            }
095                            bout.write('\n');
096                            for (TrieSymbolTable table : symbolTables.values()) {
097                                    table.save(bout);
098                            }
099                            bout.close();
100                    } catch (IOException e) {
101                            throw new SymbolException("Could not save the symbol tables. ", e);
102                    }               
103            }
104            
105            public void save(String fileName, String charSet) throws MaltChainedException  {
106                    try {
107                            save(new OutputStreamWriter(new FileOutputStream(fileName), charSet));
108                    } catch (FileNotFoundException e) {
109                            throw new SymbolException("The symbol table file '"+fileName+"' cannot be created. ", e);
110                    } catch (UnsupportedEncodingException e) {
111                            throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
112                    }
113            }
114            
115            public void loadHeader(BufferedReader bin) throws MaltChainedException {
116                    String fileLine = "";
117                    Pattern tabPattern = Pattern.compile("\t");
118                    try {
119                            while ((fileLine = bin.readLine()) != null) {
120                                    if (fileLine.length() == 0 || fileLine.charAt(0) != '\t') {
121                                            break;
122                                    }
123                                    String items[];
124                                    try {
125                                            items = tabPattern.split(fileLine.substring(1));
126                                    } catch (PatternSyntaxException e) {
127                                            throw new SymbolException("The header line of the symbol table  '"+fileLine.substring(1)+"' could not split into atomic parts. ", e);
128                                    }
129                                    if (items.length != 3) {
130                                            throw new SymbolException("The header line of the symbol table  '"+fileLine.substring(1)+"' must contain four columns. ");
131                                    }
132                                    addSymbolTable(items[0], Integer.parseInt(items[1]), items[2]);
133                            }
134                    } catch (NumberFormatException e) {
135                            throw new SymbolException("The symbol table file (.sym) contains a non-integer value in the header. ", e);
136                    } catch (IOException e) {
137                            throw new SymbolException("Could not load the symbol table. ", e);
138                    }
139            }
140            
141            
142            public void load(InputStreamReader isr) throws MaltChainedException  {
143                    try {
144                            BufferedReader bin = new BufferedReader(isr);
145                            String fileLine;
146                            SymbolTable table = null;
147                            bin.mark(2);
148                            if (bin.read() == '\t') {
149                                    bin.reset();
150                                    loadHeader(bin);
151                            } else {
152                                    bin.reset();
153                            }
154                            while ((fileLine = bin.readLine()) != null) {
155                                    if (fileLine.length() > 0) {
156                                            table = addSymbolTable(fileLine);
157                                            table.load(bin);
158                                    }
159                            }
160                            bin.close();
161                    } catch (IOException e) {
162                            throw new SymbolException("Could not load the symbol tables. ", e);
163                    }                       
164            }
165            
166            public void load(String fileName, String charSet) throws MaltChainedException  {
167                    try {
168                            load(new InputStreamReader(new FileInputStream(fileName), charSet));
169    
170                    } catch (FileNotFoundException e) {
171                            throw new SymbolException("The symbol table file '"+fileName+"' cannot be found. ", e);
172                    } catch (UnsupportedEncodingException e) {
173                            throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
174                    }               
175            }
176            
177            
178            public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy) throws MaltChainedException {
179                    try {
180                            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet));
181                            String fileLine;
182                            TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy);
183    
184                            while ((fileLine = br.readLine()) != null) {
185                                    table.addSymbol(fileLine.trim());
186                            }
187                            return table;
188                    } catch (FileNotFoundException e) {
189                            throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e);
190                    } catch (UnsupportedEncodingException e) {
191                            throw new SymbolException("The char set '"+charSet+"' is not supported. ", e);
192                    } catch (IOException e) {
193                            throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e);
194                    }
195            }
196            
197            public void printSymbolTables(Logger logger) throws MaltChainedException  {
198                    for (TrieSymbolTable table : symbolTables.values()) {
199                            table.printSymbolTable(logger);
200                    }       
201            }
202    }