001package org.maltparser.core.syntaxgraph.writer; 002 003import java.io.BufferedWriter; 004import java.io.FileNotFoundException; 005import java.io.FileOutputStream; 006import java.io.IOException; 007import java.io.OutputStream; 008import java.io.OutputStreamWriter; 009import java.io.UnsupportedEncodingException; 010import java.util.SortedMap; 011import java.util.regex.PatternSyntaxException; 012 013import org.maltparser.core.exception.MaltChainedException; 014import org.maltparser.core.io.dataformat.ColumnDescription; 015import org.maltparser.core.io.dataformat.DataFormatException; 016import org.maltparser.core.io.dataformat.DataFormatInstance; 017import org.maltparser.core.symbol.SymbolTable; 018import org.maltparser.core.symbol.SymbolTableHandler; 019import org.maltparser.core.syntaxgraph.PhraseStructure; 020import org.maltparser.core.syntaxgraph.TokenStructure; 021import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 022import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 023import org.maltparser.core.syntaxgraph.node.TokenNode; 024/** 025* 026* 027* @author Johan Hall 028*/ 029public class BracketWriter implements SyntaxGraphWriter { 030 private enum PennWriterFormat { 031 DEFAULT, PRETTY 032 }; 033 private PennWriterFormat format; 034 private BufferedWriter writer; 035 private DataFormatInstance dataFormatInstance; 036 private SortedMap<String,ColumnDescription> inputColumns; 037 private SortedMap<String,ColumnDescription> edgeLabelColumns; 038 private SortedMap<String,ColumnDescription> phraseLabelColumns; 039 private char STARTING_BRACKET = '('; 040 private String EMPTY_EDGELABEL = "??"; 041 private char CLOSING_BRACKET = ')'; 042 private char INPUT_SEPARATOR = ' '; 043 private char EDGELABEL_SEPARATOR = '-'; 044 private char SENTENCE_SEPARATOR = '\n'; 045 private String optionString; 046 private boolean closeStream = true; 047 048 public BracketWriter() { 049 } 050 051 public void open(String fileName, String charsetName) throws MaltChainedException { 052 try { 053 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 054 } catch (FileNotFoundException e) { 055 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 056 } catch (UnsupportedEncodingException e) { 057 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 058 } 059 } 060 061 public void open(OutputStream os, String charsetName) throws MaltChainedException { 062 try { 063 if (os == System.out || os == System.err) { 064 closeStream = false; 065 } 066 open(new OutputStreamWriter(os, charsetName)); 067 } catch (UnsupportedEncodingException e) { 068 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 069 } 070 } 071 072 private void open(OutputStreamWriter osw) throws MaltChainedException { 073 setWriter(new BufferedWriter(osw)); 074 } 075 076 public void writeEpilog() throws MaltChainedException { 077 078 } 079 080 public void writeProlog() throws MaltChainedException { 081 082 } 083 084 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 085 if (syntaxGraph == null || dataFormatInstance == null) { 086 return; 087 } 088 if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) { 089// PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph); 090 if (format == PennWriterFormat.PRETTY) { 091 writeElement(syntaxGraph.getSymbolTables(), ((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0); 092 } else { 093 writeElement(syntaxGraph.getSymbolTables(), ((PhraseStructure) syntaxGraph).getPhraseStructureRoot()); 094 } 095 try { 096 writer.write(SENTENCE_SEPARATOR); 097 writer.flush(); 098 } catch (IOException e) { 099 close(); 100 throw new DataFormatException("Could not write to the output file. ", e); 101 } 102 } 103 } 104 105 private void writeElement(SymbolTableHandler symbolTables, PhraseStructureNode element) throws MaltChainedException { 106 try { 107 if (element instanceof TokenNode) { 108 PhraseStructureNode t = (PhraseStructureNode)element; 109 SymbolTable table = null; 110 writer.write(STARTING_BRACKET); 111 int i = 0; 112 for (String inputColumn : inputColumns.keySet()) { 113 if (i != 0) { 114 writer.write(INPUT_SEPARATOR); 115 } 116 table = symbolTables.getSymbolTable(inputColumns.get(inputColumn).getName()); 117 if (t.hasLabel(table)) { 118 writer.write(t.getLabelSymbol(table)); 119 } 120 if (i == 0) { 121 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 122 table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName()); 123 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 124 writer.write(EDGELABEL_SEPARATOR); 125 writer.write(t.getParentEdgeLabelSymbol(table)); 126 } 127 } 128 } 129 i++; 130 } 131 writer.write(CLOSING_BRACKET); 132 } else { 133 NonTerminalNode nt = (NonTerminalNode)element; 134 writer.write(STARTING_BRACKET); 135 SymbolTable table = null; 136 int i = 0; 137 for (String phraseLabelColumn : phraseLabelColumns.keySet()) { 138 if (i != 0) { 139 writer.write(INPUT_SEPARATOR); 140 } 141 table = symbolTables.getSymbolTable(phraseLabelColumns.get(phraseLabelColumn).getName()); 142 if (nt.hasLabel(table)) { 143 writer.write(nt.getLabelSymbol(table)); 144 } 145 if (i == 0) { 146 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 147 table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName()); 148 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 149 writer.write(EDGELABEL_SEPARATOR); 150 writer.write(nt.getParentEdgeLabelSymbol(table)); 151 } 152 } 153 } 154 i++; 155 } 156 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) { 157 writeElement(symbolTables, node); 158 } 159 writer.write(CLOSING_BRACKET); 160 } 161 } catch (IOException e) { 162 throw new DataFormatException("Could not write to the output file. ", e); 163 } 164 } 165 166 private String getIndentation(int depth) { 167 StringBuilder sb = new StringBuilder(""); 168 for (int i = 0; i < depth; i++) { 169 sb.append("\t"); 170 } 171 return sb.toString(); 172 } 173 174 private void writeElement(SymbolTableHandler symbolTables, PhraseStructureNode element, int depth) throws MaltChainedException { 175 try { 176 if (element instanceof TokenNode) { 177 PhraseStructureNode t = (PhraseStructureNode)element; 178 SymbolTable table = null; 179 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET); 180 int i = 0; 181 for (String inputColumn : inputColumns.keySet()) { 182 if (i != 0) { 183 writer.write(INPUT_SEPARATOR); 184 } 185 table = symbolTables.getSymbolTable(inputColumns.get(inputColumn).getName()); 186 if (t.hasLabel(table)) { 187 writer.write(encodeString(t.getLabelSymbol(table))); 188 } 189 if (i == 0) { 190 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 191 table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName()); 192 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 193 writer.write(EDGELABEL_SEPARATOR); 194 writer.write(t.getParentEdgeLabelSymbol(table)); 195 } 196 } 197 } 198 i++; 199 } 200 writer.write(CLOSING_BRACKET); 201 } else { 202 NonTerminalNode nt = (NonTerminalNode)element; 203 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET); 204 SymbolTable table = null; 205 int i = 0; 206 for (String phraseLabelColumn : phraseLabelColumns.keySet()) { 207 if (i != 0) { 208 writer.write(INPUT_SEPARATOR); 209 } 210 table = symbolTables.getSymbolTable(phraseLabelColumns.get(phraseLabelColumn).getName()); 211 if (nt.hasLabel(table)) { 212 writer.write(nt.getLabelSymbol(table)); 213 } 214 if (i == 0) { 215 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 216 table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName()); 217 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 218 writer.write(EDGELABEL_SEPARATOR); 219 writer.write(nt.getParentEdgeLabelSymbol(table)); 220 } 221 } 222 } 223 i++; 224 } 225 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) { 226 writeElement(symbolTables, node, depth + 1); 227 } 228 writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET); 229 } 230 } catch (IOException e) { 231 throw new DataFormatException("Could not write to the output file. ", e); 232 } 233 } 234 235 public BufferedWriter getWriter() { 236 return writer; 237 } 238 239 public void setWriter(BufferedWriter writer) throws MaltChainedException { 240 close(); 241 this.writer = writer; 242 } 243 244 public DataFormatInstance getDataFormatInstance() { 245 return dataFormatInstance; 246 } 247 248 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 249 this.dataFormatInstance = dataFormatInstance; 250 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 251 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 252 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 253 } 254 255 public String getOptions() { 256 return optionString; 257 } 258 259 public void setOptions(String optionString) throws MaltChainedException { 260 this.optionString = optionString; 261 format = PennWriterFormat.DEFAULT; 262 263 String[] argv; 264 try { 265 argv = optionString.split("[_\\p{Blank}]"); 266 } catch (PatternSyntaxException e) { 267 throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e); 268 } 269 for (int i=0; i < argv.length-1; i++) { 270 if(argv[i].charAt(0) != '-') { 271 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 272 } 273 if(++i>=argv.length) { 274 throw new DataFormatException("The last argument does not have any value. "); 275 } 276 switch(argv[i-1].charAt(1)) { 277 case 'f': 278 if (argv[i].equals("p")) { 279 format = PennWriterFormat.PRETTY; 280 } else if (argv[i].equals("p")) { 281 format = PennWriterFormat.DEFAULT; 282 } 283 break; 284 default: 285 throw new DataFormatException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 286 } 287 } 288 } 289 290 public void close() throws MaltChainedException { 291 try { 292 if (writer != null) { 293 writer.flush(); 294 if (closeStream) { 295 writer.close(); 296 } 297 writer = null; 298 } 299 } catch (IOException e) { 300 throw new DataFormatException("Could not close the output file. ", e); 301 } 302 } 303 304 private String encodeString(String string) { 305 return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-"); 306 } 307}