001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.SortedMap; 011 import java.util.regex.PatternSyntaxException; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.io.dataformat.ColumnDescription; 015 import org.maltparser.core.io.dataformat.DataFormatException; 016 import org.maltparser.core.io.dataformat.DataFormatInstance; 017 import org.maltparser.core.symbol.SymbolTable; 018 import org.maltparser.core.syntaxgraph.PhraseStructure; 019 import org.maltparser.core.syntaxgraph.TokenStructure; 020 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 021 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 022 import org.maltparser.core.syntaxgraph.node.TokenNode; 023 import org.maltparser.ml.libsvm.LibsvmException; 024 /** 025 * 026 * 027 * @author Johan Hall 028 */ 029 public class BracketWriter implements SyntaxGraphWriter { 030 private enum PennWriterFormat { 031 DEFAULT, PRETTY 032 }; 033 private PennWriterFormat format; 034 private BufferedWriter writer; 035 private DataFormatInstance dataFormatInstance; 036 private SortedMap<String,ColumnDescription> inputColumns; 037 private SortedMap<String,ColumnDescription> edgeLabelColumns; 038 private SortedMap<String,ColumnDescription> phraseLabelColumns; 039 private char STARTING_BRACKET = '('; 040 private String EMPTY_EDGELABEL = "??"; 041 private char CLOSING_BRACKET = ')'; 042 private char INPUT_SEPARATOR = ' '; 043 private char EDGELABEL_SEPARATOR = '-'; 044 private char SENTENCE_SEPARATOR = '\n'; 045 private String optionString; 046 047 public BracketWriter() { 048 } 049 050 public void open(String fileName, String charsetName) throws MaltChainedException { 051 try { 052 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 053 } catch (FileNotFoundException e) { 054 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 055 } catch (UnsupportedEncodingException e) { 056 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 057 } 058 } 059 060 public void open(OutputStream os, String charsetName) throws MaltChainedException { 061 try { 062 open(new OutputStreamWriter(os, charsetName)); 063 } catch (UnsupportedEncodingException e) { 064 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 065 } 066 } 067 068 public void open(OutputStreamWriter osw) throws MaltChainedException { 069 setWriter(new BufferedWriter(osw)); 070 } 071 072 public void writeEpilog() throws MaltChainedException { 073 074 } 075 076 public void writeProlog() throws MaltChainedException { 077 078 } 079 080 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 081 if (syntaxGraph == null || dataFormatInstance == null) { 082 return; 083 } 084 if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) { 085 // PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph); 086 if (format == PennWriterFormat.PRETTY) { 087 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0); 088 } else { 089 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot()); 090 } 091 try { 092 writer.write(SENTENCE_SEPARATOR); 093 writer.flush(); 094 } catch (IOException e) { 095 close(); 096 throw new DataFormatException("Could not write to the output file. ", e); 097 } 098 } 099 } 100 101 private void writeElement(PhraseStructureNode element) throws MaltChainedException { 102 try { 103 if (element instanceof TokenNode) { 104 PhraseStructureNode t = (PhraseStructureNode)element; 105 SymbolTable table = null; 106 writer.write(STARTING_BRACKET); 107 int i = 0; 108 for (String inputColumn : inputColumns.keySet()) { 109 if (i != 0) { 110 writer.write(INPUT_SEPARATOR); 111 } 112 table = inputColumns.get(inputColumn).getSymbolTable(); 113 if (t.hasLabel(table)) { 114 writer.write(t.getLabelSymbol(table)); 115 } 116 if (i == 0) { 117 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 118 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 119 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 120 writer.write(EDGELABEL_SEPARATOR); 121 writer.write(t.getParentEdgeLabelSymbol(table)); 122 } 123 } 124 } 125 i++; 126 } 127 writer.write(CLOSING_BRACKET); 128 } else { 129 NonTerminalNode nt = (NonTerminalNode)element; 130 writer.write(STARTING_BRACKET); 131 SymbolTable table = null; 132 int i = 0; 133 for (String phraseLabelColumn : phraseLabelColumns.keySet()) { 134 if (i != 0) { 135 writer.write(INPUT_SEPARATOR); 136 } 137 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable(); 138 if (nt.hasLabel(table)) { 139 writer.write(nt.getLabelSymbol(table)); 140 } 141 if (i == 0) { 142 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 143 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 144 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 145 writer.write(EDGELABEL_SEPARATOR); 146 writer.write(nt.getParentEdgeLabelSymbol(table)); 147 } 148 } 149 } 150 i++; 151 } 152 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) { 153 writeElement(node); 154 } 155 writer.write(CLOSING_BRACKET); 156 } 157 } catch (IOException e) { 158 throw new DataFormatException("Could not write to the output file. ", e); 159 } 160 } 161 162 private String getIndentation(int depth) { 163 StringBuilder sb = new StringBuilder(""); 164 for (int i = 0; i < depth; i++) { 165 sb.append("\t"); 166 } 167 return sb.toString(); 168 } 169 170 private void writeElement(PhraseStructureNode element, int depth) throws MaltChainedException { 171 try { 172 if (element instanceof TokenNode) { 173 PhraseStructureNode t = (PhraseStructureNode)element; 174 SymbolTable table = null; 175 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET); 176 int i = 0; 177 for (String inputColumn : inputColumns.keySet()) { 178 if (i != 0) { 179 writer.write(INPUT_SEPARATOR); 180 } 181 table = inputColumns.get(inputColumn).getSymbolTable(); 182 if (t.hasLabel(table)) { 183 writer.write(encodeString(t.getLabelSymbol(table))); 184 } 185 if (i == 0) { 186 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 187 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 188 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 189 writer.write(EDGELABEL_SEPARATOR); 190 writer.write(t.getParentEdgeLabelSymbol(table)); 191 } 192 } 193 } 194 i++; 195 } 196 writer.write(CLOSING_BRACKET); 197 } else { 198 NonTerminalNode nt = (NonTerminalNode)element; 199 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET); 200 SymbolTable table = null; 201 int i = 0; 202 for (String phraseLabelColumn : phraseLabelColumns.keySet()) { 203 if (i != 0) { 204 writer.write(INPUT_SEPARATOR); 205 } 206 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable(); 207 if (nt.hasLabel(table)) { 208 writer.write(nt.getLabelSymbol(table)); 209 } 210 if (i == 0) { 211 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 212 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 213 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 214 writer.write(EDGELABEL_SEPARATOR); 215 writer.write(nt.getParentEdgeLabelSymbol(table)); 216 } 217 } 218 } 219 i++; 220 } 221 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) { 222 writeElement(node, depth + 1); 223 } 224 writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET); 225 } 226 } catch (IOException e) { 227 throw new DataFormatException("Could not write to the output file. ", e); 228 } 229 } 230 231 public BufferedWriter getWriter() { 232 return writer; 233 } 234 235 public void setWriter(BufferedWriter writer) throws MaltChainedException { 236 close(); 237 this.writer = writer; 238 } 239 240 public DataFormatInstance getDataFormatInstance() { 241 return dataFormatInstance; 242 } 243 244 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 245 this.dataFormatInstance = dataFormatInstance; 246 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 247 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 248 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 249 } 250 251 public String getOptions() { 252 return optionString; 253 } 254 255 public void setOptions(String optionString) throws MaltChainedException { 256 this.optionString = optionString; 257 format = PennWriterFormat.DEFAULT; 258 259 String[] argv; 260 try { 261 argv = optionString.split("[_\\p{Blank}]"); 262 } catch (PatternSyntaxException e) { 263 throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e); 264 } 265 for (int i=0; i < argv.length-1; i++) { 266 if(argv[i].charAt(0) != '-') { 267 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 268 } 269 if(++i>=argv.length) { 270 throw new DataFormatException("The last argument does not have any value. "); 271 } 272 switch(argv[i-1].charAt(1)) { 273 case 'f': 274 if (argv[i].equals("p")) { 275 format = PennWriterFormat.PRETTY; 276 } else if (argv[i].equals("p")) { 277 format = PennWriterFormat.DEFAULT; 278 } 279 break; 280 default: 281 throw new LibsvmException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 282 } 283 } 284 } 285 286 public void close() throws MaltChainedException { 287 try { 288 if (writer != null) { 289 writer.flush(); 290 writer.close(); 291 writer = null; 292 } 293 } catch (IOException e) { 294 throw new DataFormatException("Could not close the output file. ", e); 295 } 296 } 297 298 private String encodeString(String string) { 299 return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-"); 300 } 301 }