001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.SortedMap; 011 import java.util.regex.PatternSyntaxException; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.io.dataformat.ColumnDescription; 015 import org.maltparser.core.io.dataformat.DataFormatException; 016 import org.maltparser.core.io.dataformat.DataFormatInstance; 017 import org.maltparser.core.symbol.SymbolTable; 018 import org.maltparser.core.syntaxgraph.PhraseStructure; 019 import org.maltparser.core.syntaxgraph.TokenStructure; 020 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 021 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 022 import org.maltparser.core.syntaxgraph.node.TokenNode; 023 /** 024 * 025 * 026 * @author Johan Hall 027 */ 028 public class BracketWriter implements SyntaxGraphWriter { 029 private enum PennWriterFormat { 030 DEFAULT, PRETTY 031 }; 032 private PennWriterFormat format; 033 private BufferedWriter writer; 034 private DataFormatInstance dataFormatInstance; 035 private SortedMap<String,ColumnDescription> inputColumns; 036 private SortedMap<String,ColumnDescription> edgeLabelColumns; 037 private SortedMap<String,ColumnDescription> phraseLabelColumns; 038 private char STARTING_BRACKET = '('; 039 private String EMPTY_EDGELABEL = "??"; 040 private char CLOSING_BRACKET = ')'; 041 private char INPUT_SEPARATOR = ' '; 042 private char EDGELABEL_SEPARATOR = '-'; 043 private char SENTENCE_SEPARATOR = '\n'; 044 private String optionString; 045 private boolean closeStream = true; 046 047 public BracketWriter() { 048 } 049 050 public void open(String fileName, String charsetName) throws MaltChainedException { 051 try { 052 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 053 } catch (FileNotFoundException e) { 054 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 055 } catch (UnsupportedEncodingException e) { 056 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 057 } 058 } 059 060 public void open(OutputStream os, String charsetName) throws MaltChainedException { 061 try { 062 if (os == System.out || os == System.err) { 063 closeStream = false; 064 } 065 open(new OutputStreamWriter(os, charsetName)); 066 } catch (UnsupportedEncodingException e) { 067 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 068 } 069 } 070 071 private void open(OutputStreamWriter osw) throws MaltChainedException { 072 setWriter(new BufferedWriter(osw)); 073 } 074 075 public void writeEpilog() throws MaltChainedException { 076 077 } 078 079 public void writeProlog() throws MaltChainedException { 080 081 } 082 083 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 084 if (syntaxGraph == null || dataFormatInstance == null) { 085 return; 086 } 087 if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) { 088 // PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph); 089 if (format == PennWriterFormat.PRETTY) { 090 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0); 091 } else { 092 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot()); 093 } 094 try { 095 writer.write(SENTENCE_SEPARATOR); 096 writer.flush(); 097 } catch (IOException e) { 098 close(); 099 throw new DataFormatException("Could not write to the output file. ", e); 100 } 101 } 102 } 103 104 private void writeElement(PhraseStructureNode element) throws MaltChainedException { 105 try { 106 if (element instanceof TokenNode) { 107 PhraseStructureNode t = (PhraseStructureNode)element; 108 SymbolTable table = null; 109 writer.write(STARTING_BRACKET); 110 int i = 0; 111 for (String inputColumn : inputColumns.keySet()) { 112 if (i != 0) { 113 writer.write(INPUT_SEPARATOR); 114 } 115 table = inputColumns.get(inputColumn).getSymbolTable(); 116 if (t.hasLabel(table)) { 117 writer.write(t.getLabelSymbol(table)); 118 } 119 if (i == 0) { 120 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 121 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 122 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 123 writer.write(EDGELABEL_SEPARATOR); 124 writer.write(t.getParentEdgeLabelSymbol(table)); 125 } 126 } 127 } 128 i++; 129 } 130 writer.write(CLOSING_BRACKET); 131 } else { 132 NonTerminalNode nt = (NonTerminalNode)element; 133 writer.write(STARTING_BRACKET); 134 SymbolTable table = null; 135 int i = 0; 136 for (String phraseLabelColumn : phraseLabelColumns.keySet()) { 137 if (i != 0) { 138 writer.write(INPUT_SEPARATOR); 139 } 140 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable(); 141 if (nt.hasLabel(table)) { 142 writer.write(nt.getLabelSymbol(table)); 143 } 144 if (i == 0) { 145 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 146 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 147 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 148 writer.write(EDGELABEL_SEPARATOR); 149 writer.write(nt.getParentEdgeLabelSymbol(table)); 150 } 151 } 152 } 153 i++; 154 } 155 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) { 156 writeElement(node); 157 } 158 writer.write(CLOSING_BRACKET); 159 } 160 } catch (IOException e) { 161 throw new DataFormatException("Could not write to the output file. ", e); 162 } 163 } 164 165 private String getIndentation(int depth) { 166 StringBuilder sb = new StringBuilder(""); 167 for (int i = 0; i < depth; i++) { 168 sb.append("\t"); 169 } 170 return sb.toString(); 171 } 172 173 private void writeElement(PhraseStructureNode element, int depth) throws MaltChainedException { 174 try { 175 if (element instanceof TokenNode) { 176 PhraseStructureNode t = (PhraseStructureNode)element; 177 SymbolTable table = null; 178 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET); 179 int i = 0; 180 for (String inputColumn : inputColumns.keySet()) { 181 if (i != 0) { 182 writer.write(INPUT_SEPARATOR); 183 } 184 table = inputColumns.get(inputColumn).getSymbolTable(); 185 if (t.hasLabel(table)) { 186 writer.write(encodeString(t.getLabelSymbol(table))); 187 } 188 if (i == 0) { 189 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 190 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 191 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 192 writer.write(EDGELABEL_SEPARATOR); 193 writer.write(t.getParentEdgeLabelSymbol(table)); 194 } 195 } 196 } 197 i++; 198 } 199 writer.write(CLOSING_BRACKET); 200 } else { 201 NonTerminalNode nt = (NonTerminalNode)element; 202 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET); 203 SymbolTable table = null; 204 int i = 0; 205 for (String phraseLabelColumn : phraseLabelColumns.keySet()) { 206 if (i != 0) { 207 writer.write(INPUT_SEPARATOR); 208 } 209 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable(); 210 if (nt.hasLabel(table)) { 211 writer.write(nt.getLabelSymbol(table)); 212 } 213 if (i == 0) { 214 for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 215 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable(); 216 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) { 217 writer.write(EDGELABEL_SEPARATOR); 218 writer.write(nt.getParentEdgeLabelSymbol(table)); 219 } 220 } 221 } 222 i++; 223 } 224 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) { 225 writeElement(node, depth + 1); 226 } 227 writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET); 228 } 229 } catch (IOException e) { 230 throw new DataFormatException("Could not write to the output file. ", e); 231 } 232 } 233 234 public BufferedWriter getWriter() { 235 return writer; 236 } 237 238 public void setWriter(BufferedWriter writer) throws MaltChainedException { 239 close(); 240 this.writer = writer; 241 } 242 243 public DataFormatInstance getDataFormatInstance() { 244 return dataFormatInstance; 245 } 246 247 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 248 this.dataFormatInstance = dataFormatInstance; 249 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 250 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 251 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 252 } 253 254 public String getOptions() { 255 return optionString; 256 } 257 258 public void setOptions(String optionString) throws MaltChainedException { 259 this.optionString = optionString; 260 format = PennWriterFormat.DEFAULT; 261 262 String[] argv; 263 try { 264 argv = optionString.split("[_\\p{Blank}]"); 265 } catch (PatternSyntaxException e) { 266 throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e); 267 } 268 for (int i=0; i < argv.length-1; i++) { 269 if(argv[i].charAt(0) != '-') { 270 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 271 } 272 if(++i>=argv.length) { 273 throw new DataFormatException("The last argument does not have any value. "); 274 } 275 switch(argv[i-1].charAt(1)) { 276 case 'f': 277 if (argv[i].equals("p")) { 278 format = PennWriterFormat.PRETTY; 279 } else if (argv[i].equals("p")) { 280 format = PennWriterFormat.DEFAULT; 281 } 282 break; 283 default: 284 throw new DataFormatException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 285 } 286 } 287 } 288 289 public void close() throws MaltChainedException { 290 try { 291 if (writer != null) { 292 writer.flush(); 293 if (closeStream) { 294 writer.close(); 295 } 296 writer = null; 297 } 298 } catch (IOException e) { 299 throw new DataFormatException("Could not close the output file. ", e); 300 } 301 } 302 303 private String encodeString(String string) { 304 return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-"); 305 } 306 }