001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.SortedMap; 011 import java.util.TreeMap; 012 import java.util.regex.PatternSyntaxException; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 016 import org.maltparser.core.helper.Util; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.PhraseStructure; 021 import org.maltparser.core.syntaxgraph.TokenStructure; 022 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 023 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 024 import org.maltparser.core.syntaxgraph.node.TokenNode; 025 import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader; 026 import org.maltparser.ml.libsvm.LibsvmException; 027 /** 028 * 029 * 030 * @author Johan Hall 031 */ 032 public class TigerXMLWriter implements SyntaxGraphWriter { 033 private enum RootHandling { 034 TALBANKEN, NORMAL 035 }; 036 037 private BufferedWriter writer; 038 private DataFormatInstance dataFormatInstance; 039 private String optionString; 040 private int sentenceCount; 041 private TigerXMLHeader header; 042 // private boolean hasWriteTigerXMLHeader = false; 043 private RootHandling rootHandling; 044 private String sentencePrefix = "s"; 045 private StringBuilder sentenceID; 046 private StringBuilder tmpID; 047 private StringBuilder rootID; 048 private int START_ID_OF_NONTERMINALS = 500; 049 private boolean labeledTerminalID; 050 private String VROOT_SYMBOL = "VROOT"; 051 private boolean useVROOT = false; 052 // private String fileName = null; 053 // private String charsetName = null; 054 055 public TigerXMLWriter() { 056 sentenceID = new StringBuilder(); 057 tmpID = new StringBuilder(); 058 rootID = new StringBuilder(); 059 labeledTerminalID = false; 060 } 061 062 public void open(String fileName, String charsetName) throws MaltChainedException { 063 try { 064 // this.fileName = fileName; 065 // this.charsetName = charsetName; 066 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 067 } catch (FileNotFoundException e) { 068 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 069 } catch (UnsupportedEncodingException e) { 070 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 071 } 072 } 073 074 public void open(OutputStream os, String charsetName) throws MaltChainedException { 075 try { 076 open(new OutputStreamWriter(os, charsetName)); 077 } catch (UnsupportedEncodingException e) { 078 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 079 } 080 } 081 082 public void open(OutputStreamWriter osw) throws MaltChainedException { 083 setWriter(new BufferedWriter(osw)); 084 setSentenceCount(0); 085 } 086 087 public void writeProlog() throws MaltChainedException { 088 // if (fileName == null || charsetName == null) { 089 writeHeader(); 090 // } 091 } 092 093 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 094 if (syntaxGraph == null || dataFormatInstance == null) { 095 return; 096 } 097 if (syntaxGraph.hasTokens()) { 098 sentenceCount++; 099 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 100 try { 101 sentenceID.setLength(0); 102 sentenceID.append(sentencePrefix); 103 if (phraseStructure.getSentenceID() != 0) { 104 sentenceID.append(Integer.toString(phraseStructure.getSentenceID())); 105 } else { 106 sentenceID.append(Integer.toString(sentenceCount)); 107 } 108 writer.write(" <s id=\""); 109 writer.write(sentenceID.toString()); 110 writer.write("\">\n"); 111 112 setRootID(phraseStructure); 113 writer.write(" <graph root=\""); 114 writer.write(rootID.toString()); 115 writer.write("\" "); 116 writer.write("discontinuous=\""); 117 writer.write(Boolean.toString(!phraseStructure.isContinuous())); 118 writer.write("\">\n"); 119 120 writeTerminals(phraseStructure); 121 if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) { 122 writeNonTerminals(phraseStructure); 123 } else { 124 writer.write(" <nonterminals/>\n"); 125 } 126 writer.write(" </graph>\n"); 127 writer.write(" </s>\n"); 128 } catch (IOException e) { 129 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 130 } 131 } 132 } 133 134 private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException { 135 useVROOT = false; 136 PhraseStructureNode root = phraseStructure.getPhraseStructureRoot(); 137 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 138 if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) { 139 useVROOT = true; 140 break; 141 } 142 } 143 if (useVROOT) { 144 rootID.setLength(0); 145 rootID.append(sentenceID); 146 rootID.append('_'); 147 rootID.append(VROOT_SYMBOL); 148 } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) { 149 rootID.setLength(0); 150 rootID.append(sentenceID); 151 rootID.append("_1"); 152 } else { 153 rootID.setLength(0); 154 rootID.append(sentenceID); 155 rootID.append('_'); 156 // if (rootHandling.equals(RootHandling.NORMAL)) { 157 rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals())); 158 // } else if (rootHandling.equals(RootHandling.TALBANKEN)) { 159 // rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1)); 160 // } 161 } 162 163 } 164 165 public void writeEpilog() throws MaltChainedException { 166 writeTail(); 167 } 168 169 public BufferedWriter getWriter() { 170 return writer; 171 } 172 173 public void setWriter(BufferedWriter writer) { 174 this.writer = writer; 175 } 176 177 public void close() throws MaltChainedException { 178 try { 179 if (writer != null) { 180 writer.flush(); 181 writer.close(); 182 writer = null; 183 } 184 } catch (IOException e) { 185 throw new DataFormatException("Could not close the output file. ", e); 186 } 187 } 188 189 private void writeHeader() throws MaltChainedException { 190 try { 191 if (header == null) { 192 header = new TigerXMLHeader(dataFormatInstance.getSymbolTables()); 193 } 194 writer.write(header.toTigerXML()); 195 // hasWriteTigerXMLHeader = true; 196 } catch (IOException e) { 197 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 198 } 199 } 200 201 202 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 203 try { 204 writer.write(" <terminals>\n"); 205 for (int index : phraseStructure.getTokenIndices()) { 206 final PhraseStructureNode t = phraseStructure.getTokenNode(index); 207 writer.write(" <t "); 208 if (!labeledTerminalID) { 209 tmpID.setLength(0); 210 tmpID.append(sentenceID); 211 tmpID.append('_'); 212 tmpID.append(Integer.toString(t.getIndex())); 213 writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" "); 214 } 215 216 for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) { 217 writer.write(column.getName().toLowerCase()); 218 writer.write("=\""); 219 writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable()))); 220 writer.write("\" "); 221 } 222 writer.write("/>\n"); 223 } 224 writer.write(" </terminals>\n"); 225 } catch (IOException e) { 226 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 227 } 228 } 229 230 public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 231 try { 232 SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>(); 233 for (int index : phraseStructure.getNonTerminalIndices()) { 234 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight()); 235 } 236 writer.write(" <nonterminals>\n"); 237 boolean done = false; 238 int h = 1; 239 while (!done) { 240 done = true; 241 for (int index : phraseStructure.getNonTerminalIndices()) { 242 if (heights.get(index) == h) { 243 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 244 tmpID.setLength(0); 245 tmpID.append(sentenceID); 246 tmpID.append('_'); 247 tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1)); 248 writeNonTerminal(nt, tmpID.toString()); 249 done = false; 250 } 251 } 252 h++; 253 } 254 255 writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString()); 256 writer.write(" </nonterminals>\n"); 257 } catch (IOException e) { 258 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 259 } 260 } 261 262 public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException { 263 try { 264 writer.write(" <nt"); 265 writer.write(" id=\"");writer.write(id);writer.write("\" "); 266 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 267 if (nt.hasLabel(column.getSymbolTable())) { 268 writer.write(column.getName().toLowerCase()); 269 writer.write("="); 270 writer.write("\""); 271 writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable()))); 272 writer.write("\" "); 273 } 274 } 275 writer.write(">\n"); 276 277 for (int i = 0, n = nt.nChildren(); i < n; i++) { 278 PhraseStructureNode child = nt.getChild(i); 279 writer.write(" <edge "); 280 281 for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) { 282 if (child.hasParentEdgeLabel(column.getSymbolTable())) { 283 writer.write(column.getName().toLowerCase()); 284 writer.write("=\""); 285 writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable()))); 286 writer.write("\" "); 287 } 288 } 289 if (child instanceof TokenNode) { 290 if (!labeledTerminalID) { 291 tmpID.setLength(0); 292 tmpID.append(sentenceID); 293 tmpID.append('_'); 294 tmpID.append(Integer.toString(child.getIndex())); 295 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 296 } else { 297 writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\""); 298 } 299 300 } else { 301 tmpID.setLength(0); 302 tmpID.append(sentenceID); 303 tmpID.append('_'); 304 tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1)); 305 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 306 } 307 writer.write(" />\n"); 308 } 309 writer.write(" </nt>\n"); 310 } catch (IOException e) { 311 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 312 } 313 } 314 315 316 private void writeTail() throws MaltChainedException { 317 try { 318 writer.write(" </body>\n"); 319 writer.write("</corpus>\n"); 320 writer.flush(); 321 // if (fileName != null && charsetName != null) { 322 // writer.close(); 323 // writer = null; 324 // BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName)); 325 // if (header == null) { 326 // header = new TigerXMLHeader(dataFormatInstance.getSymbolTables()); 327 // } 328 // 329 // headerWriter.write(header.toTigerXML()); 330 // headerWriter.flush(); 331 // headerWriter.close(); 332 // } 333 } catch (IOException e) { 334 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 335 } 336 } 337 338 public int getSentenceCount() { 339 return sentenceCount; 340 } 341 342 public void setSentenceCount(int sentenceCount) { 343 this.sentenceCount = sentenceCount; 344 } 345 346 public DataFormatInstance getDataFormatInstance() { 347 return dataFormatInstance; 348 } 349 350 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 351 this.dataFormatInstance = dataFormatInstance; 352 labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID")); 353 } 354 355 public String getOptions() { 356 return optionString; 357 } 358 359 public void setOptions(String optionString) throws MaltChainedException { 360 this.optionString = optionString; 361 rootHandling = RootHandling.NORMAL; 362 363 String[] argv; 364 try { 365 argv = optionString.split("[_\\p{Blank}]"); 366 } catch (PatternSyntaxException e) { 367 throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e); 368 } 369 for (int i=0; i < argv.length-1; i++) { 370 if(argv[i].charAt(0) != '-') { 371 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 372 } 373 if(++i>=argv.length) { 374 throw new DataFormatException("The last argument does not have any value. "); 375 } 376 switch(argv[i-1].charAt(1)) { 377 case 'r': 378 if (argv[i].equals("n")) { 379 rootHandling = RootHandling.NORMAL; 380 } else if (argv[i].equals("tal")) { 381 rootHandling = RootHandling.TALBANKEN; 382 } 383 break; 384 case 's': 385 try { 386 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 387 } catch (NumberFormatException e){ 388 throw new MaltChainedException("The TigerXML writer option -s must be an integer value. "); 389 } 390 break; 391 case 'v': 392 VROOT_SYMBOL = argv[i]; 393 break; 394 default: 395 throw new LibsvmException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 396 } 397 } 398 } 399 }