001package org.maltparser.core.syntaxgraph.writer; 002 003import java.io.BufferedWriter; 004import java.io.FileNotFoundException; 005import java.io.FileOutputStream; 006import java.io.IOException; 007import java.io.OutputStream; 008import java.io.OutputStreamWriter; 009import java.io.UnsupportedEncodingException; 010import java.util.SortedMap; 011import java.util.TreeMap; 012import java.util.regex.PatternSyntaxException; 013 014import org.maltparser.core.exception.MaltChainedException; 015 016import org.maltparser.core.helper.Util; 017import org.maltparser.core.io.dataformat.ColumnDescription; 018import org.maltparser.core.io.dataformat.DataFormatException; 019import org.maltparser.core.io.dataformat.DataFormatInstance; 020import org.maltparser.core.symbol.SymbolTableHandler; 021import org.maltparser.core.syntaxgraph.PhraseStructure; 022import org.maltparser.core.syntaxgraph.TokenStructure; 023import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 024import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025import org.maltparser.core.syntaxgraph.node.TokenNode; 026import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader; 027/** 028* 029* 030* @author Johan Hall 031*/ 032public class TigerXMLWriter implements SyntaxGraphWriter { 033 private enum RootHandling { 034 TALBANKEN, NORMAL 035 }; 036 037 private BufferedWriter writer; 038 private DataFormatInstance dataFormatInstance; 039 private String optionString; 040 private int sentenceCount; 041 private TigerXMLHeader header; 042// private boolean hasWriteTigerXMLHeader = false; 043 private RootHandling rootHandling; 044 private String sentencePrefix = "s"; 045 private StringBuilder sentenceID; 046 private StringBuilder tmpID; 047 private StringBuilder rootID; 048 private int START_ID_OF_NONTERMINALS = 500; 049 private boolean labeledTerminalID; 050 private String VROOT_SYMBOL = "VROOT"; 051 private boolean useVROOT = false; 052// private String fileName = null; 053// private String charsetName = null; 054 private boolean closeStream = true; 055 056 public TigerXMLWriter() { 057 sentenceID = new StringBuilder(); 058 tmpID = new StringBuilder(); 059 rootID = new StringBuilder(); 060 labeledTerminalID = false; 061 } 062 063 public void open(String fileName, String charsetName) throws MaltChainedException { 064 try { 065// this.fileName = fileName; 066// this.charsetName = charsetName; 067 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 068 } catch (FileNotFoundException e) { 069 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 070 } catch (UnsupportedEncodingException e) { 071 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 072 } 073 } 074 075 public void open(OutputStream os, String charsetName) throws MaltChainedException { 076 try { 077 if (os == System.out || os == System.err) { 078 closeStream = false; 079 } 080 open(new OutputStreamWriter(os, charsetName)); 081 } catch (UnsupportedEncodingException e) { 082 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 083 } 084 } 085 086 private void open(OutputStreamWriter osw) throws MaltChainedException { 087 setWriter(new BufferedWriter(osw)); 088 setSentenceCount(0); 089 } 090 091 public void writeProlog() throws MaltChainedException { 092// if (fileName == null || charsetName == null) { 093 writeHeader(); 094// } 095 } 096 097 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 098 if (syntaxGraph == null || dataFormatInstance == null) { 099 return; 100 } 101 if (syntaxGraph.hasTokens()) { 102 sentenceCount++; 103 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 104 try { 105 sentenceID.setLength(0); 106 sentenceID.append(sentencePrefix); 107 if (phraseStructure.getSentenceID() != 0) { 108 sentenceID.append(Integer.toString(phraseStructure.getSentenceID())); 109 } else { 110 sentenceID.append(Integer.toString(sentenceCount)); 111 } 112 writer.write(" <s id=\""); 113 writer.write(sentenceID.toString()); 114 writer.write("\">\n"); 115 116 setRootID(phraseStructure); 117 writer.write(" <graph root=\""); 118 writer.write(rootID.toString()); 119 writer.write("\" "); 120 writer.write("discontinuous=\""); 121 writer.write(Boolean.toString(!phraseStructure.isContinuous())); 122 writer.write("\">\n"); 123 124 writeTerminals(phraseStructure); 125 if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) { 126 writeNonTerminals(phraseStructure); 127 } else { 128 writer.write(" <nonterminals/>\n"); 129 } 130 writer.write(" </graph>\n"); 131 writer.write(" </s>\n"); 132 } catch (IOException e) { 133 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 134 } 135 } 136 } 137 138 private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException { 139 useVROOT = false; 140 PhraseStructureNode root = phraseStructure.getPhraseStructureRoot(); 141 final SymbolTableHandler symbolTables = phraseStructure.getSymbolTables(); 142 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 143 if (root.hasLabel(symbolTables.getSymbolTable(column.getName())) && root.getLabelSymbol(symbolTables.getSymbolTable(column.getName())).equals(VROOT_SYMBOL)) { 144 useVROOT = true; 145 break; 146 } 147 } 148 if (useVROOT) { 149 rootID.setLength(0); 150 rootID.append(sentenceID); 151 rootID.append('_'); 152 rootID.append(VROOT_SYMBOL); 153 } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) { 154 rootID.setLength(0); 155 rootID.append(sentenceID); 156 rootID.append("_1"); 157 } else { 158 rootID.setLength(0); 159 rootID.append(sentenceID); 160 rootID.append('_'); 161// if (rootHandling.equals(RootHandling.NORMAL)) { 162 rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals())); 163// } else if (rootHandling.equals(RootHandling.TALBANKEN)) { 164// rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1)); 165// } 166 } 167 168 } 169 170 public void writeEpilog() throws MaltChainedException { 171 writeTail(); 172 } 173 174 public BufferedWriter getWriter() { 175 return writer; 176 } 177 178 public void setWriter(BufferedWriter writer) { 179 this.writer = writer; 180 } 181 182 public void close() throws MaltChainedException { 183 try { 184 if (writer != null) { 185 writer.flush(); 186 if (closeStream) { 187 writer.close(); 188 } 189 writer = null; 190 } 191 } catch (IOException e) { 192 throw new DataFormatException("Could not close the output file. ", e); 193 } 194 } 195 196 private void writeHeader() throws MaltChainedException { 197 try { 198 if (header == null) { 199 header = new TigerXMLHeader(); 200 } 201 writer.write(header.toTigerXML()); 202// hasWriteTigerXMLHeader = true; 203 } catch (IOException e) { 204 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 205 } 206 } 207 208 209 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 210 try { 211 writer.write(" <terminals>\n"); 212 for (int index : phraseStructure.getTokenIndices()) { 213 final PhraseStructureNode t = phraseStructure.getTokenNode(index); 214 writer.write(" <t "); 215 if (!labeledTerminalID) { 216 tmpID.setLength(0); 217 tmpID.append(sentenceID); 218 tmpID.append('_'); 219 tmpID.append(Integer.toString(t.getIndex())); 220 writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" "); 221 } 222 223 for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) { 224 writer.write(column.getName().toLowerCase()); 225 writer.write("=\""); 226 writer.write(Util.xmlEscape(t.getLabelSymbol(phraseStructure.getSymbolTables().getSymbolTable(column.getName())))); 227 writer.write("\" "); 228 } 229 writer.write("/>\n"); 230 } 231 writer.write(" </terminals>\n"); 232 } catch (IOException e) { 233 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 234 } 235 } 236 237 public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 238 try { 239 SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>(); 240 for (int index : phraseStructure.getNonTerminalIndices()) { 241 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight()); 242 } 243 writer.write(" <nonterminals>\n"); 244 boolean done = false; 245 int h = 1; 246 while (!done) { 247 done = true; 248 for (int index : phraseStructure.getNonTerminalIndices()) { 249 if (heights.get(index) == h) { 250 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 251 tmpID.setLength(0); 252 tmpID.append(sentenceID); 253 tmpID.append('_'); 254 tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1)); 255 writeNonTerminal(phraseStructure.getSymbolTables(), nt, tmpID.toString()); 256 done = false; 257 } 258 } 259 h++; 260 } 261 262 writeNonTerminal(phraseStructure.getSymbolTables(), (NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString()); 263 writer.write(" </nonterminals>\n"); 264 } catch (IOException e) { 265 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 266 } 267 } 268 269 public void writeNonTerminal(SymbolTableHandler symbolTables, NonTerminalNode nt, String id) throws MaltChainedException { 270 try { 271 writer.write(" <nt"); 272 writer.write(" id=\"");writer.write(id);writer.write("\" "); 273 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 274 if (nt.hasLabel(symbolTables.getSymbolTable(column.getName()))) { 275 writer.write(column.getName().toLowerCase()); 276 writer.write("="); 277 writer.write("\""); 278 writer.write(Util.xmlEscape(nt.getLabelSymbol(symbolTables.getSymbolTable(column.getName())))); 279 writer.write("\" "); 280 } 281 } 282 writer.write(">\n"); 283 284 for (int i = 0, n = nt.nChildren(); i < n; i++) { 285 PhraseStructureNode child = nt.getChild(i); 286 writer.write(" <edge "); 287 288 for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) { 289 if (child.hasParentEdgeLabel(symbolTables.getSymbolTable(column.getName()))) { 290 writer.write(column.getName().toLowerCase()); 291 writer.write("=\""); 292 writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(symbolTables.getSymbolTable(column.getName())))); 293 writer.write("\" "); 294 } 295 } 296 if (child instanceof TokenNode) { 297 if (!labeledTerminalID) { 298 tmpID.setLength(0); 299 tmpID.append(sentenceID); 300 tmpID.append('_'); 301 tmpID.append(Integer.toString(child.getIndex())); 302 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 303 } else { 304 writer.write(" idref=\"");writer.write(child.getLabelSymbol(symbolTables.getSymbolTable("ID")));writer.write("\""); 305 } 306 307 } else { 308 tmpID.setLength(0); 309 tmpID.append(sentenceID); 310 tmpID.append('_'); 311 tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1)); 312 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 313 } 314 writer.write(" />\n"); 315 } 316 writer.write(" </nt>\n"); 317 } catch (IOException e) { 318 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 319 } 320 } 321 322 323 private void writeTail() throws MaltChainedException { 324 try { 325 writer.write(" </body>\n"); 326 writer.write("</corpus>\n"); 327 writer.flush(); 328// if (fileName != null && charsetName != null) { 329// writer.close(); 330// writer = null; 331// BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName)); 332// if (header == null) { 333// header = new TigerXMLHeader(dataFormatInstance.getSymbolTables()); 334// } 335// 336// headerWriter.write(header.toTigerXML()); 337// headerWriter.flush(); 338// headerWriter.close(); 339// } 340 } catch (IOException e) { 341 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 342 } 343 } 344 345 public int getSentenceCount() { 346 return sentenceCount; 347 } 348 349 public void setSentenceCount(int sentenceCount) { 350 this.sentenceCount = sentenceCount; 351 } 352 353 public DataFormatInstance getDataFormatInstance() { 354 return dataFormatInstance; 355 } 356 357 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 358 this.dataFormatInstance = dataFormatInstance; 359 labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID")); 360 } 361 362 public String getOptions() { 363 return optionString; 364 } 365 366 public void setOptions(String optionString) throws MaltChainedException { 367 this.optionString = optionString; 368 rootHandling = RootHandling.NORMAL; 369 370 String[] argv; 371 try { 372 argv = optionString.split("[_\\p{Blank}]"); 373 } catch (PatternSyntaxException e) { 374 throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e); 375 } 376 for (int i=0; i < argv.length-1; i++) { 377 if(argv[i].charAt(0) != '-') { 378 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 379 } 380 if(++i>=argv.length) { 381 throw new DataFormatException("The last argument does not have any value. "); 382 } 383 switch(argv[i-1].charAt(1)) { 384 case 'r': 385 if (argv[i].equals("n")) { 386 rootHandling = RootHandling.NORMAL; 387 } else if (argv[i].equals("tal")) { 388 rootHandling = RootHandling.TALBANKEN; 389 } 390 break; 391 case 's': 392 try { 393 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 394 } catch (NumberFormatException e){ 395 throw new MaltChainedException("The TigerXML writer option -s must be an integer value. "); 396 } 397 break; 398 case 'v': 399 VROOT_SYMBOL = argv[i]; 400 break; 401 default: 402 throw new DataFormatException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 403 } 404 } 405 } 406}