001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.SortedMap; 011 import java.util.TreeMap; 012 import java.util.regex.PatternSyntaxException; 013 014 import org.maltparser.core.exception.MaltChainedException; 015 016 import org.maltparser.core.helper.Util; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.PhraseStructure; 021 import org.maltparser.core.syntaxgraph.TokenStructure; 022 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 023 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 024 import org.maltparser.core.syntaxgraph.node.TokenNode; 025 import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader; 026 /** 027 * 028 * 029 * @author Johan Hall 030 */ 031 public class TigerXMLWriter implements SyntaxGraphWriter { 032 private enum RootHandling { 033 TALBANKEN, NORMAL 034 }; 035 036 private BufferedWriter writer; 037 private DataFormatInstance dataFormatInstance; 038 private String optionString; 039 private int sentenceCount; 040 private TigerXMLHeader header; 041 // private boolean hasWriteTigerXMLHeader = false; 042 private RootHandling rootHandling; 043 private String sentencePrefix = "s"; 044 private StringBuilder sentenceID; 045 private StringBuilder tmpID; 046 private StringBuilder rootID; 047 private int START_ID_OF_NONTERMINALS = 500; 048 private boolean labeledTerminalID; 049 private String VROOT_SYMBOL = "VROOT"; 050 private boolean useVROOT = false; 051 // private String fileName = null; 052 // private String charsetName = null; 053 private boolean closeStream = true; 054 055 public TigerXMLWriter() { 056 sentenceID = new StringBuilder(); 057 tmpID = new StringBuilder(); 058 rootID = new StringBuilder(); 059 labeledTerminalID = false; 060 } 061 062 public void open(String fileName, String charsetName) throws MaltChainedException { 063 try { 064 // this.fileName = fileName; 065 // this.charsetName = charsetName; 066 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 067 } catch (FileNotFoundException e) { 068 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 069 } catch (UnsupportedEncodingException e) { 070 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 071 } 072 } 073 074 public void open(OutputStream os, String charsetName) throws MaltChainedException { 075 try { 076 if (os == System.out || os == System.err) { 077 closeStream = false; 078 } 079 open(new OutputStreamWriter(os, charsetName)); 080 } catch (UnsupportedEncodingException e) { 081 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 082 } 083 } 084 085 private void open(OutputStreamWriter osw) throws MaltChainedException { 086 setWriter(new BufferedWriter(osw)); 087 setSentenceCount(0); 088 } 089 090 public void writeProlog() throws MaltChainedException { 091 // if (fileName == null || charsetName == null) { 092 writeHeader(); 093 // } 094 } 095 096 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 097 if (syntaxGraph == null || dataFormatInstance == null) { 098 return; 099 } 100 if (syntaxGraph.hasTokens()) { 101 sentenceCount++; 102 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 103 try { 104 sentenceID.setLength(0); 105 sentenceID.append(sentencePrefix); 106 if (phraseStructure.getSentenceID() != 0) { 107 sentenceID.append(Integer.toString(phraseStructure.getSentenceID())); 108 } else { 109 sentenceID.append(Integer.toString(sentenceCount)); 110 } 111 writer.write(" <s id=\""); 112 writer.write(sentenceID.toString()); 113 writer.write("\">\n"); 114 115 setRootID(phraseStructure); 116 writer.write(" <graph root=\""); 117 writer.write(rootID.toString()); 118 writer.write("\" "); 119 writer.write("discontinuous=\""); 120 writer.write(Boolean.toString(!phraseStructure.isContinuous())); 121 writer.write("\">\n"); 122 123 writeTerminals(phraseStructure); 124 if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) { 125 writeNonTerminals(phraseStructure); 126 } else { 127 writer.write(" <nonterminals/>\n"); 128 } 129 writer.write(" </graph>\n"); 130 writer.write(" </s>\n"); 131 } catch (IOException e) { 132 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 133 } 134 } 135 } 136 137 private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException { 138 useVROOT = false; 139 PhraseStructureNode root = phraseStructure.getPhraseStructureRoot(); 140 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 141 if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) { 142 useVROOT = true; 143 break; 144 } 145 } 146 if (useVROOT) { 147 rootID.setLength(0); 148 rootID.append(sentenceID); 149 rootID.append('_'); 150 rootID.append(VROOT_SYMBOL); 151 } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) { 152 rootID.setLength(0); 153 rootID.append(sentenceID); 154 rootID.append("_1"); 155 } else { 156 rootID.setLength(0); 157 rootID.append(sentenceID); 158 rootID.append('_'); 159 // if (rootHandling.equals(RootHandling.NORMAL)) { 160 rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals())); 161 // } else if (rootHandling.equals(RootHandling.TALBANKEN)) { 162 // rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1)); 163 // } 164 } 165 166 } 167 168 public void writeEpilog() throws MaltChainedException { 169 writeTail(); 170 } 171 172 public BufferedWriter getWriter() { 173 return writer; 174 } 175 176 public void setWriter(BufferedWriter writer) { 177 this.writer = writer; 178 } 179 180 public void close() throws MaltChainedException { 181 try { 182 if (writer != null) { 183 writer.flush(); 184 if (closeStream) { 185 writer.close(); 186 } 187 writer = null; 188 } 189 } catch (IOException e) { 190 throw new DataFormatException("Could not close the output file. ", e); 191 } 192 } 193 194 private void writeHeader() throws MaltChainedException { 195 try { 196 if (header == null) { 197 header = new TigerXMLHeader(dataFormatInstance.getSymbolTables()); 198 } 199 writer.write(header.toTigerXML()); 200 // hasWriteTigerXMLHeader = true; 201 } catch (IOException e) { 202 throw new DataFormatException("The TigerXML writer could not write to file. ", e); 203 } 204 } 205 206 207 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 208 try { 209 writer.write(" <terminals>\n"); 210 for (int index : phraseStructure.getTokenIndices()) { 211 final PhraseStructureNode t = phraseStructure.getTokenNode(index); 212 writer.write(" <t "); 213 if (!labeledTerminalID) { 214 tmpID.setLength(0); 215 tmpID.append(sentenceID); 216 tmpID.append('_'); 217 tmpID.append(Integer.toString(t.getIndex())); 218 writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" "); 219 } 220 221 for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) { 222 writer.write(column.getName().toLowerCase()); 223 writer.write("=\""); 224 writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable()))); 225 writer.write("\" "); 226 } 227 writer.write("/>\n"); 228 } 229 writer.write(" </terminals>\n"); 230 } catch (IOException e) { 231 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 232 } 233 } 234 235 public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 236 try { 237 SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>(); 238 for (int index : phraseStructure.getNonTerminalIndices()) { 239 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight()); 240 } 241 writer.write(" <nonterminals>\n"); 242 boolean done = false; 243 int h = 1; 244 while (!done) { 245 done = true; 246 for (int index : phraseStructure.getNonTerminalIndices()) { 247 if (heights.get(index) == h) { 248 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 249 tmpID.setLength(0); 250 tmpID.append(sentenceID); 251 tmpID.append('_'); 252 tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1)); 253 writeNonTerminal(nt, tmpID.toString()); 254 done = false; 255 } 256 } 257 h++; 258 } 259 260 writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString()); 261 writer.write(" </nonterminals>\n"); 262 } catch (IOException e) { 263 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 264 } 265 } 266 267 public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException { 268 try { 269 writer.write(" <nt"); 270 writer.write(" id=\"");writer.write(id);writer.write("\" "); 271 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) { 272 if (nt.hasLabel(column.getSymbolTable())) { 273 writer.write(column.getName().toLowerCase()); 274 writer.write("="); 275 writer.write("\""); 276 writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable()))); 277 writer.write("\" "); 278 } 279 } 280 writer.write(">\n"); 281 282 for (int i = 0, n = nt.nChildren(); i < n; i++) { 283 PhraseStructureNode child = nt.getChild(i); 284 writer.write(" <edge "); 285 286 for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) { 287 if (child.hasParentEdgeLabel(column.getSymbolTable())) { 288 writer.write(column.getName().toLowerCase()); 289 writer.write("=\""); 290 writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable()))); 291 writer.write("\" "); 292 } 293 } 294 if (child instanceof TokenNode) { 295 if (!labeledTerminalID) { 296 tmpID.setLength(0); 297 tmpID.append(sentenceID); 298 tmpID.append('_'); 299 tmpID.append(Integer.toString(child.getIndex())); 300 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 301 } else { 302 writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\""); 303 } 304 305 } else { 306 tmpID.setLength(0); 307 tmpID.append(sentenceID); 308 tmpID.append('_'); 309 tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1)); 310 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\""); 311 } 312 writer.write(" />\n"); 313 } 314 writer.write(" </nt>\n"); 315 } catch (IOException e) { 316 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 317 } 318 } 319 320 321 private void writeTail() throws MaltChainedException { 322 try { 323 writer.write(" </body>\n"); 324 writer.write("</corpus>\n"); 325 writer.flush(); 326 // if (fileName != null && charsetName != null) { 327 // writer.close(); 328 // writer = null; 329 // BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName)); 330 // if (header == null) { 331 // header = new TigerXMLHeader(dataFormatInstance.getSymbolTables()); 332 // } 333 // 334 // headerWriter.write(header.toTigerXML()); 335 // headerWriter.flush(); 336 // headerWriter.close(); 337 // } 338 } catch (IOException e) { 339 throw new DataFormatException("The TigerXML writer is not able to write. ", e); 340 } 341 } 342 343 public int getSentenceCount() { 344 return sentenceCount; 345 } 346 347 public void setSentenceCount(int sentenceCount) { 348 this.sentenceCount = sentenceCount; 349 } 350 351 public DataFormatInstance getDataFormatInstance() { 352 return dataFormatInstance; 353 } 354 355 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 356 this.dataFormatInstance = dataFormatInstance; 357 labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID")); 358 } 359 360 public String getOptions() { 361 return optionString; 362 } 363 364 public void setOptions(String optionString) throws MaltChainedException { 365 this.optionString = optionString; 366 rootHandling = RootHandling.NORMAL; 367 368 String[] argv; 369 try { 370 argv = optionString.split("[_\\p{Blank}]"); 371 } catch (PatternSyntaxException e) { 372 throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e); 373 } 374 for (int i=0; i < argv.length-1; i++) { 375 if(argv[i].charAt(0) != '-') { 376 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 377 } 378 if(++i>=argv.length) { 379 throw new DataFormatException("The last argument does not have any value. "); 380 } 381 switch(argv[i-1].charAt(1)) { 382 case 'r': 383 if (argv[i].equals("n")) { 384 rootHandling = RootHandling.NORMAL; 385 } else if (argv[i].equals("tal")) { 386 rootHandling = RootHandling.TALBANKEN; 387 } 388 break; 389 case 's': 390 try { 391 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 392 } catch (NumberFormatException e){ 393 throw new MaltChainedException("The TigerXML writer option -s must be an integer value. "); 394 } 395 break; 396 case 'v': 397 VROOT_SYMBOL = argv[i]; 398 break; 399 default: 400 throw new DataFormatException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 401 } 402 } 403 } 404 }