001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.Iterator; 011 import java.util.LinkedHashMap; 012 import java.util.SortedMap; 013 import java.util.TreeMap; 014 import java.util.regex.PatternSyntaxException; 015 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.PhraseStructure; 021 import org.maltparser.core.syntaxgraph.TokenStructure; 022 import org.maltparser.core.syntaxgraph.edge.Edge; 023 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025 /** 026 * 027 * 028 * @author Johan Hall 029 */ 030 public class NegraWriter implements SyntaxGraphWriter { 031 private BufferedWriter writer; 032 private DataFormatInstance dataFormatInstance; 033 private String optionString; 034 private int sentenceCount; 035 private LinkedHashMap<Integer, Integer> nonTerminalIndexMap; 036 private int START_ID_OF_NONTERMINALS = 500; 037 private boolean closeStream = true; 038 039 public NegraWriter() { 040 nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>(); 041 } 042 043 public void open(String fileName, String charsetName) throws MaltChainedException { 044 try { 045 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 046 } catch (FileNotFoundException e) { 047 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 048 } catch (UnsupportedEncodingException e) { 049 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 050 } 051 } 052 053 public void open(OutputStream os, String charsetName) throws MaltChainedException { 054 try { 055 if (os == System.out || os == System.err) { 056 closeStream = false; 057 } 058 open(new OutputStreamWriter(os, charsetName)); 059 } catch (UnsupportedEncodingException e) { 060 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 061 } 062 } 063 064 private void open(OutputStreamWriter osw) throws MaltChainedException { 065 setWriter(new BufferedWriter(osw)); 066 setSentenceCount(0); 067 } 068 069 public void writeProlog() throws MaltChainedException { } 070 071 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 072 if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) { 073 return; 074 } 075 PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 076 sentenceCount++; 077 try { 078 writer.write("#BOS "); 079 if (phraseStructure.getSentenceID() != 0) { 080 writer.write(Integer.toString(phraseStructure.getSentenceID())); 081 } else { 082 writer.write(Integer.toString(sentenceCount)); 083 } 084 writer.write('\n'); 085 086 if (phraseStructure.hasNonTerminals()) { 087 calculateIndices(phraseStructure); 088 writeTerminals(phraseStructure); 089 writeNonTerminals(phraseStructure); 090 } else { 091 writeTerminals(phraseStructure); 092 } 093 writer.write("#EOS "); 094 if (phraseStructure.getSentenceID() != 0) { 095 writer.write(Integer.toString(phraseStructure.getSentenceID())); 096 } else { 097 writer.write(Integer.toString(sentenceCount)); 098 } 099 writer.write('\n'); 100 } catch (IOException e) { 101 throw new DataFormatException("Could not write to the output file. ", e); 102 } 103 } 104 public void writeEpilog() throws MaltChainedException { } 105 106 107 private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException { 108 final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>(); 109 for (int index : phraseStructure.getNonTerminalIndices()) { 110 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight()); 111 } 112 113 boolean done = false; 114 int h = 1; 115 int ntid = START_ID_OF_NONTERMINALS; 116 nonTerminalIndexMap.clear(); 117 while (!done) { 118 done = true; 119 for (int index : phraseStructure.getNonTerminalIndices()) { 120 if (heights.get(index) == h) { 121 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 122 nonTerminalIndexMap.put(nt.getIndex(), ntid++); 123 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1); 124 done = false; 125 } 126 } 127 h++; 128 } 129 130 // boolean done = false; 131 // int h = 1; 132 //// int ntid = START_ID_OF_NONTERMINALS; 133 //// nonTerminalIndexMap.clear(); 134 // while (!done) { 135 // done = true; 136 // for (int index : phraseStructure.getNonTerminalIndices()) { 137 // if (heights.get(index) == h) { 138 // NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 139 //// nonTerminalIndexMap.put(nt.getIndex(), ntid++); 140 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1); 141 // done = false; 142 // } 143 // } 144 // h++; 145 // } 146 } 147 148 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 149 try { 150 for (int index : phraseStructure.getTokenIndices()) { 151 final PhraseStructureNode terminal = phraseStructure.getTokenNode(index); 152 final Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 153 ColumnDescription column = null; 154 int ti = 1; 155 while (columns.hasNext()) { 156 column = columns.next(); 157 if (column.getCategory() == ColumnDescription.INPUT) { 158 writer.write(terminal.getLabelSymbol(column.getSymbolTable())); 159 int nTabs = 1; 160 if (ti == 1 || ti == 2) { 161 nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8); 162 } else if (ti == 3) { 163 nTabs = 1; 164 } else if (ti == 4) { 165 nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8); 166 } 167 if (nTabs < 1) { 168 nTabs = 1; 169 } 170 for (int j = 0; j < nTabs; j++) { 171 writer.write('\t'); 172 } 173 ti++; 174 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 175 if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) { 176 writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable())); 177 writer.write('\t'); 178 } else { 179 writer.write("--\t"); 180 } 181 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) { 182 if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) { 183 writer.write('0'); 184 } else { 185 writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex()))); 186 // writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1)); 187 } 188 } 189 } 190 for (Edge e : terminal.getIncomingSecondaryEdges()) { 191 if (e.hasLabel(column.getSymbolTable())) { 192 writer.write('\t'); 193 writer.write(e.getLabelSymbol(column.getSymbolTable())); 194 writer.write('\t'); 195 if (e.getSource() instanceof NonTerminalNode) { 196 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex()))); 197 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1)); 198 } else { 199 writer.write(Integer.toString(e.getSource().getIndex())); 200 } 201 } 202 } 203 writer.write("\n"); 204 } 205 206 } catch (IOException e) { 207 throw new DataFormatException("The Negra writer is not able to write. ", e); 208 } 209 } 210 211 private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 212 for (int index : nonTerminalIndexMap.keySet()) { 213 // for (int index : phraseStructure.getNonTerminalIndices()) { 214 NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 215 216 if (nonTerminal == null || nonTerminal.isRoot()) { 217 return; 218 } 219 try { 220 writer.write('#'); 221 // writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1)); 222 writer.write(Integer.toString(nonTerminalIndexMap.get(index))); 223 writer.write("\t\t\t--\t\t\t"); 224 if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) { 225 writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())); 226 } else { 227 writer.write("--"); 228 } 229 writer.write("\t--\t\t"); 230 if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) { 231 writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())); 232 } else { 233 writer.write("--"); 234 } 235 writer.write('\t'); 236 if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) { 237 writer.write('0'); 238 } else { 239 // writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1)); 240 writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex()))); 241 } 242 for (Edge e : nonTerminal.getIncomingSecondaryEdges()) { 243 if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) { 244 writer.write('\t'); 245 writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())); 246 writer.write('\t'); 247 if (e.getSource() instanceof NonTerminalNode) { 248 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1)); 249 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex()))); 250 } else { 251 writer.write(Integer.toString(e.getSource().getIndex())); 252 } 253 } 254 } 255 writer.write("\n"); 256 } catch (IOException e) { 257 throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e); 258 } 259 } 260 } 261 262 public BufferedWriter getWriter() { 263 return writer; 264 } 265 266 public void setWriter(BufferedWriter writer) { 267 this.writer = writer; 268 } 269 270 public int getSentenceCount() { 271 return sentenceCount; 272 } 273 274 public void setSentenceCount(int sentenceCount) { 275 this.sentenceCount = sentenceCount; 276 } 277 278 public DataFormatInstance getDataFormatInstance() { 279 return dataFormatInstance; 280 } 281 282 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 283 this.dataFormatInstance = dataFormatInstance; 284 } 285 286 public String getOptions() { 287 return optionString; 288 } 289 290 public void setOptions(String optionString) throws MaltChainedException { 291 this.optionString = optionString; 292 String[] argv; 293 try { 294 argv = optionString.split("[_\\p{Blank}]"); 295 } catch (PatternSyntaxException e) { 296 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 297 } 298 for (int i=0; i < argv.length-1; i++) { 299 if(argv[i].charAt(0) != '-') { 300 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 301 } 302 if(++i>=argv.length) { 303 throw new DataFormatException("The last argument does not have any value. "); 304 } 305 switch(argv[i-1].charAt(1)) { 306 case 's': 307 try { 308 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 309 } catch (NumberFormatException e){ 310 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 311 } 312 break; 313 default: 314 throw new DataFormatException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 315 } 316 } 317 } 318 319 public void close() throws MaltChainedException { 320 try { 321 if (writer != null) { 322 writer.flush(); 323 if (closeStream) { 324 writer.close(); 325 } 326 writer = null; 327 } 328 } catch (IOException e) { 329 throw new DataFormatException("Could not close the output file. ", e); 330 } 331 } 332 }