001 package org.maltparser.core.syntaxgraph.writer; 002 003 import java.io.BufferedWriter; 004 import java.io.FileNotFoundException; 005 import java.io.FileOutputStream; 006 import java.io.IOException; 007 import java.io.OutputStream; 008 import java.io.OutputStreamWriter; 009 import java.io.UnsupportedEncodingException; 010 import java.util.Iterator; 011 import java.util.LinkedHashMap; 012 import java.util.SortedMap; 013 import java.util.TreeMap; 014 import java.util.regex.PatternSyntaxException; 015 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.io.dataformat.ColumnDescription; 018 import org.maltparser.core.io.dataformat.DataFormatException; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.syntaxgraph.PhraseStructure; 021 import org.maltparser.core.syntaxgraph.TokenStructure; 022 import org.maltparser.core.syntaxgraph.edge.Edge; 023 import org.maltparser.core.syntaxgraph.node.NonTerminalNode; 024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 025 import org.maltparser.ml.libsvm.LibsvmException; 026 /** 027 * 028 * 029 * @author Johan Hall 030 */ 031 public class NegraWriter implements SyntaxGraphWriter { 032 private BufferedWriter writer; 033 private DataFormatInstance dataFormatInstance; 034 private String optionString; 035 private int sentenceCount; 036 private LinkedHashMap<Integer, Integer> nonTerminalIndexMap; 037 private int START_ID_OF_NONTERMINALS = 500; 038 039 public NegraWriter() { 040 nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>(); 041 } 042 043 public void open(String fileName, String charsetName) throws MaltChainedException { 044 try { 045 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName)); 046 } catch (FileNotFoundException e) { 047 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e); 048 } catch (UnsupportedEncodingException e) { 049 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 050 } 051 } 052 053 public void open(OutputStream os, String charsetName) throws MaltChainedException { 054 try { 055 open(new OutputStreamWriter(os, charsetName)); 056 } catch (UnsupportedEncodingException e) { 057 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e); 058 } 059 } 060 061 public void open(OutputStreamWriter osw) throws MaltChainedException { 062 setWriter(new BufferedWriter(osw)); 063 setSentenceCount(0); 064 } 065 066 public void writeProlog() throws MaltChainedException { } 067 068 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException { 069 if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) { 070 return; 071 } 072 PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph; 073 sentenceCount++; 074 try { 075 writer.write("#BOS "); 076 if (phraseStructure.getSentenceID() != 0) { 077 writer.write(Integer.toString(phraseStructure.getSentenceID())); 078 } else { 079 writer.write(Integer.toString(sentenceCount)); 080 } 081 writer.write('\n'); 082 083 if (phraseStructure.hasNonTerminals()) { 084 calculateIndices(phraseStructure); 085 writeTerminals(phraseStructure); 086 writeNonTerminals(phraseStructure); 087 } else { 088 writeTerminals(phraseStructure); 089 } 090 writer.write("#EOS "); 091 if (phraseStructure.getSentenceID() != 0) { 092 writer.write(Integer.toString(phraseStructure.getSentenceID())); 093 } else { 094 writer.write(Integer.toString(sentenceCount)); 095 } 096 writer.write('\n'); 097 } catch (IOException e) { 098 throw new DataFormatException("Could not write to the output file. ", e); 099 } 100 } 101 public void writeEpilog() throws MaltChainedException { } 102 103 104 private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException { 105 final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>(); 106 for (int index : phraseStructure.getNonTerminalIndices()) { 107 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight()); 108 } 109 110 boolean done = false; 111 int h = 1; 112 int ntid = START_ID_OF_NONTERMINALS; 113 nonTerminalIndexMap.clear(); 114 while (!done) { 115 done = true; 116 for (int index : phraseStructure.getNonTerminalIndices()) { 117 if (heights.get(index) == h) { 118 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 119 nonTerminalIndexMap.put(nt.getIndex(), ntid++); 120 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1); 121 done = false; 122 } 123 } 124 h++; 125 } 126 127 // boolean done = false; 128 // int h = 1; 129 //// int ntid = START_ID_OF_NONTERMINALS; 130 //// nonTerminalIndexMap.clear(); 131 // while (!done) { 132 // done = true; 133 // for (int index : phraseStructure.getNonTerminalIndices()) { 134 // if (heights.get(index) == h) { 135 // NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 136 //// nonTerminalIndexMap.put(nt.getIndex(), ntid++); 137 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1); 138 // done = false; 139 // } 140 // } 141 // h++; 142 // } 143 } 144 145 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 146 try { 147 for (int index : phraseStructure.getTokenIndices()) { 148 final PhraseStructureNode terminal = phraseStructure.getTokenNode(index); 149 final Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); 150 ColumnDescription column = null; 151 int ti = 1; 152 while (columns.hasNext()) { 153 column = columns.next(); 154 if (column.getCategory() == ColumnDescription.INPUT) { 155 writer.write(terminal.getLabelSymbol(column.getSymbolTable())); 156 int nTabs = 1; 157 if (ti == 1 || ti == 2) { 158 nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8); 159 } else if (ti == 3) { 160 nTabs = 1; 161 } else if (ti == 4) { 162 nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8); 163 } 164 if (nTabs < 1) { 165 nTabs = 1; 166 } 167 for (int j = 0; j < nTabs; j++) { 168 writer.write('\t'); 169 } 170 ti++; 171 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { 172 if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) { 173 writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable())); 174 writer.write('\t'); 175 } else { 176 writer.write("--\t"); 177 } 178 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) { 179 if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) { 180 writer.write('0'); 181 } else { 182 writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex()))); 183 // writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1)); 184 } 185 } 186 } 187 for (Edge e : terminal.getIncomingSecondaryEdges()) { 188 if (e.hasLabel(column.getSymbolTable())) { 189 writer.write('\t'); 190 writer.write(e.getLabelSymbol(column.getSymbolTable())); 191 writer.write('\t'); 192 if (e.getSource() instanceof NonTerminalNode) { 193 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex()))); 194 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1)); 195 } else { 196 writer.write(Integer.toString(e.getSource().getIndex())); 197 } 198 } 199 } 200 writer.write("\n"); 201 } 202 203 } catch (IOException e) { 204 throw new DataFormatException("The Negra writer is not able to write. ", e); 205 } 206 } 207 208 private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException { 209 for (int index : nonTerminalIndexMap.keySet()) { 210 // for (int index : phraseStructure.getNonTerminalIndices()) { 211 NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index); 212 213 if (nonTerminal == null || nonTerminal.isRoot()) { 214 return; 215 } 216 try { 217 writer.write('#'); 218 // writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1)); 219 writer.write(Integer.toString(nonTerminalIndexMap.get(index))); 220 writer.write("\t\t\t--\t\t\t"); 221 if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) { 222 writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())); 223 } else { 224 writer.write("--"); 225 } 226 writer.write("\t--\t\t"); 227 if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) { 228 writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())); 229 } else { 230 writer.write("--"); 231 } 232 writer.write('\t'); 233 if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) { 234 writer.write('0'); 235 } else { 236 // writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1)); 237 writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex()))); 238 } 239 for (Edge e : nonTerminal.getIncomingSecondaryEdges()) { 240 if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) { 241 writer.write('\t'); 242 writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())); 243 writer.write('\t'); 244 if (e.getSource() instanceof NonTerminalNode) { 245 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1)); 246 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex()))); 247 } else { 248 writer.write(Integer.toString(e.getSource().getIndex())); 249 } 250 } 251 } 252 writer.write("\n"); 253 } catch (IOException e) { 254 throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e); 255 } 256 } 257 } 258 259 public BufferedWriter getWriter() { 260 return writer; 261 } 262 263 public void setWriter(BufferedWriter writer) { 264 this.writer = writer; 265 } 266 267 public int getSentenceCount() { 268 return sentenceCount; 269 } 270 271 public void setSentenceCount(int sentenceCount) { 272 this.sentenceCount = sentenceCount; 273 } 274 275 public DataFormatInstance getDataFormatInstance() { 276 return dataFormatInstance; 277 } 278 279 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) { 280 this.dataFormatInstance = dataFormatInstance; 281 } 282 283 public String getOptions() { 284 return optionString; 285 } 286 287 public void setOptions(String optionString) throws MaltChainedException { 288 this.optionString = optionString; 289 String[] argv; 290 try { 291 argv = optionString.split("[_\\p{Blank}]"); 292 } catch (PatternSyntaxException e) { 293 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e); 294 } 295 for (int i=0; i < argv.length-1; i++) { 296 if(argv[i].charAt(0) != '-') { 297 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0)); 298 } 299 if(++i>=argv.length) { 300 throw new DataFormatException("The last argument does not have any value. "); 301 } 302 switch(argv[i-1].charAt(1)) { 303 case 's': 304 try { 305 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); 306 } catch (NumberFormatException e){ 307 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); 308 } 309 break; 310 default: 311 throw new LibsvmException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. "); 312 } 313 } 314 } 315 316 public void close() throws MaltChainedException { 317 try { 318 if (writer != null) { 319 writer.flush(); 320 writer.close(); 321 writer = null; 322 } 323 } catch (IOException e) { 324 throw new DataFormatException("Could not close the output file. ", e); 325 } 326 } 327 }