001 package org.maltparser.core.syntaxgraph.writer;
002
003 import java.io.BufferedWriter;
004 import java.io.FileNotFoundException;
005 import java.io.FileOutputStream;
006 import java.io.IOException;
007 import java.io.OutputStream;
008 import java.io.OutputStreamWriter;
009 import java.io.UnsupportedEncodingException;
010 import java.util.SortedMap;
011 import java.util.TreeMap;
012 import java.util.regex.PatternSyntaxException;
013
014 import org.maltparser.core.exception.MaltChainedException;
015
016 import org.maltparser.core.helper.Util;
017 import org.maltparser.core.io.dataformat.ColumnDescription;
018 import org.maltparser.core.io.dataformat.DataFormatException;
019 import org.maltparser.core.io.dataformat.DataFormatInstance;
020 import org.maltparser.core.syntaxgraph.PhraseStructure;
021 import org.maltparser.core.syntaxgraph.TokenStructure;
022 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
023 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
024 import org.maltparser.core.syntaxgraph.node.TokenNode;
025 import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader;
026 /**
027 *
028 *
029 * @author Johan Hall
030 */
031 public class TigerXMLWriter implements SyntaxGraphWriter {
032 private enum RootHandling {
033 TALBANKEN, NORMAL
034 };
035
036 private BufferedWriter writer;
037 private DataFormatInstance dataFormatInstance;
038 private String optionString;
039 private int sentenceCount;
040 private TigerXMLHeader header;
041 // private boolean hasWriteTigerXMLHeader = false;
042 private RootHandling rootHandling;
043 private String sentencePrefix = "s";
044 private StringBuilder sentenceID;
045 private StringBuilder tmpID;
046 private StringBuilder rootID;
047 private int START_ID_OF_NONTERMINALS = 500;
048 private boolean labeledTerminalID;
049 private String VROOT_SYMBOL = "VROOT";
050 private boolean useVROOT = false;
051 // private String fileName = null;
052 // private String charsetName = null;
053 private boolean closeStream = true;
054
055 public TigerXMLWriter() {
056 sentenceID = new StringBuilder();
057 tmpID = new StringBuilder();
058 rootID = new StringBuilder();
059 labeledTerminalID = false;
060 }
061
062 public void open(String fileName, String charsetName) throws MaltChainedException {
063 try {
064 // this.fileName = fileName;
065 // this.charsetName = charsetName;
066 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
067 } catch (FileNotFoundException e) {
068 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
069 } catch (UnsupportedEncodingException e) {
070 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
071 }
072 }
073
074 public void open(OutputStream os, String charsetName) throws MaltChainedException {
075 try {
076 if (os == System.out || os == System.err) {
077 closeStream = false;
078 }
079 open(new OutputStreamWriter(os, charsetName));
080 } catch (UnsupportedEncodingException e) {
081 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
082 }
083 }
084
085 private void open(OutputStreamWriter osw) throws MaltChainedException {
086 setWriter(new BufferedWriter(osw));
087 setSentenceCount(0);
088 }
089
090 public void writeProlog() throws MaltChainedException {
091 // if (fileName == null || charsetName == null) {
092 writeHeader();
093 // }
094 }
095
096 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
097 if (syntaxGraph == null || dataFormatInstance == null) {
098 return;
099 }
100 if (syntaxGraph.hasTokens()) {
101 sentenceCount++;
102 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
103 try {
104 sentenceID.setLength(0);
105 sentenceID.append(sentencePrefix);
106 if (phraseStructure.getSentenceID() != 0) {
107 sentenceID.append(Integer.toString(phraseStructure.getSentenceID()));
108 } else {
109 sentenceID.append(Integer.toString(sentenceCount));
110 }
111 writer.write(" <s id=\"");
112 writer.write(sentenceID.toString());
113 writer.write("\">\n");
114
115 setRootID(phraseStructure);
116 writer.write(" <graph root=\"");
117 writer.write(rootID.toString());
118 writer.write("\" ");
119 writer.write("discontinuous=\"");
120 writer.write(Boolean.toString(!phraseStructure.isContinuous()));
121 writer.write("\">\n");
122
123 writeTerminals(phraseStructure);
124 if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) {
125 writeNonTerminals(phraseStructure);
126 } else {
127 writer.write(" <nonterminals/>\n");
128 }
129 writer.write(" </graph>\n");
130 writer.write(" </s>\n");
131 } catch (IOException e) {
132 throw new DataFormatException("The TigerXML writer could not write to file. ", e);
133 }
134 }
135 }
136
137 private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException {
138 useVROOT = false;
139 PhraseStructureNode root = phraseStructure.getPhraseStructureRoot();
140 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
141 if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) {
142 useVROOT = true;
143 break;
144 }
145 }
146 if (useVROOT) {
147 rootID.setLength(0);
148 rootID.append(sentenceID);
149 rootID.append('_');
150 rootID.append(VROOT_SYMBOL);
151 } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) {
152 rootID.setLength(0);
153 rootID.append(sentenceID);
154 rootID.append("_1");
155 } else {
156 rootID.setLength(0);
157 rootID.append(sentenceID);
158 rootID.append('_');
159 // if (rootHandling.equals(RootHandling.NORMAL)) {
160 rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals()));
161 // } else if (rootHandling.equals(RootHandling.TALBANKEN)) {
162 // rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1));
163 // }
164 }
165
166 }
167
168 public void writeEpilog() throws MaltChainedException {
169 writeTail();
170 }
171
172 public BufferedWriter getWriter() {
173 return writer;
174 }
175
176 public void setWriter(BufferedWriter writer) {
177 this.writer = writer;
178 }
179
180 public void close() throws MaltChainedException {
181 try {
182 if (writer != null) {
183 writer.flush();
184 if (closeStream) {
185 writer.close();
186 }
187 writer = null;
188 }
189 } catch (IOException e) {
190 throw new DataFormatException("Could not close the output file. ", e);
191 }
192 }
193
194 private void writeHeader() throws MaltChainedException {
195 try {
196 if (header == null) {
197 header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
198 }
199 writer.write(header.toTigerXML());
200 // hasWriteTigerXMLHeader = true;
201 } catch (IOException e) {
202 throw new DataFormatException("The TigerXML writer could not write to file. ", e);
203 }
204 }
205
206
207 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
208 try {
209 writer.write(" <terminals>\n");
210 for (int index : phraseStructure.getTokenIndices()) {
211 final PhraseStructureNode t = phraseStructure.getTokenNode(index);
212 writer.write(" <t ");
213 if (!labeledTerminalID) {
214 tmpID.setLength(0);
215 tmpID.append(sentenceID);
216 tmpID.append('_');
217 tmpID.append(Integer.toString(t.getIndex()));
218 writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" ");
219 }
220
221 for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) {
222 writer.write(column.getName().toLowerCase());
223 writer.write("=\"");
224 writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable())));
225 writer.write("\" ");
226 }
227 writer.write("/>\n");
228 }
229 writer.write(" </terminals>\n");
230 } catch (IOException e) {
231 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
232 }
233 }
234
235 public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
236 try {
237 SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
238 for (int index : phraseStructure.getNonTerminalIndices()) {
239 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
240 }
241 writer.write(" <nonterminals>\n");
242 boolean done = false;
243 int h = 1;
244 while (!done) {
245 done = true;
246 for (int index : phraseStructure.getNonTerminalIndices()) {
247 if (heights.get(index) == h) {
248 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
249 tmpID.setLength(0);
250 tmpID.append(sentenceID);
251 tmpID.append('_');
252 tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1));
253 writeNonTerminal(nt, tmpID.toString());
254 done = false;
255 }
256 }
257 h++;
258 }
259
260 writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString());
261 writer.write(" </nonterminals>\n");
262 } catch (IOException e) {
263 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
264 }
265 }
266
267 public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException {
268 try {
269 writer.write(" <nt");
270 writer.write(" id=\"");writer.write(id);writer.write("\" ");
271 for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
272 if (nt.hasLabel(column.getSymbolTable())) {
273 writer.write(column.getName().toLowerCase());
274 writer.write("=");
275 writer.write("\"");
276 writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable())));
277 writer.write("\" ");
278 }
279 }
280 writer.write(">\n");
281
282 for (int i = 0, n = nt.nChildren(); i < n; i++) {
283 PhraseStructureNode child = nt.getChild(i);
284 writer.write(" <edge ");
285
286 for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) {
287 if (child.hasParentEdgeLabel(column.getSymbolTable())) {
288 writer.write(column.getName().toLowerCase());
289 writer.write("=\"");
290 writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable())));
291 writer.write("\" ");
292 }
293 }
294 if (child instanceof TokenNode) {
295 if (!labeledTerminalID) {
296 tmpID.setLength(0);
297 tmpID.append(sentenceID);
298 tmpID.append('_');
299 tmpID.append(Integer.toString(child.getIndex()));
300 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
301 } else {
302 writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\"");
303 }
304
305 } else {
306 tmpID.setLength(0);
307 tmpID.append(sentenceID);
308 tmpID.append('_');
309 tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1));
310 writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
311 }
312 writer.write(" />\n");
313 }
314 writer.write(" </nt>\n");
315 } catch (IOException e) {
316 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
317 }
318 }
319
320
321 private void writeTail() throws MaltChainedException {
322 try {
323 writer.write(" </body>\n");
324 writer.write("</corpus>\n");
325 writer.flush();
326 // if (fileName != null && charsetName != null) {
327 // writer.close();
328 // writer = null;
329 // BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName));
330 // if (header == null) {
331 // header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
332 // }
333 //
334 // headerWriter.write(header.toTigerXML());
335 // headerWriter.flush();
336 // headerWriter.close();
337 // }
338 } catch (IOException e) {
339 throw new DataFormatException("The TigerXML writer is not able to write. ", e);
340 }
341 }
342
343 public int getSentenceCount() {
344 return sentenceCount;
345 }
346
347 public void setSentenceCount(int sentenceCount) {
348 this.sentenceCount = sentenceCount;
349 }
350
351 public DataFormatInstance getDataFormatInstance() {
352 return dataFormatInstance;
353 }
354
355 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
356 this.dataFormatInstance = dataFormatInstance;
357 labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID"));
358 }
359
360 public String getOptions() {
361 return optionString;
362 }
363
364 public void setOptions(String optionString) throws MaltChainedException {
365 this.optionString = optionString;
366 rootHandling = RootHandling.NORMAL;
367
368 String[] argv;
369 try {
370 argv = optionString.split("[_\\p{Blank}]");
371 } catch (PatternSyntaxException e) {
372 throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e);
373 }
374 for (int i=0; i < argv.length-1; i++) {
375 if(argv[i].charAt(0) != '-') {
376 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
377 }
378 if(++i>=argv.length) {
379 throw new DataFormatException("The last argument does not have any value. ");
380 }
381 switch(argv[i-1].charAt(1)) {
382 case 'r':
383 if (argv[i].equals("n")) {
384 rootHandling = RootHandling.NORMAL;
385 } else if (argv[i].equals("tal")) {
386 rootHandling = RootHandling.TALBANKEN;
387 }
388 break;
389 case 's':
390 try {
391 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
392 } catch (NumberFormatException e){
393 throw new MaltChainedException("The TigerXML writer option -s must be an integer value. ");
394 }
395 break;
396 case 'v':
397 VROOT_SYMBOL = argv[i];
398 break;
399 default:
400 throw new DataFormatException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
401 }
402 }
403 }
404 }