001package org.maltparser.core.syntaxgraph.writer;
002
003import java.io.BufferedWriter;
004import java.io.FileNotFoundException;
005import java.io.FileOutputStream;
006import java.io.IOException;
007import java.io.OutputStream;
008import java.io.OutputStreamWriter;
009import java.io.UnsupportedEncodingException;
010import java.util.SortedMap;
011import java.util.TreeMap;
012import java.util.regex.PatternSyntaxException;
013
014import org.maltparser.core.exception.MaltChainedException;
015
016import org.maltparser.core.helper.Util;
017import org.maltparser.core.io.dataformat.ColumnDescription;
018import org.maltparser.core.io.dataformat.DataFormatException;
019import org.maltparser.core.io.dataformat.DataFormatInstance;
020import org.maltparser.core.symbol.SymbolTableHandler;
021import org.maltparser.core.syntaxgraph.PhraseStructure;
022import org.maltparser.core.syntaxgraph.TokenStructure;
023import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
024import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025import org.maltparser.core.syntaxgraph.node.TokenNode;
026import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader;
027/**
028*
029*
030* @author Johan Hall
031*/
032public class TigerXMLWriter implements SyntaxGraphWriter {
033        private enum RootHandling {
034                TALBANKEN, NORMAL  
035        };
036
037        private BufferedWriter writer;
038        private DataFormatInstance dataFormatInstance;
039        private String optionString;
040        private int sentenceCount;
041        private TigerXMLHeader header;
042//      private boolean hasWriteTigerXMLHeader = false;
043        private RootHandling rootHandling;
044        private String sentencePrefix = "s";
045        private StringBuilder sentenceID;
046        private StringBuilder tmpID;
047        private StringBuilder rootID;
048        private int START_ID_OF_NONTERMINALS = 500;
049        private boolean labeledTerminalID;
050        private String VROOT_SYMBOL = "VROOT";
051        private boolean useVROOT = false;
052//      private String fileName = null;
053//      private String charsetName = null;
054        private boolean closeStream = true;
055        
056        public TigerXMLWriter() { 
057                sentenceID = new StringBuilder();
058                tmpID = new StringBuilder();
059                rootID = new StringBuilder();
060                labeledTerminalID = false;
061        }
062        
063        public void open(String fileName, String charsetName) throws MaltChainedException {
064                try {
065//                      this.fileName = fileName;
066//                      this.charsetName = charsetName;
067                        open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
068                } catch (FileNotFoundException e) {
069                        throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
070                } catch (UnsupportedEncodingException e) {
071                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
072                }       
073        }
074        
075        public void open(OutputStream os, String charsetName) throws MaltChainedException {
076                try {
077                        if (os == System.out || os == System.err) {
078                                closeStream = false;
079                        }
080                        open(new OutputStreamWriter(os, charsetName));
081                } catch (UnsupportedEncodingException e) {
082                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
083                }
084        }
085        
086        private void open(OutputStreamWriter osw) throws MaltChainedException {
087                setWriter(new BufferedWriter(osw));
088                setSentenceCount(0);
089        }
090        
091        public void writeProlog() throws MaltChainedException { 
092//              if (fileName == null || charsetName == null) {
093                        writeHeader();
094//              }
095        }
096        
097        public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
098                if (syntaxGraph == null || dataFormatInstance == null) {
099                        return;
100                }
101                if (syntaxGraph.hasTokens()) {
102                        sentenceCount++;
103                        final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
104                        try {
105                                sentenceID.setLength(0);
106                                sentenceID.append(sentencePrefix);
107                                if (phraseStructure.getSentenceID() != 0) {
108                                        sentenceID.append(Integer.toString(phraseStructure.getSentenceID()));
109                                } else {
110                                        sentenceID.append(Integer.toString(sentenceCount));
111                                }
112                                writer.write("    <s id=\"");
113                                writer.write(sentenceID.toString());  
114                                writer.write("\">\n");
115                                
116                                setRootID(phraseStructure);
117                                writer.write("      <graph root=\"");
118                                writer.write(rootID.toString());
119                                writer.write("\" ");
120                                writer.write("discontinuous=\"");
121                                writer.write(Boolean.toString(!phraseStructure.isContinuous()));
122                                writer.write("\">\n");
123                                
124                                writeTerminals(phraseStructure);
125                                if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) {
126                                        writeNonTerminals(phraseStructure);
127                                } else {
128                                        writer.write("        <nonterminals/>\n");
129                                }
130                                writer.write("      </graph>\n");
131                                writer.write("    </s>\n");
132                        } catch (IOException e) {
133                                throw new DataFormatException("The TigerXML writer could not write to file. ", e);
134                        }
135                }
136        }
137        
138        private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException {
139                useVROOT = false;
140                PhraseStructureNode root = phraseStructure.getPhraseStructureRoot();
141                final SymbolTableHandler symbolTables = phraseStructure.getSymbolTables();
142                for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
143                        if (root.hasLabel(symbolTables.getSymbolTable(column.getName())) && root.getLabelSymbol(symbolTables.getSymbolTable(column.getName())).equals(VROOT_SYMBOL)) {
144                                useVROOT = true;
145                                break;
146                        }
147                }
148                if (useVROOT) {
149                        rootID.setLength(0);
150                        rootID.append(sentenceID);
151                        rootID.append('_');
152                        rootID.append(VROOT_SYMBOL);
153                } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) {
154                        rootID.setLength(0);
155                        rootID.append(sentenceID);
156                        rootID.append("_1");
157                } else {
158                        rootID.setLength(0);
159                        rootID.append(sentenceID);
160                        rootID.append('_');
161//                      if (rootHandling.equals(RootHandling.NORMAL)) { 
162                                rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals()));
163//                      } else if (rootHandling.equals(RootHandling.TALBANKEN)) {
164//                              rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1));
165//                      }
166                }
167
168        }
169        
170        public void writeEpilog() throws MaltChainedException { 
171                writeTail();
172        }
173        
174        public BufferedWriter getWriter() {
175                return writer;
176        }
177
178        public void setWriter(BufferedWriter writer) {
179                this.writer = writer;
180        }
181        
182        public void close() throws MaltChainedException {
183                try {
184                        if (writer != null) {
185                                writer.flush();
186                                if (closeStream) {
187                                        writer.close();
188                                }
189                                writer = null;
190                        }
191                }   catch (IOException e) {
192                        throw new DataFormatException("Could not close the output file. ", e);
193                } 
194        }
195        
196        private void writeHeader() throws MaltChainedException {
197                try {
198                        if (header == null) {
199                                header = new TigerXMLHeader();
200                        }
201                        writer.write(header.toTigerXML());
202//                      hasWriteTigerXMLHeader = true;
203                } catch (IOException e) {
204                        throw new DataFormatException("The TigerXML writer could not write to file. ", e);
205                }
206        }
207        
208        
209        private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
210                try {
211                        writer.write("        <terminals>\n");
212                        for (int index : phraseStructure.getTokenIndices()) {
213                                final PhraseStructureNode t = phraseStructure.getTokenNode(index);
214                                writer.write("          <t ");
215                                if (!labeledTerminalID) {
216                                        tmpID.setLength(0);
217                                        tmpID.append(sentenceID);
218                                        tmpID.append('_');
219                                        tmpID.append(Integer.toString(t.getIndex()));
220                                        writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" ");
221                                }
222                                
223                                for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) {
224                                        writer.write(column.getName().toLowerCase());
225                                        writer.write("=\"");
226                                        writer.write(Util.xmlEscape(t.getLabelSymbol(phraseStructure.getSymbolTables().getSymbolTable(column.getName()))));
227                                        writer.write("\" ");    
228                                }
229                                writer.write("/>\n");
230                        }
231                        writer.write("        </terminals>\n");
232                } catch (IOException e) {
233                        throw new DataFormatException("The TigerXML writer is not able to write. ", e);
234                }
235        }
236        
237        public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
238                try {
239                        SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
240                        for (int index : phraseStructure.getNonTerminalIndices()) {
241                                heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
242                        }
243                        writer.write("        <nonterminals>\n");
244                        boolean done = false;
245                        int h = 1;
246                        while (!done) {
247                                done = true;
248                                for (int index : phraseStructure.getNonTerminalIndices()) {
249                                        if (heights.get(index) == h) {
250                                                NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
251                                                tmpID.setLength(0);
252                                                tmpID.append(sentenceID);
253                                                tmpID.append('_');
254                                                tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1));
255                                                writeNonTerminal(phraseStructure.getSymbolTables(), nt, tmpID.toString());
256                                                done = false;
257                                        }
258                                }
259                                h++;
260                        }
261                        
262                        writeNonTerminal(phraseStructure.getSymbolTables(), (NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString());
263                        writer.write("        </nonterminals>\n");
264                } catch (IOException e) {
265                        throw new DataFormatException("The TigerXML writer is not able to write. ", e);
266                }
267        }
268        
269        public void writeNonTerminal(SymbolTableHandler symbolTables, NonTerminalNode nt, String id) throws MaltChainedException {
270                try {
271                        writer.write("          <nt");
272                        writer.write(" id=\"");writer.write(id);writer.write("\" ");
273                        for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
274                                if (nt.hasLabel(symbolTables.getSymbolTable(column.getName()))) {
275                                        writer.write(column.getName().toLowerCase());
276                                        writer.write("=");
277                                        writer.write("\"");
278                                        writer.write(Util.xmlEscape(nt.getLabelSymbol(symbolTables.getSymbolTable(column.getName()))));
279                                        writer.write("\" ");
280                                }
281                        }
282                        writer.write(">\n");
283                        
284                        for (int i = 0, n = nt.nChildren(); i < n; i++) {
285                                PhraseStructureNode child = nt.getChild(i); 
286                                writer.write("            <edge ");
287
288                                for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) {
289                                        if (child.hasParentEdgeLabel(symbolTables.getSymbolTable(column.getName()))) {
290                                                writer.write(column.getName().toLowerCase());
291                                                writer.write("=\"");
292                                                writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(symbolTables.getSymbolTable(column.getName()))));
293                                                writer.write("\" ");
294                                        }
295                                }
296                                if (child instanceof TokenNode) {
297                                        if (!labeledTerminalID) {
298                                                tmpID.setLength(0);
299                                                tmpID.append(sentenceID);
300                                                tmpID.append('_');
301                                                tmpID.append(Integer.toString(child.getIndex()));
302                                                writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
303                                        } else {
304                                                writer.write(" idref=\"");writer.write(child.getLabelSymbol(symbolTables.getSymbolTable("ID")));writer.write("\"");
305                                        }
306                                        
307                                } else {
308                                        tmpID.setLength(0);
309                                        tmpID.append(sentenceID);
310                                        tmpID.append('_');
311                                        tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1));
312                                        writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
313                                }
314                                writer.write(" />\n");
315                        }
316                        writer.write("          </nt>\n");
317                } catch (IOException e) {
318                        throw new DataFormatException("The TigerXML writer is not able to write. ", e);
319                }
320        }
321
322        
323        private void writeTail() throws MaltChainedException {
324                try {
325                        writer.write("  </body>\n");
326                        writer.write("</corpus>\n");
327                        writer.flush();
328//                      if (fileName != null && charsetName != null) {
329//                              writer.close();
330//                              writer = null;
331//                              BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName));
332//                              if (header == null) {
333//                                      header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
334//                              }
335//                              
336//                              headerWriter.write(header.toTigerXML());
337//                              headerWriter.flush();
338//                              headerWriter.close();
339//                      }
340                } catch (IOException e) {
341                        throw new DataFormatException("The TigerXML writer is not able to write. ", e);
342                }
343        }
344        
345        public int getSentenceCount() {
346                return sentenceCount;
347        }
348
349        public void setSentenceCount(int sentenceCount) {
350                this.sentenceCount = sentenceCount;
351        }
352        
353        public DataFormatInstance getDataFormatInstance() {
354                return dataFormatInstance;
355        }
356
357        public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
358                this.dataFormatInstance = dataFormatInstance;
359                labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID"));
360        }
361
362        public String getOptions() {
363                return optionString;
364        }
365        
366        public void setOptions(String optionString) throws MaltChainedException {
367                this.optionString = optionString;
368                rootHandling = RootHandling.NORMAL;
369
370                String[] argv;
371                try {
372                        argv = optionString.split("[_\\p{Blank}]");
373                } catch (PatternSyntaxException e) {
374                        throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e);
375                }
376                for (int i=0; i < argv.length-1; i++) {
377                        if(argv[i].charAt(0) != '-') {
378                                throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
379                        }
380                        if(++i>=argv.length) {
381                                throw new DataFormatException("The last argument does not have any value. ");
382                        }
383                        switch(argv[i-1].charAt(1)) {
384                        case 'r': 
385                                if (argv[i].equals("n")) {
386                                        rootHandling = RootHandling.NORMAL;
387                                } else if (argv[i].equals("tal")) {
388                                        rootHandling = RootHandling.TALBANKEN;
389                                }
390                                break;
391                        case 's': 
392                                try {
393                                        START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
394                                } catch (NumberFormatException e){
395                                        throw new MaltChainedException("The TigerXML writer option -s must be an integer value. ");
396                                }
397                                break;
398                        case 'v': 
399                                VROOT_SYMBOL = argv[i];
400                                break;  
401                        default:
402                                throw new DataFormatException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");            
403                        }
404                }       
405        }
406}