001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.SortedMap;
011    import java.util.TreeMap;
012    import java.util.regex.PatternSyntaxException;
013    
014    import org.maltparser.core.exception.MaltChainedException;
015    
016    import org.maltparser.core.helper.Util;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.PhraseStructure;
021    import org.maltparser.core.syntaxgraph.TokenStructure;
022    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
023    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
024    import org.maltparser.core.syntaxgraph.node.TokenNode;
025    import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader;
026    /**
027    *
028    *
029    * @author Johan Hall
030    */
031    public class TigerXMLWriter implements SyntaxGraphWriter {
032            private enum RootHandling {
033                    TALBANKEN, NORMAL  
034            };
035    
036            private BufferedWriter writer;
037            private DataFormatInstance dataFormatInstance;
038            private String optionString;
039            private int sentenceCount;
040            private TigerXMLHeader header;
041    //      private boolean hasWriteTigerXMLHeader = false;
042            private RootHandling rootHandling;
043            private String sentencePrefix = "s";
044            private StringBuilder sentenceID;
045            private StringBuilder tmpID;
046            private StringBuilder rootID;
047            private int START_ID_OF_NONTERMINALS = 500;
048            private boolean labeledTerminalID;
049            private String VROOT_SYMBOL = "VROOT";
050            private boolean useVROOT = false;
051    //      private String fileName = null;
052    //      private String charsetName = null;
053            private boolean closeStream = true;
054            
055            public TigerXMLWriter() { 
056                    sentenceID = new StringBuilder();
057                    tmpID = new StringBuilder();
058                    rootID = new StringBuilder();
059                    labeledTerminalID = false;
060            }
061            
062            public void open(String fileName, String charsetName) throws MaltChainedException {
063                    try {
064    //                      this.fileName = fileName;
065    //                      this.charsetName = charsetName;
066                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
067                    } catch (FileNotFoundException e) {
068                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
069                    } catch (UnsupportedEncodingException e) {
070                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
071                    }       
072            }
073            
074            public void open(OutputStream os, String charsetName) throws MaltChainedException {
075                    try {
076                            if (os == System.out || os == System.err) {
077                                    closeStream = false;
078                            }
079                            open(new OutputStreamWriter(os, charsetName));
080                    } catch (UnsupportedEncodingException e) {
081                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
082                    }
083            }
084            
085            private void open(OutputStreamWriter osw) throws MaltChainedException {
086                    setWriter(new BufferedWriter(osw));
087                    setSentenceCount(0);
088            }
089            
090            public void writeProlog() throws MaltChainedException { 
091    //              if (fileName == null || charsetName == null) {
092                            writeHeader();
093    //              }
094            }
095            
096            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
097                    if (syntaxGraph == null || dataFormatInstance == null) {
098                            return;
099                    }
100                    if (syntaxGraph.hasTokens()) {
101                            sentenceCount++;
102                            final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
103                            try {
104                                    sentenceID.setLength(0);
105                                    sentenceID.append(sentencePrefix);
106                                    if (phraseStructure.getSentenceID() != 0) {
107                                            sentenceID.append(Integer.toString(phraseStructure.getSentenceID()));
108                                    } else {
109                                            sentenceID.append(Integer.toString(sentenceCount));
110                                    }
111                                    writer.write("    <s id=\"");
112                                    writer.write(sentenceID.toString());  
113                                    writer.write("\">\n");
114                                    
115                                    setRootID(phraseStructure);
116                                    writer.write("      <graph root=\"");
117                                    writer.write(rootID.toString());
118                                    writer.write("\" ");
119                                    writer.write("discontinuous=\"");
120                                    writer.write(Boolean.toString(!phraseStructure.isContinuous()));
121                                    writer.write("\">\n");
122                                    
123                                    writeTerminals(phraseStructure);
124                                    if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) {
125                                            writeNonTerminals(phraseStructure);
126                                    } else {
127                                            writer.write("        <nonterminals/>\n");
128                                    }
129                                    writer.write("      </graph>\n");
130                                    writer.write("    </s>\n");
131                            } catch (IOException e) {
132                                    throw new DataFormatException("The TigerXML writer could not write to file. ", e);
133                            }
134                    }
135            }
136            
137            private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException {
138                    useVROOT = false;
139                    PhraseStructureNode root = phraseStructure.getPhraseStructureRoot();
140                    for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
141                            if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) {
142                                    useVROOT = true;
143                                    break;
144                            }
145                    }
146                    if (useVROOT) {
147                            rootID.setLength(0);
148                            rootID.append(sentenceID);
149                            rootID.append('_');
150                            rootID.append(VROOT_SYMBOL);
151                    } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) {
152                            rootID.setLength(0);
153                            rootID.append(sentenceID);
154                            rootID.append("_1");
155                    } else {
156                            rootID.setLength(0);
157                            rootID.append(sentenceID);
158                            rootID.append('_');
159    //                      if (rootHandling.equals(RootHandling.NORMAL)) { 
160                                    rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals()));
161    //                      } else if (rootHandling.equals(RootHandling.TALBANKEN)) {
162    //                              rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1));
163    //                      }
164                    }
165    
166            }
167            
168            public void writeEpilog() throws MaltChainedException { 
169                    writeTail();
170            }
171            
172            public BufferedWriter getWriter() {
173                    return writer;
174            }
175    
176            public void setWriter(BufferedWriter writer) {
177                    this.writer = writer;
178            }
179            
180            public void close() throws MaltChainedException {
181                    try {
182                            if (writer != null) {
183                                    writer.flush();
184                                    if (closeStream) {
185                                            writer.close();
186                                    }
187                                    writer = null;
188                            }
189                    }   catch (IOException e) {
190                            throw new DataFormatException("Could not close the output file. ", e);
191                    } 
192            }
193            
194            private void writeHeader() throws MaltChainedException {
195                    try {
196                            if (header == null) {
197                                    header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
198                            }
199                            writer.write(header.toTigerXML());
200    //                      hasWriteTigerXMLHeader = true;
201                    } catch (IOException e) {
202                            throw new DataFormatException("The TigerXML writer could not write to file. ", e);
203                    }
204            }
205            
206            
207            private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
208                    try {
209                            writer.write("        <terminals>\n");
210                            for (int index : phraseStructure.getTokenIndices()) {
211                                    final PhraseStructureNode t = phraseStructure.getTokenNode(index);
212                                    writer.write("          <t ");
213                                    if (!labeledTerminalID) {
214                                            tmpID.setLength(0);
215                                            tmpID.append(sentenceID);
216                                            tmpID.append('_');
217                                            tmpID.append(Integer.toString(t.getIndex()));
218                                            writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" ");
219                                    }
220                                    
221                                    for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) {
222                                            writer.write(column.getName().toLowerCase());
223                                            writer.write("=\"");
224                                            writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable())));
225                                            writer.write("\" ");    
226                                    }
227                                    writer.write("/>\n");
228                            }
229                            writer.write("        </terminals>\n");
230                    } catch (IOException e) {
231                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
232                    }
233            }
234            
235            public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
236                    try {
237                            SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
238                            for (int index : phraseStructure.getNonTerminalIndices()) {
239                                    heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
240                            }
241                            writer.write("        <nonterminals>\n");
242                            boolean done = false;
243                            int h = 1;
244                            while (!done) {
245                                    done = true;
246                                    for (int index : phraseStructure.getNonTerminalIndices()) {
247                                            if (heights.get(index) == h) {
248                                                    NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
249                                                    tmpID.setLength(0);
250                                                    tmpID.append(sentenceID);
251                                                    tmpID.append('_');
252                                                    tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1));
253                                                    writeNonTerminal(nt, tmpID.toString());
254                                                    done = false;
255                                            }
256                                    }
257                                    h++;
258                            }
259                            
260                            writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString());
261                            writer.write("        </nonterminals>\n");
262                    } catch (IOException e) {
263                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
264                    }
265            }
266            
267            public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException {
268                    try {
269                            writer.write("          <nt");
270                            writer.write(" id=\"");writer.write(id);writer.write("\" ");
271                            for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
272                                    if (nt.hasLabel(column.getSymbolTable())) {
273                                            writer.write(column.getName().toLowerCase());
274                                            writer.write("=");
275                                            writer.write("\"");
276                                            writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable())));
277                                            writer.write("\" ");
278                                    }
279                            }
280                            writer.write(">\n");
281                            
282                            for (int i = 0, n = nt.nChildren(); i < n; i++) {
283                                    PhraseStructureNode child = nt.getChild(i); 
284                                    writer.write("            <edge ");
285    
286                                    for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) {
287                                            if (child.hasParentEdgeLabel(column.getSymbolTable())) {
288                                                    writer.write(column.getName().toLowerCase());
289                                                    writer.write("=\"");
290                                                    writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable())));
291                                                    writer.write("\" ");
292                                            }
293                                    }
294                                    if (child instanceof TokenNode) {
295                                            if (!labeledTerminalID) {
296                                                    tmpID.setLength(0);
297                                                    tmpID.append(sentenceID);
298                                                    tmpID.append('_');
299                                                    tmpID.append(Integer.toString(child.getIndex()));
300                                                    writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
301                                            } else {
302                                                    writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\"");
303                                            }
304                                            
305                                    } else {
306                                            tmpID.setLength(0);
307                                            tmpID.append(sentenceID);
308                                            tmpID.append('_');
309                                            tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1));
310                                            writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
311                                    }
312                                    writer.write(" />\n");
313                            }
314                            writer.write("          </nt>\n");
315                    } catch (IOException e) {
316                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
317                    }
318            }
319    
320            
321            private void writeTail() throws MaltChainedException {
322                    try {
323                            writer.write("  </body>\n");
324                            writer.write("</corpus>\n");
325                            writer.flush();
326    //                      if (fileName != null && charsetName != null) {
327    //                              writer.close();
328    //                              writer = null;
329    //                              BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName));
330    //                              if (header == null) {
331    //                                      header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
332    //                              }
333    //                              
334    //                              headerWriter.write(header.toTigerXML());
335    //                              headerWriter.flush();
336    //                              headerWriter.close();
337    //                      }
338                    } catch (IOException e) {
339                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
340                    }
341            }
342            
343            public int getSentenceCount() {
344                    return sentenceCount;
345            }
346    
347            public void setSentenceCount(int sentenceCount) {
348                    this.sentenceCount = sentenceCount;
349            }
350            
351            public DataFormatInstance getDataFormatInstance() {
352                    return dataFormatInstance;
353            }
354    
355            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
356                    this.dataFormatInstance = dataFormatInstance;
357                    labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID"));
358            }
359    
360            public String getOptions() {
361                    return optionString;
362            }
363            
364            public void setOptions(String optionString) throws MaltChainedException {
365                    this.optionString = optionString;
366                    rootHandling = RootHandling.NORMAL;
367    
368                    String[] argv;
369                    try {
370                            argv = optionString.split("[_\\p{Blank}]");
371                    } catch (PatternSyntaxException e) {
372                            throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e);
373                    }
374                    for (int i=0; i < argv.length-1; i++) {
375                            if(argv[i].charAt(0) != '-') {
376                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
377                            }
378                            if(++i>=argv.length) {
379                                    throw new DataFormatException("The last argument does not have any value. ");
380                            }
381                            switch(argv[i-1].charAt(1)) {
382                            case 'r': 
383                                    if (argv[i].equals("n")) {
384                                            rootHandling = RootHandling.NORMAL;
385                                    } else if (argv[i].equals("tal")) {
386                                            rootHandling = RootHandling.TALBANKEN;
387                                    }
388                                    break;
389                            case 's': 
390                                    try {
391                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
392                                    } catch (NumberFormatException e){
393                                            throw new MaltChainedException("The TigerXML writer option -s must be an integer value. ");
394                                    }
395                                    break;
396                            case 'v': 
397                                    VROOT_SYMBOL = argv[i];
398                                    break;  
399                            default:
400                                    throw new DataFormatException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");            
401                            }
402                    }       
403            }
404    }