001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.SortedMap;
011    import java.util.TreeMap;
012    import java.util.regex.PatternSyntaxException;
013    
014    import org.maltparser.core.exception.MaltChainedException;
015    
016    import org.maltparser.core.helper.Util;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.PhraseStructure;
021    import org.maltparser.core.syntaxgraph.TokenStructure;
022    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
023    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
024    import org.maltparser.core.syntaxgraph.node.TokenNode;
025    import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader;
026    import org.maltparser.ml.libsvm.LibsvmException;
027    /**
028    *
029    *
030    * @author Johan Hall
031    */
032    public class TigerXMLWriter implements SyntaxGraphWriter {
033            private enum RootHandling {
034                    TALBANKEN, NORMAL  
035            };
036    
037            private BufferedWriter writer;
038            private DataFormatInstance dataFormatInstance;
039            private String optionString;
040            private int sentenceCount;
041            private TigerXMLHeader header;
042    //      private boolean hasWriteTigerXMLHeader = false;
043            private RootHandling rootHandling;
044            private String sentencePrefix = "s";
045            private StringBuilder sentenceID;
046            private StringBuilder tmpID;
047            private StringBuilder rootID;
048            private int START_ID_OF_NONTERMINALS = 500;
049            private boolean labeledTerminalID;
050            private String VROOT_SYMBOL = "VROOT";
051            private boolean useVROOT = false;
052    //      private String fileName = null;
053    //      private String charsetName = null;
054            
055            public TigerXMLWriter() { 
056                    sentenceID = new StringBuilder();
057                    tmpID = new StringBuilder();
058                    rootID = new StringBuilder();
059                    labeledTerminalID = false;
060            }
061            
062            public void open(String fileName, String charsetName) throws MaltChainedException {
063                    try {
064    //                      this.fileName = fileName;
065    //                      this.charsetName = charsetName;
066                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
067                    } catch (FileNotFoundException e) {
068                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
069                    } catch (UnsupportedEncodingException e) {
070                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
071                    }       
072            }
073            
074            public void open(OutputStream os, String charsetName) throws MaltChainedException {
075                    try {
076                            open(new OutputStreamWriter(os, charsetName));
077                    } catch (UnsupportedEncodingException e) {
078                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
079                    }
080            }
081            
082            public void open(OutputStreamWriter osw) throws MaltChainedException {
083                    setWriter(new BufferedWriter(osw));
084                    setSentenceCount(0);
085            }
086            
087            public void writeProlog() throws MaltChainedException { 
088    //              if (fileName == null || charsetName == null) {
089                            writeHeader();
090    //              }
091            }
092            
093            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
094                    if (syntaxGraph == null || dataFormatInstance == null) {
095                            return;
096                    }
097                    if (syntaxGraph.hasTokens()) {
098                            sentenceCount++;
099                            final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
100                            try {
101                                    sentenceID.setLength(0);
102                                    sentenceID.append(sentencePrefix);
103                                    if (phraseStructure.getSentenceID() != 0) {
104                                            sentenceID.append(Integer.toString(phraseStructure.getSentenceID()));
105                                    } else {
106                                            sentenceID.append(Integer.toString(sentenceCount));
107                                    }
108                                    writer.write("    <s id=\"");
109                                    writer.write(sentenceID.toString());  
110                                    writer.write("\">\n");
111                                    
112                                    setRootID(phraseStructure);
113                                    writer.write("      <graph root=\"");
114                                    writer.write(rootID.toString());
115                                    writer.write("\" ");
116                                    writer.write("discontinuous=\"");
117                                    writer.write(Boolean.toString(!phraseStructure.isContinuous()));
118                                    writer.write("\">\n");
119                                    
120                                    writeTerminals(phraseStructure);
121                                    if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) {
122                                            writeNonTerminals(phraseStructure);
123                                    } else {
124                                            writer.write("        <nonterminals/>\n");
125                                    }
126                                    writer.write("      </graph>\n");
127                                    writer.write("    </s>\n");
128                            } catch (IOException e) {
129                                    throw new DataFormatException("The TigerXML writer could not write to file. ", e);
130                            }
131                    }
132            }
133            
134            private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException {
135                    useVROOT = false;
136                    PhraseStructureNode root = phraseStructure.getPhraseStructureRoot();
137                    for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
138                            if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) {
139                                    useVROOT = true;
140                                    break;
141                            }
142                    }
143                    if (useVROOT) {
144                            rootID.setLength(0);
145                            rootID.append(sentenceID);
146                            rootID.append('_');
147                            rootID.append(VROOT_SYMBOL);
148                    } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) {
149                            rootID.setLength(0);
150                            rootID.append(sentenceID);
151                            rootID.append("_1");
152                    } else {
153                            rootID.setLength(0);
154                            rootID.append(sentenceID);
155                            rootID.append('_');
156    //                      if (rootHandling.equals(RootHandling.NORMAL)) { 
157                                    rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals()));
158    //                      } else if (rootHandling.equals(RootHandling.TALBANKEN)) {
159    //                              rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1));
160    //                      }
161                    }
162    
163            }
164            
165            public void writeEpilog() throws MaltChainedException { 
166                    writeTail();
167            }
168            
169            public BufferedWriter getWriter() {
170                    return writer;
171            }
172    
173            public void setWriter(BufferedWriter writer) {
174                    this.writer = writer;
175            }
176            
177            public void close() throws MaltChainedException {
178                    try {
179                            if (writer != null) {
180                                    writer.flush();
181                                    writer.close();
182                                    writer = null;
183                            }
184                    }   catch (IOException e) {
185                            throw new DataFormatException("Could not close the output file. ", e);
186                    } 
187            }
188            
189            private void writeHeader() throws MaltChainedException {
190                    try {
191                            if (header == null) {
192                                    header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
193                            }
194                            writer.write(header.toTigerXML());
195    //                      hasWriteTigerXMLHeader = true;
196                    } catch (IOException e) {
197                            throw new DataFormatException("The TigerXML writer could not write to file. ", e);
198                    }
199            }
200            
201            
202            private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
203                    try {
204                            writer.write("        <terminals>\n");
205                            for (int index : phraseStructure.getTokenIndices()) {
206                                    final PhraseStructureNode t = phraseStructure.getTokenNode(index);
207                                    writer.write("          <t ");
208                                    if (!labeledTerminalID) {
209                                            tmpID.setLength(0);
210                                            tmpID.append(sentenceID);
211                                            tmpID.append('_');
212                                            tmpID.append(Integer.toString(t.getIndex()));
213                                            writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" ");
214                                    }
215                                    
216                                    for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) {
217                                            writer.write(column.getName().toLowerCase());
218                                            writer.write("=\"");
219                                            writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable())));
220                                            writer.write("\" ");    
221                                    }
222                                    writer.write("/>\n");
223                            }
224                            writer.write("        </terminals>\n");
225                    } catch (IOException e) {
226                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
227                    }
228            }
229            
230            public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
231                    try {
232                            SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
233                            for (int index : phraseStructure.getNonTerminalIndices()) {
234                                    heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
235                            }
236                            writer.write("        <nonterminals>\n");
237                            boolean done = false;
238                            int h = 1;
239                            while (!done) {
240                                    done = true;
241                                    for (int index : phraseStructure.getNonTerminalIndices()) {
242                                            if (heights.get(index) == h) {
243                                                    NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
244                                                    tmpID.setLength(0);
245                                                    tmpID.append(sentenceID);
246                                                    tmpID.append('_');
247                                                    tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1));
248                                                    writeNonTerminal(nt, tmpID.toString());
249                                                    done = false;
250                                            }
251                                    }
252                                    h++;
253                            }
254                            
255                            writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString());
256                            writer.write("        </nonterminals>\n");
257                    } catch (IOException e) {
258                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
259                    }
260            }
261            
262            public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException {
263                    try {
264                            writer.write("          <nt");
265                            writer.write(" id=\"");writer.write(id);writer.write("\" ");
266                            for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
267                                    if (nt.hasLabel(column.getSymbolTable())) {
268                                            writer.write(column.getName().toLowerCase());
269                                            writer.write("=");
270                                            writer.write("\"");
271                                            writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable())));
272                                            writer.write("\" ");
273                                    }
274                            }
275                            writer.write(">\n");
276                            
277                            for (int i = 0, n = nt.nChildren(); i < n; i++) {
278                                    PhraseStructureNode child = nt.getChild(i); 
279                                    writer.write("            <edge ");
280    
281                                    for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) {
282                                            if (child.hasParentEdgeLabel(column.getSymbolTable())) {
283                                                    writer.write(column.getName().toLowerCase());
284                                                    writer.write("=\"");
285                                                    writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable())));
286                                                    writer.write("\" ");
287                                            }
288                                    }
289                                    if (child instanceof TokenNode) {
290                                            if (!labeledTerminalID) {
291                                                    tmpID.setLength(0);
292                                                    tmpID.append(sentenceID);
293                                                    tmpID.append('_');
294                                                    tmpID.append(Integer.toString(child.getIndex()));
295                                                    writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
296                                            } else {
297                                                    writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\"");
298                                            }
299                                            
300                                    } else {
301                                            tmpID.setLength(0);
302                                            tmpID.append(sentenceID);
303                                            tmpID.append('_');
304                                            tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1));
305                                            writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
306                                    }
307                                    writer.write(" />\n");
308                            }
309                            writer.write("          </nt>\n");
310                    } catch (IOException e) {
311                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
312                    }
313            }
314    
315            
316            private void writeTail() throws MaltChainedException {
317                    try {
318                            writer.write("  </body>\n");
319                            writer.write("</corpus>\n");
320                            writer.flush();
321    //                      if (fileName != null && charsetName != null) {
322    //                              writer.close();
323    //                              writer = null;
324    //                              BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName));
325    //                              if (header == null) {
326    //                                      header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
327    //                              }
328    //                              
329    //                              headerWriter.write(header.toTigerXML());
330    //                              headerWriter.flush();
331    //                              headerWriter.close();
332    //                      }
333                    } catch (IOException e) {
334                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
335                    }
336            }
337            
338            public int getSentenceCount() {
339                    return sentenceCount;
340            }
341    
342            public void setSentenceCount(int sentenceCount) {
343                    this.sentenceCount = sentenceCount;
344            }
345            
346            public DataFormatInstance getDataFormatInstance() {
347                    return dataFormatInstance;
348            }
349    
350            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
351                    this.dataFormatInstance = dataFormatInstance;
352                    labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID"));
353            }
354    
355            public String getOptions() {
356                    return optionString;
357            }
358            
359            public void setOptions(String optionString) throws MaltChainedException {
360                    this.optionString = optionString;
361                    rootHandling = RootHandling.NORMAL;
362    
363                    String[] argv;
364                    try {
365                            argv = optionString.split("[_\\p{Blank}]");
366                    } catch (PatternSyntaxException e) {
367                            throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e);
368                    }
369                    for (int i=0; i < argv.length-1; i++) {
370                            if(argv[i].charAt(0) != '-') {
371                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
372                            }
373                            if(++i>=argv.length) {
374                                    throw new DataFormatException("The last argument does not have any value. ");
375                            }
376                            switch(argv[i-1].charAt(1)) {
377                            case 'r': 
378                                    if (argv[i].equals("n")) {
379                                            rootHandling = RootHandling.NORMAL;
380                                    } else if (argv[i].equals("tal")) {
381                                            rootHandling = RootHandling.TALBANKEN;
382                                    }
383                                    break;
384                            case 's': 
385                                    try {
386                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
387                                    } catch (NumberFormatException e){
388                                            throw new MaltChainedException("The TigerXML writer option -s must be an integer value. ");
389                                    }
390                                    break;
391                            case 'v': 
392                                    VROOT_SYMBOL = argv[i];
393                                    break;  
394                            default:
395                                    throw new LibsvmException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");                
396                            }
397                    }       
398            }
399    }