001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.Iterator;
011    import java.util.LinkedHashMap;
012    import java.util.SortedMap;
013    import java.util.TreeMap;
014    import java.util.regex.PatternSyntaxException;
015    
016    import org.maltparser.core.exception.MaltChainedException;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.PhraseStructure;
021    import org.maltparser.core.syntaxgraph.TokenStructure;
022    import org.maltparser.core.syntaxgraph.edge.Edge;
023    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
024    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025    import org.maltparser.ml.libsvm.LibsvmException;
026    /**
027    *
028    *
029    * @author Johan Hall
030    */
031    public class NegraWriter implements SyntaxGraphWriter {
032            private BufferedWriter writer; 
033            private DataFormatInstance dataFormatInstance;
034            private String optionString;
035            private int sentenceCount;
036            private LinkedHashMap<Integer, Integer> nonTerminalIndexMap;
037            private int START_ID_OF_NONTERMINALS = 500;
038            
039            public NegraWriter() { 
040                    nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>();
041            }
042            
043            public void open(String fileName, String charsetName) throws MaltChainedException {
044                    try {
045                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
046                    } catch (FileNotFoundException e) {
047                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
048                    } catch (UnsupportedEncodingException e) {
049                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
050                    }       
051            }
052            
053            public void open(OutputStream os, String charsetName) throws MaltChainedException {
054                    try {
055                            open(new OutputStreamWriter(os, charsetName));
056                    } catch (UnsupportedEncodingException e) {
057                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
058                    }
059            }
060            
061            public void open(OutputStreamWriter osw) throws MaltChainedException {
062                    setWriter(new BufferedWriter(osw));
063                    setSentenceCount(0);
064            }
065            
066            public void writeProlog() throws MaltChainedException { }
067            
068            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
069                    if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) {
070                            return;
071                    }
072                    PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
073                    sentenceCount++;
074                    try {
075                            writer.write("#BOS ");
076                            if (phraseStructure.getSentenceID() != 0) {
077                                    writer.write(Integer.toString(phraseStructure.getSentenceID()));
078                            } else {
079                                    writer.write(Integer.toString(sentenceCount));
080                            }
081                            writer.write('\n');
082    
083                            if (phraseStructure.hasNonTerminals()) {
084                                    calculateIndices(phraseStructure);
085                                    writeTerminals(phraseStructure);
086                                    writeNonTerminals(phraseStructure);
087                            } else {
088                                    writeTerminals(phraseStructure);
089                            }
090                            writer.write("#EOS ");
091                            if (phraseStructure.getSentenceID() != 0) {
092                                    writer.write(Integer.toString(phraseStructure.getSentenceID()));
093                            } else {
094                                    writer.write(Integer.toString(sentenceCount));
095                            }
096                            writer.write('\n');
097                    } catch (IOException e) {
098                            throw new DataFormatException("Could not write to the output file. ", e);
099                    }
100            }
101            public void writeEpilog() throws MaltChainedException { }
102            
103    
104            private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException {
105                    final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
106                    for (int index : phraseStructure.getNonTerminalIndices()) {
107                            heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
108                    }
109                    
110                    boolean done = false;
111                    int h = 1;
112                    int ntid = START_ID_OF_NONTERMINALS;
113                    nonTerminalIndexMap.clear();
114                    while (!done) {
115                            done = true;
116                            for (int index : phraseStructure.getNonTerminalIndices()) {
117                                    if (heights.get(index) == h) {
118                                            NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
119                                            nonTerminalIndexMap.put(nt.getIndex(), ntid++);
120    //                                      nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
121                                            done = false;
122                                    }
123                            }
124                            h++;
125                    }
126                    
127    //              boolean done = false;
128    //              int h = 1;
129    ////            int ntid = START_ID_OF_NONTERMINALS;
130    ////            nonTerminalIndexMap.clear();
131    //              while (!done) {
132    //                      done = true;
133    //                      for (int index : phraseStructure.getNonTerminalIndices()) {
134    //                              if (heights.get(index) == h) {
135    //                                      NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
136    ////                                    nonTerminalIndexMap.put(nt.getIndex(), ntid++);
137    //                                      nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
138    //                                      done = false;
139    //                              }
140    //                      }
141    //                      h++;
142    //              }
143            }
144            
145            private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
146                    try {
147                            for (int index : phraseStructure.getTokenIndices()) {
148                                    final PhraseStructureNode terminal = phraseStructure.getTokenNode(index);
149                                    final Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
150                                    ColumnDescription column = null;
151                                    int ti = 1;
152                                    while (columns.hasNext()) {
153                                            column = columns.next();
154                                            if (column.getCategory() == ColumnDescription.INPUT) {
155                                                    writer.write(terminal.getLabelSymbol(column.getSymbolTable()));
156                                                    int nTabs = 1;
157                                                    if (ti == 1 || ti == 2) {
158                                                            nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
159                                                    } else if (ti == 3) {
160                                                            nTabs = 1;
161                                                    } else if (ti == 4) {
162                                                            nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
163                                                    }
164                                                    if (nTabs < 1) {
165                                                            nTabs = 1;
166                                                    }
167                                                    for (int j = 0; j < nTabs; j++) {
168                                                            writer.write('\t');
169                                                    }
170                                                    ti++;
171                                            } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
172                                                    if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) {
173                                                            writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable()));
174                                                            writer.write('\t');
175                                                    } else {
176                                                            writer.write("--\t");
177                                                    }
178                                            } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) { 
179                                                    if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) {
180                                                            writer.write('0');
181                                                    } else {
182                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex())));
183    //                                                      writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
184                                                    }
185                                            }
186                                    }
187                                    for (Edge e : terminal.getIncomingSecondaryEdges()) {
188                                            if (e.hasLabel(column.getSymbolTable())) {
189                                                    writer.write('\t');
190                                                    writer.write(e.getLabelSymbol(column.getSymbolTable()));
191                                                    writer.write('\t');
192                                                    if (e.getSource() instanceof NonTerminalNode) {
193                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
194    //                                                      writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
195                                                    } else {
196                                                            writer.write(Integer.toString(e.getSource().getIndex()));
197                                                    }
198                                            }
199                                    }
200                                    writer.write("\n");
201                            }
202    
203                    } catch (IOException e) {
204                            throw new DataFormatException("The Negra writer is not able to write. ", e);
205                    }
206            }
207            
208            private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
209                    for (int index : nonTerminalIndexMap.keySet()) {
210    //              for (int index : phraseStructure.getNonTerminalIndices()) {
211                            NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
212            
213                            if (nonTerminal == null || nonTerminal.isRoot()) {
214                                    return;
215                            }
216                            try {
217                                    writer.write('#');
218    //                              writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1));
219                                    writer.write(Integer.toString(nonTerminalIndexMap.get(index)));
220                                    writer.write("\t\t\t--\t\t\t");
221                                    if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) {
222                                            writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable()));
223                                    } else {
224                                            writer.write("--");
225                                    }
226                                    writer.write("\t--\t\t");
227                                    if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) {
228                                            writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable()));
229                                    } else {
230                                            writer.write("--");
231                                    }
232                                    writer.write('\t');
233                                    if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) {
234                                            writer.write('0');
235                                    } else {
236    //                                      writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
237                                            writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex())));
238                                    }
239                                    for (Edge e : nonTerminal.getIncomingSecondaryEdges()) {
240                                            if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) {
241                                                    writer.write('\t');
242                                                    writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable()));
243                                                    writer.write('\t');
244                                                    if (e.getSource() instanceof NonTerminalNode) {
245    //                                                      writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
246                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
247                                                    } else {
248                                                            writer.write(Integer.toString(e.getSource().getIndex()));
249                                                    }
250                                            }
251                                    }
252                                    writer.write("\n");
253                            } catch (IOException e) {
254                                    throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e);
255                            }
256                    }
257            }
258            
259            public BufferedWriter getWriter() {
260                    return writer;
261            }
262    
263            public void setWriter(BufferedWriter writer) {
264                    this.writer = writer;
265            }
266            
267            public int getSentenceCount() {
268                    return sentenceCount;
269            }
270    
271            public void setSentenceCount(int sentenceCount) {
272                    this.sentenceCount = sentenceCount;
273            }
274            
275            public DataFormatInstance getDataFormatInstance() {
276                    return dataFormatInstance;
277            }
278    
279            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
280                    this.dataFormatInstance = dataFormatInstance;
281            }
282    
283            public String getOptions() {
284                    return optionString;
285            }
286            
287            public void setOptions(String optionString) throws MaltChainedException {
288                    this.optionString = optionString;
289                    String[] argv;
290                    try {
291                            argv = optionString.split("[_\\p{Blank}]");
292                    } catch (PatternSyntaxException e) {
293                            throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
294                    }
295                    for (int i=0; i < argv.length-1; i++) {
296                            if(argv[i].charAt(0) != '-') {
297                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
298                            }
299                            if(++i>=argv.length) {
300                                    throw new DataFormatException("The last argument does not have any value. ");
301                            }
302                            switch(argv[i-1].charAt(1)) {
303                            case 's': 
304                                    try {
305                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
306                                    } catch (NumberFormatException e){
307                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
308                                    }
309                                    break;
310                            default:
311                                    throw new LibsvmException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");         
312                            }
313                    }       
314            }
315            
316            public void close() throws MaltChainedException {
317                    try {
318                            if (writer != null) {
319                                    writer.flush();
320                                    writer.close();
321                                    writer = null;
322                            }
323                    }   catch (IOException e) {
324                            throw new DataFormatException("Could not close the output file. ", e);
325                    } 
326            }
327    }