001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.Iterator;
011    import java.util.LinkedHashMap;
012    import java.util.SortedMap;
013    import java.util.TreeMap;
014    import java.util.regex.PatternSyntaxException;
015    
016    import org.maltparser.core.exception.MaltChainedException;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.PhraseStructure;
021    import org.maltparser.core.syntaxgraph.TokenStructure;
022    import org.maltparser.core.syntaxgraph.edge.Edge;
023    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
024    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025    /**
026    *
027    *
028    * @author Johan Hall
029    */
030    public class NegraWriter implements SyntaxGraphWriter {
031            private BufferedWriter writer; 
032            private DataFormatInstance dataFormatInstance;
033            private String optionString;
034            private int sentenceCount;
035            private LinkedHashMap<Integer, Integer> nonTerminalIndexMap;
036            private int START_ID_OF_NONTERMINALS = 500;
037            private boolean closeStream = true;
038            
039            public NegraWriter() { 
040                    nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>();
041            }
042            
043            public void open(String fileName, String charsetName) throws MaltChainedException {
044                    try {
045                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
046                    } catch (FileNotFoundException e) {
047                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
048                    } catch (UnsupportedEncodingException e) {
049                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
050                    }       
051            }
052            
053            public void open(OutputStream os, String charsetName) throws MaltChainedException {
054                    try {
055                            if (os == System.out || os == System.err) {
056                                    closeStream = false;
057                            }
058                            open(new OutputStreamWriter(os, charsetName));
059                    } catch (UnsupportedEncodingException e) {
060                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
061                    }
062            }
063            
064            private void open(OutputStreamWriter osw) throws MaltChainedException {
065                    setWriter(new BufferedWriter(osw));
066                    setSentenceCount(0);
067            }
068            
069            public void writeProlog() throws MaltChainedException { }
070            
071            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
072                    if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) {
073                            return;
074                    }
075                    PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
076                    sentenceCount++;
077                    try {
078                            writer.write("#BOS ");
079                            if (phraseStructure.getSentenceID() != 0) {
080                                    writer.write(Integer.toString(phraseStructure.getSentenceID()));
081                            } else {
082                                    writer.write(Integer.toString(sentenceCount));
083                            }
084                            writer.write('\n');
085    
086                            if (phraseStructure.hasNonTerminals()) {
087                                    calculateIndices(phraseStructure);
088                                    writeTerminals(phraseStructure);
089                                    writeNonTerminals(phraseStructure);
090                            } else {
091                                    writeTerminals(phraseStructure);
092                            }
093                            writer.write("#EOS ");
094                            if (phraseStructure.getSentenceID() != 0) {
095                                    writer.write(Integer.toString(phraseStructure.getSentenceID()));
096                            } else {
097                                    writer.write(Integer.toString(sentenceCount));
098                            }
099                            writer.write('\n');
100                    } catch (IOException e) {
101                            throw new DataFormatException("Could not write to the output file. ", e);
102                    }
103            }
104            public void writeEpilog() throws MaltChainedException { }
105            
106    
107            private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException {
108                    final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
109                    for (int index : phraseStructure.getNonTerminalIndices()) {
110                            heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
111                    }
112                    
113                    boolean done = false;
114                    int h = 1;
115                    int ntid = START_ID_OF_NONTERMINALS;
116                    nonTerminalIndexMap.clear();
117                    while (!done) {
118                            done = true;
119                            for (int index : phraseStructure.getNonTerminalIndices()) {
120                                    if (heights.get(index) == h) {
121                                            NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
122                                            nonTerminalIndexMap.put(nt.getIndex(), ntid++);
123    //                                      nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
124                                            done = false;
125                                    }
126                            }
127                            h++;
128                    }
129                    
130    //              boolean done = false;
131    //              int h = 1;
132    ////            int ntid = START_ID_OF_NONTERMINALS;
133    ////            nonTerminalIndexMap.clear();
134    //              while (!done) {
135    //                      done = true;
136    //                      for (int index : phraseStructure.getNonTerminalIndices()) {
137    //                              if (heights.get(index) == h) {
138    //                                      NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
139    ////                                    nonTerminalIndexMap.put(nt.getIndex(), ntid++);
140    //                                      nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
141    //                                      done = false;
142    //                              }
143    //                      }
144    //                      h++;
145    //              }
146            }
147            
148            private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
149                    try {
150                            for (int index : phraseStructure.getTokenIndices()) {
151                                    final PhraseStructureNode terminal = phraseStructure.getTokenNode(index);
152                                    final Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
153                                    ColumnDescription column = null;
154                                    int ti = 1;
155                                    while (columns.hasNext()) {
156                                            column = columns.next();
157                                            if (column.getCategory() == ColumnDescription.INPUT) {
158                                                    writer.write(terminal.getLabelSymbol(column.getSymbolTable()));
159                                                    int nTabs = 1;
160                                                    if (ti == 1 || ti == 2) {
161                                                            nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
162                                                    } else if (ti == 3) {
163                                                            nTabs = 1;
164                                                    } else if (ti == 4) {
165                                                            nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
166                                                    }
167                                                    if (nTabs < 1) {
168                                                            nTabs = 1;
169                                                    }
170                                                    for (int j = 0; j < nTabs; j++) {
171                                                            writer.write('\t');
172                                                    }
173                                                    ti++;
174                                            } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
175                                                    if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) {
176                                                            writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable()));
177                                                            writer.write('\t');
178                                                    } else {
179                                                            writer.write("--\t");
180                                                    }
181                                            } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) { 
182                                                    if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) {
183                                                            writer.write('0');
184                                                    } else {
185                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex())));
186    //                                                      writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
187                                                    }
188                                            }
189                                    }
190                                    for (Edge e : terminal.getIncomingSecondaryEdges()) {
191                                            if (e.hasLabel(column.getSymbolTable())) {
192                                                    writer.write('\t');
193                                                    writer.write(e.getLabelSymbol(column.getSymbolTable()));
194                                                    writer.write('\t');
195                                                    if (e.getSource() instanceof NonTerminalNode) {
196                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
197    //                                                      writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
198                                                    } else {
199                                                            writer.write(Integer.toString(e.getSource().getIndex()));
200                                                    }
201                                            }
202                                    }
203                                    writer.write("\n");
204                            }
205    
206                    } catch (IOException e) {
207                            throw new DataFormatException("The Negra writer is not able to write. ", e);
208                    }
209            }
210            
211            private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
212                    for (int index : nonTerminalIndexMap.keySet()) {
213    //              for (int index : phraseStructure.getNonTerminalIndices()) {
214                            NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
215            
216                            if (nonTerminal == null || nonTerminal.isRoot()) {
217                                    return;
218                            }
219                            try {
220                                    writer.write('#');
221    //                              writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1));
222                                    writer.write(Integer.toString(nonTerminalIndexMap.get(index)));
223                                    writer.write("\t\t\t--\t\t\t");
224                                    if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) {
225                                            writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable()));
226                                    } else {
227                                            writer.write("--");
228                                    }
229                                    writer.write("\t--\t\t");
230                                    if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) {
231                                            writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable()));
232                                    } else {
233                                            writer.write("--");
234                                    }
235                                    writer.write('\t');
236                                    if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) {
237                                            writer.write('0');
238                                    } else {
239    //                                      writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
240                                            writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex())));
241                                    }
242                                    for (Edge e : nonTerminal.getIncomingSecondaryEdges()) {
243                                            if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) {
244                                                    writer.write('\t');
245                                                    writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable()));
246                                                    writer.write('\t');
247                                                    if (e.getSource() instanceof NonTerminalNode) {
248    //                                                      writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
249                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
250                                                    } else {
251                                                            writer.write(Integer.toString(e.getSource().getIndex()));
252                                                    }
253                                            }
254                                    }
255                                    writer.write("\n");
256                            } catch (IOException e) {
257                                    throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e);
258                            }
259                    }
260            }
261            
262            public BufferedWriter getWriter() {
263                    return writer;
264            }
265    
266            public void setWriter(BufferedWriter writer) {
267                    this.writer = writer;
268            }
269            
270            public int getSentenceCount() {
271                    return sentenceCount;
272            }
273    
274            public void setSentenceCount(int sentenceCount) {
275                    this.sentenceCount = sentenceCount;
276            }
277            
278            public DataFormatInstance getDataFormatInstance() {
279                    return dataFormatInstance;
280            }
281    
282            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
283                    this.dataFormatInstance = dataFormatInstance;
284            }
285    
286            public String getOptions() {
287                    return optionString;
288            }
289            
290            public void setOptions(String optionString) throws MaltChainedException {
291                    this.optionString = optionString;
292                    String[] argv;
293                    try {
294                            argv = optionString.split("[_\\p{Blank}]");
295                    } catch (PatternSyntaxException e) {
296                            throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
297                    }
298                    for (int i=0; i < argv.length-1; i++) {
299                            if(argv[i].charAt(0) != '-') {
300                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
301                            }
302                            if(++i>=argv.length) {
303                                    throw new DataFormatException("The last argument does not have any value. ");
304                            }
305                            switch(argv[i-1].charAt(1)) {
306                            case 's': 
307                                    try {
308                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
309                                    } catch (NumberFormatException e){
310                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
311                                    }
312                                    break;
313                            default:
314                                    throw new DataFormatException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");             
315                            }
316                    }       
317            }
318            
319            public void close() throws MaltChainedException {
320                    try {
321                            if (writer != null) {
322                                    writer.flush();
323                                    if (closeStream) {
324                                            writer.close();
325                                    }
326                                    writer = null;
327                            }
328                    }   catch (IOException e) {
329                            throw new DataFormatException("Could not close the output file. ", e);
330                    } 
331            }
332    }