001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.SortedMap;
011    import java.util.regex.PatternSyntaxException;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.io.dataformat.ColumnDescription;
015    import org.maltparser.core.io.dataformat.DataFormatException;
016    import org.maltparser.core.io.dataformat.DataFormatInstance;
017    import org.maltparser.core.symbol.SymbolTable;
018    import org.maltparser.core.syntaxgraph.PhraseStructure;
019    import org.maltparser.core.syntaxgraph.TokenStructure;
020    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
021    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
022    import org.maltparser.core.syntaxgraph.node.TokenNode;
023    import org.maltparser.ml.libsvm.LibsvmException;
024    /**
025    *
026    *
027    * @author Johan Hall
028    */
029    public class BracketWriter implements SyntaxGraphWriter {
030            private enum PennWriterFormat {
031                    DEFAULT, PRETTY
032            };
033            private PennWriterFormat format;
034            private BufferedWriter writer;
035            private DataFormatInstance dataFormatInstance;
036            private SortedMap<String,ColumnDescription> inputColumns;
037            private SortedMap<String,ColumnDescription> edgeLabelColumns;
038            private SortedMap<String,ColumnDescription> phraseLabelColumns;
039            private char STARTING_BRACKET = '(';
040            private String EMPTY_EDGELABEL = "??";
041            private char CLOSING_BRACKET = ')';
042            private char INPUT_SEPARATOR = ' ';
043            private char EDGELABEL_SEPARATOR = '-';
044            private char SENTENCE_SEPARATOR = '\n';
045            private String optionString;
046            
047            public BracketWriter() { 
048            }
049    
050            public void open(String fileName, String charsetName) throws MaltChainedException {
051                    try {
052                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
053                    } catch (FileNotFoundException e) {
054                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
055                    } catch (UnsupportedEncodingException e) {
056                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
057                    }       
058            }
059            
060            public void open(OutputStream os, String charsetName) throws MaltChainedException {
061                    try {
062                            open(new OutputStreamWriter(os, charsetName));
063                    } catch (UnsupportedEncodingException e) {
064                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
065                    }
066            }
067            
068            public void open(OutputStreamWriter osw) throws MaltChainedException {
069                    setWriter(new BufferedWriter(osw));
070            }
071    
072            public void writeEpilog() throws MaltChainedException {
073    
074            }
075            
076            public void writeProlog() throws MaltChainedException {
077            
078            }
079            
080            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
081                    if (syntaxGraph == null || dataFormatInstance == null) {
082                            return;
083                    }
084                    if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) {
085    //                      PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph);
086                            if (format == PennWriterFormat.PRETTY) {
087                                    writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0);
088                            } else {
089                                    writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot());
090                            }
091                            try {
092                                    writer.write(SENTENCE_SEPARATOR);
093                                    writer.flush();
094                            } catch (IOException e) {
095                                    close();
096                                    throw new DataFormatException("Could not write to the output file. ", e);
097                            }
098                    }
099            }
100            
101            private void writeElement(PhraseStructureNode element) throws MaltChainedException {
102                    try {
103                            if (element instanceof TokenNode) {
104                                    PhraseStructureNode t = (PhraseStructureNode)element;
105                                    SymbolTable table = null;
106                                    writer.write(STARTING_BRACKET);
107                                    int i = 0;
108                                    for (String inputColumn : inputColumns.keySet()) {
109                                            if (i != 0) {
110                                                    writer.write(INPUT_SEPARATOR);
111                                            }
112                                            table = inputColumns.get(inputColumn).getSymbolTable();
113                                            if (t.hasLabel(table)) {
114                                                    writer.write(t.getLabelSymbol(table));
115                                            }
116                                            if (i == 0) {
117                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
118                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
119                                                            if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
120                                                                    writer.write(EDGELABEL_SEPARATOR);
121                                                                    writer.write(t.getParentEdgeLabelSymbol(table));
122                                                            }
123                                                    }
124                                            }
125                                            i++;
126                                    }
127                                    writer.write(CLOSING_BRACKET);
128                            } else {
129                                    NonTerminalNode nt = (NonTerminalNode)element;
130                                    writer.write(STARTING_BRACKET);
131                                    SymbolTable table = null;
132                                    int i = 0;
133                                    for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
134                                            if (i != 0) {
135                                                    writer.write(INPUT_SEPARATOR);
136                                            }
137                                            table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
138                                            if (nt.hasLabel(table)) { 
139                                                    writer.write(nt.getLabelSymbol(table));
140                                            }
141                                            if (i == 0) {
142                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
143                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
144                                                            if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
145                                                                    writer.write(EDGELABEL_SEPARATOR);
146                                                                    writer.write(nt.getParentEdgeLabelSymbol(table));
147                                                            }
148                                                    }
149                                            }
150                                            i++;
151                                    }
152                                    for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
153                                            writeElement(node);
154                                    }
155                                    writer.write(CLOSING_BRACKET);
156                            }
157                    } catch (IOException e) {
158                            throw new DataFormatException("Could not write to the output file. ", e);
159                    }
160            }
161            
162            private String getIndentation(int depth) {
163                    StringBuilder sb = new StringBuilder("");
164                    for (int i = 0; i < depth; i++) {
165                            sb.append("\t");
166                    }
167                    return sb.toString();
168            }
169            
170            private void writeElement(PhraseStructureNode element, int depth) throws MaltChainedException {
171                    try {
172                            if (element instanceof TokenNode) {
173                                    PhraseStructureNode t = (PhraseStructureNode)element;
174                                    SymbolTable table = null;
175                                    writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
176                                    int i = 0;
177                                    for (String inputColumn : inputColumns.keySet()) {
178                                            if (i != 0) {
179                                                    writer.write(INPUT_SEPARATOR);
180                                            }
181                                            table = inputColumns.get(inputColumn).getSymbolTable();
182                                            if (t.hasLabel(table)) {
183                                                    writer.write(encodeString(t.getLabelSymbol(table)));
184                                            }
185                                            if (i == 0) {
186                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
187                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
188                                                            if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
189                                                                    writer.write(EDGELABEL_SEPARATOR);
190                                                                    writer.write(t.getParentEdgeLabelSymbol(table));
191                                                            }
192                                                    }
193                                            }
194                                            i++;
195                                    }
196                                    writer.write(CLOSING_BRACKET);
197                            } else {
198                                    NonTerminalNode nt = (NonTerminalNode)element;
199                                    writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
200                                    SymbolTable table = null;
201                                    int i = 0;
202                                    for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
203                                            if (i != 0) {
204                                                    writer.write(INPUT_SEPARATOR);
205                                            }
206                                            table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
207                                            if (nt.hasLabel(table)) { 
208                                                    writer.write(nt.getLabelSymbol(table));
209                                            }
210                                            if (i == 0) {
211                                                    for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
212                                                            table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
213                                                            if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
214                                                                    writer.write(EDGELABEL_SEPARATOR);
215                                                                    writer.write(nt.getParentEdgeLabelSymbol(table));
216                                                            }
217                                                    }
218                                            }
219                                            i++;
220                                    }
221                                    for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
222                                            writeElement(node, depth + 1);
223                                    }
224                                    writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET);
225                            }
226                    } catch (IOException e) {
227                            throw new DataFormatException("Could not write to the output file. ", e);
228                    }
229            }
230            
231            public BufferedWriter getWriter() {
232                    return writer;
233            }
234    
235            public void setWriter(BufferedWriter writer) throws MaltChainedException {
236                    close();
237                    this.writer = writer;
238            }
239            
240            public DataFormatInstance getDataFormatInstance() {
241                    return dataFormatInstance;
242            }
243    
244            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
245                    this.dataFormatInstance = dataFormatInstance;
246                    inputColumns = dataFormatInstance.getInputColumnDescriptions();
247                    edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
248                    phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
249            }
250    
251            public String getOptions() {
252                    return optionString;
253            }
254            
255            public void setOptions(String optionString) throws MaltChainedException {
256                    this.optionString = optionString;
257                    format = PennWriterFormat.DEFAULT;
258    
259                    String[] argv;
260                    try {
261                            argv = optionString.split("[_\\p{Blank}]");
262                    } catch (PatternSyntaxException e) {
263                            throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e);
264                    }
265                    for (int i=0; i < argv.length-1; i++) {
266                            if(argv[i].charAt(0) != '-') {
267                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
268                            }
269                            if(++i>=argv.length) {
270                                    throw new DataFormatException("The last argument does not have any value. ");
271                            }
272                            switch(argv[i-1].charAt(1)) {
273                            case 'f': 
274                                    if (argv[i].equals("p")) {
275                                            format = PennWriterFormat.PRETTY;
276                                    } else if (argv[i].equals("p")) {
277                                            format = PennWriterFormat.DEFAULT;
278                                    }
279                                    break;
280                            default:
281                                    throw new LibsvmException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");         
282                            }
283                    }       
284            }
285            
286            public void close() throws MaltChainedException {
287                    try {
288                            if (writer != null) {
289                                    writer.flush();
290                                    writer.close();
291                                    writer = null;
292                            }
293                    }   catch (IOException e) {
294                            throw new DataFormatException("Could not close the output file. ", e);
295                    } 
296            }
297            
298            private String encodeString(String string) {
299                    return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-");
300            }
301    }