001package org.maltparser.core.syntaxgraph.writer;
002
003import java.io.BufferedWriter;
004import java.io.FileNotFoundException;
005import java.io.FileOutputStream;
006import java.io.IOException;
007import java.io.OutputStream;
008import java.io.OutputStreamWriter;
009import java.io.UnsupportedEncodingException;
010import java.util.SortedMap;
011import java.util.regex.PatternSyntaxException;
012
013import org.maltparser.core.exception.MaltChainedException;
014import org.maltparser.core.io.dataformat.ColumnDescription;
015import org.maltparser.core.io.dataformat.DataFormatException;
016import org.maltparser.core.io.dataformat.DataFormatInstance;
017import org.maltparser.core.symbol.SymbolTable;
018import org.maltparser.core.symbol.SymbolTableHandler;
019import org.maltparser.core.syntaxgraph.PhraseStructure;
020import org.maltparser.core.syntaxgraph.TokenStructure;
021import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
022import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
023import org.maltparser.core.syntaxgraph.node.TokenNode;
024/**
025*
026*
027* @author Johan Hall
028*/
029public class BracketWriter implements SyntaxGraphWriter {
030        private enum PennWriterFormat {
031                DEFAULT, PRETTY
032        };
033        private PennWriterFormat format;
034        private BufferedWriter writer;
035        private DataFormatInstance dataFormatInstance;
036        private SortedMap<String,ColumnDescription> inputColumns;
037        private SortedMap<String,ColumnDescription> edgeLabelColumns;
038        private SortedMap<String,ColumnDescription> phraseLabelColumns;
039        private char STARTING_BRACKET = '(';
040        private String EMPTY_EDGELABEL = "??";
041        private char CLOSING_BRACKET = ')';
042        private char INPUT_SEPARATOR = ' ';
043        private char EDGELABEL_SEPARATOR = '-';
044        private char SENTENCE_SEPARATOR = '\n';
045        private String optionString;
046        private boolean closeStream = true;
047        
048        public BracketWriter() { 
049        }
050
051        public void open(String fileName, String charsetName) throws MaltChainedException {
052                try {
053                        open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
054                } catch (FileNotFoundException e) {
055                        throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
056                } catch (UnsupportedEncodingException e) {
057                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
058                }       
059        }
060        
061        public void open(OutputStream os, String charsetName) throws MaltChainedException {
062                try {
063                        if (os == System.out || os == System.err) {
064                                closeStream = false;
065                        }
066                        open(new OutputStreamWriter(os, charsetName));
067                } catch (UnsupportedEncodingException e) {
068                        throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
069                }
070        }
071        
072        private void open(OutputStreamWriter osw) throws MaltChainedException {
073                setWriter(new BufferedWriter(osw));
074        }
075
076        public void writeEpilog() throws MaltChainedException {
077
078        }
079        
080        public void writeProlog() throws MaltChainedException {
081        
082        }
083        
084        public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
085                if (syntaxGraph == null || dataFormatInstance == null) {
086                        return;
087                }
088                if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) {
089//                      PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph);
090                        if (format == PennWriterFormat.PRETTY) {
091                                writeElement(syntaxGraph.getSymbolTables(), ((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0);
092                        } else {
093                                writeElement(syntaxGraph.getSymbolTables(), ((PhraseStructure) syntaxGraph).getPhraseStructureRoot());
094                        }
095                        try {
096                                writer.write(SENTENCE_SEPARATOR);
097                                writer.flush();
098                        } catch (IOException e) {
099                                close();
100                                throw new DataFormatException("Could not write to the output file. ", e);
101                        }
102                }
103        }
104        
105        private void writeElement(SymbolTableHandler symbolTables, PhraseStructureNode element) throws MaltChainedException {
106                try {
107                        if (element instanceof TokenNode) {
108                                PhraseStructureNode t = (PhraseStructureNode)element;
109                                SymbolTable table = null;
110                                writer.write(STARTING_BRACKET);
111                                int i = 0;
112                                for (String inputColumn : inputColumns.keySet()) {
113                                        if (i != 0) {
114                                                writer.write(INPUT_SEPARATOR);
115                                        }
116                                        table = symbolTables.getSymbolTable(inputColumns.get(inputColumn).getName());
117                                        if (t.hasLabel(table)) {
118                                                writer.write(t.getLabelSymbol(table));
119                                        }
120                                        if (i == 0) {
121                                                for (String edgeLabelColumn : edgeLabelColumns.keySet()) { 
122                                                        table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName());
123                                                        if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
124                                                                writer.write(EDGELABEL_SEPARATOR);
125                                                                writer.write(t.getParentEdgeLabelSymbol(table));
126                                                        }
127                                                }
128                                        }
129                                        i++;
130                                }
131                                writer.write(CLOSING_BRACKET);
132                        } else {
133                                NonTerminalNode nt = (NonTerminalNode)element;
134                                writer.write(STARTING_BRACKET);
135                                SymbolTable table = null;
136                                int i = 0;
137                                for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
138                                        if (i != 0) {
139                                                writer.write(INPUT_SEPARATOR);
140                                        }
141                                        table = symbolTables.getSymbolTable(phraseLabelColumns.get(phraseLabelColumn).getName());
142                                        if (nt.hasLabel(table)) { 
143                                                writer.write(nt.getLabelSymbol(table));
144                                        }
145                                        if (i == 0) {
146                                                for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
147                                                        table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName());
148                                                        if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
149                                                                writer.write(EDGELABEL_SEPARATOR);
150                                                                writer.write(nt.getParentEdgeLabelSymbol(table));
151                                                        }
152                                                }
153                                        }
154                                        i++;
155                                }
156                                for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
157                                        writeElement(symbolTables, node);
158                                }
159                                writer.write(CLOSING_BRACKET);
160                        }
161                } catch (IOException e) {
162                        throw new DataFormatException("Could not write to the output file. ", e);
163                }
164        }
165        
166        private String getIndentation(int depth) {
167                StringBuilder sb = new StringBuilder("");
168                for (int i = 0; i < depth; i++) {
169                        sb.append("\t");
170                }
171                return sb.toString();
172        }
173        
174        private void writeElement(SymbolTableHandler symbolTables, PhraseStructureNode element, int depth) throws MaltChainedException {
175                try {
176                        if (element instanceof TokenNode) {
177                                PhraseStructureNode t = (PhraseStructureNode)element;
178                                SymbolTable table = null;
179                                writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
180                                int i = 0;
181                                for (String inputColumn : inputColumns.keySet()) {
182                                        if (i != 0) {
183                                                writer.write(INPUT_SEPARATOR);
184                                        }
185                                        table = symbolTables.getSymbolTable(inputColumns.get(inputColumn).getName());
186                                        if (t.hasLabel(table)) {
187                                                writer.write(encodeString(t.getLabelSymbol(table)));
188                                        }
189                                        if (i == 0) {
190                                                for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
191                                                        table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName());
192                                                        if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
193                                                                writer.write(EDGELABEL_SEPARATOR);
194                                                                writer.write(t.getParentEdgeLabelSymbol(table));
195                                                        }
196                                                }
197                                        }
198                                        i++;
199                                }
200                                writer.write(CLOSING_BRACKET);
201                        } else {
202                                NonTerminalNode nt = (NonTerminalNode)element;
203                                writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
204                                SymbolTable table = null;
205                                int i = 0;
206                                for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
207                                        if (i != 0) {
208                                                writer.write(INPUT_SEPARATOR);
209                                        }
210                                        table = symbolTables.getSymbolTable(phraseLabelColumns.get(phraseLabelColumn).getName());
211                                        if (nt.hasLabel(table)) { 
212                                                writer.write(nt.getLabelSymbol(table));
213                                        }
214                                        if (i == 0) {
215                                                for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
216                                                        table = symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelColumn).getName());
217                                                        if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
218                                                                writer.write(EDGELABEL_SEPARATOR);
219                                                                writer.write(nt.getParentEdgeLabelSymbol(table));
220                                                        }
221                                                }
222                                        }
223                                        i++;
224                                }
225                                for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
226                                        writeElement(symbolTables, node, depth + 1);
227                                }
228                                writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET);
229                        }
230                } catch (IOException e) {
231                        throw new DataFormatException("Could not write to the output file. ", e);
232                }
233        }
234        
235        public BufferedWriter getWriter() {
236                return writer;
237        }
238
239        public void setWriter(BufferedWriter writer) throws MaltChainedException {
240                close();
241                this.writer = writer;
242        }
243        
244        public DataFormatInstance getDataFormatInstance() {
245                return dataFormatInstance;
246        }
247
248        public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
249                this.dataFormatInstance = dataFormatInstance;
250                inputColumns = dataFormatInstance.getInputColumnDescriptions();
251                edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
252                phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
253        }
254
255        public String getOptions() {
256                return optionString;
257        }
258        
259        public void setOptions(String optionString) throws MaltChainedException {
260                this.optionString = optionString;
261                format = PennWriterFormat.DEFAULT;
262
263                String[] argv;
264                try {
265                        argv = optionString.split("[_\\p{Blank}]");
266                } catch (PatternSyntaxException e) {
267                        throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e);
268                }
269                for (int i=0; i < argv.length-1; i++) {
270                        if(argv[i].charAt(0) != '-') {
271                                throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
272                        }
273                        if(++i>=argv.length) {
274                                throw new DataFormatException("The last argument does not have any value. ");
275                        }
276                        switch(argv[i-1].charAt(1)) {
277                        case 'f': 
278                                if (argv[i].equals("p")) {
279                                        format = PennWriterFormat.PRETTY;
280                                } else if (argv[i].equals("p")) {
281                                        format = PennWriterFormat.DEFAULT;
282                                }
283                                break;
284                        default:
285                                throw new DataFormatException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");             
286                        }
287                }       
288        }
289        
290        public void close() throws MaltChainedException {
291                try {
292                        if (writer != null) {
293                                writer.flush();
294                                if (closeStream) {
295                                        writer.close();
296                                }
297                                writer = null;
298                        }
299                }   catch (IOException e) {
300                        throw new DataFormatException("Could not close the output file. ", e);
301                } 
302        }
303        
304        private String encodeString(String string) {
305                return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-");
306        }
307}