001package org.maltparser.core.feature.spec.reader;
002
003import java.io.BufferedReader;
004import java.io.IOException;
005import java.io.InputStreamReader;
006import java.net.URL;
007import java.util.ArrayList;
008import java.util.EnumMap;
009import java.util.regex.Pattern;
010
011import org.maltparser.core.exception.MaltChainedException;
012import org.maltparser.core.feature.FeatureException;
013import org.maltparser.core.feature.spec.SpecificationModels;
014/**
015*
016*
017* @author Johan Hall
018*/
019public class ParReader implements FeatureSpecReader {
020        public enum DataStructures {
021                STACK, INPUT, LEFTCONTEXT, RIGHTCONTEXT
022        };
023        public enum ColumnNames {
024                POS, DEP, LEX, LEMMA, CPOS, FEATS
025        };
026        private EnumMap<ColumnNames, String> columnNameMap;
027        private EnumMap<DataStructures, String> dataStructuresMap;
028        private boolean useSplitFeats = true;
029        private boolean covington = false;
030        private boolean pppath;
031        private boolean pplifted;
032        private boolean ppcoveredRoot;
033        
034        public ParReader() throws MaltChainedException {
035                initializeColumnNameMap();
036                initializeDataStructuresMap();
037                setPppath(false);
038                setPplifted(false);
039                setPpcoveredRoot(false);
040        }
041        
042        public void load(URL specModelURL, SpecificationModels featureSpecModels) throws MaltChainedException {
043                BufferedReader br = null;
044                Pattern tabPattern = Pattern.compile("\t");
045                if (specModelURL == null) {
046                        throw new FeatureException("The feature specification file cannot be found. ");
047                }
048                try {
049                        br = new BufferedReader(new InputStreamReader(specModelURL.openStream()));
050                } catch (IOException e) {
051                        throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
052                }               
053                
054                if (br != null) {
055                        int specModelIndex = featureSpecModels.getNextIndex();
056                        String fileLine;
057                        String items[];
058                        StringBuilder featureText = new StringBuilder();
059                        String splitfeats = "";
060                        ArrayList<String> fileLines = new ArrayList<String>();
061                        ArrayList<String> orderFileLines = new ArrayList<String>();
062                        while (true) {
063                                try {
064                                        fileLine = br.readLine();
065                                } catch (IOException e) {
066                                        throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
067                                }
068                                if (fileLine == null) {
069                                        break;
070                                }
071                                if (fileLine.length() <= 1 && fileLine.trim().substring(0, 2).trim().equals("--")) {
072                                        continue;
073                                }
074                                fileLines.add(fileLine);
075                        }
076                        try {
077                                br.close();
078                        } catch (IOException e) {
079                                throw new FeatureException("Could not close the feature specification file '"+specModelURL.toString()+"'. ", e);
080                        }
081
082                        for (int j = 0; j < fileLines.size(); j++) {
083                                orderFileLines.add(fileLines.get(j));
084                        }
085
086                        boolean deprel = false;
087                        for (int j=0; j < orderFileLines.size(); j++) {
088                                deprel = false;
089                                featureText.setLength(0);
090                                splitfeats = "";
091                                items = tabPattern.split(orderFileLines.get(j));
092                                if (items.length < 2) {
093                                        throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' must contain at least two columns.");
094                                }
095                                if (!(columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim())) || columnNameMap.containsValue(items[0].trim()))) {
096                                        throw new FeatureException("Column one in the feature specification file '"+specModelURL.toString()+"' contains an unknown value '"+items[0].trim()+"'. ");
097                                }
098                                if (items[0].trim().equalsIgnoreCase("DEP") || items[0].trim().equalsIgnoreCase("DEPREL")) {
099                                        featureText.append("OutputColumn(DEPREL, ");
100                                        deprel = true;
101                                } else {
102                                        if (columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim()))) {
103                                                featureText.append("InputColumn("+columnNameMap.get(ColumnNames.valueOf(items[0].trim()))+", ");
104                                        } else if (columnNameMap.containsValue(items[0].trim())) {
105                                                featureText.append("InputColumn("+items[0].trim()+", ");
106                                        }
107                                        if (items[0].trim().equalsIgnoreCase("FEATS") && isUseSplitFeats()) {
108                                                splitfeats = "Split(";
109                                        }
110                                }
111                                if (!(items[1].trim().equalsIgnoreCase("STACK") || items[1].trim().equalsIgnoreCase("INPUT") || items[1].trim().equalsIgnoreCase("CONTEXT"))) {
112                                        throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should be either 'STACK', 'INPUT' or 'CONTEXT' (Covington), not '"+items[1].trim()+"'. ");
113                                }
114                                int offset = 0;
115                                if (items.length >= 3) {
116                                        try {
117                                                offset = new Integer(Integer.parseInt(items[2]));
118                                        } catch (NumberFormatException e) {
119                                                throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' contains a illegal integer value. ", e);
120                                        }
121                                }
122                                String functionArg = "";
123                                
124                                if (items[1].trim().equalsIgnoreCase("CONTEXT")) {
125                                        if (offset >= 0) {
126                                                functionArg = dataStructuresMap.get(DataStructures.valueOf("LEFTCONTEXT"))+"["+offset+"]";
127                                        } else {
128                                                functionArg = dataStructuresMap.get(DataStructures.valueOf("RIGHTCONTEXT"))+"["+Math.abs(offset + 1)+"]";
129                                        }
130                                } else if (dataStructuresMap.containsKey(DataStructures.valueOf(items[1].trim()))) {
131                                        if (covington == true) {
132                                                if (dataStructuresMap.get(DataStructures.valueOf(items[1].trim())).equalsIgnoreCase("Stack")) {
133                                                        functionArg = "Left["+offset+"]";
134                                                } else {
135                                                        functionArg = "Right["+offset+"]";
136                                                }
137                                        } else {
138                                                functionArg = dataStructuresMap.get(DataStructures.valueOf(items[1].trim()))+"["+offset+"]";
139                                        }
140                                } else if (dataStructuresMap.containsValue(items[1].trim())) {
141                                        if (covington == true) {
142                                                if (items[1].trim().equalsIgnoreCase("Stack")) {
143                                                        functionArg = "Left["+offset+"]";
144                                                } else {
145                                                        functionArg = "Right["+offset+"]";
146                                                }
147                                        } else {
148                                                functionArg = items[1].trim()+"["+offset+"]";
149                                        }
150                                        
151                                } else {
152                                        throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should not contain the value '"+items[1].trim());
153                                }
154        
155                                int linearOffset = 0;
156                                int headOffset = 0;
157                                int depOffset = 0;
158                                int sibOffset = 0;
159                                int suffixLength = 0;
160                                if (items.length >= 4) { linearOffset = new Integer(Integer.parseInt(items[3])); }
161                                if (items.length >= 5) { headOffset = new Integer(Integer.parseInt(items[4])); }
162                                if (items.length >= 6) { depOffset = new Integer(Integer.parseInt(items[5])); }
163                                if (items.length >= 7) { sibOffset = new Integer(Integer.parseInt(items[6])); }
164                                if (items.length >= 8) { suffixLength = new Integer(Integer.parseInt(items[7])); }
165                                if (linearOffset < 0) {
166                                        linearOffset = Math.abs(linearOffset);
167                                        for (int i = 0; i < linearOffset; i++) {
168                                                functionArg = "pred("+functionArg+")"; 
169                                        }
170                                } else if (linearOffset > 0) {
171                                        for (int i = 0; i < linearOffset; i++) {
172                                                functionArg = "succ("+functionArg+")"; 
173                                        }
174                                } 
175                                if (headOffset >= 0) {
176                                        for (int i = 0; i < headOffset; i++) {
177                                                functionArg = "head("+functionArg+")"; 
178                                        }
179                                } else {
180                                        throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' should not contain a negative head function value. ");
181                                }
182                                if (depOffset < 0) {
183                                        depOffset = Math.abs(depOffset);
184                                        for (int i = 0; i < depOffset; i++) {
185                                                functionArg = "ldep("+functionArg+")"; 
186                                        }
187                                } else if (depOffset > 0) {
188                                        for (int i = 0; i < depOffset; i++) {
189                                                functionArg = "rdep("+functionArg+")";
190                                        }                                                       
191                                }
192                                if (sibOffset < 0) {
193                                        sibOffset = Math.abs(sibOffset);
194                                        for (int i = 0; i < sibOffset; i++) {
195                                                functionArg = "lsib("+functionArg+")"; 
196                                        }
197                                } else if (sibOffset > 0) {
198                                        for (int i = 0; i < sibOffset; i++) {
199                                                functionArg = "rsib("+functionArg+")"; 
200                                        }                                                       
201                                }
202                                
203                                if (deprel == true && (pppath == true || pplifted == true || ppcoveredRoot == true)) {
204                                        featureSpecModels.add(specModelIndex, mergePseudoProjColumns(functionArg));
205                                } else {
206                                        if (suffixLength != 0) {
207                                                featureSpecModels.add(specModelIndex, "Suffix("+featureText.toString()+functionArg+"),"+suffixLength+")");
208                                        } else if (splitfeats.equals("Split(")) {
209                                                featureSpecModels.add(specModelIndex, splitfeats+featureText.toString()+functionArg+"),\\|)");
210                                        } else {
211                                                featureSpecModels.add(specModelIndex, featureText.toString()+functionArg+")");
212                                        }
213                                }
214
215                        }
216                }
217        }
218
219        private String mergePseudoProjColumns(String functionArg) {
220                StringBuilder newFeatureText = new StringBuilder();
221                int c = 1; 
222                
223                if (pplifted == true) { c++; };
224                if (pppath == true) { c++; };
225                if (ppcoveredRoot == true) { c++; };
226                
227                if (c == 1) { // no merge
228                        newFeatureText.append("OutputColumn(DEPREL, ");
229                        newFeatureText.append(functionArg);
230                        newFeatureText.append(')');
231                        return newFeatureText.toString();
232                }
233                if (c == 2) {
234                        newFeatureText.append("Merge(");
235                        newFeatureText.append("OutputColumn(DEPREL, ");
236                        newFeatureText.append(functionArg);
237                        newFeatureText.append("), ");
238                        if (pplifted == true) {
239                                newFeatureText.append("OutputTable(PPLIFTED, ");
240                                newFeatureText.append(functionArg);
241                                newFeatureText.append(")");
242                        }
243                        if (pppath == true) {
244                                newFeatureText.append("OutputTable(PPPATH, ");
245                                newFeatureText.append(functionArg);
246                                newFeatureText.append(")");
247                        }
248                        if (ppcoveredRoot == true) {
249                                newFeatureText.append("OutputTable(PPCOVERED, ");
250                                newFeatureText.append(functionArg);
251                                newFeatureText.append(")");
252                        }
253                        newFeatureText.append(")");
254                } else if (c == 3) { // use Merge3 
255                        int i = 0;
256                        newFeatureText.append("Merge3(");
257                        newFeatureText.append("OutputColumn(DEPREL, ");
258                        newFeatureText.append(functionArg);
259                        newFeatureText.append("), ");
260                        i++;
261                        if (pplifted == true) {
262                                newFeatureText.append("OutputTable(PPLIFTED, ");
263                                newFeatureText.append(functionArg);
264                                i++;
265                                if (i<3) { 
266                                        newFeatureText.append("), ");
267                                } else {
268                                        newFeatureText.append(")");
269                                }
270                        }
271                        if (pppath == true) {
272                                newFeatureText.append("OutputTable(PPPATH, ");
273                                newFeatureText.append(functionArg);
274                                i++;
275                                if (i<3) { 
276                                        newFeatureText.append("), ");
277                                } else {
278                                        newFeatureText.append(")");
279                                }
280                        }
281                        if (ppcoveredRoot == true) {
282                                newFeatureText.append("OutputTable(PPCOVERED, ");
283                                newFeatureText.append(functionArg);
284                                i++;
285                                if (i<3) { 
286                                        newFeatureText.append("), ");
287                                } else {
288                                        newFeatureText.append(")");
289                                }
290                        }
291                        newFeatureText.append(")");
292                } else { // c == 4
293                        newFeatureText.append("Merge(Merge(");
294                        newFeatureText.append("OutputColumn(DEPREL, ");
295                        newFeatureText.append(functionArg);
296                        newFeatureText.append("), ");
297                        newFeatureText.append("OutputTable(PPLIFTED, ");
298                        newFeatureText.append(functionArg);
299                        newFeatureText.append(")), Merge(");
300                        newFeatureText.append("OutputTable(PPPATH, ");
301                        newFeatureText.append(functionArg);
302                        newFeatureText.append("), ");
303                        newFeatureText.append("OutputTable(PPCOVERED, ");
304                        newFeatureText.append(functionArg);
305                        newFeatureText.append(")))");
306                }
307                return newFeatureText.toString();
308        }
309        
310        public EnumMap<ColumnNames, String> getColumnNameMap() {
311                return columnNameMap;
312        }
313
314        public void initializeColumnNameMap() {
315                columnNameMap = new EnumMap<ColumnNames, String>(ColumnNames.class);
316                columnNameMap.put(ColumnNames.POS, "POSTAG");
317                columnNameMap.put(ColumnNames.CPOS, "CPOSTAG");
318                columnNameMap.put(ColumnNames.DEP, "DEPREL");
319                columnNameMap.put(ColumnNames.LEX, "FORM");
320                columnNameMap.put(ColumnNames.LEMMA, "LEMMA");
321                columnNameMap.put(ColumnNames.FEATS, "FEATS");
322        }
323
324        public void setColumnNameMap(EnumMap<ColumnNames, String> columnNameMap) {
325                this.columnNameMap = columnNameMap;
326        }
327        
328        public EnumMap<DataStructures, String> getDataStructuresMap() {
329                return dataStructuresMap;
330        }
331
332        //TODO Fix covington
333        public void initializeDataStructuresMap() {
334                dataStructuresMap = new EnumMap<DataStructures, String>(DataStructures.class);
335                dataStructuresMap.put(DataStructures.STACK, "Stack");
336                dataStructuresMap.put(DataStructures.INPUT, "Input");
337        }
338
339        public void setDataStructuresMap(EnumMap<DataStructures, String> dataStructuresMap) {
340                this.dataStructuresMap = dataStructuresMap;
341        }
342        
343        public boolean isUseSplitFeats() {
344                return useSplitFeats;
345        }
346
347        public void setUseSplitFeats(boolean useSplitFeats) {
348                this.useSplitFeats = useSplitFeats;
349        }
350
351        public boolean isCovington() {
352                return covington;
353        }
354
355        public void setCovington(boolean covington) {
356                this.covington = covington;
357        }
358
359        public boolean isPppath() {
360                return pppath;
361        }
362
363        public void setPppath(boolean pppath) {
364                this.pppath = pppath;
365        }
366
367        public boolean isPplifted() {
368                return pplifted;
369        }
370
371        public void setPplifted(boolean pplifted) {
372                this.pplifted = pplifted;
373        }
374
375        public boolean isPpcoveredRoot() {
376                return ppcoveredRoot;
377        }
378
379        public void setPpcoveredRoot(boolean ppcoveredRoot) {
380                this.ppcoveredRoot = ppcoveredRoot;
381        }
382
383        public String toString() {
384                StringBuilder sb = new StringBuilder();
385                sb.append("Mapping of column names:\n");
386                for (ColumnNames columnName : ColumnNames.values()) {
387                        sb.append(columnName.toString()+"\t"+columnNameMap.get(columnName)+"\n");
388                }
389                sb.append("Mapping of data structures:\n");
390                for (DataStructures dataStruct : DataStructures.values()) {
391                        sb.append(dataStruct.toString()+"\t"+dataStructuresMap.get(dataStruct)+"\n");
392                }
393                sb.append("Split FEATS column: "+useSplitFeats+"\n");
394                return sb.toString();
395        }
396}