001    package org.maltparser.core.feature.spec.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.IOException;
005    import java.io.InputStreamReader;
006    import java.net.URL;
007    import java.util.ArrayList;
008    import java.util.EnumMap;
009    import java.util.regex.Pattern;
010    
011    import org.maltparser.core.exception.MaltChainedException;
012    import org.maltparser.core.feature.FeatureException;
013    import org.maltparser.core.feature.spec.SpecificationModels;
014    /**
015    *
016    *
017    * @author Johan Hall
018    */
019    public class ParReader implements FeatureSpecReader {
020            public enum DataStructures {
021                    STACK, INPUT, LEFTCONTEXT, RIGHTCONTEXT
022            };
023            public enum ColumnNames {
024                    POS, DEP, LEX, LEMMA, CPOS, FEATS
025            };
026            private EnumMap<ColumnNames, String> columnNameMap;
027            private EnumMap<DataStructures, String> dataStructuresMap;
028            private boolean useSplitFeats = true;
029            private boolean malt04Emulation = false;
030            private boolean covington = false;
031            private boolean pppath;
032            private boolean pplifted;
033            private boolean ppcoveredRoot;
034            
035            public ParReader() throws MaltChainedException {
036                    initializeColumnNameMap();
037                    initializeDataStructuresMap();
038                    setPppath(false);
039                    setPplifted(false);
040                    setPpcoveredRoot(false);
041            }
042            
043            public void load(URL specModelURL, SpecificationModels featureSpecModels) throws MaltChainedException {
044                    BufferedReader br = null;
045                    Pattern tabPattern = Pattern.compile("\t");
046                    if (specModelURL == null) {
047                            throw new FeatureException("The feature specification file cannot be found. ");
048                    }
049                    try {
050                            br = new BufferedReader(new InputStreamReader(specModelURL.openStream()));
051                    } catch (IOException e) {
052                            throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
053                    }               
054                    
055                    if (br != null) {
056                            int specModelIndex = featureSpecModels.getNextIndex();
057                            String fileLine;
058                            String items[];
059                            StringBuilder featureText = new StringBuilder();
060                            String splitfeats = "";
061                            ArrayList<String> fileLines = new ArrayList<String>();
062                            ArrayList<String> orderFileLines = new ArrayList<String>();
063                            while (true) {
064                                    try {
065                                            fileLine = br.readLine();
066                                    } catch (IOException e) {
067                                            throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
068                                    }
069                                    if (fileLine == null) {
070                                            break;
071                                    }
072                                    if (fileLine.length() <= 1 && fileLine.trim().substring(0, 2).trim().equals("--")) {
073                                            continue;
074                                    }
075                                    fileLines.add(fileLine);
076                            }
077                            try {
078                                    br.close();
079                            } catch (IOException e) {
080                                    throw new FeatureException("Could not close the feature specification file '"+specModelURL.toString()+"'. ", e);
081                            }
082                            if (malt04Emulation == true) {
083                                    for (int i = 0; i < ColumnNames.values().length; i++) {
084                                            for (int j = 0; j < fileLines.size(); j++) {
085                                                    if (fileLines.get(j).startsWith(ColumnNames.values()[i].toString())) {
086                                                            orderFileLines.add(fileLines.get(j));
087                                                    }
088                                            }
089                                    }
090                            } else {
091                                    for (int j = 0; j < fileLines.size(); j++) {
092                                            orderFileLines.add(fileLines.get(j));
093                                    }
094                            }
095                            boolean deprel = false;
096                            for (int j=0; j < orderFileLines.size(); j++) {
097                                    deprel = false;
098                                    featureText.setLength(0);
099                                    splitfeats = "";
100                                    items = tabPattern.split(orderFileLines.get(j));
101                                    if (items.length < 2) {
102                                            throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' must contain at least two columns.");
103                                    }
104                                    if (!(columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim())) || columnNameMap.containsValue(items[0].trim()))) {
105                                            throw new FeatureException("Column one in the feature specification file '"+specModelURL.toString()+"' contains an unknown value '"+items[0].trim()+"'. ");
106                                    }
107                                    if (items[0].trim().equalsIgnoreCase("DEP") || items[0].trim().equalsIgnoreCase("DEPREL")) {
108                                            featureText.append("OutputColumn(DEPREL, ");
109                                            deprel = true;
110                                    } else {
111                                            if (columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim()))) {
112                                                    featureText.append("InputColumn("+columnNameMap.get(ColumnNames.valueOf(items[0].trim()))+", ");
113                                            } else if (columnNameMap.containsValue(items[0].trim())) {
114                                                    featureText.append("InputColumn("+items[0].trim()+", ");
115                                            }
116                                            if (items[0].trim().equalsIgnoreCase("FEATS") && isUseSplitFeats()) {
117                                                    splitfeats = "Split(";
118                                            }
119                                    }
120                                    if (!(items[1].trim().equalsIgnoreCase("STACK") || items[1].trim().equalsIgnoreCase("INPUT") || items[1].trim().equalsIgnoreCase("CONTEXT"))) {
121                                            throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should be either 'STACK', 'INPUT' or 'CONTEXT' (Covington), not '"+items[1].trim()+"'. ");
122                                    }
123                                    int offset = 0;
124                                    if (items.length >= 3) {
125                                            try {
126                                                    offset = new Integer(Integer.parseInt(items[2]));
127                                            } catch (NumberFormatException e) {
128                                                    throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' contains a illegal integer value. ", e);
129                                            }
130                                    }
131                                    String functionArg = "";
132                                    
133                                    if (items[1].trim().equalsIgnoreCase("CONTEXT")) {
134                                            if (offset >= 0) {
135                                                    functionArg = dataStructuresMap.get(DataStructures.valueOf("LEFTCONTEXT"))+"["+offset+"]";
136                                            } else {
137                                                    functionArg = dataStructuresMap.get(DataStructures.valueOf("RIGHTCONTEXT"))+"["+Math.abs(offset + 1)+"]";
138                                            }
139                                    } else if (dataStructuresMap.containsKey(DataStructures.valueOf(items[1].trim()))) {
140                                            if (covington == true) {
141                                                    if (dataStructuresMap.get(DataStructures.valueOf(items[1].trim())).equalsIgnoreCase("Stack")) {
142                                                            functionArg = "Left["+offset+"]";
143                                                    } else {
144                                                            functionArg = "Right["+offset+"]";
145                                                    }
146                                            } else {
147                                                    functionArg = dataStructuresMap.get(DataStructures.valueOf(items[1].trim()))+"["+offset+"]";
148                                            }
149                                    } else if (dataStructuresMap.containsValue(items[1].trim())) {
150                                            if (covington == true) {
151                                                    if (items[1].trim().equalsIgnoreCase("Stack")) {
152                                                            functionArg = "Left["+offset+"]";
153                                                    } else {
154                                                            functionArg = "Right["+offset+"]";
155                                                    }
156                                            } else {
157                                                    functionArg = items[1].trim()+"["+offset+"]";
158                                            }
159                                            
160                                    } else {
161                                            throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should not contain the value '"+items[1].trim());
162                                    }
163            
164                                    int linearOffset = 0;
165                                    int headOffset = 0;
166                                    int depOffset = 0;
167                                    int sibOffset = 0;
168                                    int suffixLength = 0;
169                                    if (items.length >= 4) { linearOffset = new Integer(Integer.parseInt(items[3])); }
170                                    if (items.length >= 5) { headOffset = new Integer(Integer.parseInt(items[4])); }
171                                    if (items.length >= 6) { depOffset = new Integer(Integer.parseInt(items[5])); }
172                                    if (items.length >= 7) { sibOffset = new Integer(Integer.parseInt(items[6])); }
173                                    if (items.length >= 8) { suffixLength = new Integer(Integer.parseInt(items[7])); }
174                                    if (linearOffset < 0) {
175                                            linearOffset = Math.abs(linearOffset);
176                                            for (int i = 0; i < linearOffset; i++) {
177                                                    functionArg = "pred("+functionArg+")"; 
178                                            }
179                                    } else if (linearOffset > 0) {
180                                            for (int i = 0; i < linearOffset; i++) {
181                                                    functionArg = "succ("+functionArg+")"; 
182                                            }
183                                    } 
184                                    if (headOffset >= 0) {
185                                            for (int i = 0; i < headOffset; i++) {
186                                                    functionArg = "head("+functionArg+")"; 
187                                            }
188                                    } else {
189                                            throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' should not contain a negative head function value. ");
190                                    }
191                                    if (depOffset < 0) {
192                                            depOffset = Math.abs(depOffset);
193                                            for (int i = 0; i < depOffset; i++) {
194                                                    functionArg = "ldep("+functionArg+")"; 
195                                            }
196                                    } else if (depOffset > 0) {
197                                            for (int i = 0; i < depOffset; i++) {
198                                                    if (malt04Emulation == true) {
199                                                            functionArg = "rdep2("+functionArg+")";
200                                                    } else {
201                                                            functionArg = "rdep("+functionArg+")";
202                                                    }
203                                            }                                                       
204                                    }
205                                    if (sibOffset < 0) {
206                                            sibOffset = Math.abs(sibOffset);
207                                            for (int i = 0; i < sibOffset; i++) {
208                                                    functionArg = "lsib("+functionArg+")"; 
209                                            }
210                                    } else if (sibOffset > 0) {
211                                            for (int i = 0; i < sibOffset; i++) {
212                                                    functionArg = "rsib("+functionArg+")"; 
213                                            }                                                       
214                                    }
215                                    
216                                    if (deprel == true && (pppath == true || pplifted == true || ppcoveredRoot == true)) {
217                                            featureSpecModels.add(specModelIndex, mergePseudoProjColumns(functionArg));
218                                    } else {
219                                            if (suffixLength != 0) {
220                                                    featureSpecModels.add(specModelIndex, "Suffix("+featureText.toString()+functionArg+"),"+suffixLength+")");
221                                            } else if (splitfeats.equals("Split(")) {
222                                                    featureSpecModels.add(specModelIndex, splitfeats+featureText.toString()+functionArg+"),\\|)");
223                                            } else {
224                                                    featureSpecModels.add(specModelIndex, featureText.toString()+functionArg+")");
225                                            }
226                                    }
227    
228                            }
229                    }
230            }
231    
232            private String mergePseudoProjColumns(String functionArg) {
233                    StringBuilder newFeatureText = new StringBuilder();
234                    int c = 1; 
235                    
236                    if (pplifted == true) { c++; };
237                    if (pppath == true) { c++; };
238                    if (ppcoveredRoot == true) { c++; };
239                    
240                    if (c == 1) { // no merge
241                            newFeatureText.append("OutputColumn(DEPREL, ");
242                            newFeatureText.append(functionArg);
243                            newFeatureText.append(')');
244                            return newFeatureText.toString();
245                    }
246                    if (c == 2) {
247                            newFeatureText.append("Merge(");
248                            newFeatureText.append("OutputColumn(DEPREL, ");
249                            newFeatureText.append(functionArg);
250                            newFeatureText.append("), ");
251                            if (pplifted == true) {
252                                    newFeatureText.append("OutputTable(PPLIFTED, ");
253                                    newFeatureText.append(functionArg);
254                                    newFeatureText.append(")");
255                            }
256                            if (pppath == true) {
257                                    newFeatureText.append("OutputTable(PPPATH, ");
258                                    newFeatureText.append(functionArg);
259                                    newFeatureText.append(")");
260                            }
261                            if (ppcoveredRoot == true) {
262                                    newFeatureText.append("OutputTable(PPCOVERED, ");
263                                    newFeatureText.append(functionArg);
264                                    newFeatureText.append(")");
265                            }
266                            newFeatureText.append(")");
267                    } else if (c == 3) { // use Merge3 
268                            int i = 0;
269                            newFeatureText.append("Merge3(");
270                            newFeatureText.append("OutputColumn(DEPREL, ");
271                            newFeatureText.append(functionArg);
272                            newFeatureText.append("), ");
273                            i++;
274                            if (pplifted == true) {
275                                    newFeatureText.append("OutputTable(PPLIFTED, ");
276                                    newFeatureText.append(functionArg);
277                                    i++;
278                                    if (i<3) { 
279                                            newFeatureText.append("), ");
280                                    } else {
281                                            newFeatureText.append(")");
282                                    }
283                            }
284                            if (pppath == true) {
285                                    newFeatureText.append("OutputTable(PPPATH, ");
286                                    newFeatureText.append(functionArg);
287                                    i++;
288                                    if (i<3) { 
289                                            newFeatureText.append("), ");
290                                    } else {
291                                            newFeatureText.append(")");
292                                    }
293                            }
294                            if (ppcoveredRoot == true) {
295                                    newFeatureText.append("OutputTable(PPCOVERED, ");
296                                    newFeatureText.append(functionArg);
297                                    i++;
298                                    if (i<3) { 
299                                            newFeatureText.append("), ");
300                                    } else {
301                                            newFeatureText.append(")");
302                                    }
303                            }
304                            newFeatureText.append(")");
305                    } else { // c == 4
306                            newFeatureText.append("Merge(Merge(");
307                            newFeatureText.append("OutputColumn(DEPREL, ");
308                            newFeatureText.append(functionArg);
309                            newFeatureText.append("), ");
310                            newFeatureText.append("OutputTable(PPLIFTED, ");
311                            newFeatureText.append(functionArg);
312                            newFeatureText.append(")), Merge(");
313                            newFeatureText.append("OutputTable(PPPATH, ");
314                            newFeatureText.append(functionArg);
315                            newFeatureText.append("), ");
316                            newFeatureText.append("OutputTable(PPCOVERED, ");
317                            newFeatureText.append(functionArg);
318                            newFeatureText.append(")))");
319                    }
320                    return newFeatureText.toString();
321            }
322            
323            public EnumMap<ColumnNames, String> getColumnNameMap() {
324                    return columnNameMap;
325            }
326    
327            public void initializeColumnNameMap() {
328                    columnNameMap = new EnumMap<ColumnNames, String>(ColumnNames.class);
329                    columnNameMap.put(ColumnNames.POS, "POSTAG");
330                    columnNameMap.put(ColumnNames.CPOS, "CPOSTAG");
331                    columnNameMap.put(ColumnNames.DEP, "DEPREL");
332                    columnNameMap.put(ColumnNames.LEX, "FORM");
333                    columnNameMap.put(ColumnNames.LEMMA, "LEMMA");
334                    columnNameMap.put(ColumnNames.FEATS, "FEATS");
335            }
336    
337            public void setColumnNameMap(EnumMap<ColumnNames, String> columnNameMap) {
338                    this.columnNameMap = columnNameMap;
339            }
340            
341            public EnumMap<DataStructures, String> getDataStructuresMap() {
342                    return dataStructuresMap;
343            }
344    
345            //TODO Fix covington
346            public void initializeDataStructuresMap() {
347                    dataStructuresMap = new EnumMap<DataStructures, String>(DataStructures.class);
348                    dataStructuresMap.put(DataStructures.STACK, "Stack");
349                    dataStructuresMap.put(DataStructures.INPUT, "Input");
350            }
351    
352            public void setDataStructuresMap(EnumMap<DataStructures, String> dataStructuresMap) {
353                    this.dataStructuresMap = dataStructuresMap;
354            }
355            
356            public boolean isUseSplitFeats() {
357                    return useSplitFeats;
358            }
359    
360            public void setUseSplitFeats(boolean useSplitFeats) {
361                    this.useSplitFeats = useSplitFeats;
362            }
363    
364            
365            public boolean isMalt04Emulation() {
366                    return malt04Emulation;
367            }
368    
369            public void setMalt04Emulation(boolean malt04Emulation) {
370                    this.malt04Emulation = malt04Emulation;
371            }
372    
373            public boolean isCovington() {
374                    return covington;
375            }
376    
377            public void setCovington(boolean covington) {
378                    this.covington = covington;
379            }
380    
381            public boolean isPppath() {
382                    return pppath;
383            }
384    
385            public void setPppath(boolean pppath) {
386                    this.pppath = pppath;
387            }
388    
389            public boolean isPplifted() {
390                    return pplifted;
391            }
392    
393            public void setPplifted(boolean pplifted) {
394                    this.pplifted = pplifted;
395            }
396    
397            public boolean isPpcoveredRoot() {
398                    return ppcoveredRoot;
399            }
400    
401            public void setPpcoveredRoot(boolean ppcoveredRoot) {
402                    this.ppcoveredRoot = ppcoveredRoot;
403            }
404    
405            public String toString() {
406                    StringBuilder sb = new StringBuilder();
407                    sb.append("Mapping of column names:\n");
408                    for (ColumnNames columnName : ColumnNames.values()) {
409                            sb.append(columnName.toString()+"\t"+columnNameMap.get(columnName)+"\n");
410                    }
411                    sb.append("Mapping of data structures:\n");
412                    for (DataStructures dataStruct : DataStructures.values()) {
413                            sb.append(dataStruct.toString()+"\t"+dataStructuresMap.get(dataStruct)+"\n");
414                    }
415                    sb.append("Split FEATS column: "+useSplitFeats+"\n");
416                    return sb.toString();
417            }
418    }