001package org.maltparser.core.feature.spec.reader; 002 003import java.io.BufferedReader; 004import java.io.IOException; 005import java.io.InputStreamReader; 006import java.net.URL; 007import java.util.ArrayList; 008import java.util.EnumMap; 009import java.util.regex.Pattern; 010 011import org.maltparser.core.exception.MaltChainedException; 012import org.maltparser.core.feature.FeatureException; 013import org.maltparser.core.feature.spec.SpecificationModels; 014/** 015* 016* 017* @author Johan Hall 018*/ 019public class ParReader implements FeatureSpecReader { 020 public enum DataStructures { 021 STACK, INPUT, LEFTCONTEXT, RIGHTCONTEXT 022 }; 023 public enum ColumnNames { 024 POS, DEP, LEX, LEMMA, CPOS, FEATS 025 }; 026 private EnumMap<ColumnNames, String> columnNameMap; 027 private EnumMap<DataStructures, String> dataStructuresMap; 028 private boolean useSplitFeats = true; 029 private boolean covington = false; 030 private boolean pppath; 031 private boolean pplifted; 032 private boolean ppcoveredRoot; 033 034 public ParReader() throws MaltChainedException { 035 initializeColumnNameMap(); 036 initializeDataStructuresMap(); 037 setPppath(false); 038 setPplifted(false); 039 setPpcoveredRoot(false); 040 } 041 042 public void load(URL specModelURL, SpecificationModels featureSpecModels) throws MaltChainedException { 043 BufferedReader br = null; 044 Pattern tabPattern = Pattern.compile("\t"); 045 if (specModelURL == null) { 046 throw new FeatureException("The feature specification file cannot be found. "); 047 } 048 try { 049 br = new BufferedReader(new InputStreamReader(specModelURL.openStream())); 050 } catch (IOException e) { 051 throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e); 052 } 053 054 if (br != null) { 055 int specModelIndex = featureSpecModels.getNextIndex(); 056 String fileLine; 057 String items[]; 058 StringBuilder featureText = new StringBuilder(); 059 String splitfeats = ""; 060 ArrayList<String> fileLines = new ArrayList<String>(); 061 ArrayList<String> orderFileLines = new ArrayList<String>(); 062 while (true) { 063 try { 064 fileLine = br.readLine(); 065 } catch (IOException e) { 066 throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e); 067 } 068 if (fileLine == null) { 069 break; 070 } 071 if (fileLine.length() <= 1 && fileLine.trim().substring(0, 2).trim().equals("--")) { 072 continue; 073 } 074 fileLines.add(fileLine); 075 } 076 try { 077 br.close(); 078 } catch (IOException e) { 079 throw new FeatureException("Could not close the feature specification file '"+specModelURL.toString()+"'. ", e); 080 } 081 082 for (int j = 0; j < fileLines.size(); j++) { 083 orderFileLines.add(fileLines.get(j)); 084 } 085 086 boolean deprel = false; 087 for (int j=0; j < orderFileLines.size(); j++) { 088 deprel = false; 089 featureText.setLength(0); 090 splitfeats = ""; 091 items = tabPattern.split(orderFileLines.get(j)); 092 if (items.length < 2) { 093 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' must contain at least two columns."); 094 } 095 if (!(columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim())) || columnNameMap.containsValue(items[0].trim()))) { 096 throw new FeatureException("Column one in the feature specification file '"+specModelURL.toString()+"' contains an unknown value '"+items[0].trim()+"'. "); 097 } 098 if (items[0].trim().equalsIgnoreCase("DEP") || items[0].trim().equalsIgnoreCase("DEPREL")) { 099 featureText.append("OutputColumn(DEPREL, "); 100 deprel = true; 101 } else { 102 if (columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim()))) { 103 featureText.append("InputColumn("+columnNameMap.get(ColumnNames.valueOf(items[0].trim()))+", "); 104 } else if (columnNameMap.containsValue(items[0].trim())) { 105 featureText.append("InputColumn("+items[0].trim()+", "); 106 } 107 if (items[0].trim().equalsIgnoreCase("FEATS") && isUseSplitFeats()) { 108 splitfeats = "Split("; 109 } 110 } 111 if (!(items[1].trim().equalsIgnoreCase("STACK") || items[1].trim().equalsIgnoreCase("INPUT") || items[1].trim().equalsIgnoreCase("CONTEXT"))) { 112 throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should be either 'STACK', 'INPUT' or 'CONTEXT' (Covington), not '"+items[1].trim()+"'. "); 113 } 114 int offset = 0; 115 if (items.length >= 3) { 116 try { 117 offset = new Integer(Integer.parseInt(items[2])); 118 } catch (NumberFormatException e) { 119 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' contains a illegal integer value. ", e); 120 } 121 } 122 String functionArg = ""; 123 124 if (items[1].trim().equalsIgnoreCase("CONTEXT")) { 125 if (offset >= 0) { 126 functionArg = dataStructuresMap.get(DataStructures.valueOf("LEFTCONTEXT"))+"["+offset+"]"; 127 } else { 128 functionArg = dataStructuresMap.get(DataStructures.valueOf("RIGHTCONTEXT"))+"["+Math.abs(offset + 1)+"]"; 129 } 130 } else if (dataStructuresMap.containsKey(DataStructures.valueOf(items[1].trim()))) { 131 if (covington == true) { 132 if (dataStructuresMap.get(DataStructures.valueOf(items[1].trim())).equalsIgnoreCase("Stack")) { 133 functionArg = "Left["+offset+"]"; 134 } else { 135 functionArg = "Right["+offset+"]"; 136 } 137 } else { 138 functionArg = dataStructuresMap.get(DataStructures.valueOf(items[1].trim()))+"["+offset+"]"; 139 } 140 } else if (dataStructuresMap.containsValue(items[1].trim())) { 141 if (covington == true) { 142 if (items[1].trim().equalsIgnoreCase("Stack")) { 143 functionArg = "Left["+offset+"]"; 144 } else { 145 functionArg = "Right["+offset+"]"; 146 } 147 } else { 148 functionArg = items[1].trim()+"["+offset+"]"; 149 } 150 151 } else { 152 throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should not contain the value '"+items[1].trim()); 153 } 154 155 int linearOffset = 0; 156 int headOffset = 0; 157 int depOffset = 0; 158 int sibOffset = 0; 159 int suffixLength = 0; 160 if (items.length >= 4) { linearOffset = new Integer(Integer.parseInt(items[3])); } 161 if (items.length >= 5) { headOffset = new Integer(Integer.parseInt(items[4])); } 162 if (items.length >= 6) { depOffset = new Integer(Integer.parseInt(items[5])); } 163 if (items.length >= 7) { sibOffset = new Integer(Integer.parseInt(items[6])); } 164 if (items.length >= 8) { suffixLength = new Integer(Integer.parseInt(items[7])); } 165 if (linearOffset < 0) { 166 linearOffset = Math.abs(linearOffset); 167 for (int i = 0; i < linearOffset; i++) { 168 functionArg = "pred("+functionArg+")"; 169 } 170 } else if (linearOffset > 0) { 171 for (int i = 0; i < linearOffset; i++) { 172 functionArg = "succ("+functionArg+")"; 173 } 174 } 175 if (headOffset >= 0) { 176 for (int i = 0; i < headOffset; i++) { 177 functionArg = "head("+functionArg+")"; 178 } 179 } else { 180 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' should not contain a negative head function value. "); 181 } 182 if (depOffset < 0) { 183 depOffset = Math.abs(depOffset); 184 for (int i = 0; i < depOffset; i++) { 185 functionArg = "ldep("+functionArg+")"; 186 } 187 } else if (depOffset > 0) { 188 for (int i = 0; i < depOffset; i++) { 189 functionArg = "rdep("+functionArg+")"; 190 } 191 } 192 if (sibOffset < 0) { 193 sibOffset = Math.abs(sibOffset); 194 for (int i = 0; i < sibOffset; i++) { 195 functionArg = "lsib("+functionArg+")"; 196 } 197 } else if (sibOffset > 0) { 198 for (int i = 0; i < sibOffset; i++) { 199 functionArg = "rsib("+functionArg+")"; 200 } 201 } 202 203 if (deprel == true && (pppath == true || pplifted == true || ppcoveredRoot == true)) { 204 featureSpecModels.add(specModelIndex, mergePseudoProjColumns(functionArg)); 205 } else { 206 if (suffixLength != 0) { 207 featureSpecModels.add(specModelIndex, "Suffix("+featureText.toString()+functionArg+"),"+suffixLength+")"); 208 } else if (splitfeats.equals("Split(")) { 209 featureSpecModels.add(specModelIndex, splitfeats+featureText.toString()+functionArg+"),\\|)"); 210 } else { 211 featureSpecModels.add(specModelIndex, featureText.toString()+functionArg+")"); 212 } 213 } 214 215 } 216 } 217 } 218 219 private String mergePseudoProjColumns(String functionArg) { 220 StringBuilder newFeatureText = new StringBuilder(); 221 int c = 1; 222 223 if (pplifted == true) { c++; }; 224 if (pppath == true) { c++; }; 225 if (ppcoveredRoot == true) { c++; }; 226 227 if (c == 1) { // no merge 228 newFeatureText.append("OutputColumn(DEPREL, "); 229 newFeatureText.append(functionArg); 230 newFeatureText.append(')'); 231 return newFeatureText.toString(); 232 } 233 if (c == 2) { 234 newFeatureText.append("Merge("); 235 newFeatureText.append("OutputColumn(DEPREL, "); 236 newFeatureText.append(functionArg); 237 newFeatureText.append("), "); 238 if (pplifted == true) { 239 newFeatureText.append("OutputTable(PPLIFTED, "); 240 newFeatureText.append(functionArg); 241 newFeatureText.append(")"); 242 } 243 if (pppath == true) { 244 newFeatureText.append("OutputTable(PPPATH, "); 245 newFeatureText.append(functionArg); 246 newFeatureText.append(")"); 247 } 248 if (ppcoveredRoot == true) { 249 newFeatureText.append("OutputTable(PPCOVERED, "); 250 newFeatureText.append(functionArg); 251 newFeatureText.append(")"); 252 } 253 newFeatureText.append(")"); 254 } else if (c == 3) { // use Merge3 255 int i = 0; 256 newFeatureText.append("Merge3("); 257 newFeatureText.append("OutputColumn(DEPREL, "); 258 newFeatureText.append(functionArg); 259 newFeatureText.append("), "); 260 i++; 261 if (pplifted == true) { 262 newFeatureText.append("OutputTable(PPLIFTED, "); 263 newFeatureText.append(functionArg); 264 i++; 265 if (i<3) { 266 newFeatureText.append("), "); 267 } else { 268 newFeatureText.append(")"); 269 } 270 } 271 if (pppath == true) { 272 newFeatureText.append("OutputTable(PPPATH, "); 273 newFeatureText.append(functionArg); 274 i++; 275 if (i<3) { 276 newFeatureText.append("), "); 277 } else { 278 newFeatureText.append(")"); 279 } 280 } 281 if (ppcoveredRoot == true) { 282 newFeatureText.append("OutputTable(PPCOVERED, "); 283 newFeatureText.append(functionArg); 284 i++; 285 if (i<3) { 286 newFeatureText.append("), "); 287 } else { 288 newFeatureText.append(")"); 289 } 290 } 291 newFeatureText.append(")"); 292 } else { // c == 4 293 newFeatureText.append("Merge(Merge("); 294 newFeatureText.append("OutputColumn(DEPREL, "); 295 newFeatureText.append(functionArg); 296 newFeatureText.append("), "); 297 newFeatureText.append("OutputTable(PPLIFTED, "); 298 newFeatureText.append(functionArg); 299 newFeatureText.append(")), Merge("); 300 newFeatureText.append("OutputTable(PPPATH, "); 301 newFeatureText.append(functionArg); 302 newFeatureText.append("), "); 303 newFeatureText.append("OutputTable(PPCOVERED, "); 304 newFeatureText.append(functionArg); 305 newFeatureText.append(")))"); 306 } 307 return newFeatureText.toString(); 308 } 309 310 public EnumMap<ColumnNames, String> getColumnNameMap() { 311 return columnNameMap; 312 } 313 314 public void initializeColumnNameMap() { 315 columnNameMap = new EnumMap<ColumnNames, String>(ColumnNames.class); 316 columnNameMap.put(ColumnNames.POS, "POSTAG"); 317 columnNameMap.put(ColumnNames.CPOS, "CPOSTAG"); 318 columnNameMap.put(ColumnNames.DEP, "DEPREL"); 319 columnNameMap.put(ColumnNames.LEX, "FORM"); 320 columnNameMap.put(ColumnNames.LEMMA, "LEMMA"); 321 columnNameMap.put(ColumnNames.FEATS, "FEATS"); 322 } 323 324 public void setColumnNameMap(EnumMap<ColumnNames, String> columnNameMap) { 325 this.columnNameMap = columnNameMap; 326 } 327 328 public EnumMap<DataStructures, String> getDataStructuresMap() { 329 return dataStructuresMap; 330 } 331 332 //TODO Fix covington 333 public void initializeDataStructuresMap() { 334 dataStructuresMap = new EnumMap<DataStructures, String>(DataStructures.class); 335 dataStructuresMap.put(DataStructures.STACK, "Stack"); 336 dataStructuresMap.put(DataStructures.INPUT, "Input"); 337 } 338 339 public void setDataStructuresMap(EnumMap<DataStructures, String> dataStructuresMap) { 340 this.dataStructuresMap = dataStructuresMap; 341 } 342 343 public boolean isUseSplitFeats() { 344 return useSplitFeats; 345 } 346 347 public void setUseSplitFeats(boolean useSplitFeats) { 348 this.useSplitFeats = useSplitFeats; 349 } 350 351 public boolean isCovington() { 352 return covington; 353 } 354 355 public void setCovington(boolean covington) { 356 this.covington = covington; 357 } 358 359 public boolean isPppath() { 360 return pppath; 361 } 362 363 public void setPppath(boolean pppath) { 364 this.pppath = pppath; 365 } 366 367 public boolean isPplifted() { 368 return pplifted; 369 } 370 371 public void setPplifted(boolean pplifted) { 372 this.pplifted = pplifted; 373 } 374 375 public boolean isPpcoveredRoot() { 376 return ppcoveredRoot; 377 } 378 379 public void setPpcoveredRoot(boolean ppcoveredRoot) { 380 this.ppcoveredRoot = ppcoveredRoot; 381 } 382 383 public String toString() { 384 StringBuilder sb = new StringBuilder(); 385 sb.append("Mapping of column names:\n"); 386 for (ColumnNames columnName : ColumnNames.values()) { 387 sb.append(columnName.toString()+"\t"+columnNameMap.get(columnName)+"\n"); 388 } 389 sb.append("Mapping of data structures:\n"); 390 for (DataStructures dataStruct : DataStructures.values()) { 391 sb.append(dataStruct.toString()+"\t"+dataStructuresMap.get(dataStruct)+"\n"); 392 } 393 sb.append("Split FEATS column: "+useSplitFeats+"\n"); 394 return sb.toString(); 395 } 396}