001 package org.maltparser.core.feature.spec.reader; 002 003 import java.io.BufferedReader; 004 import java.io.IOException; 005 import java.io.InputStreamReader; 006 import java.net.URL; 007 import java.util.ArrayList; 008 import java.util.EnumMap; 009 import java.util.regex.Pattern; 010 011 import org.maltparser.core.exception.MaltChainedException; 012 import org.maltparser.core.feature.FeatureException; 013 import org.maltparser.core.feature.spec.SpecificationModels; 014 /** 015 * 016 * 017 * @author Johan Hall 018 */ 019 public class ParReader implements FeatureSpecReader { 020 public enum DataStructures { 021 STACK, INPUT, LEFTCONTEXT, RIGHTCONTEXT 022 }; 023 public enum ColumnNames { 024 POS, DEP, LEX, LEMMA, CPOS, FEATS 025 }; 026 private EnumMap<ColumnNames, String> columnNameMap; 027 private EnumMap<DataStructures, String> dataStructuresMap; 028 private boolean useSplitFeats = true; 029 private boolean malt04Emulation = false; 030 private boolean covington = false; 031 private boolean pppath; 032 private boolean pplifted; 033 private boolean ppcoveredRoot; 034 035 public ParReader() throws MaltChainedException { 036 initializeColumnNameMap(); 037 initializeDataStructuresMap(); 038 setPppath(false); 039 setPplifted(false); 040 setPpcoveredRoot(false); 041 } 042 043 public void load(URL specModelURL, SpecificationModels featureSpecModels) throws MaltChainedException { 044 BufferedReader br = null; 045 Pattern tabPattern = Pattern.compile("\t"); 046 if (specModelURL == null) { 047 throw new FeatureException("The feature specification file cannot be found. "); 048 } 049 try { 050 br = new BufferedReader(new InputStreamReader(specModelURL.openStream())); 051 } catch (IOException e) { 052 throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e); 053 } 054 055 if (br != null) { 056 int specModelIndex = featureSpecModels.getNextIndex(); 057 String fileLine; 058 String items[]; 059 StringBuilder featureText = new StringBuilder(); 060 String splitfeats = ""; 061 ArrayList<String> fileLines = new ArrayList<String>(); 062 ArrayList<String> orderFileLines = new ArrayList<String>(); 063 while (true) { 064 try { 065 fileLine = br.readLine(); 066 } catch (IOException e) { 067 throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e); 068 } 069 if (fileLine == null) { 070 break; 071 } 072 if (fileLine.length() <= 1 && fileLine.trim().substring(0, 2).trim().equals("--")) { 073 continue; 074 } 075 fileLines.add(fileLine); 076 } 077 try { 078 br.close(); 079 } catch (IOException e) { 080 throw new FeatureException("Could not close the feature specification file '"+specModelURL.toString()+"'. ", e); 081 } 082 if (malt04Emulation == true) { 083 for (int i = 0; i < ColumnNames.values().length; i++) { 084 for (int j = 0; j < fileLines.size(); j++) { 085 if (fileLines.get(j).startsWith(ColumnNames.values()[i].toString())) { 086 orderFileLines.add(fileLines.get(j)); 087 } 088 } 089 } 090 } else { 091 for (int j = 0; j < fileLines.size(); j++) { 092 orderFileLines.add(fileLines.get(j)); 093 } 094 } 095 boolean deprel = false; 096 for (int j=0; j < orderFileLines.size(); j++) { 097 deprel = false; 098 featureText.setLength(0); 099 splitfeats = ""; 100 items = tabPattern.split(orderFileLines.get(j)); 101 if (items.length < 2) { 102 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' must contain at least two columns."); 103 } 104 if (!(columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim())) || columnNameMap.containsValue(items[0].trim()))) { 105 throw new FeatureException("Column one in the feature specification file '"+specModelURL.toString()+"' contains an unknown value '"+items[0].trim()+"'. "); 106 } 107 if (items[0].trim().equalsIgnoreCase("DEP") || items[0].trim().equalsIgnoreCase("DEPREL")) { 108 featureText.append("OutputColumn(DEPREL, "); 109 deprel = true; 110 } else { 111 if (columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim()))) { 112 featureText.append("InputColumn("+columnNameMap.get(ColumnNames.valueOf(items[0].trim()))+", "); 113 } else if (columnNameMap.containsValue(items[0].trim())) { 114 featureText.append("InputColumn("+items[0].trim()+", "); 115 } 116 if (items[0].trim().equalsIgnoreCase("FEATS") && isUseSplitFeats()) { 117 splitfeats = "Split("; 118 } 119 } 120 if (!(items[1].trim().equalsIgnoreCase("STACK") || items[1].trim().equalsIgnoreCase("INPUT") || items[1].trim().equalsIgnoreCase("CONTEXT"))) { 121 throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should be either 'STACK', 'INPUT' or 'CONTEXT' (Covington), not '"+items[1].trim()+"'. "); 122 } 123 int offset = 0; 124 if (items.length >= 3) { 125 try { 126 offset = new Integer(Integer.parseInt(items[2])); 127 } catch (NumberFormatException e) { 128 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' contains a illegal integer value. ", e); 129 } 130 } 131 String functionArg = ""; 132 133 if (items[1].trim().equalsIgnoreCase("CONTEXT")) { 134 if (offset >= 0) { 135 functionArg = dataStructuresMap.get(DataStructures.valueOf("LEFTCONTEXT"))+"["+offset+"]"; 136 } else { 137 functionArg = dataStructuresMap.get(DataStructures.valueOf("RIGHTCONTEXT"))+"["+Math.abs(offset + 1)+"]"; 138 } 139 } else if (dataStructuresMap.containsKey(DataStructures.valueOf(items[1].trim()))) { 140 if (covington == true) { 141 if (dataStructuresMap.get(DataStructures.valueOf(items[1].trim())).equalsIgnoreCase("Stack")) { 142 functionArg = "Left["+offset+"]"; 143 } else { 144 functionArg = "Right["+offset+"]"; 145 } 146 } else { 147 functionArg = dataStructuresMap.get(DataStructures.valueOf(items[1].trim()))+"["+offset+"]"; 148 } 149 } else if (dataStructuresMap.containsValue(items[1].trim())) { 150 if (covington == true) { 151 if (items[1].trim().equalsIgnoreCase("Stack")) { 152 functionArg = "Left["+offset+"]"; 153 } else { 154 functionArg = "Right["+offset+"]"; 155 } 156 } else { 157 functionArg = items[1].trim()+"["+offset+"]"; 158 } 159 160 } else { 161 throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should not contain the value '"+items[1].trim()); 162 } 163 164 int linearOffset = 0; 165 int headOffset = 0; 166 int depOffset = 0; 167 int sibOffset = 0; 168 int suffixLength = 0; 169 if (items.length >= 4) { linearOffset = new Integer(Integer.parseInt(items[3])); } 170 if (items.length >= 5) { headOffset = new Integer(Integer.parseInt(items[4])); } 171 if (items.length >= 6) { depOffset = new Integer(Integer.parseInt(items[5])); } 172 if (items.length >= 7) { sibOffset = new Integer(Integer.parseInt(items[6])); } 173 if (items.length >= 8) { suffixLength = new Integer(Integer.parseInt(items[7])); } 174 if (linearOffset < 0) { 175 linearOffset = Math.abs(linearOffset); 176 for (int i = 0; i < linearOffset; i++) { 177 functionArg = "pred("+functionArg+")"; 178 } 179 } else if (linearOffset > 0) { 180 for (int i = 0; i < linearOffset; i++) { 181 functionArg = "succ("+functionArg+")"; 182 } 183 } 184 if (headOffset >= 0) { 185 for (int i = 0; i < headOffset; i++) { 186 functionArg = "head("+functionArg+")"; 187 } 188 } else { 189 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' should not contain a negative head function value. "); 190 } 191 if (depOffset < 0) { 192 depOffset = Math.abs(depOffset); 193 for (int i = 0; i < depOffset; i++) { 194 functionArg = "ldep("+functionArg+")"; 195 } 196 } else if (depOffset > 0) { 197 for (int i = 0; i < depOffset; i++) { 198 if (malt04Emulation == true) { 199 functionArg = "rdep2("+functionArg+")"; 200 } else { 201 functionArg = "rdep("+functionArg+")"; 202 } 203 } 204 } 205 if (sibOffset < 0) { 206 sibOffset = Math.abs(sibOffset); 207 for (int i = 0; i < sibOffset; i++) { 208 functionArg = "lsib("+functionArg+")"; 209 } 210 } else if (sibOffset > 0) { 211 for (int i = 0; i < sibOffset; i++) { 212 functionArg = "rsib("+functionArg+")"; 213 } 214 } 215 216 if (deprel == true && (pppath == true || pplifted == true || ppcoveredRoot == true)) { 217 featureSpecModels.add(specModelIndex, mergePseudoProjColumns(functionArg)); 218 } else { 219 if (suffixLength != 0) { 220 featureSpecModels.add(specModelIndex, "Suffix("+featureText.toString()+functionArg+"),"+suffixLength+")"); 221 } else if (splitfeats.equals("Split(")) { 222 featureSpecModels.add(specModelIndex, splitfeats+featureText.toString()+functionArg+"),\\|)"); 223 } else { 224 featureSpecModels.add(specModelIndex, featureText.toString()+functionArg+")"); 225 } 226 } 227 228 } 229 } 230 } 231 232 private String mergePseudoProjColumns(String functionArg) { 233 StringBuilder newFeatureText = new StringBuilder(); 234 int c = 1; 235 236 if (pplifted == true) { c++; }; 237 if (pppath == true) { c++; }; 238 if (ppcoveredRoot == true) { c++; }; 239 240 if (c == 1) { // no merge 241 newFeatureText.append("OutputColumn(DEPREL, "); 242 newFeatureText.append(functionArg); 243 newFeatureText.append(')'); 244 return newFeatureText.toString(); 245 } 246 if (c == 2) { 247 newFeatureText.append("Merge("); 248 newFeatureText.append("OutputColumn(DEPREL, "); 249 newFeatureText.append(functionArg); 250 newFeatureText.append("), "); 251 if (pplifted == true) { 252 newFeatureText.append("OutputTable(PPLIFTED, "); 253 newFeatureText.append(functionArg); 254 newFeatureText.append(")"); 255 } 256 if (pppath == true) { 257 newFeatureText.append("OutputTable(PPPATH, "); 258 newFeatureText.append(functionArg); 259 newFeatureText.append(")"); 260 } 261 if (ppcoveredRoot == true) { 262 newFeatureText.append("OutputTable(PPCOVERED, "); 263 newFeatureText.append(functionArg); 264 newFeatureText.append(")"); 265 } 266 newFeatureText.append(")"); 267 } else if (c == 3) { // use Merge3 268 int i = 0; 269 newFeatureText.append("Merge3("); 270 newFeatureText.append("OutputColumn(DEPREL, "); 271 newFeatureText.append(functionArg); 272 newFeatureText.append("), "); 273 i++; 274 if (pplifted == true) { 275 newFeatureText.append("OutputTable(PPLIFTED, "); 276 newFeatureText.append(functionArg); 277 i++; 278 if (i<3) { 279 newFeatureText.append("), "); 280 } else { 281 newFeatureText.append(")"); 282 } 283 } 284 if (pppath == true) { 285 newFeatureText.append("OutputTable(PPPATH, "); 286 newFeatureText.append(functionArg); 287 i++; 288 if (i<3) { 289 newFeatureText.append("), "); 290 } else { 291 newFeatureText.append(")"); 292 } 293 } 294 if (ppcoveredRoot == true) { 295 newFeatureText.append("OutputTable(PPCOVERED, "); 296 newFeatureText.append(functionArg); 297 i++; 298 if (i<3) { 299 newFeatureText.append("), "); 300 } else { 301 newFeatureText.append(")"); 302 } 303 } 304 newFeatureText.append(")"); 305 } else { // c == 4 306 newFeatureText.append("Merge(Merge("); 307 newFeatureText.append("OutputColumn(DEPREL, "); 308 newFeatureText.append(functionArg); 309 newFeatureText.append("), "); 310 newFeatureText.append("OutputTable(PPLIFTED, "); 311 newFeatureText.append(functionArg); 312 newFeatureText.append(")), Merge("); 313 newFeatureText.append("OutputTable(PPPATH, "); 314 newFeatureText.append(functionArg); 315 newFeatureText.append("), "); 316 newFeatureText.append("OutputTable(PPCOVERED, "); 317 newFeatureText.append(functionArg); 318 newFeatureText.append(")))"); 319 } 320 return newFeatureText.toString(); 321 } 322 323 public EnumMap<ColumnNames, String> getColumnNameMap() { 324 return columnNameMap; 325 } 326 327 public void initializeColumnNameMap() { 328 columnNameMap = new EnumMap<ColumnNames, String>(ColumnNames.class); 329 columnNameMap.put(ColumnNames.POS, "POSTAG"); 330 columnNameMap.put(ColumnNames.CPOS, "CPOSTAG"); 331 columnNameMap.put(ColumnNames.DEP, "DEPREL"); 332 columnNameMap.put(ColumnNames.LEX, "FORM"); 333 columnNameMap.put(ColumnNames.LEMMA, "LEMMA"); 334 columnNameMap.put(ColumnNames.FEATS, "FEATS"); 335 } 336 337 public void setColumnNameMap(EnumMap<ColumnNames, String> columnNameMap) { 338 this.columnNameMap = columnNameMap; 339 } 340 341 public EnumMap<DataStructures, String> getDataStructuresMap() { 342 return dataStructuresMap; 343 } 344 345 //TODO Fix covington 346 public void initializeDataStructuresMap() { 347 dataStructuresMap = new EnumMap<DataStructures, String>(DataStructures.class); 348 dataStructuresMap.put(DataStructures.STACK, "Stack"); 349 dataStructuresMap.put(DataStructures.INPUT, "Input"); 350 } 351 352 public void setDataStructuresMap(EnumMap<DataStructures, String> dataStructuresMap) { 353 this.dataStructuresMap = dataStructuresMap; 354 } 355 356 public boolean isUseSplitFeats() { 357 return useSplitFeats; 358 } 359 360 public void setUseSplitFeats(boolean useSplitFeats) { 361 this.useSplitFeats = useSplitFeats; 362 } 363 364 365 public boolean isMalt04Emulation() { 366 return malt04Emulation; 367 } 368 369 public void setMalt04Emulation(boolean malt04Emulation) { 370 this.malt04Emulation = malt04Emulation; 371 } 372 373 public boolean isCovington() { 374 return covington; 375 } 376 377 public void setCovington(boolean covington) { 378 this.covington = covington; 379 } 380 381 public boolean isPppath() { 382 return pppath; 383 } 384 385 public void setPppath(boolean pppath) { 386 this.pppath = pppath; 387 } 388 389 public boolean isPplifted() { 390 return pplifted; 391 } 392 393 public void setPplifted(boolean pplifted) { 394 this.pplifted = pplifted; 395 } 396 397 public boolean isPpcoveredRoot() { 398 return ppcoveredRoot; 399 } 400 401 public void setPpcoveredRoot(boolean ppcoveredRoot) { 402 this.ppcoveredRoot = ppcoveredRoot; 403 } 404 405 public String toString() { 406 StringBuilder sb = new StringBuilder(); 407 sb.append("Mapping of column names:\n"); 408 for (ColumnNames columnName : ColumnNames.values()) { 409 sb.append(columnName.toString()+"\t"+columnNameMap.get(columnName)+"\n"); 410 } 411 sb.append("Mapping of data structures:\n"); 412 for (DataStructures dataStruct : DataStructures.values()) { 413 sb.append(dataStruct.toString()+"\t"+dataStructuresMap.get(dataStruct)+"\n"); 414 } 415 sb.append("Split FEATS column: "+useSplitFeats+"\n"); 416 return sb.toString(); 417 } 418 }