001 package org.maltparser.parser.guide.instance; 002 003 import java.io.BufferedReader; 004 import java.io.BufferedWriter; 005 import java.io.IOException; 006 import java.util.SortedMap; 007 008 import java.util.ArrayList; 009 import java.util.TreeMap; 010 import java.util.TreeSet; 011 import java.util.regex.Pattern; 012 013 import org.maltparser.core.exception.MaltChainedException; 014 import org.maltparser.core.feature.FeatureException; 015 import org.maltparser.core.feature.FeatureVector; 016 import org.maltparser.core.feature.function.FeatureFunction; 017 import org.maltparser.core.feature.function.Modifiable; 018 import org.maltparser.core.feature.value.SingleFeatureValue; 019 import org.maltparser.core.syntaxgraph.DependencyStructure; 020 import org.maltparser.parser.guide.Guide; 021 import org.maltparser.parser.guide.GuideException; 022 import org.maltparser.parser.guide.Model; 023 import org.maltparser.parser.history.action.SingleDecision; 024 025 /** 026 The feature divide model is used for divide the training instances into several models according to 027 a divide feature. Usually this strategy decrease the training and classification time, but can also decrease 028 the accuracy of the parser. 029 030 @author Johan Hall 031 @since 1.0 032 */ 033 public class FeatureDivideModel implements InstanceModel { 034 private Model parent; 035 private final SortedMap<Integer,AtomicModel> divideModels; 036 private FeatureVector masterFeatureVector; 037 private FeatureVector divideFeatureVector; 038 private int frequency = 0; 039 private FeatureFunction divideFeature; 040 private int divideThreshold; 041 private AtomicModel masterModel; 042 private ArrayList<Integer> divideFeatureIndexVector; 043 044 /** 045 * Constructs a feature divide model. 046 * 047 * @param features the feature vector used by the atomic model. 048 * @param parent the parent guide model. 049 * @throws MaltChainedException 050 */ 051 public FeatureDivideModel(FeatureVector features, Model parent) throws MaltChainedException { 052 setParent(parent); 053 setFrequency(0); 054 initSplitParam(features); 055 divideModels = new TreeMap<Integer,AtomicModel>(); 056 if (getGuide().getGuideMode() == Guide.GuideMode.TRAIN) { 057 masterModel = new AtomicModel(-1, masterFeatureVector, this); 058 } else if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) { 059 load(); 060 } 061 } 062 063 public void addInstance(SingleDecision decision) throws MaltChainedException { 064 if (getGuide().getGuideMode() != Guide.GuideMode.TRAIN) { 065 throw new GuideException("Can only add instance during learning. "); 066 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { 067 throw new GuideException("The divide feature does not have a single value. "); 068 } 069 070 divideFeature.update(); 071 if (divideModels != null) { 072 if (!divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) { 073 divideModels.put(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), new AtomicModel(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), divideFeatureVector, this)); 074 } 075 divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).addInstance(decision); 076 } else { 077 throw new GuideException("The feature divide models cannot be found. "); 078 } 079 } 080 081 public void noMoreInstances() throws MaltChainedException { 082 if (getGuide().getGuideMode() != Guide.GuideMode.TRAIN) { 083 throw new GuideException("Can only finish all data during learning. "); 084 } 085 086 if (divideModels != null) { 087 divideFeature.updateCardinality(); 088 for (Integer index : divideModels.keySet()) { 089 divideModels.get(index).noMoreInstances(); 090 } 091 final TreeSet<Integer> removeSet = new TreeSet<Integer>(); 092 for (Integer index : divideModels.keySet()) { 093 if (divideModels.get(index).getFrequency() <= divideThreshold) { 094 divideModels.get(index).moveAllInstances(masterModel, divideFeature, divideFeatureIndexVector); 095 removeSet.add(index); 096 } 097 } 098 for (Integer index : removeSet) { 099 divideModels.remove(index); 100 } 101 masterModel.noMoreInstances(); 102 103 } else { 104 throw new GuideException("The feature divide models cannot be found. "); 105 } 106 } 107 108 public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException { 109 if (getGuide().getGuideMode() != Guide.GuideMode.TRAIN) { 110 throw new GuideException("Can only finish sentence during learning. "); 111 } 112 113 if (divideModels != null) { 114 for (AtomicModel divideModel : divideModels.values()) { 115 divideModel.finalizeSentence(dependencyGraph); 116 } 117 } else { 118 throw new GuideException("The feature divide models cannot be found. "); 119 } 120 } 121 122 public boolean predict(SingleDecision decision) throws MaltChainedException { 123 if (getGuide().getGuideMode() != Guide.GuideMode.CLASSIFY) { 124 throw new GuideException("Can only predict during parsing. "); 125 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { 126 throw new GuideException("The divide feature does not have a single value. "); 127 } 128 129 //divideFeature.update(); 130 if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) { 131 return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).predict(decision); 132 } else if (masterModel != null && masterModel.getFrequency() > 0) { 133 return masterModel.predict(decision); 134 } else { 135 getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " + 136 "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" + 137 " class code '1' is used. "); 138 139 decision.addDecision(1); // default prediction 140 //classCodeTable.getEmptyKBestList().addKBestItem(1); 141 } 142 return true; 143 } 144 145 public void terminate() throws MaltChainedException { 146 if (divideModels != null) { 147 for (AtomicModel divideModel : divideModels.values()) { 148 divideModel.terminate(); 149 } 150 } 151 if (masterModel != null) { 152 masterModel.terminate(); 153 } 154 } 155 156 public void train() throws MaltChainedException { 157 for (AtomicModel divideModel : divideModels.values()) { 158 divideModel.train(); 159 } 160 masterModel.train(); 161 save(); 162 for (AtomicModel divideModel : divideModels.values()) { 163 divideModel.terminate(); 164 } 165 masterModel.terminate(); 166 } 167 168 /** 169 * Initialize the feature split parameters and the split feature vector and master feature vector 170 * according to the behavior strategy. 171 * 172 * @param featureVector the parent guide model's feature vector. 173 * @throws MaltChainedException 174 */ 175 protected void initSplitParam(FeatureVector featureVector) throws MaltChainedException { 176 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_column") == null 177 || getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().length() == 0) { 178 throw new GuideException("The option '--guide-data_split_column' cannot be found, when initializing the data split. "); 179 } 180 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_structure") == null 181 || getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().length() == 0) { 182 throw new GuideException("The option '--guide-data_split_structure' cannot be found, when initializing the data split. "); 183 } 184 try { 185 final String spec = "InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().trim()+ 186 ", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().trim() +")"; 187 divideFeature = featureVector.getFeatureModel().identifyFeature(spec); 188 } catch (FeatureException e) { 189 throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") cannot be initialized. ", e); 190 } 191 if (!(divideFeature instanceof Modifiable)) { 192 throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") does not implement Modifiable interface. "); 193 } 194 divideFeatureIndexVector = new ArrayList<Integer>(); 195 for (int i = 0; i < featureVector.size(); i++) { 196 if (featureVector.get(i).equals(divideFeature)) { 197 divideFeatureIndexVector.add(i); 198 } 199 } 200 201 if ((Boolean)getGuide().getConfiguration().getOptionValue("malt0.4", "behavior") == true) { 202 /* MaltParser 0.4 removes the divide feature for all divide models. For the "Sum-up" model or 203 * master model adds the divide feature in the end of the feature vector. 204 */ 205 masterFeatureVector = (FeatureVector)featureVector.clone(); 206 for (Integer i : divideFeatureIndexVector) { 207 masterFeatureVector.remove(masterFeatureVector.get(i)); 208 } 209 for (Integer i : divideFeatureIndexVector) { 210 masterFeatureVector.add(featureVector.get(i)); 211 } 212 213 divideFeatureVector = (FeatureVector)featureVector.clone(); 214 for (Integer i : divideFeatureIndexVector) { 215 divideFeatureVector.remove(divideFeatureVector.get(i)); 216 } 217 } else { 218 masterFeatureVector = featureVector; 219 divideFeatureVector = (FeatureVector)featureVector.clone(); 220 for (Integer i : divideFeatureIndexVector) { 221 divideFeatureVector.remove(divideFeatureVector.get(i)); 222 } 223 } 224 try { 225 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString() != null) { 226 divideThreshold = Integer.parseInt(getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString()); 227 } else { 228 divideThreshold = 0; 229 } 230 } catch (NumberFormatException e) { 231 throw new GuideException("The --guide-data_split_threshold option is not an integer value. ", e); 232 } 233 } 234 235 /** 236 * Saves the feature divide model settings .fsm file. 237 * 238 * @throws MaltChainedException 239 */ 240 protected void save() throws MaltChainedException { 241 try { 242 final BufferedWriter out = new BufferedWriter(getGuide().getConfiguration().getConfigurationDir().getOutputStreamWriter(getModelName()+".dsm")); 243 out.write(masterModel.getIndex() + "\t" + masterModel.getFrequency() + "\n"); 244 245 if (divideModels != null) { 246 for (AtomicModel divideModel : divideModels.values()) { 247 out.write(divideModel.getIndex() + "\t" + divideModel.getFrequency() + "\n"); 248 } 249 } 250 out.close(); 251 } catch (IOException e) { 252 throw new GuideException("Could not write to the guide model settings file '"+getModelName()+".dsm"+"', when " + 253 "saving the guide model settings to file. ", e); 254 } 255 } 256 257 /** 258 * Loads the feature divide model settings .fsm file. 259 * 260 * @throws MaltChainedException 261 */ 262 protected void load() throws MaltChainedException { 263 try { 264 final BufferedReader in = new BufferedReader(getGuide().getConfiguration().getConfigurationDir().getInputStreamReader(getModelName()+".dsm")); 265 final Pattern tabPattern = Pattern.compile("\t"); 266 while(true) { 267 String line = in.readLine(); 268 if(line == null) break; 269 String[] cols = tabPattern.split(line); 270 if (cols.length != 2) { 271 throw new GuideException(""); 272 } 273 int code = -1; 274 int freq = 0; 275 try { 276 code = Integer.parseInt(cols[0]); 277 freq = Integer.parseInt(cols[1]); 278 } catch (NumberFormatException e) { 279 throw new GuideException("Could not convert a string value into an integer value when loading the feature divide model settings (.fsm). ", e); 280 } 281 if (code == -1) { 282 masterModel = new AtomicModel(-1, masterFeatureVector, this); 283 masterModel.setFrequency(freq); 284 } else if (divideModels != null) { 285 divideModels.put(code, new AtomicModel(code, divideFeatureVector, this)); 286 divideModels.get(code).setFrequency(freq); 287 } 288 setFrequency(getFrequency()+freq); 289 } 290 in.close(); 291 } catch (IOException e) { 292 throw new GuideException("Could not read from the guide model settings file '"+getModelName()+".dsm"+"', when " + 293 "loading the guide model settings. ", e); 294 } 295 } 296 297 /** 298 * Returns the parent model 299 * 300 * @return the parent model 301 */ 302 public Model getParent() { 303 return parent; 304 } 305 306 public Guide getGuide() { 307 return parent.getGuide(); 308 } 309 310 /** 311 * Sets the parent model 312 * 313 * @param parent the parent model 314 */ 315 protected void setParent(Model parent) throws MaltChainedException { 316 this.parent = parent; 317 } 318 319 320 public String getModelName() throws MaltChainedException { 321 try { 322 return parent.getModelName(); 323 } catch (NullPointerException e) { 324 throw new GuideException("The parent guide model cannot be found. ", e); 325 } 326 } 327 328 /** 329 * Returns the "sum-up" or master feature vector 330 * 331 * @return a feature vector object 332 */ 333 public FeatureVector getMasterFeatureVector() { 334 return masterFeatureVector; 335 } 336 337 /** 338 * Returns the divide feature vector 339 * 340 * @return a feature vector object 341 */ 342 public FeatureVector getDivideFeatureVector() { 343 return divideFeatureVector; 344 } 345 346 /** 347 * Returns the frequency (number of instances) 348 * 349 * @return the frequency (number of instances) 350 */ 351 public int getFrequency() { 352 return frequency; 353 } 354 355 /** 356 * Increase the frequency by 1 357 */ 358 public void increaseFrequency() { 359 if (parent instanceof InstanceModel) { 360 ((InstanceModel)parent).increaseFrequency(); 361 } 362 frequency++; 363 } 364 365 public void decreaseFrequency() { 366 if (parent instanceof InstanceModel) { 367 ((InstanceModel)parent).decreaseFrequency(); 368 } 369 frequency--; 370 } 371 372 /** 373 * Sets the frequency (number of instances) 374 * 375 * @param frequency (number of instances) 376 */ 377 protected void setFrequency(int frequency) { 378 this.frequency = frequency; 379 } 380 381 382 /* (non-Javadoc) 383 * @see java.lang.Object#toString() 384 */ 385 public String toString() { 386 final StringBuilder sb = new StringBuilder(); 387 //TODO 388 return sb.toString(); 389 } 390 }