001    package org.maltparser.parser.guide.instance;
002    
003    import java.io.BufferedReader;
004    import java.io.BufferedWriter;
005    import java.io.IOException;
006    import java.util.SortedMap;
007    
008    import java.util.ArrayList;
009    import java.util.TreeMap;
010    import java.util.TreeSet;
011    import java.util.regex.Pattern;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.feature.FeatureException;
015    import org.maltparser.core.feature.FeatureVector;
016    import org.maltparser.core.feature.function.FeatureFunction;
017    import org.maltparser.core.feature.function.Modifiable;
018    import org.maltparser.core.feature.value.SingleFeatureValue;
019    import org.maltparser.core.syntaxgraph.DependencyStructure;
020    import org.maltparser.parser.guide.Guide;
021    import org.maltparser.parser.guide.GuideException;
022    import org.maltparser.parser.guide.Model;
023    import org.maltparser.parser.history.action.SingleDecision;
024    
025    /**
026    The feature divide model is used for divide the training instances into several models according to
027    a divide feature. Usually this strategy decrease the training and classification time, but can also decrease 
028    the accuracy of the parser.  
029    
030    @author Johan Hall
031    @since 1.0
032    */
033    public class FeatureDivideModel implements InstanceModel {
034            private Model parent;
035            private final SortedMap<Integer,AtomicModel> divideModels;
036            private FeatureVector masterFeatureVector;
037            private FeatureVector divideFeatureVector;
038            private int frequency = 0;
039            private FeatureFunction divideFeature;
040            private int divideThreshold;
041            private AtomicModel masterModel;
042            private ArrayList<Integer> divideFeatureIndexVector;
043            
044            /**
045             * Constructs a feature divide model.
046             * 
047             * @param features the feature vector used by the atomic model.
048             * @param parent the parent guide model.
049             * @throws MaltChainedException
050             */
051            public FeatureDivideModel(FeatureVector features, Model parent) throws MaltChainedException {
052                    setParent(parent);
053                    setFrequency(0);
054                    initSplitParam(features);
055                    divideModels = new TreeMap<Integer,AtomicModel>();
056                    if (getGuide().getGuideMode() == Guide.GuideMode.TRAIN) {
057                            masterModel = new AtomicModel(-1, masterFeatureVector, this);
058                    } else if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) {
059                            load();
060                    }
061            }
062            
063            public void addInstance(SingleDecision decision) throws MaltChainedException {
064                    if (getGuide().getGuideMode() != Guide.GuideMode.TRAIN) {
065                            throw new GuideException("Can only add instance during learning. ");
066                    } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
067                            throw new GuideException("The divide feature does not have a single value. ");
068                    }
069                    
070                    divideFeature.update();
071                    if (divideModels != null) { 
072                            if (!divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
073                                    divideModels.put(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), new AtomicModel(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), divideFeatureVector, this));
074                            }
075                            divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).addInstance(decision);
076                    } else {
077                            throw new GuideException("The feature divide models cannot be found. ");
078                    }
079            }
080    
081            public void noMoreInstances() throws MaltChainedException {
082                    if (getGuide().getGuideMode() != Guide.GuideMode.TRAIN) {
083                            throw new GuideException("Can only finish all data during learning. ");
084                    }
085                    
086                    if (divideModels != null) {
087                            divideFeature.updateCardinality();
088                            for (Integer index : divideModels.keySet()) {
089                                    divideModels.get(index).noMoreInstances();
090                            }
091                            final TreeSet<Integer> removeSet = new TreeSet<Integer>();
092                            for (Integer index : divideModels.keySet()) {
093                                    if (divideModels.get(index).getFrequency() <= divideThreshold) {
094                                            divideModels.get(index).moveAllInstances(masterModel, divideFeature, divideFeatureIndexVector);
095                                            removeSet.add(index);
096                                    }
097                            }
098                            for (Integer index : removeSet) {
099                                    divideModels.remove(index);
100                            }
101                            masterModel.noMoreInstances();
102    
103                    } else {
104                            throw new GuideException("The feature divide models cannot be found. ");
105                    }
106            }
107    
108            public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException {
109                    if (getGuide().getGuideMode() != Guide.GuideMode.TRAIN) {
110                            throw new GuideException("Can only finish sentence during learning. ");
111                    }
112    
113                    if (divideModels != null) { 
114                            for (AtomicModel divideModel : divideModels.values()) {
115                                    divideModel.finalizeSentence(dependencyGraph);
116                            }
117                    } else {
118                            throw new GuideException("The feature divide models cannot be found. ");
119                    }
120            }
121    
122            public boolean predict(SingleDecision decision) throws MaltChainedException {
123                    if (getGuide().getGuideMode() != Guide.GuideMode.CLASSIFY) {
124                            throw new GuideException("Can only predict during parsing. ");
125                    } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
126                            throw new GuideException("The divide feature does not have a single value. ");
127                    }
128                    
129                    //divideFeature.update();
130                    if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
131                            return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).predict(decision);
132                    } else if (masterModel != null && masterModel.getFrequency() > 0) {
133                            return masterModel.predict(decision);
134                    } else {
135                            getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " +
136                                            "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" +
137                                                            " class code '1' is used. ");
138                            
139                            decision.addDecision(1); // default prediction
140                            //classCodeTable.getEmptyKBestList().addKBestItem(1); 
141                    }
142                    return true;
143            }
144    
145            public void terminate() throws MaltChainedException {
146                    if (divideModels != null) {
147                            for (AtomicModel divideModel : divideModels.values()) { 
148                                    divideModel.terminate();
149                            }
150                    }
151                    if (masterModel != null) {
152                            masterModel.terminate();
153                    }
154            }
155            
156            public void train() throws MaltChainedException {
157                    for (AtomicModel divideModel : divideModels.values()) {
158                            divideModel.train();
159                    }
160                    masterModel.train();
161                    save();
162                    for (AtomicModel divideModel : divideModels.values()) {
163                            divideModel.terminate();
164                    }
165                    masterModel.terminate();
166            }
167            
168            /**
169             * Initialize the feature split parameters and the split feature vector and master feature vector
170             * according to the behavior strategy.
171             * 
172             * @param featureVector the parent guide model's feature vector.
173             * @throws MaltChainedException
174             */
175            protected void initSplitParam(FeatureVector featureVector) throws MaltChainedException {
176                    if (getGuide().getConfiguration().getOptionValue("guide", "data_split_column") == null 
177                                    || getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().length() == 0) {
178                            throw new GuideException("The option '--guide-data_split_column' cannot be found, when initializing the data split. ");
179                    }
180                    if (getGuide().getConfiguration().getOptionValue("guide", "data_split_structure") == null 
181                                    || getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().length() == 0) {
182                            throw new GuideException("The option '--guide-data_split_structure' cannot be found, when initializing the data split. ");
183                    }
184                    try {
185                            final String spec = "InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().trim()+
186                                                            ", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().trim() +")";
187                            divideFeature = featureVector.getFeatureModel().identifyFeature(spec);
188                    } catch (FeatureException e) {
189                            throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") cannot be initialized. ", e);
190                    }
191                    if (!(divideFeature instanceof Modifiable)) {
192                            throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") does not implement Modifiable interface. ");
193                    }
194                    divideFeatureIndexVector = new ArrayList<Integer>();
195                    for (int i = 0; i < featureVector.size(); i++) {
196                            if (featureVector.get(i).equals(divideFeature)) {
197                                    divideFeatureIndexVector.add(i);
198                            }
199                    }
200                    
201                    if ((Boolean)getGuide().getConfiguration().getOptionValue("malt0.4", "behavior") == true) {
202                            /* MaltParser 0.4 removes the divide feature for all divide models. For the "Sum-up" model or
203                             * master model adds the divide feature in the end of the feature vector.
204                             */
205                            masterFeatureVector = (FeatureVector)featureVector.clone();
206                            for (Integer i : divideFeatureIndexVector) {
207                                    masterFeatureVector.remove(masterFeatureVector.get(i));
208                            }
209                            for (Integer i : divideFeatureIndexVector) {
210                                    masterFeatureVector.add(featureVector.get(i));
211                            }
212                    
213                            divideFeatureVector = (FeatureVector)featureVector.clone();
214                            for (Integer i : divideFeatureIndexVector) {
215                                    divideFeatureVector.remove(divideFeatureVector.get(i));
216                            }
217                    } else {
218                            masterFeatureVector = featureVector;
219                            divideFeatureVector = (FeatureVector)featureVector.clone();
220                            for (Integer i : divideFeatureIndexVector) {
221                                    divideFeatureVector.remove(divideFeatureVector.get(i));
222                            }
223                    }
224                    try {
225                            if (getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString() != null) {
226                                    divideThreshold = Integer.parseInt(getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString());
227                            } else {
228                                    divideThreshold = 0;
229                            }
230                    } catch (NumberFormatException e) {
231                            throw new GuideException("The --guide-data_split_threshold option is not an integer value. ", e);
232                    }
233            }
234            
235            /**
236             * Saves the feature divide model settings .fsm file.
237             * 
238             * @throws MaltChainedException
239             */
240            protected void save() throws MaltChainedException {
241                    try {
242                            final BufferedWriter out = new BufferedWriter(getGuide().getConfiguration().getConfigurationDir().getOutputStreamWriter(getModelName()+".dsm"));
243                            out.write(masterModel.getIndex() + "\t" + masterModel.getFrequency() + "\n");
244    
245                            if (divideModels != null) {
246                                    for (AtomicModel divideModel : divideModels.values()) {
247                                            out.write(divideModel.getIndex() + "\t" + divideModel.getFrequency() + "\n");
248                            }
249                            }
250                            out.close();
251                    } catch (IOException e) {
252                            throw new GuideException("Could not write to the guide model settings file '"+getModelName()+".dsm"+"', when " +
253                                            "saving the guide model settings to file. ", e);
254                    }
255            }
256            
257            /**
258             * Loads the feature divide model settings .fsm file.
259             * 
260             * @throws MaltChainedException
261             */
262            protected void load() throws MaltChainedException {
263                    try {
264                            final BufferedReader in = new BufferedReader(getGuide().getConfiguration().getConfigurationDir().getInputStreamReader(getModelName()+".dsm"));
265                            final Pattern tabPattern = Pattern.compile("\t");
266                            while(true) {
267                                    String line = in.readLine();
268                                    if(line == null) break;
269                                    String[] cols = tabPattern.split(line);
270                                    if (cols.length != 2) { 
271                                            throw new GuideException("");
272                                    }
273                                    int code = -1;
274                                    int freq = 0;
275                                    try {
276                                            code = Integer.parseInt(cols[0]);
277                                            freq = Integer.parseInt(cols[1]);
278                                    } catch (NumberFormatException e) {
279                                            throw new GuideException("Could not convert a string value into an integer value when loading the feature divide model settings (.fsm). ", e);
280                                    }
281                                    if (code == -1) { 
282                                            masterModel = new AtomicModel(-1, masterFeatureVector, this);
283                                            masterModel.setFrequency(freq);
284                                    } else if (divideModels != null) {
285                                            divideModels.put(code, new AtomicModel(code, divideFeatureVector, this));
286                                            divideModels.get(code).setFrequency(freq);
287                                    }
288                                    setFrequency(getFrequency()+freq);
289                            }
290                            in.close();
291                    } catch (IOException e) {
292                            throw new GuideException("Could not read from the guide model settings file '"+getModelName()+".dsm"+"', when " +
293                                            "loading the guide model settings. ", e);
294                    }       
295            }
296            
297            /**
298             * Returns the parent model
299             * 
300             * @return the parent model
301             */
302            public Model getParent() {
303                    return parent;
304            }
305    
306            public Guide getGuide() {
307                    return parent.getGuide();
308            }
309            
310            /**
311             * Sets the parent model
312             * 
313             * @param parent the parent model
314             */
315            protected void setParent(Model parent) throws MaltChainedException {
316                    this.parent = parent;
317            }
318    
319    
320            public String getModelName() throws MaltChainedException {
321                    try {
322                            return parent.getModelName();
323                    } catch (NullPointerException e) {
324                            throw new GuideException("The parent guide model cannot be found. ", e);
325                    }
326            }
327    
328            /**
329             * Returns the "sum-up" or master feature vector
330             * 
331             * @return a feature vector object
332             */
333            public FeatureVector getMasterFeatureVector() {
334                    return masterFeatureVector;
335            }
336    
337            /**
338             * Returns the divide feature vector
339             * 
340             * @return a feature vector object
341             */
342            public FeatureVector getDivideFeatureVector() {
343                    return divideFeatureVector;
344            }
345            
346            /**
347             * Returns the frequency (number of instances)
348             * 
349             * @return the frequency (number of instances)
350             */
351            public int getFrequency() {
352                    return frequency;
353            }
354    
355            /**
356             * Increase the frequency by 1
357             */
358            public void increaseFrequency() {
359                    if (parent instanceof InstanceModel) {
360                            ((InstanceModel)parent).increaseFrequency();
361                    }
362                    frequency++;
363            }
364            
365            public void decreaseFrequency() {
366                    if (parent instanceof InstanceModel) {
367                            ((InstanceModel)parent).decreaseFrequency();
368                    }
369                    frequency--;
370            }
371            
372            /**
373             * Sets the frequency (number of instances)
374             * 
375             * @param frequency (number of instances)
376             */
377            protected void setFrequency(int frequency) {
378                    this.frequency = frequency;
379            }
380    
381    
382            /* (non-Javadoc)
383             * @see java.lang.Object#toString()
384             */
385            public String toString() {
386                    final StringBuilder sb = new StringBuilder();
387                    //TODO
388                    return sb.toString();
389            }
390    }