001    package org.maltparser.ml.libsvm;
002    
003    import java.io.BufferedReader;
004    import java.io.BufferedWriter;
005    import java.io.File;
006    import java.io.FileNotFoundException;
007    import java.io.IOException;
008    import java.io.InputStream;
009    import java.io.InputStreamReader;
010    import java.io.OutputStreamWriter;
011    import java.io.PrintStream;
012    import java.text.DecimalFormat;
013    import java.text.DecimalFormatSymbols;
014    import java.util.ArrayList;
015    import java.util.Set;
016    import java.util.regex.Pattern;
017    import java.util.regex.PatternSyntaxException;
018    
019    import libsvm.svm;
020    import libsvm.svm_model;
021    import libsvm.svm_node;
022    import libsvm.svm_parameter;
023    import libsvm.svm_problem;
024    
025    import org.maltparser.core.exception.MaltChainedException;
026    import org.maltparser.core.feature.FeatureVector;
027    import org.maltparser.core.feature.function.FeatureFunction;
028    import org.maltparser.core.feature.value.FeatureValue;
029    import org.maltparser.core.feature.value.MultipleFeatureValue;
030    import org.maltparser.core.feature.value.SingleFeatureValue;
031    import org.maltparser.core.helper.NoPrintStream;
032    import org.maltparser.core.syntaxgraph.DependencyStructure;
033    import org.maltparser.ml.LearningMethod;
034    import org.maltparser.ml.libsvm.LibsvmException;
035    import org.maltparser.parser.DependencyParserConfig;
036    import org.maltparser.parser.guide.instance.InstanceModel;
037    import org.maltparser.parser.history.action.SingleDecision;
038    import org.maltparser.parser.history.kbest.KBestList;
039    import org.maltparser.parser.history.kbest.ScoredKBestList;
040    
041    /**
042    Implements an interface to the LIBSVM learner (currently the LIBSVM 2.86 is used). More information
043    about LIBSVM can be found at 
044    <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvm/" target="_blank">LIBSVM -- A Library for Support Vector Machines</a>.
045    
046    @author Johan Hall
047    @since 1.0
048    */
049    public class Libsvm implements LearningMethod {
050            public final static String LIBSVM_VERSION = "2.89";
051            public enum Verbostity {
052                    SILENT, ERROR, ALL
053            }
054            protected InstanceModel owner;
055            protected int learnerMode;
056            protected String name;
057            protected int numberOfInstances;
058            protected boolean saveInstanceFiles;
059            protected boolean excludeNullValues;
060            protected String pathExternalSVMTrain = null;
061            private int[] cardinalities;
062    
063            /**
064             * Instance output stream writer 
065             */
066            private BufferedWriter instanceOutput = null; 
067            /**
068             * LIBSVM svm_model object, only used during classification.
069             */
070            private svm_model model = null;
071            
072            /**
073             * LIBSVM svm_parameter object
074             */
075            private svm_parameter svmParam;
076            /**
077             * Parameter string
078             */
079            private String paramString;
080            /**
081             * An array of LIBSVM svm_node objects, only used during classification.
082             */
083            private ArrayList<svm_node> xlist = null;
084    
085            private Verbostity verbosity;
086            /**
087             * Constructs a LIBSVM learner.
088             * 
089             * @param owner the guide model owner
090             * @param learnerMode the mode of the learner TRAIN or CLASSIFY
091             */
092            public Libsvm(InstanceModel owner, Integer learnerMode) throws MaltChainedException {
093                    setOwner(owner);
094                    setLearningMethodName("libsvm");
095                    setLearnerMode(learnerMode.intValue());
096                    setNumberOfInstances(0);
097                    verbosity = Verbostity.SILENT;
098                    initSvmParam(getConfiguration().getOptionValue("libsvm", "libsvm_options").toString());
099                    initSpecialParameters();
100                    if (learnerMode == BATCH) {
101    //                      if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) {
102    //                              if (pathExternalSVMTrain != null) {
103    //                                      owner.getGuide().getConfiguration().getConfigLogger().info("  Learner              : LIBSVM external "+ getParamString() + "\n");
104    //                              } else {
105    //                                      owner.getGuide().getConfiguration().getConfigLogger().info("  Learner              : LIBSVM "+LIBSVM_VERSION+" "+ getParamString() + "\n");
106    //                              }
107    //                      }
108                            instanceOutput = new BufferedWriter(getInstanceOutputStreamWriter(".ins"));
109                    } 
110    //              else {
111    //                      if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) {
112    //                              owner.getGuide().getConfiguration().getConfigLogger().info("  Classifier           : LIBSVM "+LIBSVM_VERSION+" "+ getParamString()+ "\n");
113    //                      }
114    //              }
115            }
116            
117            
118            public void addInstance(SingleDecision decision, FeatureVector featureVector) throws MaltChainedException {
119                    if (featureVector == null) {
120                            throw new LibsvmException("The feature vector cannot be found");
121                    } else if (decision == null) {
122                            throw new LibsvmException("The decision cannot be found");
123                    }       
124                    try {
125                            instanceOutput.write(decision.getDecisionCode()+"\t");
126                            for (int i = 0; i < featureVector.size(); i++) {
127                                    FeatureValue featureValue = featureVector.get(i).getFeatureValue();
128                                    if (excludeNullValues == true && featureValue.isNullValue()) {
129                                            instanceOutput.write("-1");
130                                    } else {
131                                            if (featureValue instanceof SingleFeatureValue) {
132                                                    instanceOutput.write(((SingleFeatureValue)featureValue).getCode()+"");
133                                            } else if (featureValue instanceof MultipleFeatureValue) {
134                                                    Set<Integer> values = ((MultipleFeatureValue)featureValue).getCodes();
135                                                    int j=0;
136                                                    for (Integer value : values) {
137                                                            instanceOutput.write(value.toString());
138                                                            if (j != values.size()-1) {
139                                                                    instanceOutput.write("|");
140                                                            }
141                                                            j++;
142                                                    }
143                                            }
144                                    }
145                                    if (i != featureVector.size()) {
146                                            instanceOutput.write('\t');
147                                    }
148                            }
149    
150                            instanceOutput.write('\n');
151                            instanceOutput.flush();
152                            increaseNumberOfInstances();
153                    } catch (IOException e) {
154                            throw new LibsvmException("The LIBSVM learner cannot write to the instance file. ", e);
155                    }
156            }
157            
158            public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException { }
159            
160            /* (non-Javadoc)
161             * @see org.maltparser.ml.LearningMethod#noMoreInstances()
162             */
163            public void noMoreInstances() throws MaltChainedException {
164                    closeInstanceWriter();
165            }
166    
167    
168            /* (non-Javadoc)
169             * @see org.maltparser.ml.LearningMethod#train(org.maltparser.parser.guide.feature.FeatureVector)
170             */
171            public void train(FeatureVector featureVector) throws MaltChainedException {
172                    if (featureVector == null) {
173                            throw new LibsvmException("The feature vector cannot be found. ");
174                    } else if (owner == null) {
175                            throw new LibsvmException("The parent guide model cannot be found. ");
176                    }
177                    cardinalities = getCardinalities(featureVector);
178                    if (pathExternalSVMTrain == null) {
179                            try {
180                                    final svm_problem prob = readProblemMaltSVMFormat(getInstanceInputStreamReader(".ins"), cardinalities, svmParam);
181                                    if(svm.svm_check_parameter(prob, svmParam) != null) {
182                                            throw new LibsvmException(svm.svm_check_parameter(prob, svmParam));
183                                    }
184                                    owner.getGuide().getConfiguration().getConfigLogger().info("Creating LIBSVM model "+getFile(".mod").getName()+"\n");
185                                    final PrintStream out = System.out;
186                                    final PrintStream err = System.err;
187                                    System.setOut(NoPrintStream.NO_PRINTSTREAM);
188                                    System.setErr(NoPrintStream.NO_PRINTSTREAM);
189                                    
190                                    svm.svm_save_model(getFile(".mod").getAbsolutePath(), svm.svm_train(prob, svmParam));
191                                    System.setOut(err);
192                                    System.setOut(out);
193                                    if (!saveInstanceFiles) {
194                                            getFile(".ins").delete();
195                                    }
196                            } catch (OutOfMemoryError e) {
197                                    throw new LibsvmException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e);
198                            } catch (IllegalArgumentException e) {
199                                    throw new LibsvmException("The LIBSVM learner was not able to redirect Standard Error stream. ", e);
200                            } catch (SecurityException e) {
201                                    throw new LibsvmException("The LIBSVM learner cannot remove the instance file. ", e);
202                            } catch (IOException e) {
203                                    throw new LibsvmException("The LIBSVM learner cannot save the model file '"+getFile(".mod").getAbsolutePath()+"'. ", e);
204                            }
205                    } else {
206                            trainExternal(featureVector);
207                    }
208                    saveCardinalities(getInstanceOutputStreamWriter(".car"), cardinalities);
209            }
210            
211            private void trainExternal(FeatureVector featureVector) throws MaltChainedException {
212                    try {           
213                            maltSVMFormat2OriginalSVMFormat(getInstanceInputStreamReader(".ins"), getInstanceOutputStreamWriter(".ins.tmp"), cardinalities);
214                            owner.getGuide().getConfiguration().getConfigLogger().info("Creating LIBSVM model (svm-train) "+getFile(".mod").getName());
215    
216                            final ArrayList<String> commands = new ArrayList<String>();
217                            commands.add(pathExternalSVMTrain);
218                            final String[] params = getSVMParamStringArray(svmParam);
219                            for (int i=0; i < params.length; i++) {
220                                    commands.add(params[i]);
221                            }
222                            commands.add(getFile(".ins.tmp").getAbsolutePath());
223                            commands.add(getFile(".mod").getAbsolutePath());
224                            String[] arrayCommands =  commands.toArray(new String[commands.size()]);
225                            
226                    if (verbosity == Verbostity.ALL) {
227                            owner.getGuide().getConfiguration().getConfigLogger().info('\n');
228                    }
229                            final Process child = Runtime.getRuntime().exec(arrayCommands);
230                    final InputStream in = child.getInputStream();
231                    final InputStream err = child.getErrorStream();
232                    int c;
233                    while ((c = in.read()) != -1){
234                            if (verbosity == Verbostity.ALL) {
235                                    owner.getGuide().getConfiguration().getConfigLogger().info((char)c);
236                            }
237                    }
238                    while ((c = err.read()) != -1){
239                            if (verbosity == Verbostity.ALL || verbosity == Verbostity.ERROR) {
240                                    owner.getGuide().getConfiguration().getConfigLogger().info((char)c);
241                            }
242                    }
243                if (child.waitFor() != 0) {
244                    owner.getGuide().getConfiguration().getConfigLogger().info(" FAILED ("+child.exitValue()+")");
245                }
246                    in.close();
247                    err.close();
248                    if (!saveInstanceFiles) {
249                                    getFile(".ins").delete();
250                                    getFile(".ins.tmp").delete();
251                    }
252                    owner.getGuide().getConfiguration().getConfigLogger().info('\n');
253                    } catch (InterruptedException e) {
254                             throw new LibsvmException("SVM-trainer is interrupted. ", e);
255                    } catch (IllegalArgumentException e) {
256                            throw new LibsvmException("The LIBSVM learner was not able to redirect Standard Error stream. ", e);
257                    } catch (SecurityException e) {
258                            throw new LibsvmException("The LIBSVM learner cannot remove the instance file. ", e);
259                    } catch (IOException e) {
260                            throw new LibsvmException("The LIBSVM learner cannot save the model file '"+getFile(".mod").getAbsolutePath()+"'. ", e);
261                    } catch (OutOfMemoryError e) {
262                            throw new LibsvmException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e);
263                    }
264            }
265            
266            private int[] getCardinalities(FeatureVector featureVector) {
267                    int[] cardinalities = new int[featureVector.size()];
268                    int i = 0;
269                    for (FeatureFunction feature : featureVector) {
270                            cardinalities[i++] = feature.getFeatureValue().getCardinality();
271                    }
272                    return cardinalities;
273            }
274            
275            private void saveCardinalities(OutputStreamWriter osw, int[] cardinalities) throws MaltChainedException {
276                    final BufferedWriter out = new BufferedWriter(osw);
277                    try {
278                            for (int i = 0, n = cardinalities.length; i < n; i++) {
279                                    out.write(Integer.toString(cardinalities[i]));
280                                    if (i < n - 1) {
281                                            out.write(',');
282                                    }
283                            }
284                            out.write('\n');
285                            out.close();
286                    } catch (IOException e) {
287                            throw new LibsvmException("", e);
288                    }
289            }
290            
291            private int[] loadCardinalities(InputStreamReader isr) throws MaltChainedException {
292                    int[] cardinalities = null;
293                    try {
294                            final BufferedReader in = new BufferedReader(isr); 
295                            String line;
296                            if ((line = in.readLine()) != null) {
297                                    String[] items = line.split(",");
298                                    cardinalities = new int[items.length];
299                                    for (int i = 0; i < items.length; i++) {
300                                            cardinalities[i] = Integer.parseInt(items[i]);
301                                    }
302                            }
303                            in.close();
304                    } catch (IOException e) {
305                            throw new LibsvmException("", e);
306                    } catch (NumberFormatException e) {
307                            throw new LibsvmException("", e);
308                    }
309                    return cardinalities;
310            }
311            
312            /* (non-Javadoc)
313             * @see org.maltparser.ml.LearningMethod#moveAllInstances(org.maltparser.ml.LearningMethod, org.maltparser.core.feature.function.FeatureFunction, java.util.ArrayList)
314             */
315            public void moveAllInstances(LearningMethod method, FeatureFunction divideFeature, ArrayList<Integer> divideFeatureIndexVector) throws MaltChainedException {
316                    if (method == null) {
317                            throw new LibsvmException("The learning method cannot be found. ");
318                    } else if (divideFeature == null) {
319                            throw new LibsvmException("The divide feature cannot be found. ");
320                    } 
321                    try {
322                            final BufferedReader in = new BufferedReader(getInstanceInputStreamReader(".ins"));
323                            final BufferedWriter out = method.getInstanceWriter();
324                            final StringBuilder sb = new StringBuilder(6);
325                            int l = in.read();
326                            char c;
327                            int j = 0;
328                            while(true) {
329                                    if (l == -1) {
330                                            sb.setLength(0);
331                                            break;
332                                    }
333                                    
334                                    c = (char)l; 
335                                    l = in.read();
336                                    if (c == '\t') {
337                                            if (divideFeatureIndexVector.contains(j-1)) {
338                                                    out.write(Integer.toString(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()));
339                                                    out.write('\t');
340                                            }
341                                            out.write(sb.toString());
342                                            j++;
343                                            out.write('\t');
344                                            sb.setLength(0);
345                                    } else if (c == '\n') {
346                                            out.write(sb.toString());
347                                            if (divideFeatureIndexVector.contains(j-1)) {
348                                                    out.write('\t');
349                                                    out.write(Integer.toString(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()));
350                                            }
351                                            out.write('\n');
352                                            sb.setLength(0);
353                                            method.increaseNumberOfInstances();
354                                            this.decreaseNumberOfInstances();
355                                            j = 0;
356                                    } else {
357                                            sb.append(c);
358                                    }
359                            }       
360                            in.close();
361                            getFile(".ins").delete();
362                    } catch (SecurityException e) {
363                            throw new LibsvmException("The LIBSVM learner cannot remove the instance file. ", e);
364                    } catch (NullPointerException  e) {
365                            throw new LibsvmException("The instance file cannot be found. ", e);
366                    } catch (FileNotFoundException e) {
367                            throw new LibsvmException("The instance file cannot be found. ", e);
368                    } catch (IOException e) {
369                            throw new LibsvmException("The LIBSVM learner read from the instance file. ", e);
370                    }
371            }
372            
373            /* (non-Javadoc)
374             * @see org.maltparser.ml.LearningMethod#predict(org.maltparser.parser.guide.feature.FeatureVector, org.maltparser.ml.KBestList)
375             */
376            public boolean predict(FeatureVector featureVector, SingleDecision decision) throws MaltChainedException {
377                    if (model == null) {
378                            File modelFile = getFile(".mod");
379                            try {
380                                    model = svm.svm_load_model(modelFile.getAbsolutePath());        
381                            } catch (IOException e) {
382                                    throw new LibsvmException("The file '"+modelFile.getAbsolutePath()+"' cannot be loaded. ", e);
383                            }
384                    }
385                    if (cardinalities == null) {
386                            if (getFile(".car").exists()) {
387                                    cardinalities = loadCardinalities(getInstanceInputStreamReader(".car"));
388                            } else {
389                                    cardinalities = getCardinalities(featureVector);
390                            }
391                    }
392                    if (xlist == null) {
393                            xlist = new ArrayList<svm_node>(featureVector.size()); 
394                    }
395                    if (model == null) { 
396                            throw new LibsvmException("The LIBSVM learner cannot predict the next class, because the learning model cannot be found. ");
397                    } else if (featureVector == null) {
398                            throw new LibsvmException("The LIBSVM learner cannot predict the next class, because the feature vector cannot be found. ");
399                    }
400                    int j = 0;
401                    int offset = 0;
402                    int i = 0;
403                    for (FeatureFunction feature : featureVector) {
404                            final FeatureValue featureValue = feature.getFeatureValue();
405                            if (!(excludeNullValues == true && featureValue.isNullValue())) {
406                                    if (featureValue instanceof SingleFeatureValue) {
407                                            if (((SingleFeatureValue)featureValue).getCode() < cardinalities[i]) {
408                                                    if (j >= xlist.size()) {
409                                                            svm_node x =  new svm_node();
410                                                            x.value = 1;
411                                                            xlist.add(j,x);
412                                                    }
413                                                    xlist.get(j++).index = ((SingleFeatureValue)featureValue).getCode() + offset;
414                                            }
415                                    } else if (featureValue instanceof MultipleFeatureValue) {
416                                            for (Integer value : ((MultipleFeatureValue)featureValue).getCodes()) {
417                                                    if (value < cardinalities[i]) {
418    //                                              if (((MultipleFeatureValue)featureValue).isKnown(value)) {
419                                                            if (j >= xlist.size()) {
420                                                                    svm_node x =  new svm_node();
421                                                                    x.value = 1;
422                                                                    xlist.add(j,x);
423                                                            }
424                                                            xlist.get(j++).index = value + offset;
425                                                    }
426                                            }
427                                    }
428                            }
429                            offset += cardinalities[i];
430                            i++;
431                    }
432    
433                    svm_node[] xarray = new svm_node[j];
434                    for (int k = 0; k < j; k++) {
435                            xarray[k] = xlist.get(k);
436                    }
437                    if (decision.getKBestList().getK() == 1 || svm.svm_get_svm_type(model) == svm_parameter.ONE_CLASS ||
438                                    svm.svm_get_svm_type(model) == svm_parameter.EPSILON_SVR ||
439                                    svm.svm_get_svm_type(model) == svm_parameter.NU_SVR) {
440                            decision.getKBestList().add((int)svm.svm_predict(model, xarray));
441                    } else {
442                            svm_predict_with_kbestlist(model, xarray, decision.getKBestList());
443                    }
444    
445                    return true;
446            }
447            
448    
449            public void terminate() throws MaltChainedException { 
450                    closeInstanceWriter();
451                    model = null;
452                    svmParam = null;
453                    xlist = null;
454                    owner = null;
455            }
456    
457            public BufferedWriter getInstanceWriter() {
458                    return instanceOutput;
459            }
460            
461            protected void closeInstanceWriter() throws MaltChainedException {
462                    try {
463                            if (instanceOutput != null) {
464                                    instanceOutput.flush();
465                                    instanceOutput.close();
466                                    instanceOutput = null;
467                            }
468                    } catch (IOException e) {
469                            throw new LibsvmException("The LIBSVM learner cannot close the instance file. ", e);
470                    }
471            }
472            
473            /**
474             * Initialize the LIBSVM according to the parameter string
475             * 
476             * @param paramString the parameter string to configure the LIBSVM learner.
477             * @throws MaltChainedException
478             */
479            protected void initSvmParam(String paramString) throws MaltChainedException {
480                    this.paramString = paramString;
481                    svmParam = new svm_parameter();
482                    initParameters(svmParam);
483                    parseParameters(paramString, svmParam);
484            }
485            
486            /**
487             * Returns the parameter string for used for configure LIBSVM
488             * 
489             * @return the parameter string for used for configure LIBSVM
490             */
491            public String getParamString() {
492                    return paramString;
493            }
494            
495            public InstanceModel getOwner() {
496                    return owner;
497            }
498    
499            protected void setOwner(InstanceModel owner) {
500                    this.owner = owner;
501            }
502            
503            public int getLearnerMode() {
504                    return learnerMode;
505            }
506    
507            public void setLearnerMode(int learnerMode) throws MaltChainedException {
508                    this.learnerMode = learnerMode;
509            }
510            
511            public String getLearningMethodName() {
512                    return name;
513            }
514            
515            /**
516             * Returns the current configuration
517             * 
518             * @return the current configuration
519             * @throws MaltChainedException
520             */
521            public DependencyParserConfig getConfiguration() throws MaltChainedException {
522                    return owner.getGuide().getConfiguration();
523            }
524            
525            public int getNumberOfInstances() {
526                    return numberOfInstances;
527            }
528    
529            public void increaseNumberOfInstances() {
530                    numberOfInstances++;
531                    owner.increaseFrequency();
532            }
533            
534            public void decreaseNumberOfInstances() {
535                    numberOfInstances--;
536                    owner.decreaseFrequency();
537            }
538            
539            protected void setNumberOfInstances(int numberOfInstances) {
540                    this.numberOfInstances = 0;
541            }
542    
543            protected void setLearningMethodName(String name) {
544                    this.name = name;
545            }
546            
547            protected OutputStreamWriter getInstanceOutputStreamWriter(String suffix) throws MaltChainedException {
548                    return getConfiguration().getConfigurationDir().getOutputStreamWriter(owner.getModelName()+getLearningMethodName()+suffix);
549            }
550            
551            protected InputStreamReader getInstanceInputStreamReader(String suffix) throws MaltChainedException {
552                    return getConfiguration().getConfigurationDir().getInputStreamReader(owner.getModelName()+getLearningMethodName()+suffix);
553            }
554            
555            protected File getFile(String suffix) throws MaltChainedException {
556                    return getConfiguration().getConfigurationDir().getFile(owner.getModelName()+getLearningMethodName()+suffix);
557            }
558            
559            /**
560             * Reads an instance file into a svm_problem object according to the Malt-SVM format, which is column fixed format (tab-separated).
561             * 
562             * @param isr   the instance stream reader for the instance file
563             * @param cardinalities a array containing the number of distinct values for a particular column.
564             * @param param a svm_parameter object
565             * @throws LibsvmException
566             */
567            public final svm_problem readProblemMaltSVMFormat(InputStreamReader isr, int[] cardinalities, svm_parameter param) throws MaltChainedException {
568                    final svm_problem prob = new svm_problem();
569                    try {
570                            final BufferedReader fp = new BufferedReader(isr);
571                            int max_index = 0;
572                            if (xlist == null) {
573                                    xlist = new ArrayList<svm_node>(); 
574                            }
575                            prob.l = getNumberOfInstances();
576                            prob.x = new svm_node[prob.l][];
577                            prob.y = new double[prob.l];
578                            int i = 0;
579                            final Pattern tabPattern = Pattern.compile("\t");
580                            final Pattern pipePattern = Pattern.compile("\\|");
581                            while(true) {
582                                    String line = fp.readLine();
583                                    if(line == null) break;
584                                    String[] columns = tabPattern.split(line);
585    
586                                    if (columns.length == 0) {
587                                            continue;
588                                    }
589                                    
590                                    int offset = 0; 
591                                    int j = 0;
592                                    try {
593                                            prob.y[i] = (double)Integer.parseInt(columns[j]);
594                                            int p = 0;
595                                            for(j = 1; j < columns.length; j++) {
596                                                    final String[] items = pipePattern.split(columns[j]);   
597                                                    for (int k = 0; k < items.length; k++) {
598                                                            try {
599                                                                    if (Integer.parseInt(items[k]) != -1) {
600                                                                            xlist.add(p, new svm_node());
601                                                                            xlist.get(p).value = 1;
602                                                                            xlist.get(p).index = Integer.parseInt(items[k])+offset;
603                                                                            p++;
604                                                                    }
605                                                            } catch (NumberFormatException e) {
606                                                                    throw new LibsvmException("The instance file contain a non-integer value '"+items[k]+"'", e);
607                                                            }
608                                                    }
609                                                    offset += cardinalities[j-1];
610                                            }
611                                            prob.x[i] = xlist.subList(0, p).toArray(new svm_node[0]);
612                                            if(columns.length > 1) {
613                                                    max_index = Math.max(max_index, xlist.get(p-1).index);
614                                            }
615                                            i++;
616                                            xlist.clear();
617                                    } catch (ArrayIndexOutOfBoundsException e) {
618                                            throw new LibsvmException("Cannot read from the instance file. ", e);
619                                    }
620                            }
621                            fp.close();     
622                            if (param.gamma == 0) {
623                                    param.gamma = 1.0/max_index;
624                            }
625                            xlist = null;
626                    } catch (IOException e) {
627                            throw new LibsvmException("Cannot read from the instance file. ", e);
628                    }
629                    return prob;
630            }
631            
632            protected void initSpecialParameters() throws MaltChainedException {
633                    if (getConfiguration().getOptionValue("singlemalt", "null_value") != null && getConfiguration().getOptionValue("singlemalt", "null_value").toString().equalsIgnoreCase("none")) {
634                            excludeNullValues = true;
635                    } else {
636                            excludeNullValues = false;
637                    }
638                    saveInstanceFiles = ((Boolean)getConfiguration().getOptionValue("libsvm", "save_instance_files")).booleanValue();
639                            
640                    if (!getConfiguration().getOptionValue("libsvm", "libsvm_external").toString().equals("")) {
641                            try {
642                                    if (!new File(getConfiguration().getOptionValue("libsvm", "libsvm_external").toString()).exists()) {
643                                            throw new LibsvmException("The path to the external LIBSVM trainer 'svm-train' is wrong.");
644                                    }
645                                    if (new File(getConfiguration().getOptionValue("libsvm", "libsvm_external").toString()).isDirectory()) {
646                                            throw new LibsvmException("The option --libsvm-libsvm_external points to a directory, the path should point at the 'svm-train' file or the 'svm-train.exe' file");
647                                    }
648                                    if (!(getConfiguration().getOptionValue("libsvm", "libsvm_external").toString().endsWith("svm-train") || getConfiguration().getOptionValue("libsvm", "libsvm_external").toString().endsWith("svm-train.exe"))) {
649                                            throw new LibsvmException("The option --libsvm-libsvm_external does not specify the path to 'svm-train' file or the 'svm-train.exe' file. ");
650                                    }
651                                    pathExternalSVMTrain = getConfiguration().getOptionValue("libsvm", "libsvm_external").toString();
652                            } catch (SecurityException e) {
653                                    throw new LibsvmException("Access denied to the file specified by the option --libsvm-libsvm_external. ", e);
654                            }
655                    }
656                    if (getConfiguration().getOptionValue("libsvm", "verbosity") != null) {
657                            verbosity = Verbostity.valueOf(getConfiguration().getOptionValue("libsvm", "verbosity").toString().toUpperCase());
658                    }
659            }
660            
661            /**
662             * Assign a default value to all svm parameters
663             * 
664             * @param param a svm_parameter object
665             */
666            protected void initParameters(svm_parameter param) throws MaltChainedException {
667                    if (param == null) {
668                            throw new LibsvmException("Svm-parameters cannot be found. ");
669                    }
670                    param.svm_type = svm_parameter.C_SVC;
671                    param.kernel_type = svm_parameter.POLY;
672                    param.degree = 2;
673                    param.gamma = 0.2;      // 1/k
674                    param.coef0 = 0;
675                    param.nu = 0.5;
676                    param.cache_size = 100; 
677                    param.C = 1; 
678                    param.eps = 1.0; 
679                    param.p = 0.1;
680                    param.shrinking = 1;
681                    param.probability = 0;
682                    param.nr_weight = 0;
683                    param.weight_label = new int[0];
684                    param.weight = new double[0];
685            }
686            
687            /**
688             * Returns a string containing all svm-parameters of interest
689             * 
690             * @param param a svm_parameter object
691             * @return a string containing all svm-parameters of interest
692             */
693            public String toStringParameters(svm_parameter param)  {
694                    if (param == null) {
695                            throw new IllegalArgumentException("Svm-parameters cannot be found. ");
696                    }
697                    final StringBuffer sb = new StringBuffer();
698                    
699                    final String[] svmtypes = {"C_SVC", "NU_SVC","ONE_CLASS","EPSILON_SVR","NU_SVR"};
700                    final String[] kerneltypes = {"LINEAR", "POLY","RBF","SIGMOID","PRECOMPUTED"};
701                    final DecimalFormat dform = new DecimalFormat("#0.0#"); 
702                    final DecimalFormatSymbols sym = new DecimalFormatSymbols();
703                    sym.setDecimalSeparator('.');
704                    dform.setDecimalFormatSymbols(sym);
705                    sb.append("LIBSVM SETTINGS\n");
706                    sb.append("  SVM type      : " + svmtypes[param.svm_type] + " (" + param.svm_type + ")\n");
707                    sb.append("  Kernel        : " + kerneltypes[param.kernel_type] + " (" + param.kernel_type + ")\n");
708                    if (param.kernel_type == svm_parameter.POLY) {
709                            sb.append("  Degree        : " + param.degree + "\n");
710                    }
711                    if (param.kernel_type == svm_parameter.POLY || param.kernel_type == svm_parameter.RBF || param.kernel_type == svm_parameter.SIGMOID) {
712                            sb.append("  Gamma         : " + dform.format(param.gamma) + "\n");
713                            if (param.kernel_type == svm_parameter.POLY || param.kernel_type == svm_parameter.SIGMOID) {
714                                    sb.append("  Coef0         : " + dform.format(param.coef0) + "\n");
715                            }
716                    }
717                    if (param.svm_type == svm_parameter.NU_SVC || param.svm_type == svm_parameter.NU_SVR || param.svm_type == svm_parameter.ONE_CLASS) {
718                            sb.append("  Nu            : " + dform.format(param.nu) + "\n");
719                    }
720                    sb.append("  Cache Size    : " + dform.format(param.cache_size) + " MB\n");
721                    if (param.svm_type == svm_parameter.C_SVC || param.svm_type == svm_parameter.NU_SVR || param.svm_type == svm_parameter.EPSILON_SVR) {
722                            sb.append("  C             : " + dform.format(param.C) + "\n");
723                    }
724                    sb.append("  Eps           : " + dform.format(param.eps) + "\n");
725                    if (param.svm_type == svm_parameter.EPSILON_SVR) {
726                            sb.append("  P             : " + dform.format(param.p) + "\n");
727                    }
728                    sb.append("  Shrinking     : " + param.shrinking + "\n");
729                    sb.append("  Probability   : " + param.probability + "\n");
730                    if (param.svm_type == svm_parameter.C_SVC) {
731                            sb.append("  #Weight       : " + param.nr_weight + "\n");
732                            if (param.nr_weight > 0) {
733                                    sb.append("  Weight labels : ");
734                                    for (int i = 0; i < param.nr_weight; i++) {
735                                            sb.append(param.weight_label[i]);
736                                            if (i != param.nr_weight-1) {
737                                                    sb.append(", ");
738                                            }
739                                    }
740                                    sb.append("\n");
741                                    for (int i = 0; i < param.nr_weight; i++) {
742                                            sb.append(dform.format(param.weight));
743                                            if (i != param.nr_weight-1) {
744                                                    sb.append(", ");
745                                            }
746                                    }
747                                    sb.append("\n");
748                            }
749                    }
750                    return sb.toString();
751            }
752            
753            public String[] getSVMParamStringArray(svm_parameter param) {
754                    final ArrayList<String> params = new ArrayList<String>();
755    
756                    if (param.svm_type != 0) {
757                            params.add("-s"); params.add(new Integer(param.svm_type).toString());
758                    }
759                    if (param.kernel_type != 2) {
760                            params.add("-t"); params.add(new Integer(param.kernel_type).toString());
761                    }
762                    if (param.degree != 3) {
763                            params.add("-d"); params.add(new Integer(param.degree).toString());
764                    }
765                    params.add("-g"); params.add(new Double(param.gamma).toString());
766                    if (param.coef0 != 0) {
767                            params.add("-r"); params.add(new Double(param.coef0).toString());
768                    }
769                    if (param.nu != 0.5) {
770                            params.add("-n"); params.add(new Double(param.nu).toString());
771                    }
772                    if (param.cache_size != 100) {
773                            params.add("-m"); params.add(new Double(param.cache_size).toString());
774                    }
775                    if (param.C != 1) {
776                            params.add("-c"); params.add(new Double(param.C).toString());
777                    }
778                    if (param.eps != 0.001) {
779                            params.add("-e"); params.add(new Double(param.eps).toString());
780                    }
781                    if (param.p != 0.1) {
782                            params.add("-p"); params.add(new Double(param.p).toString());
783                    }
784                    if (param.shrinking != 1) {
785                            params.add("-h"); params.add(new Integer(param.shrinking).toString());
786                    }
787                    if (param.probability != 0) {
788                            params.add("-b"); params.add(new Integer(param.probability).toString());
789                    }
790    
791                    return params.toArray(new String[params.size()]);
792            }
793            
794            /**
795             * Parses the parameter string. The parameter string must contain parameter and value pairs, which are separated by a blank 
796             * or a underscore. The parameter begins with a character '-' followed by a one-character flag and the value must comply with
797             * the parameters data type. Some examples:
798             * 
799             * -s 0 -t 1 -d 2 -g 0.4 -e 0.1
800             * -s_0_-t_1_-d_2_-g_0.4_-e_0.1
801             * 
802             * @param paramstring   the parameter string 
803             * @param param a svm_parameter object
804             * @throws LibsvmException
805             */
806            public void parseParameters(String paramstring, svm_parameter param) throws MaltChainedException {
807                    if (param == null) {
808                            throw new LibsvmException("Svm-parameters cannot be found. ");
809                    }
810                    if (paramstring == null) {
811                            return;
812                    }
813                    final String[] argv;
814                    try {
815                            argv = paramstring.split("[_\\p{Blank}]");
816                    } catch (PatternSyntaxException e) {
817                            throw new LibsvmException("Could not split the svm-parameter string '"+paramstring+"'. ", e);
818                    }
819                    for (int i=0; i < argv.length-1; i++) {
820                            if(argv[i].charAt(0) != '-') {
821                                    throw new LibsvmException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
822                            }
823                            if(++i>=argv.length) {
824                                    throw new LibsvmException("The last argument does not have any value. ");
825                            }
826                            try {
827                                    switch(argv[i-1].charAt(1)) {
828                                    case 's':
829                                            param.svm_type = Integer.parseInt(argv[i]);
830                                            break;
831                                    case 't':
832                                            param.kernel_type = Integer.parseInt(argv[i]);
833                                            break;
834                                    case 'd':
835                                            param.degree = Integer.parseInt(argv[i]);
836                                            break;
837                                    case 'g':
838                                            param.gamma = Double.valueOf(argv[i]).doubleValue();
839                                            break;
840                                    case 'r':
841                                            param.coef0 = Double.valueOf(argv[i]).doubleValue();
842                                            break;
843                                    case 'n':
844                                            param.nu = Double.valueOf(argv[i]).doubleValue();
845                                            break;
846                                    case 'm':
847                                            param.cache_size = Double.valueOf(argv[i]).doubleValue();
848                                            break;
849                                    case 'c':
850                                            param.C = Double.valueOf(argv[i]).doubleValue();
851                                            break;
852                                    case 'e':
853                                            param.eps = Double.valueOf(argv[i]).doubleValue();
854                                            break;
855                                    case 'p':
856                                            param.p = Double.valueOf(argv[i]).doubleValue();
857                                            break;
858                                    case 'h':
859                                            param.shrinking = Integer.parseInt(argv[i]);
860                                            break;
861                                case 'b':
862                                            param.probability = Integer.parseInt(argv[i]);
863                                            break;
864                                    case 'w':
865                                            ++param.nr_weight;
866                                            {
867                                                    int[] old = param.weight_label;
868                                                    param.weight_label = new int[param.nr_weight];
869                                                    System.arraycopy(old,0,param.weight_label,0,param.nr_weight-1);
870                                            }
871            
872                                            {
873                                                    double[] old = param.weight;
874                                                    param.weight = new double[param.nr_weight];
875                                                    System.arraycopy(old,0,param.weight,0,param.nr_weight-1);
876                                            }
877            
878                                            param.weight_label[param.nr_weight-1] = Integer.parseInt(argv[i].substring(2));
879                                            param.weight[param.nr_weight-1] = Double.valueOf(argv[i]).doubleValue();
880                                            break;
881                                    case 'Y':
882                                    case 'V':
883                                    case 'S':
884                                    case 'F':
885                                    case 'T':
886                                    case 'M':
887                                    case 'N':
888                                            break;
889                                    default:
890                                            throw new LibsvmException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");         
891                                    }
892                            } catch (ArrayIndexOutOfBoundsException e) {
893                                    throw new LibsvmException("The svm-parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e);
894                            } catch (NumberFormatException e) {
895                                    throw new LibsvmException("The svm-parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e);      
896                            } catch (NullPointerException e) {
897                                    throw new LibsvmException("The svm-parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e);      
898                            }
899                    }
900            }
901    
902            public void svm_predict_with_kbestlist(svm_model model, svm_node[] x, KBestList kBestList) throws MaltChainedException {
903                    int i;
904                    final int nr_class = svm.svm_get_nr_class(model);
905                    final double[] dec_values = new double[nr_class*(nr_class-1)/2];
906                    svm.svm_predict_values(model, x, dec_values);
907    
908                    final int[] vote = new int[nr_class];
909                    final double[] score = new double[nr_class];
910                    final int[] voteindex = new int[nr_class];
911                    for(i=0;i<nr_class;i++) {
912                            vote[i] = 0;
913                            score[i] = 0.0;
914                            voteindex[i] = i;
915                    }
916                    int pos=0;
917                    for(i=0;i<nr_class;i++) {
918                            for(int j=i+1;j<nr_class;j++) {
919                                    if(dec_values[pos] > 0) {
920                                            vote[i]++;
921                                    } else {
922                                            vote[j]++;
923                                    }
924                                    score[i] += dec_values[pos];
925                                    score[j] += dec_values[pos];
926                                    pos++;
927                            }
928                    }
929                    for(i=0;i<nr_class;i++) {
930                            score[i] = score[i]/nr_class;
931                    }
932                    int lagest, tmpint;
933                    double tmpdouble;
934                    for (i=0;i<nr_class-1;i++) {
935                            lagest = i;
936                            for (int j=i;j<nr_class;j++) {
937                                    if (vote[j] > vote[lagest]) {
938                                            lagest = j;
939                                    }
940                            }
941                            tmpint = vote[lagest];
942                            vote[lagest] = vote[i];
943                            vote[i] = tmpint;
944                            tmpdouble = score[lagest];
945                            score[lagest] = score[i];
946                            score[i] = tmpdouble;
947                            tmpint = voteindex[lagest];
948                            voteindex[lagest] = voteindex[i];
949                            voteindex[i] = tmpint;
950                    }
951                    final int[] labels = new int[nr_class];
952                    svm.svm_get_labels(model, labels);
953                    int k = nr_class-1;
954                    if (kBestList.getK() != -1) {
955                            k = kBestList.getK() - 1;
956                    }
957                    
958                    for (i=0; i<nr_class && k >= 0; i++, k--) {
959                            if (vote[i] > 0 || i == 0) {
960                                    if (kBestList instanceof ScoredKBestList) {
961                                            ((ScoredKBestList)kBestList).add(labels[voteindex[i]], (float)vote[i]/(float)(nr_class*(nr_class-1)/2));
962                                    } else {
963                                            kBestList.add(labels[voteindex[i]]);
964                                    }
965                            }
966                    }
967            }
968            /**
969             * Converts the instance file (Malt's own SVM format) into the LIBSVM (SVMLight) format. The input instance file is removed (replaced)
970             * by the instance file in the LIBSVM (SVMLight) format. If a column contains -1, the value will be removed in destination file. 
971             * 
972             * @param isr the input stream reader for the source instance file
973             * @param osw   the output stream writer for the destination instance file
974             * @param cardinalities a vector containing the number of distinct values for a particular column
975             * @throws LibsvmException
976             */
977            public static void maltSVMFormat2OriginalSVMFormat(InputStreamReader isr, OutputStreamWriter osw, int[] cardinalities) throws MaltChainedException {
978                    try {
979                            final BufferedReader in = new BufferedReader(isr);
980                            final BufferedWriter out = new BufferedWriter(osw);
981    
982                            int c;
983                            int j = 0;
984                            int offset = 0;
985                            int code = 0;
986                            while(true) {
987                                    c = in.read();
988                                    if (c == -1) {
989                                            break;
990                                    }
991                                    
992                                    if (c == '\t' || c == '|') {
993                                            if (j == 0) {
994                                                    out.write(Integer.toString(code));
995                                                    j++;
996                                            } else {
997                                                    if (code != -1) {
998                                                            out.write(' ');
999                                                            out.write(Integer.toString(code+offset));
1000                                                            out.write(":1");
1001                                                    }
1002                                                    if (c == '\t') {
1003                                                            offset += cardinalities[j-1];
1004                                                            j++;
1005                                                    }
1006                                            }
1007                                            code = 0;
1008                                    } else if (c == '\n') {
1009                                            j = 0;
1010                                            offset = 0;
1011                                            out.write('\n');
1012                                            code = 0;
1013                                    } else if (c == '-') {
1014                                            code = -1;
1015                                    } else if (code != -1) {
1016                                            if (c > 47 && c < 58) {
1017                                                    code = code * 10 + (c-48);
1018                                            } else {
1019                                                    throw new LibsvmException("The instance file contain a non-integer value, when converting the Malt SVM format into LIBSVM format.");
1020                                            }
1021                                    }       
1022                            }                       
1023                            in.close();     
1024                            out.close();
1025                    } catch (IOException e) {
1026                            throw new LibsvmException("Cannot read from the instance file, when converting the Malt SVM format into LIBSVM format. ", e);
1027                    }
1028            }
1029            
1030            protected void finalize() throws Throwable {
1031                    try {
1032                            closeInstanceWriter();
1033                    } finally {
1034                            super.finalize();
1035                    }
1036            }
1037            
1038            /* (non-Javadoc)
1039             * @see java.lang.Object#toString()
1040             */
1041            public String toString() {
1042                    final StringBuffer sb = new StringBuffer();
1043                    sb.append("\nLIBSVM INTERFACE\n");
1044                    sb.append("  LIBSVM version: "+LIBSVM_VERSION+"\n");
1045                    sb.append("  SVM-param string: "+paramString+"\n");
1046                    
1047                    sb.append(toStringParameters(svmParam));
1048                    return sb.toString();
1049            }
1050    }