001 package org.maltparser.parser; 002 003 import java.io.File; 004 import java.io.IOException; 005 import java.net.URL; 006 import java.util.Formatter; 007 import java.util.regex.Pattern; 008 009 import org.apache.log4j.FileAppender; 010 import org.apache.log4j.Level; 011 import org.apache.log4j.Logger; 012 import org.apache.log4j.PatternLayout; 013 import org.maltparser.core.config.ConfigurationDir; 014 import org.maltparser.core.config.ConfigurationException; 015 import org.maltparser.core.config.ConfigurationRegistry; 016 import org.maltparser.core.exception.MaltChainedException; 017 import org.maltparser.core.helper.SystemLogger; 018 import org.maltparser.core.helper.Util; 019 import org.maltparser.core.io.dataformat.DataFormatInstance; 020 import org.maltparser.core.options.OptionManager; 021 import org.maltparser.core.propagation.PropagationManager; 022 import org.maltparser.core.symbol.SymbolTableHandler; 023 import org.maltparser.core.syntaxgraph.DependencyStructure; 024 import org.maltparser.parser.guide.ClassifierGuide; 025 026 /** 027 * @author Johan Hall 028 * 029 */ 030 public class SingleMalt implements DependencyParserConfig { 031 public static final int LEARN = 0; 032 public static final int PARSE = 1; 033 protected ConfigurationDir configDir; 034 protected Logger configLogger; 035 protected int optionContainerIndex; 036 protected Algorithm parsingAlgorithm = null; 037 protected int mode; 038 protected ConfigurationRegistry registry; 039 protected SymbolTableHandler symbolTableHandler; 040 protected DataFormatInstance dataFormatInstance; 041 protected long startTime; 042 protected long endTime; 043 protected int nIterations = 0; 044 protected PropagationManager propagationManager; 045 046 public void initialize(int containerIndex, DataFormatInstance dataFormatInstance, ConfigurationDir configDir, int mode) throws MaltChainedException { 047 048 this.optionContainerIndex = containerIndex; 049 this.mode = mode; 050 setConfigurationDir(configDir); 051 startTime = System.currentTimeMillis(); 052 configLogger = initConfigLogger(getOptionValue("config", "logfile").toString(), getOptionValue("config", "logging").toString()); 053 registry = new ConfigurationRegistry(); 054 this.dataFormatInstance = dataFormatInstance; 055 symbolTableHandler = dataFormatInstance.getSymbolTables(); 056 057 if (mode == SingleMalt.LEARN) { 058 checkOptionDependency(); 059 } 060 registry.put(org.maltparser.core.symbol.SymbolTableHandler.class, getSymbolTables()); 061 registry.put(org.maltparser.core.io.dataformat.DataFormatInstance.class, dataFormatInstance); 062 // registry.put(org.maltparser.parser.DependencyParserConfig.class, this); 063 initPropagation(); 064 initParsingAlgorithm(); 065 if (configLogger.isInfoEnabled()) { 066 URL inputFormatURL = configDir.getInputFormatURL(); 067 URL outputFormatURL = configDir.getOutputFormatURL(); 068 if (inputFormatURL != null) { 069 if (outputFormatURL == null || outputFormatURL.toString().equals(inputFormatURL.toString())) { 070 int index = inputFormatURL.toString().indexOf('!'); 071 if (index == -1) { 072 configLogger.info(" Data Format : "+inputFormatURL.toString()+"\n"); 073 } else { 074 configLogger.info(" Data Format : "+inputFormatURL.toString().substring(index+1)+"\n"); 075 } 076 } else { 077 int indexIn = inputFormatURL.toString().indexOf('!'); 078 int indexOut = outputFormatURL.toString().indexOf('!'); 079 if (indexIn == -1) { 080 configLogger.info(" Input Data Format : "+inputFormatURL.toString()+"\n"); 081 } else { 082 configLogger.info(" Input Data Format : "+inputFormatURL.toString().substring(indexIn+1)+"\n"); 083 } 084 if (indexOut == -1) { 085 configLogger.info(" Output Data Format : "+outputFormatURL.toString()+"\n"); 086 } else { 087 configLogger.info(" Output Data Format : "+outputFormatURL.toString().substring(indexOut+1)+"\n"); 088 } 089 } 090 } 091 } 092 } 093 094 private void initPropagation() throws MaltChainedException { 095 String propagationSpecFileName = getOptionValue("singlemalt", "propagation").toString(); 096 if (propagationSpecFileName == null || propagationSpecFileName.length() == 0) { 097 return; 098 } 099 propagationManager = new PropagationManager(configDir); 100 if (mode == SingleMalt.LEARN) { 101 propagationSpecFileName = configDir.copyToConfig(propagationSpecFileName); 102 OptionManager.instance().overloadOptionValue(optionContainerIndex, "singlemalt", "propagation", propagationSpecFileName); 103 } 104 getConfigLogger().info(" Propagation : " + propagationSpecFileName+"\n"); 105 propagationManager.loadSpecification(propagationSpecFileName); 106 } 107 108 /** 109 * Initialize the parsing algorithm 110 * 111 * @throws MaltChainedException 112 */ 113 protected void initParsingAlgorithm() throws MaltChainedException { 114 if (mode == LEARN) { 115 parsingAlgorithm = new BatchTrainer(this); 116 } else if (mode == PARSE) { 117 parsingAlgorithm = new DeterministicParser(this); 118 } 119 } 120 121 public void addRegistry(Class<?> clazz, Object o) { 122 registry.put(clazz, o); 123 } 124 125 public void process(Object[] arguments) throws MaltChainedException { 126 if (mode == LEARN) { 127 if (arguments.length < 2 || !(arguments[0] instanceof DependencyStructure) || !(arguments[1] instanceof DependencyStructure)) { 128 throw new MaltChainedException("The single malt learn task must be supplied with at least two dependency structures. "); 129 } 130 DependencyStructure systemGraph = (DependencyStructure)arguments[0]; 131 DependencyStructure goldGraph = (DependencyStructure)arguments[1]; 132 if (systemGraph.hasTokens() && getGuide() != null) { 133 getGuide().finalizeSentence(((Trainer)getAlgorithm()).parse(goldGraph, systemGraph)); 134 } 135 } else if (mode == PARSE) { 136 if (arguments.length < 1 || !(arguments[0] instanceof DependencyStructure)) { 137 throw new MaltChainedException("The single malt parse task must be supplied with at least one input terminal structure and one output dependency structure. "); 138 } 139 DependencyStructure processGraph = (DependencyStructure)arguments[0]; 140 if (processGraph.hasTokens()) { 141 ((Parser)getAlgorithm()).parse(processGraph); 142 } 143 } 144 } 145 146 public void parse(DependencyStructure graph) throws MaltChainedException { 147 if (graph.hasTokens()) { 148 ((Parser)getAlgorithm()).parse(graph); 149 } 150 } 151 152 public void oracleParse(DependencyStructure goldGraph, DependencyStructure oracleGraph) throws MaltChainedException { 153 if (oracleGraph.hasTokens()) { 154 if (getGuide() != null) { 155 getGuide().finalizeSentence(((Trainer)getAlgorithm()).parse(goldGraph, oracleGraph)); 156 } else { 157 ((Trainer)getAlgorithm()).parse(goldGraph, oracleGraph); 158 } 159 } 160 } 161 162 public void train() throws MaltChainedException { 163 if (getGuide() == null) { 164 ((Trainer)getAlgorithm()).train(); 165 } 166 } 167 168 public void terminate(Object[] arguments) throws MaltChainedException { 169 // if (getAlgorithm() instanceof Trainer) { 170 // ((Trainer)getAlgorithm()).terminate(); 171 // } 172 getAlgorithm().terminate(); 173 if (getGuide() != null) { 174 getGuide().terminate(); 175 } 176 if (mode == LEARN) { 177 endTime = System.currentTimeMillis(); 178 long elapsed = endTime - startTime; 179 if (configLogger.isInfoEnabled()) { 180 configLogger.info("Learning time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n"); 181 } 182 } else if (mode == PARSE) { 183 endTime = System.currentTimeMillis(); 184 long elapsed = endTime - startTime; 185 if (configLogger.isInfoEnabled()) { 186 configLogger.info("Parsing time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n"); 187 } 188 } 189 if (SystemLogger.logger() != configLogger && configLogger != null) { 190 configLogger.removeAllAppenders(); 191 } 192 } 193 194 /** 195 * Initialize the configuration logger 196 * 197 * @return the configuration logger 198 * @throws MaltChainedException 199 */ 200 public Logger initConfigLogger(String logfile, String level) throws MaltChainedException { 201 if (logfile != null && logfile.length() > 0 && !logfile.equalsIgnoreCase("stdout") && configDir != null) { 202 configLogger = Logger.getLogger(logfile); 203 FileAppender fileAppender = null; 204 try { 205 fileAppender = new FileAppender(new PatternLayout("%m"),configDir.getWorkingDirectory().getPath()+File.separator+logfile, true); 206 } catch(IOException e) { 207 throw new ConfigurationException("It is not possible to create a configuration log file. ", e); 208 } 209 fileAppender.setThreshold(Level.toLevel(level, Level.INFO)); 210 configLogger.addAppender(fileAppender); 211 configLogger.setLevel(Level.toLevel(level, Level.INFO)); 212 } else { 213 configLogger = SystemLogger.logger(); 214 } 215 216 return configLogger; 217 } 218 219 public Logger getConfigLogger() { 220 return configLogger; 221 } 222 223 public void setConfigLogger(Logger logger) { 224 configLogger = logger; 225 } 226 227 public ConfigurationDir getConfigurationDir() { 228 return configDir; 229 } 230 231 public void setConfigurationDir(ConfigurationDir configDir) { 232 this.configDir = configDir; 233 } 234 235 public int getMode() { 236 return mode; 237 } 238 239 public ConfigurationRegistry getRegistry() { 240 return registry; 241 } 242 243 public void setRegistry(ConfigurationRegistry registry) { 244 this.registry = registry; 245 } 246 247 public Object getOptionValue(String optiongroup, String optionname) throws MaltChainedException { 248 return OptionManager.instance().getOptionValue(optionContainerIndex, optiongroup, optionname); 249 } 250 251 public String getOptionValueString(String optiongroup, String optionname) throws MaltChainedException { 252 return OptionManager.instance().getOptionValueString(optionContainerIndex, optiongroup, optionname); 253 } 254 255 public OptionManager getOptionManager() throws MaltChainedException { 256 return OptionManager.instance(); 257 } 258 /******************************** MaltParserConfiguration specific ********************************/ 259 260 /** 261 * Returns the list of symbol tables 262 * 263 * @return the list of symbol tables 264 */ 265 public SymbolTableHandler getSymbolTables() { 266 return symbolTableHandler; 267 } 268 269 public PropagationManager getPropagationManager() { 270 return propagationManager; 271 } 272 273 public Algorithm getAlgorithm() { 274 return parsingAlgorithm; 275 } 276 /** 277 * Returns the guide 278 * 279 * @return the guide 280 */ 281 public ClassifierGuide getGuide() { 282 return parsingAlgorithm.getGuide(); 283 } 284 285 public void checkOptionDependency() throws MaltChainedException { 286 try { 287 if (configDir.getInfoFileWriter() != null) { 288 configDir.getInfoFileWriter().write("\nDEPENDENCIES\n"); 289 } 290 291 // Copy the feature model file into the configuration directory 292 String featureModelFileName = getOptionValue("guide", "features").toString().trim(); 293 if (featureModelFileName.equals("")) { 294 // use default feature model depending on the selected parser algorithm 295 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", getOptionValueString("singlemalt", "parsing_algorithm")); 296 featureModelFileName = getOptionValue("guide", "features").toString().trim(); 297 /* START: Temp fix during development of new liblinear and libsvm interface */ 298 String learner = getOptionValueString("guide", "learner"); 299 if (!learner.startsWith("lib")) { 300 learner = "lib"+learner; 301 } 302 /* END: Temp fix during development of new liblinear and libsvm interface */ 303 featureModelFileName = featureModelFileName.replace("{learner}", learner); 304 featureModelFileName = configDir.copyToConfig(Util.findURLinJars(featureModelFileName)); 305 } else { 306 featureModelFileName = configDir.copyToConfig(featureModelFileName); 307 } 308 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", featureModelFileName); 309 if (configDir.getInfoFileWriter() != null) { 310 configDir.getInfoFileWriter().write("--guide-features ( -F) "+getOptionValue("guide", "features").toString()+"\n"); 311 } 312 313 if (getOptionValue("guide", "data_split_column").toString().equals("") && !getOptionValue("guide", "data_split_structure").toString().equals("")) { 314 configLogger.warn("Option --guide-data_split_column = '' and --guide-data_split_structure != ''. Option --guide-data_split_structure is overloaded with '', this will cause the parser to induce a single model.\n "); 315 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_structure", ""); 316 if (configDir.getInfoFileWriter() != null) { 317 configDir.getInfoFileWriter().write("--guide-data_split_structure ( -s)\n"); 318 } 319 } 320 if (!getOptionValue("guide", "data_split_column").toString().equals("") && getOptionValue("guide", "data_split_structure").toString().equals("")) { 321 configLogger.warn("Option --guide-data_split_column != '' and --guide-data_split_structure = ''. Option --guide-data_split_column is overloaded with '', this will cause the parser to induce a single model.\n"); 322 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_column", ""); 323 if (configDir.getInfoFileWriter() != null) { 324 configDir.getInfoFileWriter().write("--guide-data_split_column ( -d)\n"); 325 } 326 } 327 328 String decisionSettings = getOptionValue("guide", "decision_settings").toString().trim(); 329 String markingStrategy = getOptionValue("pproj", "marking_strategy").toString().trim(); 330 String coveredRoot = getOptionValue("pproj", "covered_root").toString().trim(); 331 StringBuilder newDecisionSettings = new StringBuilder(); 332 333 if (decisionSettings == null || decisionSettings.length() < 1 || decisionSettings.equals("default")) { 334 decisionSettings = "T.TRANS+A.DEPREL"; 335 } else { 336 decisionSettings = decisionSettings.toUpperCase(); 337 } 338 339 if (markingStrategy.equalsIgnoreCase("head") || markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) { 340 if (!Pattern.matches(".*A\\.PPLIFTED.*", decisionSettings)) { 341 newDecisionSettings.append("+A.PPLIFTED"); 342 } 343 } 344 if (markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) { 345 if (!Pattern.matches(".*A\\.PPPATH.*", decisionSettings)) { 346 newDecisionSettings.append("+A.PPPATH"); 347 } 348 } 349 if (!coveredRoot.equalsIgnoreCase("none") && !Pattern.matches(".*A\\.PPCOVERED.*", decisionSettings)) { 350 newDecisionSettings.append("+A.PPCOVERED"); 351 } 352 if (!getOptionValue("guide", "decision_settings").toString().equals(decisionSettings) || newDecisionSettings.length() > 0) { 353 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "decision_settings", decisionSettings+newDecisionSettings.toString()); 354 if (configDir.getInfoFileWriter() != null) { 355 configDir.getInfoFileWriter().write("--guide-decision_settings ( -gds) "+getOptionValue("guide", "decision_settings").toString()+"\n"); 356 } 357 } 358 if (configDir.getInfoFileWriter() != null) { 359 configDir.getInfoFileWriter().flush(); 360 } 361 } catch (IOException e) { 362 throw new ConfigurationException("Could not write to the configuration information file. ", e); 363 } 364 } 365 }