001package org.maltparser.parser; 002 003import java.io.BufferedReader; 004import java.io.File; 005import java.io.IOException; 006import java.io.InputStream; 007import java.io.InputStreamReader; 008import java.io.ObjectInputStream; 009import java.io.OutputStreamWriter; 010import java.lang.reflect.InvocationTargetException; 011import java.net.MalformedURLException; 012import java.net.URL; 013import java.util.Formatter; 014import java.util.regex.Pattern; 015 016import org.apache.log4j.FileAppender; 017import org.apache.log4j.Level; 018import org.apache.log4j.Logger; 019import org.apache.log4j.PatternLayout; 020import org.maltparser.core.config.ConfigurationDir; 021import org.maltparser.core.config.ConfigurationException; 022import org.maltparser.core.exception.MaltChainedException; 023import org.maltparser.core.feature.FeatureModelManager; 024import org.maltparser.core.feature.system.FeatureEngine; 025import org.maltparser.core.helper.SystemLogger; 026import org.maltparser.core.helper.URLFinder; 027import org.maltparser.core.io.dataformat.DataFormatInstance; 028import org.maltparser.core.options.OptionManager; 029import org.maltparser.core.plugin.PluginLoader; 030import org.maltparser.core.propagation.PropagationException; 031import org.maltparser.core.propagation.PropagationManager; 032import org.maltparser.core.symbol.SymbolTableHandler; 033import org.maltparser.core.syntaxgraph.DependencyStructure; 034import org.maltparser.parser.guide.ClassifierGuide; 035 036/** 037 * @author Johan Hall 038 * 039 */ 040public class SingleMalt implements DependencyParserConfig { 041 public final static Class<?>[] paramTypes = { org.maltparser.parser.DependencyParserConfig.class }; 042 public static final int LEARN = 0; 043 public static final int PARSE = 1; 044 protected ConfigurationDir configDir; 045 protected Logger configLogger; 046 protected int optionContainerIndex; 047 protected ParsingAlgorithm parsingAlgorithm = null; 048 protected int mode; 049 protected SymbolTableHandler symbolTableHandler; 050 protected DataFormatInstance dataFormatInstance; 051 protected FeatureModelManager featureModelManager; 052 protected long startTime; 053 protected long endTime; 054 protected int nIterations = 0; 055 protected PropagationManager propagationManager; 056 private Parser parser; 057 private Trainer trainer; 058 private AbstractParserFactory parserFactory; 059 060 061 public void initialize(int containerIndex, DataFormatInstance dataFormatInstance, SymbolTableHandler symbolTableHandler, ConfigurationDir configDir, int mode) throws MaltChainedException { 062 this.optionContainerIndex = containerIndex; 063 this.mode = mode; 064 setConfigurationDir(configDir); 065 startTime = System.currentTimeMillis(); 066 configLogger = initConfigLogger(getOptionValue("config", "logfile").toString(), getOptionValue("config", "logging").toString()); 067 this.dataFormatInstance = dataFormatInstance; 068 this.symbolTableHandler = symbolTableHandler; 069 this.parserFactory = makeParserFactory(); 070 if (mode == SingleMalt.LEARN) { 071 checkOptionDependency(); 072 } 073 initPropagation(); 074 initFeatureSystem(); 075 initParsingAlgorithm(); 076 077 if (configLogger.isInfoEnabled()) { 078 URL inputFormatURL = configDir.getInputFormatURL(); 079 URL outputFormatURL = configDir.getOutputFormatURL(); 080 if (inputFormatURL != null) { 081 if (outputFormatURL == null || outputFormatURL.toString().equals(inputFormatURL.toString())) { 082 int index = inputFormatURL.toString().indexOf('!'); 083 if (index == -1) { 084 configLogger.info(" Data Format : "+inputFormatURL.toString()+"\n"); 085 } else { 086 configLogger.info(" Data Format : "+inputFormatURL.toString().substring(index+1)+"\n"); 087 } 088 } else { 089 int indexIn = inputFormatURL.toString().indexOf('!'); 090 int indexOut = outputFormatURL.toString().indexOf('!'); 091 if (indexIn == -1) { 092 configLogger.info(" Input Data Format : "+inputFormatURL.toString()+"\n"); 093 } else { 094 configLogger.info(" Input Data Format : "+inputFormatURL.toString().substring(indexIn+1)+"\n"); 095 } 096 if (indexOut == -1) { 097 configLogger.info(" Output Data Format : "+outputFormatURL.toString()+"\n"); 098 } else { 099 configLogger.info(" Output Data Format : "+outputFormatURL.toString().substring(indexOut+1)+"\n"); 100 } 101 } 102 } 103 } 104 } 105 106 private void initPropagation() throws MaltChainedException { 107 String propagationSpecFileName = getOptionValue("singlemalt", "propagation").toString(); 108 if (propagationSpecFileName == null || propagationSpecFileName.length() == 0) { 109 return; 110 } 111 propagationManager = new PropagationManager(); 112 if (mode == SingleMalt.LEARN) { 113 propagationSpecFileName = configDir.copyToConfig(propagationSpecFileName); 114 OptionManager.instance().overloadOptionValue(optionContainerIndex, "singlemalt", "propagation", propagationSpecFileName); 115 } 116 if (isLoggerInfoEnabled()) { 117 logInfoMessage(" Propagation : " + propagationSpecFileName+"\n"); 118 } 119 propagationManager.loadSpecification(findURL(propagationSpecFileName)); 120 propagationManager.createPropagations(dataFormatInstance, symbolTableHandler); 121 } 122 123 /** 124 * Initialize the parsing algorithm 125 * 126 * @throws MaltChainedException 127 */ 128 protected void initParsingAlgorithm() throws MaltChainedException { 129 boolean diagnostics = (Boolean)getOptionValue("singlemalt", "diagnostics"); 130 if (mode == LEARN) { 131 if (!diagnostics) { 132 parsingAlgorithm = trainer = new BatchTrainer(this, symbolTableHandler); 133 } else { 134 parsingAlgorithm = trainer = new BatchTrainerWithDiagnostics(this, symbolTableHandler); 135 } 136 } else if (mode == PARSE) { 137 if (!diagnostics) { 138 parsingAlgorithm = parser = new DeterministicParser(this, symbolTableHandler); 139 } else { 140 parsingAlgorithm = parser = new DeterministicParserWithDiagnostics(this, symbolTableHandler); 141 } 142 } 143 } 144 145 protected void initFeatureSystem() throws MaltChainedException { 146 final FeatureEngine system = new FeatureEngine(); 147 system.load("/appdata/features/ParserFeatureSystem.xml"); 148 system.load(PluginLoader.instance()); 149 featureModelManager = new FeatureModelManager(system); 150 String featureModelFileName = getOptionValue("guide", "features").toString().trim(); 151 if (featureModelFileName.endsWith(".par")) { 152 String markingStrategy = getOptionValue("pproj", "marking_strategy").toString().trim(); 153 String coveredRoot = getOptionValue("pproj", "covered_root").toString().trim(); 154 featureModelManager.loadParSpecification(findURL(featureModelFileName), markingStrategy, coveredRoot); 155 } else { 156 featureModelManager.loadSpecification(findURL(featureModelFileName)); 157 } 158 } 159 160 /** 161 * Creates a parser factory specified by the --singlemalt-parsing_algorithm option 162 * 163 * @return a parser factory 164 * @throws MaltChainedException 165 */ 166 private AbstractParserFactory makeParserFactory() throws MaltChainedException { 167 Class<?> clazz = (Class<?>)getOptionValue("singlemalt", "parsing_algorithm"); 168 try { 169 Object[] arguments = { this }; 170 return (AbstractParserFactory)clazz.getConstructor(paramTypes).newInstance(arguments); 171 } catch (NoSuchMethodException e) { 172 throw new ConfigurationException("The parser factory '"+clazz.getName()+"' cannot be initialized. ", e); 173 } catch (InstantiationException e) { 174 throw new ConfigurationException("The parser factory '"+clazz.getName()+"' cannot be initialized. ", e); 175 } catch (IllegalAccessException e) { 176 throw new ConfigurationException("The parser factory '"+clazz.getName()+"' cannot be initialized. ", e); 177 } catch (InvocationTargetException e) { 178 throw new ConfigurationException("The parser factory '"+clazz.getName()+"' cannot be initialized. ", e); 179 } 180 } 181 182 public AbstractParserFactory getParserFactory() { 183 return parserFactory; 184 } 185 186 public FeatureModelManager getFeatureModelManager() { 187 return featureModelManager; 188 } 189 190 public void process(Object[] arguments) throws MaltChainedException { 191 if (mode == LEARN) { 192 if (arguments.length < 2 || !(arguments[0] instanceof DependencyStructure) || !(arguments[1] instanceof DependencyStructure)) { 193 throw new MaltChainedException("The single malt learn task must be supplied with at least two dependency structures. "); 194 } 195 DependencyStructure systemGraph = (DependencyStructure)arguments[0]; 196 DependencyStructure goldGraph = (DependencyStructure)arguments[1]; 197 if (systemGraph.hasTokens() && getGuide() != null) { 198 getGuide().finalizeSentence(((Trainer)getAlgorithm()).parse(goldGraph, systemGraph)); 199 } 200 } else if (mode == PARSE) { 201 if (arguments.length < 1 || !(arguments[0] instanceof DependencyStructure)) { 202 throw new MaltChainedException("The single malt parse task must be supplied with at least one input terminal structure and one output dependency structure. "); 203 } 204 DependencyStructure processGraph = (DependencyStructure)arguments[0]; 205 if (processGraph.hasTokens()) { 206 parser.parse(processGraph); 207// ((Parser)getAlgorithm()).parse(processGraph); 208 } 209 } 210 } 211 212 public void parse(DependencyStructure graph) throws MaltChainedException { 213 if (graph.hasTokens()) { 214// ((Parser)getAlgorithm()).parse(graph); 215 parser.parse(graph); 216 } 217 } 218 219 public void oracleParse(DependencyStructure goldGraph, DependencyStructure oracleGraph) throws MaltChainedException { 220 if (oracleGraph.hasTokens()) { 221 if (getGuide() != null) { 222 getGuide().finalizeSentence(trainer.parse(goldGraph, oracleGraph)); 223 } else { 224 trainer.parse(goldGraph, oracleGraph); 225 } 226 } 227 } 228 229 public void train() throws MaltChainedException { 230 if (getGuide() == null) { 231 ((Trainer)getAlgorithm()).train(); 232 } 233 } 234 235 public void terminate(Object[] arguments) throws MaltChainedException { 236// if (getAlgorithm() instanceof Trainer) { 237// ((Trainer)getAlgorithm()).terminate(); 238// } 239 getAlgorithm().terminate(); 240 if (getGuide() != null) { 241 getGuide().terminate(); 242 } 243 if (mode == LEARN) { 244 endTime = System.currentTimeMillis(); 245 long elapsed = endTime - startTime; 246 if (configLogger.isInfoEnabled()) { 247 configLogger.info("Learning time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n"); 248 } 249 } else if (mode == PARSE) { 250 endTime = System.currentTimeMillis(); 251 long elapsed = endTime - startTime; 252 if (configLogger.isInfoEnabled()) { 253 configLogger.info("Parsing time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n"); 254 } 255 } 256 if (SystemLogger.logger() != configLogger && configLogger != null) { 257 configLogger.removeAllAppenders(); 258 } 259 } 260 261 /** 262 * Initialize the configuration logger 263 * 264 * @return the configuration logger 265 * @throws MaltChainedException 266 */ 267 public Logger initConfigLogger(String logfile, String level) throws MaltChainedException { 268 if (logfile != null && logfile.length() > 0 && !logfile.equalsIgnoreCase("stdout") && configDir != null) { 269 configLogger = Logger.getLogger(logfile); 270 FileAppender fileAppender = null; 271 try { 272 fileAppender = new FileAppender(new PatternLayout("%m"),configDir.getWorkingDirectory().getPath()+File.separator+logfile, true); 273 } catch(IOException e) { 274 throw new ConfigurationException("It is not possible to create a configuration log file. ", e); 275 } 276 fileAppender.setThreshold(Level.toLevel(level, Level.INFO)); 277 configLogger.addAppender(fileAppender); 278 configLogger.setLevel(Level.toLevel(level, Level.INFO)); 279 } else { 280 configLogger = SystemLogger.logger(); 281 } 282 283 return configLogger; 284 } 285 286 public boolean isLoggerInfoEnabled() { 287 return configLogger != null && configLogger.isInfoEnabled(); 288 } 289 public boolean isLoggerDebugEnabled() { 290 return configLogger != null && configLogger.isDebugEnabled(); 291 } 292 public void logErrorMessage(String message) { 293 configLogger.error(message); 294 } 295 public void logInfoMessage(String message) { 296 configLogger.info(message); 297 } 298 public void logInfoMessage(char character) { 299 configLogger.info(character); 300 } 301 public void logDebugMessage(String message) { 302 configLogger.debug(message); 303 } 304 305 public void writeInfoToConfigFile(String message) throws MaltChainedException { 306 try { 307 configDir.getInfoFileWriter().write(message); 308 configDir.getInfoFileWriter().flush(); 309 } catch (IOException e) { 310 throw new ConfigurationException("Could not write to the configuration information file. ", e); 311 312 } 313 } 314 315 public Logger getConfigLogger() { 316 return configLogger; 317 } 318 319 public void setConfigLogger(Logger logger) { 320 configLogger = logger; 321 } 322 323 public ConfigurationDir getConfigurationDir() { 324 return configDir; 325 } 326 327 public void setConfigurationDir(ConfigurationDir configDir) { 328 this.configDir = configDir; 329 } 330 331 public OutputStreamWriter getOutputStreamWriter(String fileName) throws MaltChainedException { 332 return configDir.getOutputStreamWriter(fileName); 333 } 334 335 public OutputStreamWriter getAppendOutputStreamWriter(String fileName) throws MaltChainedException { 336 return configDir.getAppendOutputStreamWriter(fileName); 337 } 338 339 public InputStreamReader getInputStreamReader(String fileName) throws MaltChainedException { 340 return configDir.getInputStreamReader(fileName); 341 } 342 343 public InputStream getInputStreamFromConfigFileEntry(String fileName) throws MaltChainedException { 344 return configDir.getInputStreamFromConfigFileEntry(fileName); 345 } 346 347 public URL getConfigFileEntryURL(String fileName) throws MaltChainedException { 348 return configDir.getConfigFileEntryURL(fileName); 349 } 350 351 public File getFile(String fileName) throws MaltChainedException { 352 return configDir.getFile(fileName); 353 } 354 355 public Object getConfigFileEntryObject(String fileName) throws MaltChainedException { 356 Object object = null; 357 try { 358 ObjectInputStream input = new ObjectInputStream(getInputStreamFromConfigFileEntry(fileName)); 359 try { 360 object = input.readObject(); 361 } catch (ClassNotFoundException e) { 362 throw new ConfigurationException("Could not load object '"+fileName+"' from mco-file", e); 363 } catch (Exception e) { 364 throw new ConfigurationException("Could not load object '"+fileName+"' from mco-file", e); 365 } finally { 366 input.close(); 367 } 368 } catch (IOException e) { 369 throw new ConfigurationException("Could not load object from '"+fileName+"' in mco-file", e); 370 } 371 return object; 372 } 373 374 public String getConfigFileEntryString(String fileName) throws MaltChainedException { 375 StringBuilder sb = new StringBuilder(); 376 try { 377 final BufferedReader in = new BufferedReader(new InputStreamReader(getInputStreamFromConfigFileEntry(fileName), "UTF-8")); 378 String line; 379 380 while((line = in.readLine()) != null) { 381 sb.append(line); 382 sb.append('\n'); 383 } 384 } catch (IOException e) { 385 throw new ConfigurationException("Could not load string from '"+fileName+"' in mco-file", e); 386 } 387 return sb.toString(); 388 } 389 390 public int getMode() { 391 return mode; 392 } 393 394 public Object getOptionValue(String optiongroup, String optionname) throws MaltChainedException { 395 return OptionManager.instance().getOptionValue(optionContainerIndex, optiongroup, optionname); 396 } 397 398 public String getOptionValueString(String optiongroup, String optionname) throws MaltChainedException { 399 return OptionManager.instance().getOptionValueString(optionContainerIndex, optiongroup, optionname); 400 } 401 402 public OptionManager getOptionManager() throws MaltChainedException { 403 return OptionManager.instance(); 404 } 405 /******************************** MaltParserConfiguration specific ********************************/ 406 407 /** 408 * Returns the list of symbol tables 409 * 410 * @return the list of symbol tables 411 */ 412 public SymbolTableHandler getSymbolTables() { 413 return symbolTableHandler; 414 } 415 416 public DataFormatInstance getDataFormatInstance() { 417 return dataFormatInstance; 418 } 419 420 public PropagationManager getPropagationManager() { 421 return propagationManager; 422 } 423 424 public ParsingAlgorithm getAlgorithm() { 425 return parsingAlgorithm; 426 } 427 /** 428 * Returns the guide 429 * 430 * @return the guide 431 */ 432 public ClassifierGuide getGuide() { 433 return parsingAlgorithm.getGuide(); 434 } 435 436 public void checkOptionDependency() throws MaltChainedException { 437 try { 438 if (configDir.getInfoFileWriter() != null) { 439 configDir.getInfoFileWriter().write("\nDEPENDENCIES\n"); 440 } 441 442 // Copy the feature model file into the configuration directory 443 String featureModelFileName = getOptionValue("guide", "features").toString().trim(); 444 if (featureModelFileName.equals("")) { 445 446 // use default feature model depending on the selected parser algorithm 447 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", getOptionValueString("singlemalt", "parsing_algorithm")); 448 featureModelFileName = getOptionValue("guide", "features").toString().trim(); 449 450 /* START: Temp fix during development of new liblinear and libsvm interface */ 451 String learner = getOptionValueString("guide", "learner"); 452 if (!learner.startsWith("lib")) { 453 learner = "lib"+learner; 454 } 455 /* END: Temp fix during development of new liblinear and libsvm interface */ 456 featureModelFileName = featureModelFileName.replace("{learner}", learner); 457 featureModelFileName = featureModelFileName.replace("{dataformat}", getOptionValue("input", "format").toString().trim().replace(".xml", "")); 458 459 final URLFinder f = new URLFinder(); 460 featureModelFileName = configDir.copyToConfig(f.findURLinJars(featureModelFileName)); 461 } else { 462 featureModelFileName = configDir.copyToConfig(featureModelFileName); 463 } 464 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", featureModelFileName); 465 if (configDir.getInfoFileWriter() != null) { 466 configDir.getInfoFileWriter().write("--guide-features ( -F) "+getOptionValue("guide", "features").toString()+"\n"); 467 } 468 469 if (getOptionValue("guide", "data_split_column").toString().equals("") && !getOptionValue("guide", "data_split_structure").toString().equals("")) { 470 configLogger.warn("Option --guide-data_split_column = '' and --guide-data_split_structure != ''. Option --guide-data_split_structure is overloaded with '', this will cause the parser to induce a single model.\n "); 471 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_structure", ""); 472 if (configDir.getInfoFileWriter() != null) { 473 configDir.getInfoFileWriter().write("--guide-data_split_structure ( -s)\n"); 474 } 475 } 476 if (!getOptionValue("guide", "data_split_column").toString().equals("") && getOptionValue("guide", "data_split_structure").toString().equals("")) { 477 configLogger.warn("Option --guide-data_split_column != '' and --guide-data_split_structure = ''. Option --guide-data_split_column is overloaded with '', this will cause the parser to induce a single model.\n"); 478 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_column", ""); 479 if (configDir.getInfoFileWriter() != null) { 480 configDir.getInfoFileWriter().write("--guide-data_split_column ( -d)\n"); 481 } 482 } 483 484 String decisionSettings = getOptionValue("guide", "decision_settings").toString().trim(); 485 String markingStrategy = getOptionValue("pproj", "marking_strategy").toString().trim(); 486 String coveredRoot = getOptionValue("pproj", "covered_root").toString().trim(); 487 StringBuilder newDecisionSettings = new StringBuilder(); 488 489 if (decisionSettings == null || decisionSettings.length() < 1 || decisionSettings.equals("default")) { 490 decisionSettings = "T.TRANS+A.DEPREL"; 491 } else { 492 decisionSettings = decisionSettings.toUpperCase(); 493 } 494 495 if (markingStrategy.equalsIgnoreCase("head") || markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) { 496 if (!Pattern.matches(".*A\\.PPLIFTED.*", decisionSettings)) { 497 newDecisionSettings.append("+A.PPLIFTED"); 498 } 499 } 500 if (markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) { 501 if (!Pattern.matches(".*A\\.PPPATH.*", decisionSettings)) { 502 newDecisionSettings.append("+A.PPPATH"); 503 } 504 } 505 if (!coveredRoot.equalsIgnoreCase("none") && !Pattern.matches(".*A\\.PPCOVERED.*", decisionSettings)) { 506 newDecisionSettings.append("+A.PPCOVERED"); 507 } 508 if (!getOptionValue("guide", "decision_settings").toString().equals(decisionSettings) || newDecisionSettings.length() > 0) { 509 OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "decision_settings", decisionSettings+newDecisionSettings.toString()); 510 if (configDir.getInfoFileWriter() != null) { 511 configDir.getInfoFileWriter().write("--guide-decision_settings ( -gds) "+getOptionValue("guide", "decision_settings").toString()+"\n"); 512 } 513 } 514 if (configDir.getInfoFileWriter() != null) { 515 configDir.getInfoFileWriter().flush(); 516 } 517 } catch (IOException e) { 518 throw new ConfigurationException("Could not write to the configuration information file. ", e); 519 } 520 } 521 522 private URL findURL(String propagationSpecFileName) throws MaltChainedException { 523 URL url = null; 524 File specFile = configDir.getFile(propagationSpecFileName); 525 if (specFile.exists()) { 526 try { 527 url = new URL("file:///"+specFile.getAbsolutePath()); 528 } catch (MalformedURLException e) { 529 throw new PropagationException("Malformed URL: "+specFile, e); 530 } 531 } else { 532 url = configDir.getConfigFileEntryURL(propagationSpecFileName); 533 } 534 return url; 535 } 536}