001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.Iterator;
012 import java.util.SortedMap;
013 import java.util.TreeMap;
014 import java.util.regex.PatternSyntaxException;
015
016 import org.maltparser.core.exception.MaltChainedException;
017 import org.maltparser.core.io.dataformat.ColumnDescription;
018 import org.maltparser.core.io.dataformat.DataFormatException;
019 import org.maltparser.core.io.dataformat.DataFormatInstance;
020 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
021 import org.maltparser.core.syntaxgraph.PhraseStructure;
022 import org.maltparser.core.syntaxgraph.TokenStructure;
023 import org.maltparser.core.syntaxgraph.edge.Edge;
024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025
026 /**
027 *
028 *
029 * @author Johan Hall
030 */
031 public class NegraReader implements SyntaxGraphReader {
032 private enum NegraTables {
033 ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
034 };
035 private BufferedReader reader;
036 private DataFormatInstance dataFormatInstance;
037 private int sentenceCount;
038 private String optionString;
039 private int formatVersion;
040 private NegraTables currentHeaderTable;
041 private int currentTerminalSize;
042 private int currentNonTerminalSize;
043 private SortedMap<Integer,PhraseStructureNode> nonterminals;
044 private StringBuilder edgelabelSymbol;
045 private StringBuilder edgelabelTableName;
046 private int START_ID_OF_NONTERMINALS = 500;
047 private String fileName = null;
048 private URL url = null;
049 private String charsetName;
050 private int nIterations;
051 private int cIterations;
052 private boolean closeStream = true;
053
054 public NegraReader() {
055 currentHeaderTable = NegraTables.UNDEF;
056 edgelabelSymbol = new StringBuilder();
057 edgelabelTableName = new StringBuilder();
058 nonterminals = new TreeMap<Integer,PhraseStructureNode>();
059 nIterations = 1;
060 cIterations = 1;
061 }
062
063 private void reopen() throws MaltChainedException {
064 close();
065 if (fileName != null) {
066 open(fileName, charsetName);
067 } else if (url != null) {
068 open(url, charsetName);
069 } else {
070 throw new DataFormatException("The input stream cannot be reopen. ");
071 }
072 }
073
074 public void open(String fileName, String charsetName) throws MaltChainedException {
075 setFileName(fileName);
076 setCharsetName(charsetName);
077 try {
078 open(new FileInputStream(fileName), charsetName);
079 } catch (FileNotFoundException e) {
080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081 }
082 }
083 public void open(URL url, String charsetName) throws MaltChainedException {
084 setUrl(url);
085 setCharsetName(charsetName);
086 try {
087 open(url.openStream(), charsetName);
088 } catch (IOException e) {
089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090 }
091 }
092
093 public void open(InputStream is, String charsetName) throws MaltChainedException {
094 try {
095 if (is == System.in) {
096 closeStream = false;
097 }
098 open(new InputStreamReader(is, charsetName));
099 } catch (UnsupportedEncodingException e) {
100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101 }
102 }
103
104 private void open(InputStreamReader isr) throws MaltChainedException {
105 setReader(new BufferedReader(isr));
106 setSentenceCount(0);
107 }
108
109 public void readProlog() throws MaltChainedException {
110
111 }
112
113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
114 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
115 return false;
116 }
117 syntaxGraph.clear();
118 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
119 PhraseStructureNode parent = null;
120 PhraseStructureNode child = null;
121 currentHeaderTable = NegraTables.UNDEF;
122 String line = null;
123 syntaxGraph.clear();
124 syntaxGraph.getSymbolTables().cleanUp();
125 nonterminals.clear();
126 try {
127 while (true) {
128 line = reader.readLine();
129 if (line == null) {
130 if (syntaxGraph.hasTokens()) {
131 sentenceCount++;
132 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
133 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
134 }
135 }
136 if (cIterations < nIterations) {
137 cIterations++;
138 reopen();
139 return true;
140 }
141 return false;
142 } else if (line.startsWith("#EOS")) {
143 currentTerminalSize = 0;
144 currentNonTerminalSize = 0;
145 currentHeaderTable = NegraTables.UNDEF;
146 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
147 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
148 }
149 return true;
150 } else if (line.startsWith("#BOS")) {
151 currentHeaderTable = NegraTables.SENTENCE;
152 int s = -1, e = -1;
153 for (int i = 5, n = line.length(); i < n; i++) {
154 if (Character.isDigit(line.charAt(i)) && s == -1) {
155 s = i;
156 }
157 if (line.charAt(i) == ' ') {
158 e = i;
159 break;
160 }
161 }
162 if (s != e && s != -1 && e != -1) {
163 phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
164 }
165 sentenceCount++;
166 } else if (currentHeaderTable == NegraTables.SENTENCE) {
167 if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
168 Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
169 ColumnDescription column = null;
170 currentNonTerminalSize++;
171 char[] lineChars = line.toCharArray();
172 int start = 0;
173 int secedgecounter = 0;
174 for (int i = 0, n = lineChars.length; i < n; i++) {
175 if (lineChars[i] == '\t' && start == i) {
176 start++;
177 } else if (lineChars[i] == '\t' || i == n - 1) {
178 if (columns.hasNext()) {
179 column = columns.next();
180 }
181 if (column.getPosition() == 0) {
182 int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
183 child = nonterminals.get(index);
184 if (child == null) {
185 if (index != 0) {
186 child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
187 }
188 nonterminals.put(index,child);
189 }
190 } else if (column.getPosition() == 2 && child != null) {
191 syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
192 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
193 edgelabelSymbol.setLength(0);
194 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
195 edgelabelTableName.setLength(0);
196 edgelabelTableName.append(column.getName());
197 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
198 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
199 parent = nonterminals.get(index);
200 if (parent == null) {
201 if (index == 0) {
202 parent = phraseStructure.getPhraseStructureRoot();
203 } else {
204 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
205 }
206 nonterminals.put(index,parent);
207 }
208 Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
209 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
210 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
211 if (secedgecounter % 2 == 0) {
212 edgelabelSymbol.setLength(0);
213 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
214 secedgecounter++;
215 } else {
216 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
217 if (index == 0) {
218 parent = phraseStructure.getPhraseStructureRoot();
219 } else if (index < START_ID_OF_NONTERMINALS) {
220 parent = phraseStructure.getTokenNode(index);
221 } else {
222 parent = nonterminals.get(index);
223 if (parent == null) {
224 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
225 nonterminals.put(index,parent);
226 }
227 }
228 Edge e = phraseStructure.addSecondaryEdge(parent, child);
229 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
230 secedgecounter++;
231 }
232 }
233 start = i + 1;
234 }
235 }
236 } else { // Terminal
237 Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
238 ColumnDescription column = null;
239
240 currentTerminalSize++;
241 child = syntaxGraph.addTokenNode(currentTerminalSize);
242 char[] lineChars = line.toCharArray();
243 int start = 0;
244 int secedgecounter = 0;
245 for (int i = 0, n = lineChars.length; i < n; i++) {
246 if (lineChars[i] == '\t' && start == i) {
247 start++;
248 } else if (lineChars[i] == '\t' || i == n - 1) {
249 if (columns.hasNext()) {
250 column = columns.next();
251 }
252 if (column.getCategory() == ColumnDescription.INPUT && child != null) {
253 syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
254 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
255 edgelabelSymbol.setLength(0);
256 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
257 edgelabelTableName.setLength(0);
258 edgelabelTableName.append(column.getName());
259 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
260 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
261 parent = nonterminals.get(index);
262 if (parent == null) {
263 if (index == 0) {
264 parent = phraseStructure.getPhraseStructureRoot();
265 } else {
266 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
267 }
268 nonterminals.put(index,parent);
269 }
270
271 Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
272 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
273 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
274 if (secedgecounter % 2 == 0) {
275 edgelabelSymbol.setLength(0);
276 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
277 secedgecounter++;
278 } else {
279 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
280 if (index == 0) {
281 parent = phraseStructure.getPhraseStructureRoot();
282 } else if (index < START_ID_OF_NONTERMINALS) {
283 parent = phraseStructure.getTokenNode(index);
284 } else {
285 parent = nonterminals.get(index);
286 if (parent == null) {
287 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
288 nonterminals.put(index,parent);
289 }
290 }
291 Edge e = phraseStructure.addSecondaryEdge(parent, child);
292 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
293 secedgecounter++;
294 }
295 }
296 start = i + 1;
297 }
298 }
299 }
300 } else if (line.startsWith("%%")) { // comment skip
301
302 } else if (line.startsWith("#FORMAT")) {
303 // int index = line.indexOf(' ');
304 // if (index > -1) {
305 // try {
306 // formatVersion = Integer.parseInt(line.substring(index+1));
307 // } catch (NumberFormatException e) {
308 //
309 // }
310 // }
311 } else if (line.startsWith("#BOT")) {
312 // int index = line.indexOf(' ');
313 // if (index > -1) {
314 // if (line.substring(index+1).equals("ORIGIN")) {
315 // currentHeaderTable = NegraTables.ORIGIN;
316 // } else if (line.substring(index+1).equals("EDITOR")) {
317 // currentHeaderTable = NegraTables.EDITOR;
318 // } else if (line.substring(index+1).equals("WORDTAG")) {
319 // currentHeaderTable = NegraTables.WORDTAG;
320 // } else if (line.substring(index+1).equals("MORPHTAG")) {
321 // currentHeaderTable = NegraTables.MORPHTAG;
322 // } else if (line.substring(index+1).equals("NODETAG")) {
323 // currentHeaderTable = NegraTables.NODETAG;
324 // } else if (line.substring(index+1).equals("EDGETAG")) {
325 // currentHeaderTable = NegraTables.EDGETAG;
326 // } else if (line.substring(index+1).equals("SECEDGETAG")) {
327 // currentHeaderTable = NegraTables.SECEDGETAG;
328 // } else {
329 // currentHeaderTable = NegraTables.UNDEF;
330 // }
331 // }
332 } else if (line.startsWith("#EOT")) {
333 currentHeaderTable = NegraTables.UNDEF;
334 }
335 }
336 } catch (IOException e) {
337 throw new DataFormatException("Error when reading from the input file. ", e);
338 }
339 }
340
341 public void readEpilog() throws MaltChainedException {
342
343 }
344
345 public BufferedReader getReader() {
346 return reader;
347 }
348
349 public void setReader(BufferedReader reader) {
350 this.reader = reader;
351 }
352
353 public int getSentenceCount() {
354 return sentenceCount;
355 }
356
357 public void setSentenceCount(int sentenceCount) {
358 this.sentenceCount = sentenceCount;
359 }
360
361 public int getFormatVersion() {
362 return formatVersion;
363 }
364
365 public void setFormatVersion(int formatVersion) {
366 this.formatVersion = formatVersion;
367 }
368
369 public DataFormatInstance getDataFormatInstance() {
370 return dataFormatInstance;
371 }
372
373 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
374 this.dataFormatInstance = inputDataFormatInstance;
375 }
376
377 public String getOptions() {
378 return optionString;
379 }
380
381 public void setOptions(String optionString) throws MaltChainedException {
382 this.optionString = optionString;
383
384 String[] argv;
385 try {
386 argv = optionString.split("[_\\p{Blank}]");
387 } catch (PatternSyntaxException e) {
388 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
389 }
390 for (int i=0; i < argv.length-1; i++) {
391 if(argv[i].charAt(0) != '-') {
392 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
393 }
394 if(++i>=argv.length) {
395 throw new DataFormatException("The last argument does not have any value. ");
396 }
397 switch(argv[i-1].charAt(1)) {
398 case 's':
399 try {
400 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
401 } catch (NumberFormatException e){
402 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
403 }
404 break;
405 default:
406 throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
407 }
408 }
409 }
410
411 public String getFileName() {
412 return fileName;
413 }
414
415 public void setFileName(String fileName) {
416 this.fileName = fileName;
417 }
418
419 public URL getUrl() {
420 return url;
421 }
422
423 public void setUrl(URL url) {
424 this.url = url;
425 }
426
427 public String getCharsetName() {
428 return charsetName;
429 }
430
431 public void setCharsetName(String charsetName) {
432 this.charsetName = charsetName;
433 }
434
435 public int getNIterations() {
436 return nIterations;
437 }
438
439 public void setNIterations(int iterations) {
440 nIterations = iterations;
441 }
442
443 public int getIterationCounter() {
444 return cIterations;
445 }
446
447 public void close() throws MaltChainedException {
448 try {
449 if (reader != null) {
450 if (closeStream) {
451 reader.close();
452 }
453 reader = null;
454 }
455 } catch (IOException e) {
456 throw new DataFormatException("Error when closing the input file.", e);
457 }
458 }
459 }