-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.java
executable file
·148 lines (123 loc) · 5.24 KB
/
Main.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
* Main.java
*
* Created on 30 december 2006, 19:45
*
*/
package BMM_labels;
/**
*
* @author peter
* @author gideon
*/
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
public class Main {
public Induced_Grammar inducedGrammar;
// special symbols
public final static boolean PRINT_OUTPUT = false;
public static boolean timer = false;
public final static boolean DO_STOLCKE = true;
/**
* DO_STOLCKE_CONTROL computes for every accepted merge or chunk the DL
* values according to the `absolute' values of the Stolcke algorithm, so
* that it can be compared against the relative values of the eGrids algo,
* as an extra control
*/
public final static boolean DO_STOLCKE_CONTROL = false;
/**
* although INCLUDE_POISSON option can be enabled, it doesn't make sense for
* unsupervised labelling so default=false
*/
public static boolean INCLUDE_POISSON = false;
public static double MU = 3.0;
public static boolean INCLUDE_DIRICHLET = false;
/**
* dynamic and incremental update of merge bigrams: default=TRUE (faster)
*/
public static boolean UPDATE_MERGE_BIGRAMS = true;
/**
* DO_LHS_MERGES is still not working, because results in clustering
* everything into TOP category
*/
public static boolean DO_LHS_MERGES = false;
public final static boolean PRINT_DEBUG = false;
public final static boolean PRINT_DUPLICATE_RULES_TO_SCREEN = false;
public final static boolean PRINT_REMOVED_RULES_TO_SCREEN = false;
public final static boolean PRINT_DUPLICATE_RULES_TO_FILE = false;
public static String UNLABELED_FILE = "";
public static String SPAN_FILE = "";
// SPAN_IMPORT_FORMAT_MANUAL only relevant when reading in .csv file
// with annotation for v=checked, etc
public static boolean SPAN_IMPORT_FORMAT_MANUAL = false;
public static boolean INDUCE_FROM_POSTAGS = false;
public static int MININUM_REMOVED_SYMBOLS = 0;
public static int MAXLOOKAHEAD = 10;
public static int BEAM_SIZE = 10;
public static boolean INDUCE_MULTIGRAMS = false;
public static boolean WRITE_INTERMEDIATE_GRAMMARS_AND_PARSES = false;
public static String OUTPUT_DIRECTORY="./Output";
public final static int TERMINAL = 1;
public final static int NONTERMINAL = 2;
// extra functionality
public final static boolean PRINT_TARGET_GRAMMAR = false;
public final static boolean COMPUTE_TARGET_GRAMMAR_MDL = false;
/**
* Filters out only those sentences for which 70% of words in sentence occur
* more than x times
*/
public final static boolean FILTER_WSJ_SENTENCES = false;
// targets a specific number of non-lexical non-terminals
// ignored if set to 0
public final static int NON_LEX_NON_TERMINAL_MAX = 0;
/** Creates a new instance of Main */
public Main(String[] args) throws Exception {
// collect options, usage:
// BMM_labels plain_textfile span_file [postag] [dirichlet] [multigrams]
// [lookahead=la] [beam=b]
UNLABELED_FILE = args[0];
SPAN_FILE = args[1];
for (String s: args) {
if (s.toLowerCase().equals("postag")) INDUCE_FROM_POSTAGS = true;
if (s.toLowerCase().equals("dirichlet")) INCLUDE_DIRICHLET = true;
if (s.toLowerCase().equals("multigrams")) INDUCE_MULTIGRAMS = true;
if (s.toLowerCase().startsWith("lookahead=")) {
MAXLOOKAHEAD = java.lang.Integer.parseInt(s.split("=")[1]);
}
if (s.toLowerCase().startsWith("beam=")) {
BEAM_SIZE = java.lang.Integer.parseInt(s.split("=")[1]);
}
}
Reader myReader = new Reader();
myReader.readSentencesFromTextFile(Main.UNLABELED_FILE, Main.SPAN_FILE);
// initialize induced_grammar from training samples
// creates Terminals, nonTerminals, rules + counts, and initial
// mergeBigrams
inducedGrammar = new Induced_Grammar(myReader.getSentences());
System.out.println("# non-terminals =" + inducedGrammar.nonTerminals.size());
System.out.println("# TOP-rules =" + inducedGrammar.nonTerminals.get("TOP").getRules().size());
if (PRINT_OUTPUT) {
Printer.printoutTerminalsNonTerminalsAndRules(inducedGrammar, "induced");
}
if (Main.DO_STOLCKE) {
//do merging and chunking
System.out.println("Starting IteratedMergingAndChunking...");
inducedGrammar.doIteratedMerging();
System.out.println("Starting writeGrammarToFile...");
Printer.printMergesAndChunks(inducedGrammar);
if (Main.PRINT_DUPLICATE_RULES_TO_FILE) Printer.printDuplicateRulesToFile();
if (Main.PRINT_TARGET_GRAMMAR) Printer.printToFileGrammarTargetFormat (inducedGrammar);
Printer.printViterbiParsesAndSpans(true, true, false, false);
if (Main.WRITE_INTERMEDIATE_GRAMMARS_AND_PARSES) Printer.writeParsesToFile(inducedGrammar);
Printer.printToFileTerminalsNonTerminalsAndRules(inducedGrammar);
}
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws Exception {
new Main(args);
}
}