CogComp · cogcomp-dev · Sep 27, 2018 · Sep 24, 2018 · Sep 24, 2018 · Sep 24, 2018
diff --git a/chunker/doc/performance.txt b/chunker/doc/performance.txt
@@ -1,9 +1,9 @@
-Date: 10/20/2016
+Date: 09/23/2018
 Tested: Qiang (John) Ning
 Contact: [email protected]
 
-Chunker model version: illinois-chunker-model-3.0.77
-Trainset: /shared/corpora/corporaWeb/written/eng/chunking/conll2000distributions/train.txt (trained with 50 iterations.)
+Chunker model version: illinois-chunker-model-4.0.12
+Trainset: /shared/corpora/corporaWeb/written/eng/chunking/conll2000distributions/train.txt (trained with 11 iterations.)
 Testset:
 Gold POS: /shared/corpora/corporaWeb/written/eng/chunking/conll2000distributions/test.txt
 No POS: /shared/corpora/corporaWeb/written/eng/chunking/conll2000distributions/test.noPOS.txt
@@ -12,39 +12,37 @@ Performance:
 With Gold POS
  Label   Precision Recall   F1   LCount PCount
 ----------------------------------------------
-ADJP        76.633 69.635 72.967    438    398
-ADVP        81.862 79.215 80.516    866    838
-CONJP       45.455 55.556 50.000      9     11
-INTJ        50.000 50.000 50.000      2      2
+ADJP        78.000 71.233 74.463    438    400
+ADVP        82.262 79.792 81.008    866    840
+CONJP       50.000 55.556 52.632      9     10
+INTJ       100.000 50.000 66.667      2      1
 LST          0.000  0.000  0.000      5      1
-NP          94.106 93.962 94.034  12422  12403
-PP          96.770 97.776 97.270   4811   4861
-PRT         72.072 75.472 73.733    106    111
-SBAR        88.280 87.290 87.782    535    529
-UCP          0.000  0.000  0.000      0      5
-VP          93.416 93.517 93.466   4658   4663
+NP          94.051 94.051 94.051  12422  12422
+PP          96.694 97.880 97.283   4811   4870
+PRT         73.394 75.472 74.419    106    109
+SBAR        87.902 86.916 87.406    535    529
+VP          93.845 93.946 93.896   4658   4663
 ----------------------------------------------
-O            0.000  0.000  0.000   1244   1274
+O            0.000  0.000  0.000   1214   1221
 ----------------------------------------------
-Overall     93.510 93.393 93.451  23852  23822
-Accuracy    88.763   -      -      -     25096
+Overall     93.613 93.585 93.599  23852  23845
+Accuracy    89.053   -      -      -     25066
 
 With NO POS
  Label   Precision Recall   F1   LCount PCount
 ----------------------------------------------
-ADJP        78.608 69.635 73.850    438    388
-ADVP        80.427 78.291 79.345    866    843
-CONJP       45.455 55.556 50.000      9     11
+ADJP        80.051 72.374 76.019    438    396
+ADVP        80.806 78.753 79.766    866    844
+CONJP       50.000 55.556 52.632      9     10
 INTJ       100.000 50.000 66.667      2      1
 LST          0.000  0.000  0.000      5      0
-NP          94.193 94.019 94.106  12422  12399
-PP          96.656 97.942 97.295   4811   4875
-PRT         60.417 82.075 69.600    106    144
-SBAR        86.813 88.598 87.697    535    546
-UCP          0.000  0.000  0.000      0      4
-VP          94.105 94.246 94.176   4658   4665
+NP          94.224 94.156 94.190  12422  12413
+PP          96.540 98.005 97.267   4811   4884
+PRT         64.444 82.075 72.199    106    135
+SBAR        86.900 88.037 87.465    535    542
+VP          94.427 94.568 94.497   4658   4665
 ----------------------------------------------
-O            0.000  0.000  0.000   1231   1207
+O            0.000  0.000  0.000   1199   1161
 ----------------------------------------------
-Overall     93.529 93.623 93.576  23852  23876
-Accuracy    89.028   -      -      -     25083
+Overall     93.675 93.824 93.750  23852  23890
+Accuracy    89.334   -      -      -     25051
diff --git a/chunker/pom.xml b/chunker/pom.xml
@@ -39,7 +39,7 @@
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>
             <artifactId>illinois-chunker-model</artifactId>
-            <version>3.0.77</version>
+            <version>4.0.12</version>
         </dependency>
         <dependency>
             <groupId>edu.illinois.cs.cogcomp</groupId>

diff --git a/chunker/scripts/mvn_demo.sh b/chunker/scripts/mvn_demo.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
-TESTFILE=test/testIn.txt
-OUTFILE=test/testOut.txt
+TESTFILE=src/test/resources/testIn.txt
+OUTFILE=testOut.txt
 
 mvn exec:java -Dexec.mainClass=edu.illinois.cs.cogcomp.chunker.main.ChunkerDemo -Dexec.args="$TESTFILE $OUTFILE"
 

diff --git a/chunker/scripts/mvn_test_conll.sh b/chunker/scripts/mvn_test_conll.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-TESTFILE=test/testCoNLL.txt
+TESTFILE=src/test/resources/testCoNLL.txt
 
 #	Use the default chunker model
 if [ $# -eq 0 ]; then

diff --git a/chunker/scripts/mvn_validate.sh b/chunker/scripts/mvn_validate.sh
@@ -1,19 +1,21 @@
-TESTFILE=test/testIn.txt
-OUTFILE=test/testOut.txt
-REFFILE=test/testRefOut.txt
+#!/usr/bin/env bash
+TESTFILE=src/test/resources/testIn.txt
+OUTFILE=testOut.txt
+REFFILE=src/test/resources/testRefOut-demo.txt
 
 mvn exec:java -Dexec.mainClass=edu.illinois.cs.cogcomp.chunker.main.ChunkerDemo -Dexec.args="$TESTFILE $OUTFILE"
 
-DIFFFILE=test/testDiff.txt
+DIFFFILE=testDiff.txt
 rm -f ${DIFFFILE}
 diff $REFFILE $OUTFILE > $DIFFFILE
 
 if [ -e ${DIFFFILE} ]; then
     if [ -s ${DIFFFILE} ]; then
-	echo "$0: *** TEST FAILED ***: Differences found between new output and reference output.  See $DIFFFILE for details." 
+	echo "$0: *** TEST FAILED ***: Differences found between new output and reference output.  See $OUTFILE and $DIFFFILE for details."
     else
 	echo "$0: Test passed: no difference between new output and reference output."
 	rm -f $DIFFFILE
+	rm -f $OUTFILE
     fi
 else
     echo "$0: Error: couldn't find the diff file '$DIFFFILE'."

diff --git a/chunker/src/main/java/edu/illinois/cs/cogcomp/chunker/main/ChunkerTrain.java b/chunker/src/main/java/edu/illinois/cs/cogcomp/chunker/main/ChunkerTrain.java
@@ -79,7 +79,7 @@ public void trainModels(String trainingData, String modeldir, String modelname,
      */
     public void trainModelsWithParser(Parser parser) {
         Chunker.isTraining = true;
-
+        chunker.forget();
         // Run the learner
         for (int i = 1; i <= iter; i++) {
             LinkedVector ex;
@@ -97,6 +97,7 @@ public void trainModelsWithParser(Parser parser) {
 
     public void trainModelsWithParser(Parser parser, String modeldir, String modelname, double dev_ratio) {
         Chunker.isTraining = true;
+        chunker.forget();
         double tmpF1 = 0;
         double bestF1 = 0;
         int bestIter = 0;
@@ -107,16 +108,11 @@ public void trainModelsWithParser(Parser parser, String modeldir, String modelna
         // Get the total number of training set
         int cnt = 0;
         LinkedVector ex;
-        while ((ex = (LinkedVector) parser.next()) != null) {
-            cnt++;
-        }
+        while (parser.next() != null) cnt++;
         parser.reset();
         // Get the boundary between train and dev
+        dev_ratio = Math.min(1,Math.max(dev_ratio,0));
         long idx = Math.round(cnt*(1-dev_ratio));
-        if( idx < 0 )
-            idx = 0;
-        if( idx > cnt )
-            idx = cnt;
 
         // Run the learner and save F1 for each iteration
         for (int i = 1; i <= iter; i++) {
@@ -125,10 +121,8 @@ public void trainModelsWithParser(Parser parser, String modeldir, String modelna
                 for (int j = 0; j < ex.size(); j++) {
                     chunker.learn(ex.get(j));
                 }
-                if(cnt>=idx)
-                    break;
-                else
-                    cnt++;
+                if(cnt>=idx) break;
+                cnt++;
             }
             chunker.doneWithRound();
             writeModelsToDisk(modeldir,modelname);
@@ -153,6 +147,7 @@ public void trainModelsWithParser(Parser parser, String modeldir, String modelna
         System.out.println("Best #Iter = "+bestIter+" (F1="+bestF1+")");
         System.out.println("Rerun the learner using best #Iter...");
         // Rerun the learner
+        chunker.forget();
         for (int i = 1; i <= bestIter; i++) {
             while ((ex = (LinkedVector) parser.next()) != null) {
                 for (int j = 0; j < ex.size(); j++) {

diff --git a/chunker/src/test/java/edu/illinois/cs/cogcomp/lbj/chunk/tests/TestDiff.java b/chunker/src/test/java/edu/illinois/cs/cogcomp/lbj/chunk/tests/TestDiff.java
@@ -37,7 +37,7 @@
 public class TestDiff {
     private static final String testFileName = "testIn.txt";
     private static String testFile;
-    private static final String refFileName = "testRefOutput.txt";
+    private static final String refFileName = "testRefOut.txt";
     private static List<String> refSentences;
 
     @Before

diff --git a/chunker/test/testCoNLL.txt → chunker/src/test/resources/testCoNLL.txt b/chunker/test/testCoNLL.txt → chunker/src/test/resources/testCoNLL.txt
diff --git a/chunker/src/test/resources/testOut.txt b/chunker/src/test/resources/testOut.txt
diff --git a/chunker/test/testOut.txt → ...er/src/test/resources/testRefOut-demo.txt b/chunker/test/testOut.txt → ...er/src/test/resources/testRefOut-demo.txt
@@ -1,2 +1,2 @@
-[ADVP Arguably ] [NP both ] [VP were ] [PP on ] [NP notice ] [SBAR that ] [NP their behavior ] [VP was ] [ADVP at ] [ADJP least risky ] [NP Mr. Bush ] [VP had threatened ] [NP a veto ] [ADVP previously ] [NP The volatility ] [VP was ] [ADJP dizzying ] [PP for ] [NP traders ] 
+[ADVP Arguably ] [NP both ] [VP were ] [PP on ] [NP notice ] [SBAR that ] [NP their behavior ] [VP was ] [ADVP at least ] [ADJP risky ] [NP Mr. Bush ] [VP had threatened ] [NP a veto ] [ADVP previously ] [NP The volatility ] [VP was ] [ADJP dizzying ] [PP for ] [NP traders ] 
 (RB Arguably) (, ,) (DT both) (VBD were) (IN on) (NN notice) (IN that) (PRP$ their) (NN behavior) (VBD was) (IN at) (JJS least) (JJ risky) (. .) (NNP Mr.) (NNP Bush) (VBD had) (VBN threatened) (DT a) (NN veto) (RB previously) (. .) (DT The) (NN volatility) (VBD was) (JJ dizzying) (IN for) (NNS traders) (. .) 
diff --git a/...c/test/resources/testRefOutput-Actual.txt → chunker/src/test/resources/testRefOut.txt b/...c/test/resources/testRefOutput-Actual.txt → chunker/src/test/resources/testRefOut.txt
@@ -1,3 +1,3 @@
 [ADVP (RB Arguably) ] (, ,) [NP (DT both) ] [VP (VBD were) ] [PP (IN on) ] [NP (NN notice) ] [SBAR (IN that) ] [NP (PRP$ their) (NN behavior) ] [VP (VBD was) ] [ADVP (IN at) (JJS least) ] [ADJP (JJ risky) ] (. .)
-[NP (NNP Mr.) (NNP Bush) ] [VP (VBD had) (VBN threatened) ] [NP (DT a) (NN veto) ] [ADVP (RB previously) ] (. .) 
-[NP (DT The) (NN volatility) ] [VP (VBD was) ] [ADJP (JJ dizzying) ] [PP (IN for) ] [NP (NNS traders) ] (. .) 
+[NP (NNP Mr.) (NNP Bush) ] [VP (VBD had) (VBN threatened) ] [NP (DT a) (NN veto) ] [ADVP (RB previously) ] (. .)
+[NP (DT The) (NN volatility) ] [VP (VBD was) ] [ADJP (JJ dizzying) ] [PP (IN for) ] [NP (NNS traders) ] (. .)
diff --git a/chunker/src/test/resources/testRefOutput.txt b/chunker/src/test/resources/testRefOutput.txt
diff --git a/chunker/test/testIn.txt b/chunker/test/testIn.txt
diff --git a/chunker/test/testRefOut.txt b/chunker/test/testRefOut.txt
diff --git a/pipeline/README.md b/pipeline/README.md
@@ -39,7 +39,7 @@ of several other components for which it is a dependency.
 6. [Constituency Parser](http://nlp.stanford.edu/software/lex-parser.shtml) (Stanford): 1G, no dependencies.
 6. [Dependency Parser](http://nlp.stanford.edu/software/lex-parser.shtml) (Stanford): shares resources of Constituency parser so no individual footprint; no dependencies.
 7. Dependency Parser (CogComp): <1G requires Part-of-Speech tagger, Chunker.
-8. Verb Semantic Role Labeler: 4G, requires Lemmatizer, Part-of-Speech, Named Entity Recognizer (CoNLL),
+8. Verb Semantic Role Labeler: ~40G (see [issue656](https://github.com/CogComp/cogcomp-nlp/issues/656)), requires Lemmatizer, Part-of-Speech, Shallow Parsing, Named Entity Recognizer (CoNLL),
    Constituency Parser.
 9. Noun Semantic Role Labeler: 1G, requires Lemmatizer, Part-of-Speech, Named Entity Recognizer (CoNLL),
    Constituency Parser.