Ingesting 2020 EMNLP, Findings, workshops, and CoNLL (#1045)

* Ingested EMNLP * Ingested Findings:EMNLP 2020 * Ingested CoNLL and shared task * Added EMLNP workshops * Improvements and changes to ingest.py * corrected capitalizations on name (via a fix to ingest.py) * Automatically add missing venues (with note) * Revision to W19-2111 (closes #1021) * Normalized venue database; added add_venue.py * find_name_variants.py * author_case.py Co-authored-by: David Chiang <[email protected]> Co-authored-by: Daniel Gildea <gildea>
acl-org · Nov 9, 2020 · ab92b62 · ab92b62
1 parent 4aaef47
commit ab92b62
Show file tree

Hide file tree

Showing 57 changed files with 18,254 additions and 176 deletions.
diff --git a/bin/add_venue.py b/bin/add_venue.py
@@ -0,0 +1,54 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright 2020 Matt Post <[email protected]>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Adds a venue to the data/yaml/venues.yaml file.
+Usage:
+
+    add_venue.py acronym "name" [--url URL] [--acl]
+"""
+
+import argparse
+import os
+import sys
+
+from slugify import slugify
+
+from anthology.venues import VenueIndex
+
+
+def main(args):
+    datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data")
+    venues = VenueIndex(srcdir=datadir)
+
+    print(f"Adding '{args.acronym}' ({args.name})")
+    venues.add_venue(args.acronym, args.name, is_acl=args.acl, url=args.url)
+
+    venues.dump(datadir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("acronym", help="Venue acronym (e.g., BlackboxNLP)")
+    parser.add_argument(
+        "name",
+        help="Venue name (e.g., Workshop on analyzing and interpreting neural networks for NLP)",
+    )
+    parser.add_argument("--acl", action="store_true", help="Venue is an ACL venue")
+    parser.add_argument("--url", help="Venue URL")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/bin/anthology/data.py b/bin/anthology/data.py
@@ -45,6 +45,9 @@
 # Default ingestion date (= unknown)
 UNKNOWN_INGEST_DATE = "1900-01-01"
 
+# The venue format must match this pattern
+VENUE_FORMAT = r"^[A-Za-z\d]+$"
+
 
 def get_journal_title(top_level_id, volume_title):
     # TODO: consider moving this from code to data (perhaps

diff --git a/bin/anthology/venues.py b/bin/anthology/venues.py
@@ -15,8 +15,10 @@
 # limitations under the License.
 
 from collections import defaultdict
+from copy import deepcopy
 from slugify import slugify
 import logging as log
+import re
 import yaml
 
 try:
@@ -25,18 +27,49 @@
     from yaml import Loader
 
 from .utils import is_newstyle_id, deconstruct_anthology_id
+from anthology.data import VENUE_FORMAT
 
 
 class VenueIndex:
     def __init__(self, srcdir=None):
         self.venues, self.letters, self.joint_map = {}, {}, defaultdict(list)
         self.acronyms_by_key = {}
+        self.venue_dict = None
         if srcdir is not None:
             self.load_from_dir(srcdir)
 
+    @staticmethod
+    def get_slug(acronym):
+        """The acronym can contain a hyphen, whereas the slug must match VENUE_FORMAT."""
+        slug = slugify(acronym.replace("-", ""))
+        assert (
+            re.match(VENUE_FORMAT, slug) is not None
+        ), f"Proposed slug '{slug}' of venue '{acronym}' doesn't match {VENUE_FORMAT}"
+        return slug
+
+    def add_venue(self, acronym, title, is_acl=False, url=None):
+        """
+        Adds a new venue.
+        """
+        slug = VenueIndex.get_slug(acronym)
+
+        self.venue_dict[slug] = {"acronym": acronym, "name": title}
+        if is_acl:
+            self.venue_dict[slug]["is_acl"] = True
+        if url is not None:
+            self.venue_dict[slug]["url"] = url
+
+    def dump(self, directory):
+        """
+        Dumps the venue database to file.
+        """
+        with open("{}/yaml/venues.yaml".format(directory), "wt") as f:
+            print(yaml.dump(self.venue_dict, allow_unicode=True), file=f)
+
     def load_from_dir(self, directory):
         with open("{}/yaml/venues.yaml".format(directory), "r") as f:
-            venue_dict = yaml.load(f, Loader=Loader)
+            self.venue_dict = yaml.load(f, Loader=Loader)
+            venue_dict = deepcopy(self.venue_dict)
             for key, val in venue_dict.items():
                 if "acronym" not in val:
                     log.critical(f"Venues must have 'acronym' - none defined for '{key}'")

diff --git a/bin/change_authors.py b/bin/change_authors.py
@@ -1,8 +1,10 @@
+#!/usr/bin/env python3
+
 """Apply changes to author names.
 
-usage: change_authors.py <xml-file>+ -o <out-dir>
+usage: change_authors.py -o <out-dir> <change-file>
 
-Reads from stdin a list of changes (produced, e.g., by author_case.py)
+Reads a list of changes (produced, e.g., by author_case.py)
 in the following format:
 
 paperid \t role \t oldfirst || oldlast \t newfirst || newlast

diff --git a/bin/find_name_variants.py b/bin/find_name_variants.py
@@ -4,6 +4,9 @@
 """Usage: find_name_variants.py [--importdir=DIR]
 
 Heuristically try to find variants of names not yet covered by name_variants.yaml
+Finds names that slugify to the same thing.  Handles missing accents and
+mistakes in first/last split, but not things like Tom/Thomas.  Prints output
+that can be pasted into name_variants.yaml.
 
 Options:
   --importdir=DIR          Directory to import XML files from. [default: {scriptdir}/../data/]
@@ -45,7 +48,8 @@ def to_dict(pn):
 def main(anthology):
     variants = defaultdict(list)
     slugs = {}
-    for name in anthology.people.names():
+    for person in anthology.people.personids():
+        name = anthology.people.get_canonical_name(person)
         name_slug = slugify(repr(name))
         if name_slug in slugs:
             variants[slugs[name_slug]].append(repr(name))
@@ -69,7 +73,11 @@ def main(anthology):
             }
         )
 
-    print(yaml.dump(canonical_variants, allow_unicode=True))
+    canonical_variants.sort(
+        key=lambda x: (x["canonical"]["last"], x["canonical"]["first"])
+    )
+    # flow style to match format of file name_variants.yaml
+    print(yaml.dump(canonical_variants, allow_unicode=True, default_flow_style=None))
 
 
 if __name__ == "__main__":

diff --git a/bin/ingest.py b/bin/ingest.py
@@ -58,6 +58,8 @@
 from itertools import chain
 from typing import Dict, Any
 
+from slugify import slugify
+
 
 def log(text: str, fake: bool = False):
     message = "[DRY RUN] " if fake else ""
@@ -106,7 +108,11 @@ def bib2xml(bibfilename, anthology_id):
         'language',
     ]
 
-    collection_id, volume_name, paper_no = deconstruct_anthology_id(anthology_id)
+    try:
+        collection_id, volume_name, paper_no = deconstruct_anthology_id(anthology_id)
+    except ValueError:
+        print(f"Couldn't split {anthology_id}", file=sys.stderr)
+        sys.exit(1)
     if paper_no == '':
         return  # skip the master bib file; we only process the individual files
 
@@ -166,25 +172,23 @@ def main(args):
     volumes = {}
 
     anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data")
-    venue_keys = [
-        venue["slug"].lower() for _, venue in VenueIndex(srcdir=anthology_datadir).items()
-    ]
+    venue_index = VenueIndex(srcdir=anthology_datadir)
+    venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()]
 
     # Build list of volumes, confirm uniqueness
     unseen_venues = []
     for proceedings in args.proceedings:
         meta = read_meta(os.path.join(proceedings, "meta"))
 
-        venue_name = meta["abbrev"].lower()
+        venue_abbrev = meta["abbrev"]
+        venue_slug = venue_index.get_slug(venue_abbrev)
 
-        if venue_name not in venue_keys:
-            unseen_venues.append(meta["abbrev"])
+        if venue_slug not in venue_keys:
+            unseen_venues.append((venue_slug, venue_abbrev, meta["title"]))
 
         meta["path"] = proceedings
 
-        meta["collection_id"] = collection_id = (
-            meta["year"] + "." + meta["abbrev"].lower()
-        )
+        meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug
         volume_name = meta["volume"].lower()
         volume_full_id = f"{collection_id}-{volume_name}"
 
@@ -196,11 +200,11 @@ def main(args):
 
     # Make sure all venues exist
     if len(unseen_venues) > 0:
-        print("FATAL: The following venue(s) don't exist in venues.yaml")
         for venue in unseen_venues:
-            print(f"- {venue}")
-        print("Please create entries for them and re-ingest.")
-        sys.exit(1)
+            slug, abbrev, title = venue
+            print(f"Creating venue '{abbrev}' ({title})")
+            venue_index.add_venue(abbrev, title)
+        venue_index.dump(directory=anthology_datadir)
 
     # Copy over the PDFs and attachments
     for volume, meta in volumes.items():
@@ -229,6 +233,10 @@ def main(args):
         # copy the paper PDFs
         pdf_src_dir = os.path.join(root_path, "pdf")
         for pdf_file in os.listdir(pdf_src_dir):
+            # Skip . files
+            if os.path.basename(pdf_file).startswith("."):
+                continue
+
             # names are {abbrev}{number}.pdf
             match = re.match(rf".*\.(\d+)\.pdf", pdf_file)
 
@@ -262,6 +270,8 @@ def main(args):
             if not os.path.exists(attachments_dest_dir):
                 os.makedirs(attachments_dest_dir)
             for attachment_file in os.listdir(os.path.join(root_path, "additional")):
+                if os.path.basename(attachment_file).startswith("."):
+                    continue
                 attachment_file_path = os.path.join(
                     root_path, "additional", attachment_file
                 )
@@ -290,6 +300,22 @@ def main(args):
 
     people = AnthologyIndex(None, srcdir=anthology_datadir)
 
+    def correct_caps(person, name_node, anth_id):
+        """
+        Many people submit their names in "ALL CAPS" or "all lowercase".
+        Correct this with heuristics.
+        """
+        name = name_node.text
+        if name.islower() or name.isupper():
+            # capitalize all parts
+            corrected = " ".join(list(map(lambda x: x.capitalize(), name.split())))
+            choice = input(
+                f"({anth_id}): Author '{person}': Change '{name}' to '{corrected}'?\n(Return for yes, any text for no): "
+            )
+            if choice == "":
+                print(f"-> Correcting {name} to {corrected}")
+                name_node.text = corrected
+
     def disambiguate_name(node, anth_id):
         name = PersonName.from_element(node)
         ids = people.get_ids(name)
@@ -339,7 +365,6 @@ def disambiguate_name(node, anth_id):
                 paper_id_full = paper["anthology_id"]
                 bibfile = paper["bib"]
                 paper_node = bib2xml(bibfile, paper_id_full)
-                # print(etree.tostring(paper_node, pretty_print=True))
 
                 if paper_node.attrib["id"] == "0":
                     # create metadata subtree
@@ -406,6 +431,9 @@ def disambiguate_name(node, anth_id):
                     paper_node.findall("./author"), paper_node.findall("./editor")
                 ):
                     disambiguate_name(name_node, paper_id_full)
+                    person = PersonName.from_element(name_node)
+                    for name_part in name_node:
+                        correct_caps(person, name_part, paper_id_full)
 
         # Other data from the meta file
         if "isbn" in meta:

diff --git a/data/xml/1993.eamt.xml b/data/xml/1993.eamt.xml
@@ -15,7 +15,7 @@
       <title>Introduction</title>
       <author><first>Petra</first><last>Steffens</last></author>
       <pages>1-18</pages>
-      <abstract></abstract>
+      <abstract/>
     </paper>
     <paper id="2">
       <title>Knowledge extraction from machine-readable dictionaries: an evaluation</title>

diff --git a/data/xml/1995.iwpt.xml b/data/xml/1995.iwpt.xml
@@ -201,7 +201,7 @@
     </paper>
     <paper id="23">
       <title>A Formalism and a Parser for Lexicalised Dependency Grammars</title>
-      <author><first>Alexis</first><last>NASR</last></author>
+      <author><first>Alexis</first><last>Nasr</last></author>
       <pages>186-195</pages>
       <url hash="56f890e6">1995.iwpt-1.23</url>
       <abstract/>

diff --git a/data/xml/2010.eamt.xml b/data/xml/2010.eamt.xml
@@ -120,7 +120,7 @@
       <title>Integration of statistical collocation segmentations in a phrase-based statistical machine translation system</title>
       <author><first>Marta R.</first><last>Costa-jussa</last></author>
       <author><first>Vidas</first><last>Daudaravicius</last></author>
-      <author><first>Rafael</first><last>E.Banchs</last></author>
+      <author><first>Rafael E.</first><last>Banchs</last></author>
       <url hash="14e27cae">2010.eamt-1.17</url>
     </paper>
     <paper id="18">

diff --git a/data/xml/2014.eamt.xml b/data/xml/2014.eamt.xml
@@ -247,7 +247,7 @@
     <paper id="35">
       <title>Collaborative web <fixed-case>UI</fixed-case> localization, or how to build feature-rich multilingual datasets</title>
       <author><first>Vicent</first><last>Alabau</last></author>
-      <author><first>Luis</first><last>A.Leiva</last></author>
+      <author><first>Luis A.</first><last>Leiva</last></author>
       <pages>151–154</pages>
       <url hash="9e9af54e">2014.eamt-1.35</url>
     </paper>

diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
@@ -2334,7 +2334,7 @@
     <paper id="187">
       <title>Speak to your Parser: Interactive Text-to-<fixed-case>SQL</fixed-case> with Natural Language Feedback</title>
       <author><first>Ahmed</first><last>Elgohary</last></author>
-      <author><first>saghar</first><last>Hosseini</last></author>
+      <author><first>Saghar</first><last>Hosseini</last></author>
       <author><first>Ahmed</first><last>Hassan Awadallah</last></author>
       <pages>2065–2077</pages>
       <abstract>We study the task of semantic parse correction with natural language feedback. Given a natural language utterance, most semantic parsing systems pose the problem as one-shot translation where the utterance is mapped to a corresponding logical form. In this paper, we investigate a more interactive scenario where humans can further interact with the system by providing free-form natural language feedback to correct the system when it generates an inaccurate interpretation of an initial utterance. We focus on natural language to SQL systems and construct, SPLASH, a dataset of utterances, incorrect SQL interpretations and the corresponding natural language feedback. We compare various reference models for the correction task and show that incorporating such a rich form of feedback can significantly improve the overall semantic parsing accuracy while retaining the flexibility of natural language interaction. While we estimated human correction accuracy is 81.5%, our best model achieves only 25.1%, which leaves a large gap for improvement in future research. SPLASH is publicly available at https://aka.ms/Splash_dataset.</abstract>
@@ -2439,7 +2439,7 @@
     </paper>
     <paper id="196">
       <title>On Importance Sampling-Based Evaluation of Latent Language Models</title>
-      <author><first>Robert L</first><last>Logan IV</last></author>
+      <author><first>Robert L.</first><last>Logan IV</last></author>
       <author><first>Matt</first><last>Gardner</last></author>
       <author><first>Sameer</first><last>Singh</last></author>
       <pages>2171–2176</pages>
@@ -3235,7 +3235,7 @@
       <title>Gender Bias in Multilingual Embeddings and Cross-Lingual Transfer</title>
       <author><first>Jieyu</first><last>Zhao</last></author>
       <author><first>Subhabrata</first><last>Mukherjee</last></author>
-      <author><first>saghar</first><last>Hosseini</last></author>
+      <author><first>Saghar</first><last>Hosseini</last></author>
       <author><first>Kai-Wei</first><last>Chang</last></author>
       <author><first>Ahmed</first><last>Hassan Awadallah</last></author>
       <pages>2896–2907</pages>
@@ -3968,7 +3968,7 @@
     </paper>
     <paper id="319">
       <title>A Reinforced Generation of Adversarial Examples for Neural Machine Translation</title>
-      <author><first>wei</first><last>zou</last></author>
+      <author><first>Wei</first><last>Zou</last></author>
       <author><first>Shujian</first><last>Huang</last></author>
       <author><first>Jun</first><last>Xie</last></author>
       <author><first>Xinyu</first><last>Dai</last></author>
@@ -4014,7 +4014,7 @@
       <author><first>Tong</first><last>Xiao</last></author>
       <author><first>Jingbo</first><last>Zhu</last></author>
       <author><first>Tongran</first><last>Liu</last></author>
-      <author><first>changliang</first><last>Li</last></author>
+      <author><first>Changliang</first><last>Li</last></author>
       <pages>3512–3518</pages>
       <abstract>In encoder-decoder neural models, multiple encoders are in general used to represent the contextual information in addition to the individual sentence. In this paper, we investigate multi-encoder approaches in document-level neural machine translation (NMT). Surprisingly, we find that the context encoder does not only encode the surrounding sentences but also behaves as a noise generator. This makes us rethink the real benefits of multi-encoder in context-aware translation - some of the improvements come from robust training. We compare several methods that introduce noise and/or well-tuned dropout setup into the training of these encoders. Experimental results show that noisy training plays an important role in multi-encoder-based NMT, especially when the training data is small. Also, we establish a new state-of-the-art on IWSLT Fr-En task by careful use of noise generation and dropout methods.</abstract>
       <url hash="49515b79">2020.acl-main.322</url>
@@ -9831,7 +9831,7 @@
       <author><first>Mandy</first><last>Guo</last></author>
       <author><first>Jax</first><last>Law</last></author>
       <author><first>Noah</first><last>Constant</last></author>
-      <author><first>Gustavo Hernandez</first><last>Abrego</last></author>
+      <author><first>Gustavo</first><last>Hernandez Abrego</last></author>
       <author><first>Steve</first><last>Yuan</last></author>
       <author><first>Chris</first><last>Tar</last></author>
       <author><first>Yun-hsuan</first><last>Sung</last></author>