Skip to content

Commit

Permalink
Ingesting 2020 EMNLP, Findings, workshops, and CoNLL (#1045)
Browse files Browse the repository at this point in the history
* Ingested EMNLP
* Ingested Findings:EMNLP 2020
* Ingested CoNLL and shared task
* Added EMLNP workshops
* Improvements and changes to ingest.py
   * corrected capitalizations on name (via a fix to ingest.py)
   * Automatically add missing venues (with note)
* Revision to W19-2111 (closes #1021)
* Normalized venue database; added add_venue.py
* find_name_variants.py
* author_case.py

Co-authored-by: David Chiang <[email protected]>
Co-authored-by: Daniel Gildea <gildea>
  • Loading branch information
mjpost and davidweichiang authored Nov 9, 2020
1 parent 4aaef47 commit ab92b62
Show file tree
Hide file tree
Showing 57 changed files with 18,254 additions and 176 deletions.
54 changes: 54 additions & 0 deletions bin/add_venue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2020 Matt Post <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Adds a venue to the data/yaml/venues.yaml file.
Usage:
add_venue.py acronym "name" [--url URL] [--acl]
"""

import argparse
import os
import sys

from slugify import slugify

from anthology.venues import VenueIndex


def main(args):
datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data")
venues = VenueIndex(srcdir=datadir)

print(f"Adding '{args.acronym}' ({args.name})")
venues.add_venue(args.acronym, args.name, is_acl=args.acl, url=args.url)

venues.dump(datadir)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("acronym", help="Venue acronym (e.g., BlackboxNLP)")
parser.add_argument(
"name",
help="Venue name (e.g., Workshop on analyzing and interpreting neural networks for NLP)",
)
parser.add_argument("--acl", action="store_true", help="Venue is an ACL venue")
parser.add_argument("--url", help="Venue URL")
args = parser.parse_args()

main(args)
3 changes: 3 additions & 0 deletions bin/anthology/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
# Default ingestion date (= unknown)
UNKNOWN_INGEST_DATE = "1900-01-01"

# The venue format must match this pattern
VENUE_FORMAT = r"^[A-Za-z\d]+$"


def get_journal_title(top_level_id, volume_title):
# TODO: consider moving this from code to data (perhaps
Expand Down
35 changes: 34 additions & 1 deletion bin/anthology/venues.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
# limitations under the License.

from collections import defaultdict
from copy import deepcopy
from slugify import slugify
import logging as log
import re
import yaml

try:
Expand All @@ -25,18 +27,49 @@
from yaml import Loader

from .utils import is_newstyle_id, deconstruct_anthology_id
from anthology.data import VENUE_FORMAT


class VenueIndex:
def __init__(self, srcdir=None):
self.venues, self.letters, self.joint_map = {}, {}, defaultdict(list)
self.acronyms_by_key = {}
self.venue_dict = None
if srcdir is not None:
self.load_from_dir(srcdir)

@staticmethod
def get_slug(acronym):
"""The acronym can contain a hyphen, whereas the slug must match VENUE_FORMAT."""
slug = slugify(acronym.replace("-", ""))
assert (
re.match(VENUE_FORMAT, slug) is not None
), f"Proposed slug '{slug}' of venue '{acronym}' doesn't match {VENUE_FORMAT}"
return slug

def add_venue(self, acronym, title, is_acl=False, url=None):
"""
Adds a new venue.
"""
slug = VenueIndex.get_slug(acronym)

self.venue_dict[slug] = {"acronym": acronym, "name": title}
if is_acl:
self.venue_dict[slug]["is_acl"] = True
if url is not None:
self.venue_dict[slug]["url"] = url

def dump(self, directory):
"""
Dumps the venue database to file.
"""
with open("{}/yaml/venues.yaml".format(directory), "wt") as f:
print(yaml.dump(self.venue_dict, allow_unicode=True), file=f)

def load_from_dir(self, directory):
with open("{}/yaml/venues.yaml".format(directory), "r") as f:
venue_dict = yaml.load(f, Loader=Loader)
self.venue_dict = yaml.load(f, Loader=Loader)
venue_dict = deepcopy(self.venue_dict)
for key, val in venue_dict.items():
if "acronym" not in val:
log.critical(f"Venues must have 'acronym' - none defined for '{key}'")
Expand Down
6 changes: 4 additions & 2 deletions bin/change_authors.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env python3

"""Apply changes to author names.
usage: change_authors.py <xml-file>+ -o <out-dir>
usage: change_authors.py -o <out-dir> <change-file>
Reads from stdin a list of changes (produced, e.g., by author_case.py)
Reads a list of changes (produced, e.g., by author_case.py)
in the following format:
paperid \t role \t oldfirst || oldlast \t newfirst || newlast
Expand Down
12 changes: 10 additions & 2 deletions bin/find_name_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
"""Usage: find_name_variants.py [--importdir=DIR]
Heuristically try to find variants of names not yet covered by name_variants.yaml
Finds names that slugify to the same thing. Handles missing accents and
mistakes in first/last split, but not things like Tom/Thomas. Prints output
that can be pasted into name_variants.yaml.
Options:
--importdir=DIR Directory to import XML files from. [default: {scriptdir}/../data/]
Expand Down Expand Up @@ -45,7 +48,8 @@ def to_dict(pn):
def main(anthology):
variants = defaultdict(list)
slugs = {}
for name in anthology.people.names():
for person in anthology.people.personids():
name = anthology.people.get_canonical_name(person)
name_slug = slugify(repr(name))
if name_slug in slugs:
variants[slugs[name_slug]].append(repr(name))
Expand All @@ -69,7 +73,11 @@ def main(anthology):
}
)

print(yaml.dump(canonical_variants, allow_unicode=True))
canonical_variants.sort(
key=lambda x: (x["canonical"]["last"], x["canonical"]["first"])
)
# flow style to match format of file name_variants.yaml
print(yaml.dump(canonical_variants, allow_unicode=True, default_flow_style=None))


if __name__ == "__main__":
Expand Down
58 changes: 43 additions & 15 deletions bin/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
from itertools import chain
from typing import Dict, Any

from slugify import slugify


def log(text: str, fake: bool = False):
message = "[DRY RUN] " if fake else ""
Expand Down Expand Up @@ -106,7 +108,11 @@ def bib2xml(bibfilename, anthology_id):
'language',
]

collection_id, volume_name, paper_no = deconstruct_anthology_id(anthology_id)
try:
collection_id, volume_name, paper_no = deconstruct_anthology_id(anthology_id)
except ValueError:
print(f"Couldn't split {anthology_id}", file=sys.stderr)
sys.exit(1)
if paper_no == '':
return # skip the master bib file; we only process the individual files

Expand Down Expand Up @@ -166,25 +172,23 @@ def main(args):
volumes = {}

anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data")
venue_keys = [
venue["slug"].lower() for _, venue in VenueIndex(srcdir=anthology_datadir).items()
]
venue_index = VenueIndex(srcdir=anthology_datadir)
venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()]

# Build list of volumes, confirm uniqueness
unseen_venues = []
for proceedings in args.proceedings:
meta = read_meta(os.path.join(proceedings, "meta"))

venue_name = meta["abbrev"].lower()
venue_abbrev = meta["abbrev"]
venue_slug = venue_index.get_slug(venue_abbrev)

if venue_name not in venue_keys:
unseen_venues.append(meta["abbrev"])
if venue_slug not in venue_keys:
unseen_venues.append((venue_slug, venue_abbrev, meta["title"]))

meta["path"] = proceedings

meta["collection_id"] = collection_id = (
meta["year"] + "." + meta["abbrev"].lower()
)
meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug
volume_name = meta["volume"].lower()
volume_full_id = f"{collection_id}-{volume_name}"

Expand All @@ -196,11 +200,11 @@ def main(args):

# Make sure all venues exist
if len(unseen_venues) > 0:
print("FATAL: The following venue(s) don't exist in venues.yaml")
for venue in unseen_venues:
print(f"- {venue}")
print("Please create entries for them and re-ingest.")
sys.exit(1)
slug, abbrev, title = venue
print(f"Creating venue '{abbrev}' ({title})")
venue_index.add_venue(abbrev, title)
venue_index.dump(directory=anthology_datadir)

# Copy over the PDFs and attachments
for volume, meta in volumes.items():
Expand Down Expand Up @@ -229,6 +233,10 @@ def main(args):
# copy the paper PDFs
pdf_src_dir = os.path.join(root_path, "pdf")
for pdf_file in os.listdir(pdf_src_dir):
# Skip . files
if os.path.basename(pdf_file).startswith("."):
continue

# names are {abbrev}{number}.pdf
match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

Expand Down Expand Up @@ -262,6 +270,8 @@ def main(args):
if not os.path.exists(attachments_dest_dir):
os.makedirs(attachments_dest_dir)
for attachment_file in os.listdir(os.path.join(root_path, "additional")):
if os.path.basename(attachment_file).startswith("."):
continue
attachment_file_path = os.path.join(
root_path, "additional", attachment_file
)
Expand Down Expand Up @@ -290,6 +300,22 @@ def main(args):

people = AnthologyIndex(None, srcdir=anthology_datadir)

def correct_caps(person, name_node, anth_id):
"""
Many people submit their names in "ALL CAPS" or "all lowercase".
Correct this with heuristics.
"""
name = name_node.text
if name.islower() or name.isupper():
# capitalize all parts
corrected = " ".join(list(map(lambda x: x.capitalize(), name.split())))
choice = input(
f"({anth_id}): Author '{person}': Change '{name}' to '{corrected}'?\n(Return for yes, any text for no): "
)
if choice == "":
print(f"-> Correcting {name} to {corrected}")
name_node.text = corrected

def disambiguate_name(node, anth_id):
name = PersonName.from_element(node)
ids = people.get_ids(name)
Expand Down Expand Up @@ -339,7 +365,6 @@ def disambiguate_name(node, anth_id):
paper_id_full = paper["anthology_id"]
bibfile = paper["bib"]
paper_node = bib2xml(bibfile, paper_id_full)
# print(etree.tostring(paper_node, pretty_print=True))

if paper_node.attrib["id"] == "0":
# create metadata subtree
Expand Down Expand Up @@ -406,6 +431,9 @@ def disambiguate_name(node, anth_id):
paper_node.findall("./author"), paper_node.findall("./editor")
):
disambiguate_name(name_node, paper_id_full)
person = PersonName.from_element(name_node)
for name_part in name_node:
correct_caps(person, name_part, paper_id_full)

# Other data from the meta file
if "isbn" in meta:
Expand Down
2 changes: 1 addition & 1 deletion data/xml/1993.eamt.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<title>Introduction</title>
<author><first>Petra</first><last>Steffens</last></author>
<pages>1-18</pages>
<abstract></abstract>
<abstract/>
</paper>
<paper id="2">
<title>Knowledge extraction from machine-readable dictionaries: an evaluation</title>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/1995.iwpt.xml
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@
</paper>
<paper id="23">
<title>A Formalism and a Parser for Lexicalised Dependency Grammars</title>
<author><first>Alexis</first><last>NASR</last></author>
<author><first>Alexis</first><last>Nasr</last></author>
<pages>186-195</pages>
<url hash="56f890e6">1995.iwpt-1.23</url>
<abstract/>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2010.eamt.xml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
<title>Integration of statistical collocation segmentations in a phrase-based statistical machine translation system</title>
<author><first>Marta R.</first><last>Costa-jussa</last></author>
<author><first>Vidas</first><last>Daudaravicius</last></author>
<author><first>Rafael</first><last>E.Banchs</last></author>
<author><first>Rafael E.</first><last>Banchs</last></author>
<url hash="14e27cae">2010.eamt-1.17</url>
</paper>
<paper id="18">
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2014.eamt.xml
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@
<paper id="35">
<title>Collaborative web <fixed-case>UI</fixed-case> localization, or how to build feature-rich multilingual datasets</title>
<author><first>Vicent</first><last>Alabau</last></author>
<author><first>Luis</first><last>A.Leiva</last></author>
<author><first>Luis A.</first><last>Leiva</last></author>
<pages>151–154</pages>
<url hash="9e9af54e">2014.eamt-1.35</url>
</paper>
Expand Down
12 changes: 6 additions & 6 deletions data/xml/2020.acl.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2334,7 +2334,7 @@
<paper id="187">
<title>Speak to your Parser: Interactive Text-to-<fixed-case>SQL</fixed-case> with Natural Language Feedback</title>
<author><first>Ahmed</first><last>Elgohary</last></author>
<author><first>saghar</first><last>Hosseini</last></author>
<author><first>Saghar</first><last>Hosseini</last></author>
<author><first>Ahmed</first><last>Hassan Awadallah</last></author>
<pages>2065–2077</pages>
<abstract>We study the task of semantic parse correction with natural language feedback. Given a natural language utterance, most semantic parsing systems pose the problem as one-shot translation where the utterance is mapped to a corresponding logical form. In this paper, we investigate a more interactive scenario where humans can further interact with the system by providing free-form natural language feedback to correct the system when it generates an inaccurate interpretation of an initial utterance. We focus on natural language to SQL systems and construct, SPLASH, a dataset of utterances, incorrect SQL interpretations and the corresponding natural language feedback. We compare various reference models for the correction task and show that incorporating such a rich form of feedback can significantly improve the overall semantic parsing accuracy while retaining the flexibility of natural language interaction. While we estimated human correction accuracy is 81.5%, our best model achieves only 25.1%, which leaves a large gap for improvement in future research. SPLASH is publicly available at https://aka.ms/Splash_dataset.</abstract>
Expand Down Expand Up @@ -2439,7 +2439,7 @@
</paper>
<paper id="196">
<title>On Importance Sampling-Based Evaluation of Latent Language Models</title>
<author><first>Robert L</first><last>Logan IV</last></author>
<author><first>Robert L.</first><last>Logan IV</last></author>
<author><first>Matt</first><last>Gardner</last></author>
<author><first>Sameer</first><last>Singh</last></author>
<pages>2171–2176</pages>
Expand Down Expand Up @@ -3235,7 +3235,7 @@
<title>Gender Bias in Multilingual Embeddings and Cross-Lingual Transfer</title>
<author><first>Jieyu</first><last>Zhao</last></author>
<author><first>Subhabrata</first><last>Mukherjee</last></author>
<author><first>saghar</first><last>Hosseini</last></author>
<author><first>Saghar</first><last>Hosseini</last></author>
<author><first>Kai-Wei</first><last>Chang</last></author>
<author><first>Ahmed</first><last>Hassan Awadallah</last></author>
<pages>2896–2907</pages>
Expand Down Expand Up @@ -3968,7 +3968,7 @@
</paper>
<paper id="319">
<title>A Reinforced Generation of Adversarial Examples for Neural Machine Translation</title>
<author><first>wei</first><last>zou</last></author>
<author><first>Wei</first><last>Zou</last></author>
<author><first>Shujian</first><last>Huang</last></author>
<author><first>Jun</first><last>Xie</last></author>
<author><first>Xinyu</first><last>Dai</last></author>
Expand Down Expand Up @@ -4014,7 +4014,7 @@
<author><first>Tong</first><last>Xiao</last></author>
<author><first>Jingbo</first><last>Zhu</last></author>
<author><first>Tongran</first><last>Liu</last></author>
<author><first>changliang</first><last>Li</last></author>
<author><first>Changliang</first><last>Li</last></author>
<pages>3512–3518</pages>
<abstract>In encoder-decoder neural models, multiple encoders are in general used to represent the contextual information in addition to the individual sentence. In this paper, we investigate multi-encoder approaches in document-level neural machine translation (NMT). Surprisingly, we find that the context encoder does not only encode the surrounding sentences but also behaves as a noise generator. This makes us rethink the real benefits of multi-encoder in context-aware translation - some of the improvements come from robust training. We compare several methods that introduce noise and/or well-tuned dropout setup into the training of these encoders. Experimental results show that noisy training plays an important role in multi-encoder-based NMT, especially when the training data is small. Also, we establish a new state-of-the-art on IWSLT Fr-En task by careful use of noise generation and dropout methods.</abstract>
<url hash="49515b79">2020.acl-main.322</url>
Expand Down Expand Up @@ -9831,7 +9831,7 @@
<author><first>Mandy</first><last>Guo</last></author>
<author><first>Jax</first><last>Law</last></author>
<author><first>Noah</first><last>Constant</last></author>
<author><first>Gustavo Hernandez</first><last>Abrego</last></author>
<author><first>Gustavo</first><last>Hernandez Abrego</last></author>
<author><first>Steve</first><last>Yuan</last></author>
<author><first>Chris</first><last>Tar</last></author>
<author><first>Yun-hsuan</first><last>Sung</last></author>
Expand Down
Loading

0 comments on commit ab92b62

Please sign in to comment.