diff --git a/bin/add_venue.py b/bin/add_venue.py new file mode 100755 index 0000000000..fdc4a991f4 --- /dev/null +++ b/bin/add_venue.py @@ -0,0 +1,54 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright 2020 Matt Post +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Adds a venue to the data/yaml/venues.yaml file. +Usage: + + add_venue.py acronym "name" [--url URL] [--acl] +""" + +import argparse +import os +import sys + +from slugify import slugify + +from anthology.venues import VenueIndex + + +def main(args): + datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") + venues = VenueIndex(srcdir=datadir) + + print(f"Adding '{args.acronym}' ({args.name})") + venues.add_venue(args.acronym, args.name, is_acl=args.acl, url=args.url) + + venues.dump(datadir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("acronym", help="Venue acronym (e.g., BlackboxNLP)") + parser.add_argument( + "name", + help="Venue name (e.g., Workshop on analyzing and interpreting neural networks for NLP)", + ) + parser.add_argument("--acl", action="store_true", help="Venue is an ACL venue") + parser.add_argument("--url", help="Venue URL") + args = parser.parse_args() + + main(args) diff --git a/bin/anthology/data.py b/bin/anthology/data.py index fe8607c9c3..9671bc0e81 100644 --- a/bin/anthology/data.py +++ b/bin/anthology/data.py @@ -45,6 +45,9 @@ # Default ingestion date (= unknown) UNKNOWN_INGEST_DATE = "1900-01-01" +# The venue format must match this pattern +VENUE_FORMAT = r"^[A-Za-z\d]+$" + def get_journal_title(top_level_id, volume_title): # TODO: consider moving this from code to data (perhaps diff --git a/bin/anthology/venues.py b/bin/anthology/venues.py index ed8f755240..59284bc821 100644 --- a/bin/anthology/venues.py +++ b/bin/anthology/venues.py @@ -15,8 +15,10 @@ # limitations under the License. from collections import defaultdict +from copy import deepcopy from slugify import slugify import logging as log +import re import yaml try: @@ -25,18 +27,49 @@ from yaml import Loader from .utils import is_newstyle_id, deconstruct_anthology_id +from anthology.data import VENUE_FORMAT class VenueIndex: def __init__(self, srcdir=None): self.venues, self.letters, self.joint_map = {}, {}, defaultdict(list) self.acronyms_by_key = {} + self.venue_dict = None if srcdir is not None: self.load_from_dir(srcdir) + @staticmethod + def get_slug(acronym): + """The acronym can contain a hyphen, whereas the slug must match VENUE_FORMAT.""" + slug = slugify(acronym.replace("-", "")) + assert ( + re.match(VENUE_FORMAT, slug) is not None + ), f"Proposed slug '{slug}' of venue '{acronym}' doesn't match {VENUE_FORMAT}" + return slug + + def add_venue(self, acronym, title, is_acl=False, url=None): + """ + Adds a new venue. + """ + slug = VenueIndex.get_slug(acronym) + + self.venue_dict[slug] = {"acronym": acronym, "name": title} + if is_acl: + self.venue_dict[slug]["is_acl"] = True + if url is not None: + self.venue_dict[slug]["url"] = url + + def dump(self, directory): + """ + Dumps the venue database to file. + """ + with open("{}/yaml/venues.yaml".format(directory), "wt") as f: + print(yaml.dump(self.venue_dict, allow_unicode=True), file=f) + def load_from_dir(self, directory): with open("{}/yaml/venues.yaml".format(directory), "r") as f: - venue_dict = yaml.load(f, Loader=Loader) + self.venue_dict = yaml.load(f, Loader=Loader) + venue_dict = deepcopy(self.venue_dict) for key, val in venue_dict.items(): if "acronym" not in val: log.critical(f"Venues must have 'acronym' - none defined for '{key}'") diff --git a/bin/change_authors.py b/bin/change_authors.py old mode 100644 new mode 100755 index 9f7912df9c..25e9192d8f --- a/bin/change_authors.py +++ b/bin/change_authors.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python3 + """Apply changes to author names. -usage: change_authors.py + -o +usage: change_authors.py -o -Reads from stdin a list of changes (produced, e.g., by author_case.py) +Reads a list of changes (produced, e.g., by author_case.py) in the following format: paperid \t role \t oldfirst || oldlast \t newfirst || newlast diff --git a/bin/find_name_variants.py b/bin/find_name_variants.py index 6a45e9b54b..cb832b0d3d 100755 --- a/bin/find_name_variants.py +++ b/bin/find_name_variants.py @@ -4,6 +4,9 @@ """Usage: find_name_variants.py [--importdir=DIR] Heuristically try to find variants of names not yet covered by name_variants.yaml +Finds names that slugify to the same thing. Handles missing accents and +mistakes in first/last split, but not things like Tom/Thomas. Prints output +that can be pasted into name_variants.yaml. Options: --importdir=DIR Directory to import XML files from. [default: {scriptdir}/../data/] @@ -45,7 +48,8 @@ def to_dict(pn): def main(anthology): variants = defaultdict(list) slugs = {} - for name in anthology.people.names(): + for person in anthology.people.personids(): + name = anthology.people.get_canonical_name(person) name_slug = slugify(repr(name)) if name_slug in slugs: variants[slugs[name_slug]].append(repr(name)) @@ -69,7 +73,11 @@ def main(anthology): } ) - print(yaml.dump(canonical_variants, allow_unicode=True)) + canonical_variants.sort( + key=lambda x: (x["canonical"]["last"], x["canonical"]["first"]) + ) + # flow style to match format of file name_variants.yaml + print(yaml.dump(canonical_variants, allow_unicode=True, default_flow_style=None)) if __name__ == "__main__": diff --git a/bin/ingest.py b/bin/ingest.py index 03ab8990aa..4db180c477 100755 --- a/bin/ingest.py +++ b/bin/ingest.py @@ -58,6 +58,8 @@ from itertools import chain from typing import Dict, Any +from slugify import slugify + def log(text: str, fake: bool = False): message = "[DRY RUN] " if fake else "" @@ -106,7 +108,11 @@ def bib2xml(bibfilename, anthology_id): 'language', ] - collection_id, volume_name, paper_no = deconstruct_anthology_id(anthology_id) + try: + collection_id, volume_name, paper_no = deconstruct_anthology_id(anthology_id) + except ValueError: + print(f"Couldn't split {anthology_id}", file=sys.stderr) + sys.exit(1) if paper_no == '': return # skip the master bib file; we only process the individual files @@ -166,25 +172,23 @@ def main(args): volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") - venue_keys = [ - venue["slug"].lower() for _, venue in VenueIndex(srcdir=anthology_datadir).items() - ] + venue_index = VenueIndex(srcdir=anthology_datadir) + venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) - venue_name = meta["abbrev"].lower() + venue_abbrev = meta["abbrev"] + venue_slug = venue_index.get_slug(venue_abbrev) - if venue_name not in venue_keys: - unseen_venues.append(meta["abbrev"]) + if venue_slug not in venue_keys: + unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) meta["path"] = proceedings - meta["collection_id"] = collection_id = ( - meta["year"] + "." + meta["abbrev"].lower() - ) + meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" @@ -196,11 +200,11 @@ def main(args): # Make sure all venues exist if len(unseen_venues) > 0: - print("FATAL: The following venue(s) don't exist in venues.yaml") for venue in unseen_venues: - print(f"- {venue}") - print("Please create entries for them and re-ingest.") - sys.exit(1) + slug, abbrev, title = venue + print(f"Creating venue '{abbrev}' ({title})") + venue_index.add_venue(abbrev, title) + venue_index.dump(directory=anthology_datadir) # Copy over the PDFs and attachments for volume, meta in volumes.items(): @@ -229,6 +233,10 @@ def main(args): # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): + # Skip . files + if os.path.basename(pdf_file).startswith("."): + continue + # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) @@ -262,6 +270,8 @@ def main(args): if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir(os.path.join(root_path, "additional")): + if os.path.basename(attachment_file).startswith("."): + continue attachment_file_path = os.path.join( root_path, "additional", attachment_file ) @@ -290,6 +300,22 @@ def main(args): people = AnthologyIndex(None, srcdir=anthology_datadir) + def correct_caps(person, name_node, anth_id): + """ + Many people submit their names in "ALL CAPS" or "all lowercase". + Correct this with heuristics. + """ + name = name_node.text + if name.islower() or name.isupper(): + # capitalize all parts + corrected = " ".join(list(map(lambda x: x.capitalize(), name.split()))) + choice = input( + f"({anth_id}): Author '{person}': Change '{name}' to '{corrected}'?\n(Return for yes, any text for no): " + ) + if choice == "": + print(f"-> Correcting {name} to {corrected}") + name_node.text = corrected + def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) @@ -339,7 +365,6 @@ def disambiguate_name(node, anth_id): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) - # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree @@ -406,6 +431,9 @@ def disambiguate_name(node, anth_id): paper_node.findall("./author"), paper_node.findall("./editor") ): disambiguate_name(name_node, paper_id_full) + person = PersonName.from_element(name_node) + for name_part in name_node: + correct_caps(person, name_part, paper_id_full) # Other data from the meta file if "isbn" in meta: diff --git a/data/xml/1993.eamt.xml b/data/xml/1993.eamt.xml index cd533a0ea2..aad8974865 100644 --- a/data/xml/1993.eamt.xml +++ b/data/xml/1993.eamt.xml @@ -15,7 +15,7 @@ Introduction PetraSteffens 1-18 - + Knowledge extraction from machine-readable dictionaries: an evaluation diff --git a/data/xml/1995.iwpt.xml b/data/xml/1995.iwpt.xml index 2d201d3ba0..f3c4bf1c89 100644 --- a/data/xml/1995.iwpt.xml +++ b/data/xml/1995.iwpt.xml @@ -201,7 +201,7 @@ A Formalism and a Parser for Lexicalised Dependency Grammars - AlexisNASR + AlexisNasr 186-195 1995.iwpt-1.23 diff --git a/data/xml/2010.eamt.xml b/data/xml/2010.eamt.xml index d17944c5d3..9976139fcf 100644 --- a/data/xml/2010.eamt.xml +++ b/data/xml/2010.eamt.xml @@ -120,7 +120,7 @@ Integration of statistical collocation segmentations in a phrase-based statistical machine translation system Marta R.Costa-jussa VidasDaudaravicius - RafaelE.Banchs + Rafael E.Banchs 2010.eamt-1.17 diff --git a/data/xml/2014.eamt.xml b/data/xml/2014.eamt.xml index 3d7a36fd51..55dcf053f7 100644 --- a/data/xml/2014.eamt.xml +++ b/data/xml/2014.eamt.xml @@ -247,7 +247,7 @@ Collaborative web <fixed-case>UI</fixed-case> localization, or how to build feature-rich multilingual datasets VicentAlabau - LuisA.Leiva + Luis A.Leiva 151–154 2014.eamt-1.35 diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml index a0f9b25b67..a54fd325b4 100644 --- a/data/xml/2020.acl.xml +++ b/data/xml/2020.acl.xml @@ -2334,7 +2334,7 @@ Speak to your Parser: Interactive Text-to-<fixed-case>SQL</fixed-case> with Natural Language Feedback AhmedElgohary - sagharHosseini + SagharHosseini AhmedHassan Awadallah 2065–2077 We study the task of semantic parse correction with natural language feedback. Given a natural language utterance, most semantic parsing systems pose the problem as one-shot translation where the utterance is mapped to a corresponding logical form. In this paper, we investigate a more interactive scenario where humans can further interact with the system by providing free-form natural language feedback to correct the system when it generates an inaccurate interpretation of an initial utterance. We focus on natural language to SQL systems and construct, SPLASH, a dataset of utterances, incorrect SQL interpretations and the corresponding natural language feedback. We compare various reference models for the correction task and show that incorporating such a rich form of feedback can significantly improve the overall semantic parsing accuracy while retaining the flexibility of natural language interaction. While we estimated human correction accuracy is 81.5%, our best model achieves only 25.1%, which leaves a large gap for improvement in future research. SPLASH is publicly available at https://aka.ms/Splash_dataset. @@ -2439,7 +2439,7 @@ On Importance Sampling-Based Evaluation of Latent Language Models - Robert LLogan IV + Robert L.Logan IV MattGardner SameerSingh 2171–2176 @@ -3235,7 +3235,7 @@ Gender Bias in Multilingual Embeddings and Cross-Lingual Transfer JieyuZhao SubhabrataMukherjee - sagharHosseini + SagharHosseini Kai-WeiChang AhmedHassan Awadallah 2896–2907 @@ -3968,7 +3968,7 @@ A Reinforced Generation of Adversarial Examples for Neural Machine Translation - weizou + WeiZou ShujianHuang JunXie XinyuDai @@ -4014,7 +4014,7 @@ TongXiao JingboZhu TongranLiu - changliangLi + ChangliangLi 3512–3518 In encoder-decoder neural models, multiple encoders are in general used to represent the contextual information in addition to the individual sentence. In this paper, we investigate multi-encoder approaches in document-level neural machine translation (NMT). Surprisingly, we find that the context encoder does not only encode the surrounding sentences but also behaves as a noise generator. This makes us rethink the real benefits of multi-encoder in context-aware translation - some of the improvements come from robust training. We compare several methods that introduce noise and/or well-tuned dropout setup into the training of these encoders. Experimental results show that noisy training plays an important role in multi-encoder-based NMT, especially when the training data is small. Also, we establish a new state-of-the-art on IWSLT Fr-En task by careful use of noise generation and dropout methods. 2020.acl-main.322 @@ -9831,7 +9831,7 @@ MandyGuo JaxLaw NoahConstant - Gustavo HernandezAbrego + GustavoHernandez Abrego SteveYuan ChrisTar Yun-hsuanSung diff --git a/data/xml/2020.alw.xml b/data/xml/2020.alw.xml new file mode 100644 index 0000000000..d9f6c4e118 --- /dev/null +++ b/data/xml/2020.alw.xml @@ -0,0 +1,244 @@ + + + + + Proceedings of the Fourth Workshop on Online Abuse and Harms + SeyiAkiwowo + BertieVidgen + VinodkumarPrabhakaran + ZeerakWaseem + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.alw-1.0 + + + Online Abuse and Human Rights: <fixed-case>WOAH</fixed-case> Satellite Session at <fixed-case>R</fixed-case>ights<fixed-case>C</fixed-case>on 2020 + VinodkumarPrabhakaran + ZeerakWaseem + SeyiAkiwowo + BertieVidgen + 1–6 + In 2020 The Workshop on Online Abuse and Harms (WOAH) held a satellite panel at RightsCons 2020, an international human rights conference. Our aim was to bridge the gap between human rights scholarship and Natural Language Processing (NLP) research communities in tackling online abuse. We report on the discussions that took place, and present an analysis of four key issues which emerged: Problems in tackling online abuse, Solutions, Meta concerns and the Ecosystem of content moderation and research. We argue there is a pressing need for NLP research communities to engage with human rights perspectives, and identify four key ways in which NLP research into online abuse could immediately be enhanced to create better and more ethical solutions. + 2020.alw-1.1 + + + A Novel Methodology for Developing Automatic Harassment Classifiers for <fixed-case>T</fixed-case>witter + IshaanArora + JuliaGuo + Sarah ItaLevitan + SusanMcGregor + JuliaHirschberg + 7–15 + Most efforts at identifying abusive speech online rely on public corpora that have been scraped from websites using keyword-based queries or released by site or platform owners for research purposes. These are typically labeled by crowd-sourced annotators – not the targets of the abuse themselves. While this method of data collection supports fast development of machine learning classifiers, the models built on them often fail in the context of real-world harassment and abuse, which contain nuances less easily identified by non-targets. Here, we present a mixed-methods approach to create classifiers for abuse and harassment which leverages direct engagement with the target group in order to achieve high quality and ecological validity of data sets and labels, and to generate deeper insights into the key tactics of bad actors. We use women journalists’ experience on Twitter as an initial community of focus. We identify several structural mechanisms of abuse that we believe will generalize to other target communities. + 2020.alw-1.2 + + + Using Transfer-based Language Models to Detect Hateful and Offensive Language Online + VebjørnIsaksen + BjörnGambäck + 16–27 + Distinguishing hate speech from non-hate offensive language is challenging, as hate speech not always includes offensive slurs and offensive language not always express hate. Here, four deep learners based on the Bidirectional Encoder Representations from Transformers (BERT), with either general or domain-specific language models, were tested against two datasets containing tweets labelled as either ‘Hateful’, ‘Normal’ or ‘Offensive’. The results indicate that the attention-based models profoundly confuse hate speech with offensive and normal language. However, the pre-trained models outperform state-of-the-art results in terms of accurately predicting the hateful instances. + 2020.alw-1.3 + + + Fine-tuning <fixed-case>BERT</fixed-case> for multi-domain and multi-label incivil language detection + Kadir BulutOzler + KateKenski + SteveRains + YotamShmargad + KevinCoe + StevenBethard + 28–33 + Incivility is a problem on social media, and it comes in many forms (name-calling, vulgarity, threats, etc.) and domains (microblog posts, online news comments, Wikipedia edits, etc.). Training machine learning models to detect such incivility must handle the multi-label and multi-domain nature of the problem. We present a BERT-based model for incivility detection and propose several approaches for training it for multi-label and multi-domain datasets. We find that individual binary classifiers outperform a joint multi-label classifier, and that simply combining multiple domains of training data outperforms other recently-proposed fine tuning strategies. We also establish new state-of-the-art performance on several incivility detection datasets. + 2020.alw-1.4 + + + <fixed-case>H</fixed-case>urt<fixed-case>BERT</fixed-case>: Incorporating Lexical Features with <fixed-case>BERT</fixed-case> for the Detection of Abusive Language + AnnaKoufakou + Endang WahyuPamungkas + ValerioBasile + VivianaPatti + 34–43 + The detection of abusive or offensive remarks in social texts has received significant attention in research. In several related shared tasks, BERT has been shown to be the state-of-the-art. In this paper, we propose to utilize lexical features derived from a hate lexicon towards improving the performance of BERT in such tasks. We explore different ways to utilize the lexical features in the form of lexicon-based encodings at the sentence level or embeddings at the word level. We provide an extensive dataset evaluation that addresses in-domain as well as cross-domain detection of abusive content to render a complete picture. Our results indicate that our proposed models combining BERT with lexical features help improve over a baseline BERT model in many of our in-domain and cross-domain experiments. + 2020.alw-1.5 + + + Abusive Language Detection using Syntactic Dependency Graphs + KanikaNarang + ChrisBrew + 44–53 + Automated detection of abusive language online has become imperative. Current sequential models (LSTM) do not work well for long and complex sentences while bi-transformer models (BERT) are not computationally efficient for the task. We show that classifiers based on syntactic structure of the text, dependency graphical convolutional networks (DepGCNs) can achieve state-of-the-art performance on abusive language datasets. The overall performance is at par with of strong baselines such as fine-tuned BERT. Further, our GCN-based approach is much more efficient than BERT at inference time making it suitable for real-time detection. + 2020.alw-1.6 + + + Impact of politically biased data on hate speech classification + MaximilianWich + JanBauer + GeorgGroh + 54–64 + One challenge that social media platforms are facing nowadays is hate speech. Hence, automatic hate speech detection has been increasingly researched in recent years - in particular with the rise of deep learning. A problem of these models is their vulnerability to undesirable bias in training data. We investigate the impact of political bias on hate speech classification by constructing three politically-biased data sets (left-wing, right-wing, politically neutral) and compare the performance of classifiers trained on them. We show that (1) political bias negatively impairs the performance of hate speech classifiers and (2) an explainable machine learning model can help to visualize such bias within the training data. The results show that political bias in training data has an impact on hate speech classification and can become a serious issue. + 2020.alw-1.7 + + + Reducing Unintended Identity Bias in <fixed-case>R</fixed-case>ussian Hate Speech Detection + NadezhdaZueva + MadinaKabirova + PavelKalaidin + 65–69 + Toxicity has become a grave problem for many online communities, and has been growing across many languages, including Russian. Hate speech creates an environment of intimidation, discrimination, and may even incite some real-world violence. Both researchers and social platforms have been focused on developing models to detect toxicity in online communication for a while now. A common problem of these models is the presence of bias towards some words (e.g. woman, black, jew or женщина, черный, еврей) that are not toxic, but serve as triggers for the classifier due to model caveats. In this paper, we describe our efforts towards classifying hate speech in Russian, and propose simple techniques of reducing unintended bias, such as generating training data with language models using terms and words related to protected identities as context and applying word dropout to such words. + 2020.alw-1.8 + + + Investigating Sampling Bias in Abusive Language Detection + DanteRazo + SandraKübler + 70–78 + Abusive language detection is becoming increasingly important, but we still understand little about the biases in our datasets for abusive language detection, and how these biases affect the quality of abusive language detection. In the work reported here, we reproduce the investigation of Wiegand et al. (2019) to determine differences between different sampling strategies. They compared boosted random sampling, where abusive posts are upsampled, and biased topic sampling, which focuses on topics that are known to cause abusive language. Instead of comparing individual datasets created using these sampling strategies, we use the sampling strategies on a single, large dataset, thus eliminating the textual source of the dataset as a potential confounding factor. We show that differences in the textual source can have more effect than the chosen sampling strategy. + 2020.alw-1.9 + 2020.alw-1.9.OptionalSupplementaryMaterial.zip + + + Attending the Emotions to Detect Online Abusive Language + NiloofarSafi Samghabadi + AfsheenHatami + MahsaShafaei + SudiptaKar + ThamarSolorio + 79–88 + In recent years, abusive behavior has become a serious issue in online social networks. In this paper, we present a new corpus for the task of abusive language detection that is collected from a semi-anonymous online platform, and unlike the majority of other available resources, is not created based on a specific list of bad words. We also develop computational models to incorporate emotions into textual cues to improve aggression identification. We evaluate our proposed methods on a set of corpora related to the task and show promising results with respect to abusive language detection. + 2020.alw-1.10 + + + Enhancing the Identification of Cyberbullying through Participant Roles + GathikaRathnayake + ThushariAtapattu + MahenHerath + GeorgiaZhang + KatrinaFalkner + 89–94 + Cyberbullying is a prevalent social problem that inflicts detrimental consequences to the health and safety of victims such as psychological distress, anti-social behaviour, and suicide. The automation of cyberbullying detection is a recent but widely researched problem, with current research having a strong focus on a binary classification of bullying versus non-bullying. This paper proposes a novel approach to enhancing cyberbullying detection through role modeling. We utilise a dataset from ASKfm to perform multi-class classification to detect participant roles (e.g. victim, harasser). Our preliminary results demonstrate promising performance including 0.83 and 0.76 of F1-score for cyberbullying and role classification respectively, outperforming baselines. + 2020.alw-1.11 + + + Developing a New Classifier for Automated Identification of Incivility in Social Media + SamDavidson + QiusiSun + MagdalenaWojcieszak + 95–101 + Incivility is not only prevalent on online social media platforms, but also has concrete effects on individual users, online groups, and the platforms themselves. Given the prevalence and effects of online incivility, and the challenges involved in human-based incivility detection, it is urgent to develop validated and versatile automatic approaches to identifying uncivil posts and comments. This project advances both a neural, BERT-based classifier as well as a logistic regression classifier to identify uncivil comments. The classifier is trained on a dataset of Reddit posts, which are annotated for incivility, and further expanded using a combination of labeled data from Reddit and Twitter. Our best performing model achieves an F1 of 0.802 on our Reddit test set. The final model is not only applicable across social media platforms and their distinct data structures, but also computationally versatile, and - as such - ready to be used on vast volumes of online data. All trained models and annotated data are made available to the research community. + 2020.alw-1.12 + + + Countering hate on social media: Large scale classification of hate and counter speech + JoshuaGarland + KeyanGhazi-Zahedi + Jean-GabrielYoung + LaurentHébert-Dufresne + MirtaGalesic + 102–112 + Hateful rhetoric is plaguing online discourse, fostering extreme societal movements and possibly giving rise to real-world violence. A potential solution to this growing global problem is citizen-generated counter speech where citizens actively engage with hate speech to restore civil non-polarized discourse. However, its actual effectiveness in curbing the spread of hatred is unknown and hard to quantify. One major obstacle to researching this question is a lack of large labeled data sets for training automated classifiers to identify counter speech. Here we use a unique situation in Germany where self-labeling groups engaged in organized online hate and counter speech. We use an ensemble learning algorithm which pairs a variety of paragraph embeddings with regularized logistic regression functions to classify both hate and counter speech in a corpus of millions of relevant tweets from these two groups. Our pipeline achieves macro F1 scores on out of sample balanced test sets ranging from 0.76 to 0.97—accuracy in line and even exceeding the state of the art. We then use the classifier to discover hate and counter speech in more than 135,000 fully-resolved Twitter conversations occurring from 2013 to 2018 and study their frequency and interaction. Altogether, our results highlight the potential of automated methods to evaluate the impact of coordinated counter speech in stabilizing conversations on social media. + 2020.alw-1.13 + 2020.alw-1.13.OptionalSupplementaryMaterial.pdf + + + Moderating Our (Dis)Content: Renewing the Regulatory Approach + ClairePershan + 113 + As online platforms become central to our democracies, the problem of toxic content threatens the free flow of information and the enjoyment of fundamental rights. But effective policy response to toxic content must grasp the idiosyncrasies and interconnectedness of content moderation across a fragmented online landscape. This report urges regulators and legislators to consider a range of platforms and moderation approaches in the regulation. In particular, it calls for a holistic, process-oriented regulatory approach that accounts for actors beyond the handful of dominant platforms that currently shape public debate. + 2020.alw-1.14 + + + Six Attributes of Unhealthy Conversations + IlanPrice + JordanGifford-Moore + JoryFlemming + SaulMusker + MaayanRoichman + GuillaumeSylvain + NithumThain + LucasDixon + JeffreySorensen + 114–124 + We present a new dataset of approximately 44000 comments labeled by crowdworkers. Each comment is labelled as either ‘healthy’ or ‘unhealthy’, in addition to binary labels for the presence of six potentially ‘unhealthy’ sub-attributes: (1) hostile; (2) antagonistic, insulting, provocative or trolling; (3) dismissive; (4) condescending or patronising; (5) sarcastic; and/or (6) an unfair generalisation. Each label also has an associated confidence score. We argue that there is a need for datasets which enable research based on a broad notion of ‘unhealthy online conversation’. We build this typology to encompass a substantial proportion of the individual comments which contribute to unhealthy online conversation. For some of these attributes, this is the first publicly available dataset of this scale. We explore the quality of the dataset, present some summary statistics and initial models to illustrate the utility of this data, and highlight limitations and directions for further research. + 2020.alw-1.15 + 2020.alw-1.15.OptionalSupplementaryMaterial.pdf + + + A Unified Taxonomy of Harmful Content + MicheleBanko + BrendonMacKeen + LaurieRay + 125–137 + The ability to recognize harmful content within online communities has come into focus for researchers, engineers and policy makers seeking to protect users from abuse. While the number of datasets aiming to capture forms of abuse has grown in recent years, the community has not standardized around how various harmful behaviors are defined, creating challenges for reliable moderation, modeling and evaluation. As a step towards attaining shared understanding of how online abuse may be modeled, we synthesize the most common types of abuse described by industry, policy, community and health experts into a unified typology of harmful content, with detailed criteria and exceptions for each type of abuse. + 2020.alw-1.16 + + + Towards a Comprehensive Taxonomy and Large-Scale Annotated Corpus for Online Slur Usage + JanaKurrek + Haji MohammadSaleem + DerekRuths + 138–149 + Abusive language classifiers have been shown to exhibit bias against women and racial minorities. Since these models are trained on data that is collected using keywords, they tend to exhibit a high sensitivity towards pejoratives. As a result, comments written by victims of abuse are frequently labelled as hateful, even if they discuss or reclaim slurs. Any attempt to address bias in keyword-based corpora requires a better understanding of pejorative language, as well as an equitable representation of targeted users in data collection. We make two main contributions to this end. First, we provide an annotation guide that outlines 4 main categories of online slur usage, which we further divide into a total of 12 sub-categories. Second, we present a publicly available corpus based on our taxonomy, with 39.8k human annotated comments extracted from Reddit. This corpus was annotated by a diverse cohort of coders, with Shannon equitability indices of 0.90, 0.92, and 0.87 across sexuality, ethnicity, and gender. Taken together, our taxonomy and corpus allow researchers to evaluate classifiers on a wider range of speech containing slurs. + 2020.alw-1.17 + 2020.alw-1.17.OptionalSupplementaryMaterial.pdf + + + In Data We Trust: A Critical Analysis of Hate Speech Detection Datasets + KosisochukwuMadukwe + XiaoyingGao + BingXue + 150–161 + Recently, a few studies have discussed the limitations of datasets collected for the task of detecting hate speech from different viewpoints. We intend to contribute to the conversation by providing a consolidated overview of these issues pertaining to the data that debilitate research in this area. Specifically, we discuss how the varying pre-processing steps and the format for making data publicly available result in highly varying datasets that make an objective comparison between studies difficult and unfair. There is currently no study (to the best of our knowledge) focused on comparing the attributes of existing datasets for hate speech detection, outlining their limitations and recommending approaches for future research. This work intends to fill that gap and become the one-stop shop for information regarding hate speech datasets. + 2020.alw-1.18 + + + Detecting <fixed-case>E</fixed-case>ast <fixed-case>A</fixed-case>sian Prejudice on Social Media + BertieVidgen + ScottHale + EllaGuest + HelenMargetts + DavidBroniatowski + ZeerakWaseem + AustinBotelho + MatthewHall + RebekahTromble + 162–172 + During COVID-19 concerns have heightened about the spread of aggressive and hateful language online, especially hostility directed against East Asia and East Asian people. We report on a new dataset and the creation of a machine learning classifier that categorizes social media posts from Twitter into four classes: Hostility against East Asia, Criticism of East Asia, Meta-discussions of East Asian prejudice, and a neutral class. The classifier achieves a macro-F1 score of 0.83. We then conduct an in-depth ground-up error analysis and show that the model struggles with edge cases and ambiguous content. We provide the 20,000 tweet training dataset (annotated by experienced analysts), which also contains several secondary categories and additional flags. We also provide the 40,000 original annotations (before adjudication), the full codebook, annotations for COVID-19 relevance and East Asian relevance and stance for 1,000 hashtags, and the final model. + 2020.alw-1.19 + 2020.alw-1.19.OptionalSupplementaryMaterial.zip + + + On Cross-Dataset Generalization in Automatic Detection of Online Abuse + IsarNejadgholi + SvetlanaKiritchenko + 173–183 + NLP research has attained high performances in abusive language detection as a supervised classification task. While in research settings, training and test datasets are usually obtained from similar data samples, in practice systems are often applied on data that are different from the training set in topic and class distributions. Also, the ambiguity in class definitions inherited in this task aggravates the discrepancies between source and target datasets. We explore the topic bias and the task formulation bias in cross-dataset generalization. We show that the benign examples in the Wikipedia Detox dataset are biased towards platform-specific topics. We identify these examples using unsupervised topic modeling and manual inspection of topics’ keywords. Removing these topics increases cross-dataset generalization, without reducing in-domain classification performance. For a robust dataset design, we suggest applying inexpensive unsupervised methods to inspect the collected data and downsize the non-generalizable content before manually annotating for class labels. + 2020.alw-1.20 + 2020.alw-1.20.OptionalSupplementaryMaterial.zip + + + Identifying and Measuring Annotator Bias Based on Annotators’ Demographic Characteristics + HalaAl Kuwatly + MaximilianWich + GeorgGroh + 184–190 + Machine learning is recently used to detect hate speech and other forms of abusive language in online platforms. However, a notable weakness of machine learning models is their vulnerability to bias, which can impair their performance and fairness. One type is annotator bias caused by the subjective perception of the annotators. In this work, we investigate annotator bias using classification models trained on data from demographically distinct annotator groups. To do so, we sample balanced subsets of data that are labeled by demographically distinct annotators. We then train classifiers on these subsets, analyze their performances on similarly grouped test sets, and compare them statistically. Our findings show that the proposed approach successfully identifies bias and that demographic features, such as first language, age, and education, correlate with significant performance differences. + 2020.alw-1.21 + 2020.alw-1.21.OptionalSupplementaryMaterial.zip + + + Investigating Annotator Bias with a Graph-Based Approach + MaximilianWich + HalaAl Kuwatly + GeorgGroh + 191–199 + A challenge that many online platforms face is hate speech or any other form of online abuse. To cope with this, hate speech detection systems are developed based on machine learning to reduce manual work for monitoring these platforms. Unfortunately, machine learning is vulnerable to unintended bias in training data, which could have severe consequences, such as a decrease in classification performance or unfair behavior (e.g., discriminating minorities). In the scope of this study, we want to investigate annotator bias — a form of bias that annotators cause due to different knowledge in regards to the task and their subjective perception. Our goal is to identify annotation bias based on similarities in the annotation behavior from annotators. To do so, we build a graph based on the annotations from the different annotators, apply a community detection algorithm to group the annotators, and train for each group classifiers whose performances we compare. By doing so, we are able to identify annotator bias within a data set. The proposed method and collected insights can contribute to developing fairer and more reliable hate speech classification models. + 2020.alw-1.22 + 2020.alw-1.22.OptionalSupplementaryMaterial.zip + +
+
diff --git a/data/xml/2020.blackboxnlp.xml b/data/xml/2020.blackboxnlp.xml new file mode 100644 index 0000000000..dc355e7d49 --- /dev/null +++ b/data/xml/2020.blackboxnlp.xml @@ -0,0 +1,318 @@ + + + + + Proceedings of the Third BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP + AfraAlishahi + YonatanBelinkov + GrzegorzChrupała + DieuwkeHupkes + YuvalPinter + HassanSajjad + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.blackboxnlp-1.0 + + + <fixed-case>BERT</fixed-case>ering <fixed-case>RAMS</fixed-case>: What and How Much does <fixed-case>BERT</fixed-case> Already Know About Event Arguments? - A Study on the <fixed-case>RAMS</fixed-case> Dataset + VarunGangal + EduardHovy + 1–10 + Using the attention map based probing framework from (Clark et al., 2019), we observe that, on the RAMS dataset (Ebner et al., 2020), BERT’s attention heads have modest but well above-chance ability to spot event arguments sans any training or domain finetuning, varying from a low of 17.77% for Place to a high of 51.61% for Artifact. Next, we find that linear combinations of these heads, estimated with approx. 11% of available total event argument detection supervision, can push performance well higher for some roles — highest two being Victim (68.29% Accuracy) and Artifact (58.82% Accuracy). Furthermore, we investigate how well our methods do for cross-sentence event arguments. We propose a procedure to isolate “best heads” for cross-sentence argument detection separately of those for intra-sentence arguments. The heads thus estimated have superior cross-sentence performance compared to their jointly estimated equivalents, albeit only under the unrealistic assumption that we already know the argument is present in another sentence. Lastly, we seek to isolate to what extent our numbers stem from lexical frequency based associations between gold arguments and roles. We propose NONCE, a scheme to create adversarial test examples by replacing gold arguments with randomly generated “nonce” words. We find that learnt linear combinations are robust to NONCE, though individual best heads can be more sensitive. + 2020.blackboxnlp-1.1 + 2020.blackboxnlp-1.1.OptionalSupplementaryMaterial.zip + + + Emergent Language Generalization and Acquisition Speed are not tied to Compositionality + EugeneKharitonov + MarcoBaroni + 11–15 + Studies of discrete languages emerging when neural agents communicate to solve a joint task often look for evidence of compositional structure. This stems for the expectation that such a structure would allow languages to be acquired faster by the agents and enable them to generalize better. We argue that these beneficial properties are only loosely connected to compositionality. In two experiments, we demonstrate that, depending on the task, non-compositional languages might show equal, or better, generalization performance and acquisition speed than compositional ones. Further research in the area should be clearer about what benefits are expected from compositionality, and how the latter would lead to them. + 2020.blackboxnlp-1.2 + + + Examining the rhetorical capacities of neural language models + ZiningZhu + ChuerPan + MohamedAbdalla + FrankRudzicz + 16–32 + Recently, neural language models (LMs) have demonstrated impressive abilities in generating high-quality discourse. While many recent papers have analyzed the syntactic aspects encoded in LMs, there has been no analysis to date of the inter-sentential, rhetorical knowledge. In this paper, we propose a method that quantitatively evaluates the rhetorical capacities of neural LMs. We examine the capacities of neural LMs understanding the rhetoric of discourse by evaluating their abilities to encode a set of linguistic features derived from Rhetorical Structure Theory (RST). Our experiments show that BERT-based LMs outperform other Transformer LMs, revealing the richer discourse knowledge in their intermediate layer representations. In addition, GPT-2 and XLNet apparently encode less rhetorical knowledge, and we suggest an explanation drawing from linguistic philosophy. Our method shows an avenue towards quantifying the rhetorical capacities of neural LMs. + 2020.blackboxnlp-1.3 + + + What Happens To <fixed-case>BERT</fixed-case> Embeddings During Fine-tuning? + AmilMerchant + ElaheRahimtoroghi + ElliePavlick + IanTenney + 33–44 + While much recent work has examined how linguistic information is encoded in pre-trained sentence representations, comparatively little is understood about how these models change when adapted to solve downstream tasks. Using a suite of analysis techniques—supervised probing, unsupervised similarity analysis, and layer-based ablations—we investigate how fine-tuning affects the representations of the BERT model. We find that while fine-tuning necessarily makes some significant changes, there is no catastrophic forgetting of linguistic phenomena. We instead find that fine-tuning is a conservative process that primarily affects the top layers of BERT, albeit with noteworthy variation across tasks. In particular, dependency parsing reconfigures most of the model, whereas SQuAD and MNLI involve much shallower processing. Finally, we also find that fine-tuning has a weaker effect on representations of out-of-domain sentences, suggesting room for improvement in model generalization. + 2020.blackboxnlp-1.4 + 2020.blackboxnlp-1.4.OptionalSupplementaryMaterial.zip + + + It’s not <fixed-case>G</fixed-case>reek to m<fixed-case>BERT</fixed-case>: Inducing Word-Level Translations from Multilingual <fixed-case>BERT</fixed-case> + HilaGonen + ShauliRavfogel + YanaiElazar + YoavGoldberg + 45–56 + Recent works have demonstrated that multilingual BERT (mBERT) learns rich cross-lingual representations, that allow for transfer across languages. We study the word-level translation information embedded in mBERT and present two simple methods that expose remarkable translation capabilities with no fine-tuning. The results suggest that most of this information is encoded in a non-linear way, while some of it can also be recovered with purely linear tools. As part of our analysis, we test the hypothesis that mBERT learns representations which contain both a language-encoding component and an abstract, cross-lingual component, and explicitly identify an empirical language-identity subspace within mBERT representations. + 2020.blackboxnlp-1.5 + + + Leveraging Extracted Model Adversaries for Improved Black Box Attacks + Naveen JaferNizar + AriKobren + 57–67 + We present a method for adversarial input generation against black box models for reading comprehension based question answering. Our approach is composed of two steps. First, we approximate a victim black box model via model extraction. Second, we use our own white box method to generate input perturbations that cause the approximate model to fail. These perturbed inputs are used against the victim. In experiments we find that our method improves on the efficacy of the ADDANY—a white box attack—performed on the approximate model by 25% F1, and the ADDSENT attack—a black box attack—by 11% F1. + 2020.blackboxnlp-1.6 + + + On the Interplay Between Fine-tuning and Sentence-Level Probing for Linguistic Knowledge in Pre-Trained Transformers + MariusMosbach + AnnaKhokhlova + Michael A.Hedderich + DietrichKlakow + 68–82 + Fine-tuning pre-trained contextualized embedding models has become an integral part of the NLP pipeline. At the same time, probing has emerged as a way to investigate the linguistic knowledge captured by pre-trained models. Very little is, however, understood about how fine-tuning affects the representations of pre-trained models and thereby the linguistic knowledge they encode. This paper contributes towards closing this gap. We study three different pre-trained models: BERT, RoBERTa, and ALBERT, and investigate through sentence-level probing how fine-tuning affects their representations. We find that for some probing tasks fine-tuning leads to substantial changes in accuracy, possibly suggesting that fine-tuning introduces or even removes linguistic knowledge from a pre-trained model. These changes, however, vary greatly across different models, fine-tuning and probing tasks. Our analysis reveals that while fine-tuning indeed changes the representations of a pre-trained model and these changes are typically larger for higher layers, only in very few cases, fine-tuning has a positive effect on probing accuracy that is larger than just using the pre-trained model with a strong pooling method. Based on our findings, we argue that both positive and negative effects of fine-tuning on probing require a careful interpretation. + 2020.blackboxnlp-1.7 + + + Unsupervised Evaluation for Question Answering with Transformers + LukasMuttenthaler + IsabelleAugenstein + JohannesBjerva + 83–90 + It is challenging to automatically evaluate the answer of a QA model at inference time. Although many models provide confidence scores, and simple heuristics can go a long way towards indicating answer correctness, such measures are heavily dataset-dependent and are unlikely to generalise. In this work, we begin by investigating the hidden representations of questions, answers, and contexts in transformer-based QA architectures. We observe a consistent pattern in the answer representations, which we show can be used to automatically evaluate whether or not a predicted answer span is correct. Our method does not require any labelled data and outperforms strong heuristic baselines, across 2 datasets and 7 domains. We are able to predict whether or not a model’s answer is correct with 91.37% accuracy on SQuAD, and 80.7% accuracy on SubjQA. We expect that this method will have broad applications, e.g., in semi-automatic development of QA datasets. + 2020.blackboxnlp-1.8 + + + Unsupervised Distillation of Syntactic Information from Contextualized Word Representations + ShauliRavfogel + YanaiElazar + JacobGoldberger + YoavGoldberg + 91–106 + Contextualized word representations, such as ELMo and BERT, were shown to perform well on various semantic and syntactic task. In this work, we tackle the task of unsupervised disentanglement between semantics and structure in neural language representations: we aim to learn a transformation of the contextualized vectors, that discards the lexical semantics, but keeps the structural information. To this end, we automatically generate groups of sentences which are structurally similar but semantically different, and use metric-learning approach to learn a transformation that emphasizes the structural component that is encoded in the vectors. We demonstrate that our transformation clusters vectors in space by structural properties, rather than by lexical semantics. Finally, we demonstrate the utility of our distilled representations by showing that they outperform the original contextualized representations in a few-shot parsing setting. + 2020.blackboxnlp-1.9 + + + The Explanation Game: Towards Prediction Explainability through Sparse Communication + MarcosTreviso + André F. T.Martins + 107–118 + Explainability is a topic of growing importance in NLP. In this work, we provide a unified perspective of explainability as a communication problem between an explainer and a layperson about a classifier’s decision. We use this framework to compare several explainers, including gradient methods, erasure, and attention mechanisms, in terms of their communication success. In addition, we reinterpret these methods in the light of classical feature selection, and use this as inspiration for new embedded explainers, through the use of selective, sparse attention. Experiments in text classification and natural language inference, using different configurations of explainers and laypeople (including both machines and humans), reveal an advantage of attention-based explainers over gradient and erasure methods, and show that selective attention is a simpler alternative to stochastic rationalizers. Human experiments show strong results on text classification with post-hoc explainers trained to optimize communication success. + 2020.blackboxnlp-1.10 + 2020.blackboxnlp-1.10.OptionalSupplementaryMaterial.zip + + + Latent Tree Learning with Ordered Neurons: What Parses Does It Produce? + YianZhang + 119–125 + Recent latent tree learning models can learn constituency parsing without any exposure to human-annotated tree structures. One such model is ON-LSTM (Shen et al., 2019), which is trained on language modelling and has near-state-of-the-art performance on unsupervised parsing. In order to better understand the performance and consistency of the model as well as how the parses it generates are different from gold-standard PTB parses, we replicate the model with different restarts and examine their parses. We find that (1) the model has reasonably consistent parsing behaviors across different restarts, (2) the model struggles with the internal structures of complex noun phrases, (3) the model has a tendency to overestimate the height of the split points right before verbs. We speculate that both problems could potentially be solved by adopting a different training task other than unidirectional language modelling. + 2020.blackboxnlp-1.11 + + + Linguistically-Informed Transformations (<fixed-case>LIT</fixed-case>): A Method for Automatically Generating Contrast Sets + ChuanrongLi + LinShengshuo + ZeyuLiu + XinyiWu + XuhuiZhou + ShaneSteinert-Threlkeld + 126–135 + Although large-scale pretrained language models, such as BERT and RoBERTa, have achieved superhuman performance on in-distribution test sets, their performance suffers on out-of-distribution test sets (e.g., on contrast sets). Building contrast sets often requires human-expert annotation, which is expensive and hard to create on a large scale. In this work, we propose a Linguistically-Informed Transformation (LIT) method to automatically generate contrast sets, which enables practitioners to explore linguistic phenomena of interests as well as compose different phenomena. Experimenting with our method on SNLI and MNLI shows that current pretrained language models, although being claimed to contain sufficient linguistic knowledge, struggle on our automatically generated contrast sets. Furthermore, we improve models’ performance on the contrast sets by applying LIT to augment the training data, without affecting performance on the original data. + 2020.blackboxnlp-1.12 + 2020.blackboxnlp-1.12.OptionalSupplementaryMaterial.pdf + + + Controlling the Imprint of Passivization and Negation in Contextualized Representations + HandeCelikkanat + SamiVirpioja + JörgTiedemann + MariannaApidianaki + 136–148 + Contextualized word representations encode rich information about syntax and semantics, alongside specificities of each context of use. While contextual variation does not always reflect actual meaning shifts, it can still reduce the similarity of embeddings for word instances having the same meaning. We explore the imprint of two specific linguistic alternations, namely passivization and negation, on the representations generated by neural models trained with two different objectives: masked language modeling and translation. Our exploration methodology is inspired by an approach previously proposed for removing societal biases from word vectors. We show that passivization and negation leave their traces on the representations, and that neutralizing this information leads to more similar embeddings for words that should preserve their meaning in the transformation. We also find clear differences in how the respective features generalize across datasets. + 2020.blackboxnlp-1.13 + + + The elephant in the interpretability room: Why use attention as explanation when we have saliency methods? + JasmijnBastings + KatjaFilippova + 149–155 + There is a recent surge of interest in using attention as explanation of model predictions, with mixed evidence on whether attention can be used as such. While attention conveniently gives us one weight per input token and is easily extracted, it is often unclear toward what goal it is used as explanation. We find that often that goal, whether explicitly stated or not, is to find out what input tokens are the most relevant to a prediction, and that the implied user for the explanation is a model developer. For this goal and user, we argue that input saliency methods are better suited, and that there are no compelling reasons to use attention, despite the coincidence that it provides a weight for each input. With this position paper, we hope to shift some of the recent focus on attention to saliency methods, and for authors to clearly state the goal and user for their explanations. + 2020.blackboxnlp-1.14 + + + How does <fixed-case>BERT</fixed-case> capture semantics? A closer look at polysemous words + DavidYenicelik + FlorianSchmidt + YannicKilcher + 156–162 + The recent paradigm shift to contextual word embeddings has seen tremendous success across a wide range of down-stream tasks. However, little is known on how the emergent relation of context and semantics manifests geometrically. We investigate polysemous words as one particularly prominent instance of semantic organization. Our rigorous quantitative analysis of linear separability and cluster organization in embedding vectors produced by BERT shows that semantics do not surface as isolated clusters but form seamless structures, tightly coupled with sentiment and syntax. + 2020.blackboxnlp-1.15 + 2020.blackboxnlp-1.15.OptionalSupplementaryMaterial.zip + + + Neural Natural Language Inference Models Partially Embed Theories of Lexical Entailment and Negation + AtticusGeiger + KyleRichardson + ChristopherPotts + 163–173 + We address whether neural models for Natural Language Inference (NLI) can learn the compositional interactions between lexical entailment and negation, using four methods: the behavioral evaluation methods of (1) challenge test sets and (2) systematic generalization tasks, and the structural evaluation methods of (3) probes and (4) interventions. To facilitate this holistic evaluation, we present Monotonicity NLI (MoNLI), a new naturalistic dataset focused on lexical entailment and negation. In our behavioral evaluations, we find that models trained on general-purpose NLI datasets fail systematically on MoNLI examples containing negation, but that MoNLI fine-tuning addresses this failure. In our structural evaluations, we look for evidence that our top-performing BERT-based model has learned to implement the monotonicity algorithm behind MoNLI. Probes yield evidence consistent with this conclusion, and our intervention experiments bolster this, showing that the causal dynamics of the model mirror the causal dynamics of this algorithm on subsets of MoNLI. This suggests that the BERT model at least partially embeds a theory of lexical entailment and negation at an algorithmic level. + 2020.blackboxnlp-1.16 + 2020.blackboxnlp-1.16.OptionalSupplementaryMaterial.zip + + + <fixed-case>BERT</fixed-case>nesia: Investigating the capture and forgetting of knowledge in <fixed-case>BERT</fixed-case> + JaspreetSingh + JonasWallat + AvishekAnand + 174–183 + Probing complex language models has recently revealed several insights into linguistic and semantic patterns found in the learned representations. In this paper, we probe BERT specifically to understand and measure the relational knowledge it captures. We utilize knowledge base completion tasks to probe every layer of pre-trained as well as fine-tuned BERT (ranking, question answering, NER). Our findings show that knowledge is not just contained in BERT’s final layers. Intermediate layers contribute a significant amount (17-60%) to the total knowledge found. Probing intermediate layers also reveals how different types of knowledge emerge at varying rates. When BERT is fine-tuned, relational knowledge is forgotten but the extent of forgetting is impacted by the fine-tuning objective but not the size of the dataset. We found that ranking models forget the least and retain more knowledge in their final layer. + 2020.blackboxnlp-1.17 + 2020.blackboxnlp-1.17.OptionalSupplementaryMaterial.zip + + + Probing for Multilingual Numerical Understanding in Transformer-Based Language Models + DevinJohnson + DeniseMak + AndrewBarker + LexiLoessberg-Zahl + 184–192 + Natural language numbers are an example of compositional structures, where larger numbers are composed of operations on smaller numbers. Given that compositional reasoning is a key to natural language understanding, we propose novel multilingual probing tasks tested on DistilBERT, XLM, and BERT to investigate for evidence of compositional reasoning over numerical data in various natural language number systems. By using both grammaticality judgment and value comparison classification tasks in English, Japanese, Danish, and French, we find evidence that the information encoded in these pretrained models’ embeddings is sufficient for grammaticality judgments but generally not for value comparisons. We analyze possible reasons for this and discuss how our tasks could be extended in further studies. + 2020.blackboxnlp-1.18 + + + Dissecting Lottery Ticket Transformers: Structural and Behavioral Study of Sparse Neural Machine Translation + RajivMovva + JasonZhao + 193–203 + Recent work on the lottery ticket hypothesis has produced highly sparse Transformers for NMT while maintaining BLEU. However, it is unclear how such pruning techniques affect a model’s learned representations. By probing Transformers with more and more low-magnitude weights pruned away, we find that complex semantic information is first to be degraded. Analysis of internal activations reveals that higher layers diverge most over the course of pruning, gradually becoming less complex than their dense counterparts. Meanwhile, early layers of sparse models begin to perform more encoding. Attention mechanisms remain remarkably consistent as sparsity increases. + 2020.blackboxnlp-1.19 + 2020.blackboxnlp-1.19.OptionalSupplementaryMaterial.pdf + + + Exploring Neural Entity Representations for Semantic Information + AndrewRunge + EduardHovy + 204–216 + Neural methods for embedding entities are typically extrinsically evaluated on downstream tasks and, more recently, intrinsically using probing tasks. Downstream task-based comparisons are often difficult to interpret due to differences in task structure, while probing task evaluations often look at only a few attributes and models. We address both of these issues by evaluating a diverse set of eight neural entity embedding methods on a set of simple probing tasks, demonstrating which methods are able to remember words used to describe entities, learn type, relationship and factual information, and identify how frequently an entity is mentioned. We also compare these methods in a unified framework on two entity linking tasks and discuss how they generalize to different model architectures and datasets. + 2020.blackboxnlp-1.20 + + + <fixed-case>BERT</fixed-case>s of a feather do not generalize together: Large variability in generalization across models with similar test set performance + R. ThomasMcCoy + JunghyunMin + TalLinzen + 217–227 + If the same neural network architecture is trained multiple times on the same dataset, will it make similar linguistic generalizations across runs? To study this question, we fine-tuned 100 instances of BERT on the Multi-genre Natural Language Inference (MNLI) dataset and evaluated them on the HANS dataset, which evaluates syntactic generalization in natural language inference. On the MNLI development set, the behavior of all instances was remarkably consistent, with accuracy ranging between 83.6% and 84.8%. In stark contrast, the same models varied widely in their generalization performance. For example, on the simple case of subject-object swap (e.g., determining that “the doctor visited the lawyer” does not entail “the lawyer visited the doctor”), accuracy ranged from 0.0% to 66.2%. Such variation is likely due to the presence of many local minima in the loss surface that are equally attractive to a low-bias learner such as a neural network; decreasing the variability may therefore require models with stronger inductive biases. + 2020.blackboxnlp-1.21 + + + Attacking Semantic Similarity: Generating Second-Order <fixed-case>NLP</fixed-case> Adversarial Examples + JohnMorris + 228–237 + Adversarial example generation methods in NLP rely on models like language models or sentence encoders to determine if potential adversarial examples are valid. In these methods, a valid adversarial example fools the model being attacked, and is determined to be semantically or syntactically valid by a second model. Research to date has counted all such examples as errors by the attacked model. We contend that these adversarial examples may not be flaws in the attacked model, but flaws in the model that determines validity. We term such invalid inputs second-order adversarial examples. We propose the constraint robustness curve, and associated metric ACCS, as tools for evaluating the robustness of a constraint to second-order adversarial examples. To generate this curve, we design an adversarial attack to run directly on the semantic similarity models. We test on two constraints, the Universal Sentence Encoder (USE) and BERTScore. Our findings indicate that such second-order examples exist, but are typically less common than first-order adversarial examples in state-of-the-art models. They also indicate that USE is effective as constraint on NLP adversarial examples, while BERTScore is nearly ineffectual. Code for running the experiments in this paper is available here. + 2020.blackboxnlp-1.22 + 2020.blackboxnlp-1.22.OptionalSupplementaryMaterial.zip + + + Discovering the Compositional Structure of Vector Representations with Role Learning Networks + PaulSoulos + R. ThomasMcCoy + TalLinzen + PaulSmolensky + 238–254 + How can neural networks perform so well on compositional tasks even though they lack explicit compositional representations? We use a novel analysis technique called ROLE to show that recurrent neural networks perform well on such tasks by converging to solutions which implicitly represent symbolic structure. This method uncovers a symbolic structure which, when properly embedded in vector space, closely approximates the encodings of a standard seq2seq network trained to perform the compositional SCAN task. We verify the causal importance of the discovered symbolic structure by showing that, when we systematically manipulate hidden embeddings based on this symbolic structure, the model’s output is changed in the way predicted by our analysis. + 2020.blackboxnlp-1.23 + + + Structured Self-Attention Weights Encodes Semantics in Sentiment Analysis + ZhengxuanWu + Thanh-SonNguyen + DesmondOng + 255–264 + Neural attention, especially the self-attention made popular by the Transformer, has become the workhorse of state-of-the-art natural language processing (NLP) models. Very recent work suggests that the self-attention in the Transformer encodes syntactic information; Here, we show that self-attention scores encode semantics by considering sentiment analysis tasks. In contrast to gradient-based feature attribution methods, we propose a simple and effective Layer-wise Attention Tracing (LAT) method to analyze structured attention weights. We apply our method to Transformer models trained on two tasks that have surface dissimilarities, but share common semantics—sentiment analysis of movie reviews and time-series valence prediction in life story narratives. Across both tasks, words with high aggregated attention weights were rich in emotional semantics, as quantitatively validated by an emotion lexicon labeled by human annotators. Our results show that structured attention weights encode rich semantics in sentiment analysis, and match human interpretations of semantics. + 2020.blackboxnlp-1.24 + 2020.blackboxnlp-1.24.OptionalSupplementaryMaterial.zip + + + Investigating Novel Verb Learning in <fixed-case>BERT</fixed-case>: Selectional Preference Classes and Alternation-Based Syntactic Generalization + TristanThrush + EthanWilcox + RogerLevy + 265–275 + Previous studies investigating the syntactic abilities of deep learning models have not targeted the relationship between the strength of the grammatical generalization and the amount of evidence to which the model is exposed during training. We address this issue by deploying a novel word-learning paradigm to test BERT’s few-shot learning capabilities for two aspects of English verbs: alternations and classes of selectional preferences. For the former, we fine-tune BERT on a single frame in a verbal-alternation pair and ask whether the model expects the novel verb to occur in its sister frame. For the latter, we fine-tune BERT on an incomplete selectional network of verbal objects and ask whether it expects unattested but plausible verb/object pairs. We find that BERT makes robust grammatical generalizations after just one or two instances of a novel word in fine-tuning. For the verbal alternation tests, we find that the model displays behavior that is consistent with a transitivity bias: verbs seen few times are expected to take direct objects, but verbs seen with direct objects are not expected to occur intransitively. + 2020.blackboxnlp-1.25 + + + The <fixed-case>EOS</fixed-case> Decision and Length Extrapolation + BenjaminNewman + JohnHewitt + PercyLiang + Christopher D.Manning + 276–291 + Extrapolation to unseen sequence lengths is a challenge for neural generative models of language. In this work, we characterize the effect on length extrapolation of a modeling decision often overlooked: predicting the end of the generative process through the use of a special end-of-sequence (EOS) vocabulary item. We study an oracle setting - forcing models to generate to the correct sequence length at test time - to compare the length-extrapolative behavior of networks trained to predict EOS (+EOS) with networks not trained to (-EOS). We find that -EOS substantially outperforms +EOS, for example extrapolating well to lengths 10 times longer than those seen at training time in a bracket closing task, as well as achieving a 40% improvement over +EOS in the difficult SCAN dataset length generalization task. By comparing the hidden states and dynamics of -EOS and +EOS models, we observe that +EOS models fail to generalize because they (1) unnecessarily stratify their hidden states by their linear position is a sequence (structures we call length manifolds) or (2) get stuck in clusters (which we refer to as length attractors) once the EOS token is the highest-probability prediction. + 2020.blackboxnlp-1.26 + + + Do Language Embeddings capture Scales? + XikunZhang + DeepakRamachandran + IanTenney + YanaiElazar + DanRoth + 292–299 + Pretrained Language Models (LMs) have been shown to possess significant linguistic, common sense and factual knowledge. One form of knowledge that has not been studied yet in this context is information about the scalar magnitudes of objects. We show that pretrained language models capture a significant amount of this information but are short of the capability required for general common-sense reasoning. We identify contextual information in pre-training and numeracy as two key factors affecting their performance, and show that a simple method of canonicalizing numbers can have a significant effect on the results. + 2020.blackboxnlp-1.27 + + + Evaluating Attribution Methods using White-Box <fixed-case>LSTM</fixed-case>s + YidingHao + 300–313 + Interpretability methods for neural networks are difficult to evaluate because we do not understand the black-box models typically used to test them. This paper proposes a framework in which interpretability methods are evaluated using manually constructed networks, which we call white-box networks, whose behavior is understood a priori. We evaluate five methods for producing attribution heatmaps by applying them to white-box LSTM classifiers for tasks based on formal languages. Although our white-box classifiers solve their tasks perfectly and transparently, we find that all five attribution methods fail to produce the expected model explanations. + 2020.blackboxnlp-1.28 + + + Defining Explanation in an <fixed-case>AI</fixed-case> Context + TejaswaniVerma + ChristophLingenfelder + DietrichKlakow + 314–322 + With the increase in the use of AI systems, a need for explanation systems arises. Building an explanation system requires a definition of explanation. However, the natural language term explanation is difficult to define formally as it includes multiple perspectives from different domains such as psychology, philosophy, and cognitive sciences. We study multiple perspectives and aspects of explainability of recommendations or predictions made by AI systems, and provide a generic definition of explanation. The proposed definition is ambitious and challenging to apply. With the intention to bridge the gap between theory and application, we also propose a possible architecture of an automated explanation system based on our definition of explanation. + 2020.blackboxnlp-1.29 + + + Searching for a Search Method: Benchmarking Search Algorithms for Generating <fixed-case>NLP</fixed-case> Adversarial Examples + Jin YongYoo + JohnMorris + EliLifland + YanjunQi + 323–332 + We study the behavior of several black-box search algorithms used for generating adversarial examples for natural language processing (NLP) tasks. We perform a fine-grained analysis of three elements relevant to search: search algorithm, search space, and search budget. When new search algorithms are proposed in past work, the attack search space is often modified alongside the search algorithm. Without ablation studies benchmarking the search algorithm change with the search space held constant, one cannot tell if an increase in attack success rate is a result of an improved search algorithm or a less restrictive search space. Additionally, many previous studies fail to properly consider the search algorithms’ run-time cost, which is essential for downstream tasks like adversarial training. Our experiments provide a reproducible benchmark of search algorithms across a variety of search spaces and query budgets to guide future research in adversarial NLP. Based on our experiments, we recommend greedy attacks with word importance ranking when under a time constraint or attacking long inputs, and either beam search or particle swarm optimization otherwise. + 2020.blackboxnlp-1.30 + 2020.blackboxnlp-1.30.OptionalSupplementaryMaterial.pdf + + + This is a <fixed-case>BERT</fixed-case>. Now there are several of them. Can they generalize to novel words? + ColemanHaley + 333–341 + Recently, large-scale pre-trained neural network models such as BERT have achieved many state-of-the-art results in natural language processing. Recent work has explored the linguistic capacities of these models. However, no work has focused on the ability of these models to generalize these capacities to novel words. This type of generalization is exhibited by humans, and is intimately related to morphology—humans are in many cases able to identify inflections of novel words in the appropriate context. This type of morphological capacity has not been previously tested in BERT models, and is important for morphologically-rich languages, which are under-studied in the literature regarding BERT’s linguistic capacities. In this work, we investigate this by considering monolingual and multilingual BERT models’ abilities to agree in number with novel plural words in English, French, German, Spanish, and Dutch. We find that many models are not able to reliably determine plurality of novel words, suggesting potential deficiencies in the morphological capacities of BERT models. + 2020.blackboxnlp-1.31 + + + diag<fixed-case>NN</fixed-case>ose: A Library for Neural Activation Analysis + JaapJumelet + 342–350 + In this paper we introduce diagNNose, an open source library for analysing the activations of deep neural networks. diagNNose contains a wide array of interpretability techniques that provide fundamental insights into the inner workings of neural networks. We demonstrate the functionality of diagNNose with a case study on subject-verb agreement within language models. diagNNose is available at https://github.com/i-machine-think/diagnnose. + 2020.blackboxnlp-1.32 + +
+
diff --git a/data/xml/2020.challengehml.xml b/data/xml/2020.challengehml.xml index 668fc4742b..6b0cab16a2 100644 --- a/data/xml/2020.challengehml.xml +++ b/data/xml/2020.challengehml.xml @@ -55,7 +55,7 @@ Low Rank Fusion based Transformers for Multimodal Sequences SauravSahay EdaOkur - shachiH Kumar + ShachiH Kumar LamaNachman 29–34 Our senses individually work in a coordinated fashion to express our emotional intentions. In this work, we experiment with modeling modality-specific sensory signals to attend to our latent multimodal emotional intentions and vice versa expressed via low-rank multimodal fusion and multimodal transformers. The low-rank factorization of multimodal fusion amongst the modalities helps represent approximate multiplicative latent signal interactions. Motivated by the work of~(CITATION) and~(CITATION), we present our transformer-based cross-fusion architecture without any over-parameterization of the model. The low-rank fusion helps represent the latent signal interactions while the modality-specific attention helps focus on relevant parts of the signal. We present two methods for the Multimodal Sentiment and Emotion Recognition results on CMU-MOSEI, CMU-MOSI, and IEMOCAP datasets and show that our models have lesser parameters, train faster and perform comparably to many larger fusion-based architectures. @@ -88,7 +88,7 @@ Audio-Visual Understanding of Passenger Intents for In-Cabin Conversational Agents EdaOkur - shachiH Kumar + ShachiH Kumar SauravSahay LamaNachman 55–59 @@ -100,7 +100,7 @@ <fixed-case>AI</fixed-case> <fixed-case>S</fixed-case>ensing for Robotics using Deep Learning based Visual and Language Modeling - yuvaramsingh + YuvaramSingh Kameshwar RaoJV 60–63 An artificial intelligence(AI) system should be capable of processing the sensory inputs to extract both task-specific and general information about its environment. However, most of the existing algorithms extract only task specific information. In this work, an innovative approach to address the problem of processing visual sensory data is presented by utilizing convolutional neural network (CNN). It recognizes and represents the physical and semantic nature of the surrounding in both human readable and machine processable format. This work utilizes the image captioning model to capture the semantics of the input image and a modular design to generate a probability distribution for semantic topics. It gives any autonomous system the ability to process visual information in a human-like way and generates more insights which are hardly possible with a conventional algorithm. Here a model and data collection method are proposed. diff --git a/data/xml/2020.clinicalnlp.xml b/data/xml/2020.clinicalnlp.xml new file mode 100644 index 0000000000..367f2d6d66 --- /dev/null +++ b/data/xml/2020.clinicalnlp.xml @@ -0,0 +1,385 @@ + + + + + Proceedings of the 3rd Clinical Natural Language Processing Workshop + AnnaRumshisky + KirkRoberts + StevenBethard + TristanNaumann + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.clinicalnlp-1.0 + + + Various Approaches for Predicting Stroke Prognosis using Magnetic Resonance Imaging Text Records + Tak-SungHeo + ChulhoKim + Jeong-MyeongChoi + Yeong-SeokJeong + Yu-SeopKim + 1–6 + Stroke is one of the leading causes of death and disability worldwide. Stroke is treatable, but it is prone to disability after treatment and must be prevented. To grasp the degree of disability caused by stroke, we use magnetic resonance imaging text records to predict stroke and measure the performance according to the document-level and sentence-level representation. As a result of the experiment, the document-level representation shows better performance. + 2020.clinicalnlp-1.1 + + + Multiple Sclerosis Severity Classification From Clinical Text + AlisterD’Costa + StefanDenkovski + MichalMalyska + Sae YoungMoon + BrandonRufino + ZhenYang + TaylorKillian + MarzyehGhassemi + 7–23 + Multiple Sclerosis (MS) is a chronic, inflammatory and degenerative neurological disease, which is monitored by a specialist using the Expanded Disability Status Scale (EDSS) and recorded in unstructured text in the form of a neurology consult note. An EDSS measurement contains an overall ‘EDSS’ score and several functional subscores. Typically, expert knowledge is required to interpret consult notes and generate these scores. Previous approaches used limited context length Word2Vec embeddings and keyword searches to predict scores given a consult note, but often failed when scores were not explicitly stated. In this work, we present MS-BERT, the first publicly available transformer model trained on real clinical data other than MIMIC. Next, we present MSBC, a classifier that applies MS-BERT to generate embeddings and predict EDSS and functional subscores. Lastly, we explore combining MSBC with other models through the use of Snorkel to generate scores for unlabelled consult notes. MSBC achieves state-of-the-art performance on all metrics and prediction tasks and outperforms the models generated from the Snorkel ensemble. We improve Macro-F1 by 0.12 (to 0.88) for predicting EDSS and on average by 0.29 (to 0.63) for predicting functional subscores over previous Word2Vec CNN and rule-based approaches. + 2020.clinicalnlp-1.2 + + + <fixed-case>BERT</fixed-case>-<fixed-case>XML</fixed-case>: Large Scale Automated <fixed-case>ICD</fixed-case> Coding Using <fixed-case>BERT</fixed-case> Pretraining + ZachariahZhang + JingshuLiu + NargesRazavian + 24–34 + ICD coding is the task of classifying and cod-ing all diagnoses, symptoms and proceduresassociated with a patient’s visit. The process isoften manual, extremely time-consuming andexpensive for hospitals as clinical interactionsare usually recorded in free text medical notes.In this paper, we propose a machine learningmodel, BERT-XML, for large scale automatedICD coding of EHR notes, utilizing recentlydeveloped unsupervised pretraining that haveachieved state of the art performance on a va-riety of NLP tasks. We train a BERT modelfrom scratch on EHR notes, learning with vo-cabulary better suited for EHR tasks and thusoutperform off-the-shelf models. We furtheradapt the BERT architecture for ICD codingwith multi-label attention. We demonstratethe effectiveness of BERT-based models on thelarge scale ICD code classification task usingmillions of EHR notes to predict thousands ofunique codes. + 2020.clinicalnlp-1.3 + + + Incorporating Risk Factor Embeddings in Pre-trained Transformers Improves Sentiment Prediction in Psychiatric Discharge Summaries + XiyuDing + Mei-HuaHall + TimothyMiller + 35–40 + Reducing rates of early hospital readmission has been recognized and identified as a key to improve quality of care and reduce costs. There are a number of risk factors that have been hypothesized to be important for understanding re-admission risk, including such factors as problems with substance abuse, ability to maintain work, relations with family. In this work, we develop Roberta-based models to predict the sentiment of sentences describing readmission risk factors in discharge summaries of patients with psychosis. We improve substantially on previous results by a scheme that shares information across risk factors while also allowing the model to learn risk factor-specific information. + 2020.clinicalnlp-1.4 + + + Information Extraction from <fixed-case>S</fixed-case>wedish Medical Prescriptions with Sig-Transformer Encoder + JohnPougué Biyong + BoWang + TerryLyons + AlejoNevado-Holgado + 41–54 + Relying on large pretrained language models such as Bidirectional Encoder Representations from Transformers (BERT) for encoding and adding a simple prediction layer has led to impressive performance in many clinical natural language processing (NLP) tasks. In this work, we present a novel extension to the Transformer architecture, by incorporating signature transform with the self-attention model. This architecture is added between embedding and prediction layers. Experiments on a new Swedish prescription data show the proposed architecture to be superior in two of the three information extraction tasks, comparing to baseline models. Finally, we evaluate two different embedding approaches between applying Multilingual BERT and translating the Swedish text to English then encode with a BERT model pretrained on clinical notes. + 2020.clinicalnlp-1.5 + 2020.clinicalnlp-1.5.OptionalSupplementaryMaterial.zip + + + Evaluation of Transfer Learning for Adverse Drug Event (<fixed-case>ADE</fixed-case>) and Medication Entity Extraction + SankaranNarayanan + KaivalyaMannam + Sreeranga PRajan + P VenkatRangan + 55–64 + We evaluate several biomedical contextual embeddings (based on BERT, ELMo, and Flair) for the detection of medication entities such as Drugs and Adverse Drug Events (ADE) from Electronic Health Records (EHR) using the 2018 ADE and Medication Extraction (Track 2) n2c2 data-set. We identify best practices for transfer learning, such as language-model fine-tuning and scalar mix. Our transfer learning models achieve strong performance in the overall task (F1=92.91%) as well as in ADE identification (F1=53.08%). Flair-based embeddings out-perform in the identification of context-dependent entities such as ADE. BERT-based embeddings out-perform in recognizing clinical terminology such as Drug and Form entities. ELMo-based embeddings deliver competitive performance in all entities. We develop a sentence-augmentation method for enhanced ADE identification benefiting BERT-based and ELMo-based models by up to 3.13% in F1 gains. Finally, we show that a simple ensemble of these models out-paces most current methods in ADE extraction (F1=55.77%). + 2020.clinicalnlp-1.6 + + + <fixed-case>B</fixed-case>io<fixed-case>BERT</fixed-case>pt - A <fixed-case>P</fixed-case>ortuguese Neural Language Model for Clinical Named Entity Recognition + Elisa Terumi RubelSchneider + João Vitor Andriolide Souza + JulienKnafou + Lucas Emanuel Silva eOliveira + JennyCopara + Yohan BonesckiGumiel + Lucas Ferro Antunes deOliveira + Emerson CabreraParaiso + DouglasTeodoro + Cláudia Maria Cabral MoroBarra + 65–72 + With the growing number of electronic health record data, clinical NLP tasks have become increasingly relevant to unlock valuable information from unstructured clinical text. Although the performance of downstream NLP tasks, such as named-entity recognition (NER), in English corpus has recently improved by contextualised language models, less research is available for clinical texts in low resource languages. Our goal is to assess a deep contextual embedding model for Portuguese, so called BioBERTpt, to support clinical and biomedical NER. We transfer learned information encoded in a multilingual-BERT model to a corpora of clinical narratives and biomedical-scientific papers in Brazilian Portuguese. To evaluate the performance of BioBERTpt, we ran NER experiments on two annotated corpora containing clinical narratives and compared the results with existing BERT models. Our in-domain model outperformed the baseline model in F1-score by 2.72%, achieving higher performance in 11 out of 13 assessed entities. We demonstrate that enriching contextual embedding models with domain literature can play an important role in improving performance for specific NLP tasks. The transfer learning process enhanced the Portuguese biomedical NER model by reducing the necessity of labeled data and the demand for retraining a whole new model. + 2020.clinicalnlp-1.7 + + + Dilated Convolutional Attention Network for Medical Code Assignment from Clinical Text + ShaoxiongJi + ErikCambria + PekkaMarttinen + 73–78 + Medical code assignment, which predicts medical codes from clinical texts, is a fundamental task of intelligent medical information systems. The emergence of deep models in natural language processing has boosted the development of automatic assignment methods. However, recent advanced neural architectures with flat convolutions or multi-channel feature concatenation ignore the sequential causal constraint within a text sequence and may not learn meaningful clinical text representations, especially for lengthy clinical notes with long-term sequential dependency. This paper proposes a Dilated Convolutional Attention Network (DCAN), integrating dilated convolutions, residual connections, and label attention, for medical code assignment. It adopts dilated convolutions to capture complex medical patterns with a receptive field which increases exponentially with dilation size. Experiments on a real-world clinical dataset empirically show that our model improves the state of the art. + 2020.clinicalnlp-1.8 + + + Classification of Syncope Cases in <fixed-case>N</fixed-case>orwegian Medical Records + IldikoPilan + Pål H.Brekke + Fredrik A.Dahl + ToreGundersen + HaldorHusby + ØysteinNytrø + LiljaØvrelid + 79–84 + Loss of consciousness, so-called syncope, is a commonly occurring symptom associated with worse prognosis for a number of heart-related diseases. We present a comparison of methods for a diagnosis classification task in Norwegian clinical notes, targeting syncope, i.e. fainting cases. We find that an often neglected baseline with keyword matching constitutes a rather strong basis, but more advanced methods do offer some improvement in classification performance, especially a convolutional neural network model. The developed pipeline is planned to be used for quantifying unregistered syncope cases in Norway. + 2020.clinicalnlp-1.9 + + + Comparison of Machine Learning Methods for Multi-label Classification of Nursing Education and Licensure Exam Questions + JohnLangton + KrishnaSrihasam + JunlinJiang + 85–93 + In this paper, we evaluate several machine learning methods for multi-label classification of text questions. Every nursing student in the United States must pass the National Council Licensure Examination (NCLEX) to begin professional practice. NCLEX defines a number of competencies on which students are evaluated. By labeling test questions with NCLEX competencies, we can score students according to their performance in each competency. This information helps instructors measure how prepared students are for the NCLEX, as well as which competencies they may need help with. A key challenge is that questions may be related to more than one competency. Labeling questions with NCLEX competencies, therefore, equates to a multi-label, text classification problem where each competency is a label. Here we present an evaluation of several methods to support this use case along with a proposed approach. While our work is grounded in the nursing education domain, the methods described here can be used for any multi-label, text classification use case. + 2020.clinicalnlp-1.10 + + + Clinical <fixed-case>XLN</fixed-case>et: Modeling Sequential Clinical Notes and Predicting Prolonged Mechanical Ventilation + KexinHuang + AbhishekSingh + SitongChen + EdwardMoseley + Chih-YingDeng + NaomiGeorge + CharolottaLindvall + 94–100 + Clinical notes contain rich information, which is relatively unexploited in predictive modeling compared to structured data. In this work, we developed a new clinical text representation Clinical XLNet that leverages the temporal information of the sequence of the notes. We evaluated our models on prolonged mechanical ventilation prediction problem and our experiments demonstrated that Clinical XLNet outperforms the best baselines consistently. The models and scripts are made publicly available. + 2020.clinicalnlp-1.11 + 2020.clinicalnlp-1.11.OptionalSupplementaryMaterial.pdf + + + Automatic recognition of abdominal lymph nodes from clinical text + YifanPeng + SungwonLee + Daniel C.Elton + ThomasShen + Yu-xingTang + QingyuChen + ShuaiWang + YingyingZhu + RonaldSummers + ZhiyongLu + 101–110 + Lymph node status plays a pivotal role in the treatment of cancer. The extraction of lymph nodes from radiology text reports enables large-scale training of lymph node detection on MRI. In this work, we first propose an ontology of 41 types of abdominal lymph nodes with a hierarchical relationship. We then introduce an end-to-end approach based on the combination of rules and transformer-based methods to detect these abdominal lymph node mentions and classify their types from the MRI radiology reports. We demonstrate the superior performance of a model fine-tuned on MRI reports using BlueBERT, called MriBERT. We find that MriBERT outperforms the rule-based labeler (0.957 vs 0.644 in micro weighted F1-score) as well as other BERT-based variations (0.913 - 0.928). We make the code and MriBERT publicly available at https://github.com/ncbi-nlp/bluebert, with the hope that this method can facilitate the development of medical report annotators to produce labels from scratch at scale. + 2020.clinicalnlp-1.12 + + + How You Ask Matters: The Effect of Paraphrastic Questions to <fixed-case>BERT</fixed-case> Performance on a Clinical <fixed-case>SQ</fixed-case>u<fixed-case>AD</fixed-case> Dataset + Sungrim (Riea)Moon + JungweiFan + 111–116 + Reading comprehension style question-answering (QA) based on patient-specific documents represents a growing area in clinical NLP with plentiful applications. Bidirectional Encoder Representations from Transformers (BERT) and its derivatives lead the state-of-the-art accuracy on the task, but most evaluation has treated the data as a pre-mixture without systematically looking into the potential effect of imperfect train/test questions. The current study seeks to address this gap by experimenting with full versus partial train/test data consisting of paraphrastic questions. Our key findings include 1) training with all pooled question variants yielded best accuracy, 2) the accuracy varied widely, from 0.74 to 0.80, when trained with each single question variant, and 3) questions of similar lexical/syntactic structure tended to induce identical answers. The results suggest that how you ask questions matters in BERT-based QA, especially at the training stage. + 2020.clinicalnlp-1.13 + + + Relative and Incomplete Time Expression Anchoring for Clinical Text + LouiseDupuis + NicolBergou + HeglerTissot + SumithraVelupillai + 117–129 + Extracting and modeling temporal information in clinical text is an important element for developing timelines and disease trajectories. Time information in written text varies in preciseness and explicitness, posing challenges for NLP approaches that aim to accurately anchor temporal information on a timeline. Relative and incomplete time expressions (RI-Timexes) are expressions that require additional information for their temporal anchor to be resolved, but few studies have addressed this challenge specifically. In this study, we aimed to reproduce and verify a classification approach for identifying anchor dates and relations in clinical text, and propose a novel relation classification approach for this task. + 2020.clinicalnlp-1.14 + + + <fixed-case>M</fixed-case>e<fixed-case>DAL</fixed-case>: Medical Abbreviation Disambiguation Dataset for Natural Language Understanding Pretraining + ZhiWen + Xing HanLu + SivaReddy + 130–135 + One of the biggest challenges that prohibit the use of many current NLP methods in clinical settings is the availability of public datasets. In this work, we present MeDAL, a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. We pre-trained several models of common architectures on this dataset and empirically showed that such pre-training leads to improved performance and convergence speed when fine-tuning on downstream medical tasks. + 2020.clinicalnlp-1.15 + 2020.clinicalnlp-1.15.OptionalSupplementaryMaterial.zip + + + Knowledge Grounded Conversational Symptom Detection with Graph Memory Networks + HongyinLuo + Shang-WenLi + JamesGlass + 136–145 + In this work, we propose a novel goal-oriented dialog task, automatic symptom detection. We build a system that can interact with patients through dialog to detect and collect clinical symptoms automatically, which can save a doctor’s time interviewing the patient. Given a set of explicit symptoms provided by the patient to initiate a dialog for diagnosing, the system is trained to collect implicit symptoms by asking questions, in order to collect more information for making an accurate diagnosis. After getting the reply from the patient for each question, the system also decides whether current information is enough for a human doctor to make a diagnosis. To achieve this goal, we propose two neural models and a training pipeline for the multi-step reasoning task. We also build a knowledge graph as additional inputs to further improve model performance. Experiments show that our model significantly outperforms the baseline by 4%, discovering 67% of implicit symptoms on average with a limited number of questions. + 2020.clinicalnlp-1.16 + + + Pretrained Language Models for Biomedical and Clinical Tasks: Understanding and Extending the State-of-the-Art + PatrickLewis + MyleOtt + JingfeiDu + VeselinStoyanov + 146–157 + A large array of pretrained models are available to the biomedical NLP (BioNLP) community. Finding the best model for a particular task can be difficult and time-consuming. For many applications in the biomedical and clinical domains, it is crucial that models can be built quickly and are highly accurate. We present a large-scale study across 18 established biomedical and clinical NLP tasks to determine which of several popular open-source biomedical and clinical NLP models work well in different settings. Furthermore, we apply recent advances in pretraining to train new biomedical language models, and carefully investigate the effect of various design choices on downstream performance. Our best models perform well in all of our benchmarks, and set new State-of-the-Art in 9 tasks. We release these models in the hope that they can help the community to speed up and increase the accuracy of BioNLP and text mining applications. + 2020.clinicalnlp-1.17 + 2020.clinicalnlp-1.17.OptionalSupplementaryMaterial.pdf + + + Assessment of <fixed-case>D</fixed-case>istil<fixed-case>BERT</fixed-case> performance on Named Entity Recognition task for the detection of Protected Health Information and medical concepts + MacariousAbadeer + 158–167 + Bidirectional Encoder Representations from Transformers (BERT) models achieve state-of-the-art performance on a number of Natural Language Processing tasks. However, their model size on disk often exceeds 1 GB and the process of fine-tuning them and using them to run inference consumes significant hardware resources and runtime. This makes them hard to deploy to production environments. This paper fine-tunes DistilBERT, a lightweight deep learning model, on medical text for the named entity recognition task of Protected Health Information (PHI) and medical concepts. This work provides a full assessment of the performance of DistilBERT in comparison with BERT models that were pre-trained on medical text. For Named Entity Recognition task of PHI, DistilBERT achieved almost the same results as medical versions of BERT in terms of F1 score at almost half the runtime and consuming approximately half the disk space. On the other hand, for the detection of medical concepts, DistilBERT’s F1 score was lower by 4 points on average than medical BERT variants. + 2020.clinicalnlp-1.18 + + + Distinguishing between Dementia with Lewy bodies (<fixed-case>DLB</fixed-case>) and <fixed-case>A</fixed-case>lzheimer’s Disease (<fixed-case>AD</fixed-case>) using Mental Health Records: a Classification Approach + ZixuWang + JuliaIve + SineadMoylett + ChristophMueller + RudolfCardinal + SumithraVelupillai + JohnO’Brien + RobertStewart + 168–177 + While Dementia with Lewy Bodies (DLB) is the second most common type of neurodegenerative dementia following Alzheimer’s Disease (AD), it is difficult to distinguish from AD. We propose a method for DLB detection by using mental health record (MHR) documents from a (3-month) period before a patient has been diagnosed with DLB or AD. Our objective is to develop a model that could be clinically useful to differentiate between DLB and AD across datasets from different healthcare institutions. We cast this as a classification task using Convolutional Neural Network (CNN), an efficient neural model for text classification. We experiment with different representation models, and explore the features that contribute to model performances. In addition, we apply temperature scaling, a simple but efficient model calibration method, to produce more reliable predictions. We believe the proposed method has important potential for clinical applications using routine healthcare records, and for generalising to other relevant clinical record datasets. To the best of our knowledge, this is the first attempt to distinguish DLB from AD using mental health records, and to improve the reliability of DLB predictions. + 2020.clinicalnlp-1.19 + + + Weakly Supervised Medication Regimen Extraction from Medical Conversations + DhruveshPatel + SandeepKonam + SaiPrabhakar + 178–193 + Automated Medication Regimen (MR) extraction from medical conversations can not only improve recall and help patients follow through with their care plan, but also reduce the documentation burden for doctors. In this paper, we focus on extracting spans for frequency, route and change, corresponding to medications discussed in the conversation. We first describe a unique dataset of annotated doctor-patient conversations and then present a weakly supervised model architecture that can perform span extraction using noisy classification data. The model utilizes an attention bottleneck inside a classification model to perform the extraction. We experiment with several variants of attention scoring and projection functions and propose a novel transformer-based attention scoring function (TAScore). The proposed combination of TAScore and Fusedmax projection achieves a 10 point increase in Longest Common Substring F1 compared to the baseline of additive scoring plus softmax projection. + 2020.clinicalnlp-1.20 + + + Extracting Relations between Radiotherapy Treatment Details + DanielleBitterman + TimothyMiller + DavidHarris + ChenLin + SeanFinan + JeremyWarner + RaymondMak + GuerganaSavova + 194–200 + We present work on extraction of radiotherapy treatment information from the clinical narrative in the electronic medical records. Radiotherapy is a central component of the treatment of most solid cancers. Its details are described in non-standardized fashions using jargon not found in other medical specialties, complicating the already difficult task of manual data extraction. We examine the performance of several state-of-the-art neural methods for relation extraction of radiotherapy treatment details, with a goal of automating detailed information extraction. The neural systems perform at 0.82-0.88 macro-average F1, which approximates or in some cases exceeds the inter-annotator agreement. To the best of our knowledge, this is the first effort to develop models for radiotherapy relation extraction and one of the few efforts for relation extraction to describe cancer treatment in general. + 2020.clinicalnlp-1.21 + + + Cancer Registry Information Extraction via Transfer Learning + Yan-JieLin + Hong-JieDai + You-ChenZhang + Chung-YangWu + Yu-ChengChang + Pin-JouLu + Chih-JenHuang + Yu-TsangWang + Hui-MinHsieh + Kun-SanChao + Tsang-WuLiu + I-ShouChang + Yi-Hsin ConnieYang + Ti-HaoWang + Ko-JiunnLiu + Li-TzongChen + Sheau-FangYang + 201–208 + A cancer registry is a critical and massive database for which various types of domain knowledge are needed and whose maintenance requires labor-intensive data curation. In order to facilitate the curation process for building a high-quality and integrated cancer registry database, we compiled a cross-hospital corpus and applied neural network methods to develop a natural language processing system for extracting cancer registry variables buried in unstructured pathology reports. The performance of the developed networks was compared with various baselines using standard micro-precision, recall and F-measure. Furthermore, we conducted experiments to study the feasibility of applying transfer learning to rapidly develop a well-performing system for processing reports from different sources that might be presented in different writing styles and formats. The results demonstrate that the transfer learning method enables us to develop a satisfactory system for a new hospital with only a few annotations and suggest more opportunities to reduce the burden of cancer registry curation. + 2020.clinicalnlp-1.22 + + + <fixed-case>PHICON</fixed-case>: Improving Generalization of Clinical Text De-identification Models via Data Augmentation + XiangYue + ShuangZhou + 209–214 + De-identification is the task of identifying protected health information (PHI) in the clinical text. Existing neural de-identification models often fail to generalize to a new dataset. We propose a simple yet effective data augmentation method PHICON to alleviate the generalization issue. PHICON consists of PHI augmentation and Context augmentation, which creates augmented training corpora by replacing PHI entities with named-entities sampled from external sources, and by changing background context with synonym replacement or random word insertion, respectively. Experimental results on the i2b2 2006 and 2014 de-identification challenge datasets show that PHICON can help three selected de-identification models boost F1-score (by at most 8.6%) on cross-dataset test setting. We also discuss how much augmentation to use and how each augmentation method influences the performance. + 2020.clinicalnlp-1.23 + + + Where’s the Question? A Multi-channel Deep Convolutional Neural Network for Question Identification in Textual Data + GeorgeMichalopoulos + HelenChen + AlexanderWong + 215–226 + In most clinical practice settings, there is no rigorous reviewing of the clinical documentation, resulting in inaccurate information captured in the patient medical records. The gold standard in clinical data capturing is achieved via “expert-review”, where clinicians can have a dialogue with a domain expert (reviewers) and ask them questions about data entry rules. Automatically identifying “real questions” in these dialogues could uncover ambiguities or common problems in data capturing in a given clinical setting. In this study, we proposed a novel multi-channel deep convolutional neural network architecture, namely Quest-CNN, for the purpose of separating real questions that expect an answer (information or help) about an issue from sentences that are not questions, as well as from questions referring to an issue mentioned in a nearby sentence (e.g., can you clarify this?), which we will refer as “c-questions”. We conducted a comprehensive performance comparison analysis of the proposed multi-channel deep convolutional neural network against other deep neural networks. Furthermore, we evaluated the performance of traditional rule-based and learning-based methods for detecting question sentences. The proposed Quest-CNN achieved the best F1 score both on a dataset of data entry-review dialogue in a dialysis care setting, and on a general domain dataset. + 2020.clinicalnlp-1.24 + + + Learning from Unlabelled Data for Clinical Semantic Textual Similarity + YuxiaWang + KarinVerspoor + TimothyBaldwin + 227–233 + Domain pretraining followed by task fine-tuning has become the standard paradigm for NLP tasks, but requires in-domain labelled data for task fine-tuning. To overcome this, we propose to utilise domain unlabelled data by assigning pseudo labels from a general model. We evaluate the approach on two clinical STS datasets, and achieve r= 0.80 on N2C2-STS. Further investigation reveals that if the data distribution of unlabelled sentence pairs is closer to the test data, we can obtain better performance. By leveraging a large general-purpose STS dataset and small-scale in-domain training data, we obtain further improvements to r= 0.90, a new SOTA. + 2020.clinicalnlp-1.25 + + + Joint Learning with Pre-trained Transformer on Named Entity Recognition and Relation Extraction Tasks for Clinical Analytics + MiaoChen + GanhuiLan + FangDu + VictorLobanov + 234–242 + In drug development, protocols define how clinical trials are conducted, and are therefore of paramount importance. They contain key patient-, investigator-, medication-, and study-related information, often elaborated in different sections in the protocol texts. Granular-level parsing on large quantity of existing protocols can accelerate clinical trial design and provide actionable insights into trial optimization. Here, we report our progresses in using deep learning NLP algorithms to enable automated protocol analytics. In particular, we combined a pre-trained BERT transformer model with joint-learning strategies to simultaneously identify clinically relevant entities (i.e. Named Entity Recognition) and extract the syntactic relations between these entities (i.e. Relation Extraction) from the eligibility criteria section in protocol texts. When comparing to standalone NER and RE models, our joint-learning strategy can effectively improve the performance of RE task while retaining similarly high NER performance, likely due to the synergy of optimizing toward both tasks’ objectives via shared parameters. The derived NLP model provides an end-to-end solution to convert unstructured protocol texts into structured data source, which will be embedded into a comprehensive clinical analytics workflow for downstream trial design missions such like patient population extraction, patient enrollment rate estimation, and protocol amendment prediction. + 2020.clinicalnlp-1.26 + + + Extracting Semantic Aspects for Structured Representation of Clinical Trial Eligibility Criteria + TirthankarDasgupta + IshaniMondal + AbirNaskar + LipikaDey + 243–248 + Eligibility criteria in the clinical trials specify the characteristics that a patient must or must not possess in order to be treated according to a standard clinical care guideline. As the process of manual eligibility determination is time-consuming, automatic structuring of the eligibility criteria into various semantic categories or aspects is the need of the hour. Existing methods use hand-crafted rules and feature-based statistical machine learning methods to dynamically induce semantic aspects. However, in order to deal with paucity of aspect-annotated clinical trials data, we propose a novel weakly-supervised co-training based method which can exploit a large pool of unlabeled criteria sentences to augment the limited supervised training data, and consequently enhance the performance. Experiments with 0.2M criteria sentences show that the proposed approach outperforms the competitive supervised baselines by 12% in terms of micro-averaged F1 score for all the aspects. Probing deeper into analysis, we observe domain-specific information boosts up the performance by a significant margin. + 2020.clinicalnlp-1.27 + + + An Ensemble Approach to Automatic Structuring of Radiology Reports + MortezaPourreza Shahri + AmirTahmasebi + BingyangYe + HenghuiZhu + JavedAslam + TimothyFerris + 249–258 + Automatic structuring of electronic medical records is of high demand for clinical workflow solutions to facilitate extraction, storage, and querying of patient care information. However, developing a scalable solution is extremely challenging, specifically for radiology reports, as most healthcare institutes use either no template or department/institute specific templates. Moreover, radiologists’ reporting style varies from one to another as sentences are written in a telegraphic format and do not follow general English grammar rules. In this work, we present an ensemble method that consolidates the predictions of three models, capturing various attributes of textual information for automatic labeling of sentences with section labels. These three models are: 1) Focus Sentence model, capturing context of the target sentence; 2) Surrounding Context model, capturing the neighboring context of the target sentence; and finally, 3) Formatting/Layout model, aimed at learning report formatting cues. We utilize Bi-directional LSTMs, followed by sentence encoders, to acquire the context. Furthermore, we define several features that incorporate the structure of reports. We compare our proposed approach against multiple baselines and state-of-the-art approaches on a proprietary dataset as well as 100 manually annotated radiology notes from the MIMIC-III dataset, which we are making publicly available. Our proposed approach significantly outperforms other approaches by achieving 97.1% accuracy. + 2020.clinicalnlp-1.28 + + + Utilizing Multimodal Feature Consistency to Detect Adversarial Examples on Clinical Summaries + WenjieWang + YoungjaPark + TaesungLee + IanMolloy + PengfeiTang + LiXiong + 259–268 + Recent studies have shown that adversarial examples can be generated by applying small perturbations to the inputs such that the well- trained deep learning models will misclassify. With the increasing number of safety and security-sensitive applications of deep learn- ing models, the robustness of deep learning models has become a crucial topic. The robustness of deep learning models for health- care applications is especially critical because the unique characteristics and the high financial interests of the medical domain make it more sensitive to adversarial attacks. Among the modalities of medical data, the clinical summaries have higher risks to be attacked because they are generated by third-party companies. As few works studied adversarial threats on clinical summaries, in this work we first apply adversarial attack to clinical summaries of electronic health records (EHR) to show the text-based deep learning systems are vulnerable to adversarial examples. Secondly, benefiting from the multi-modality of the EHR dataset, we propose a novel defense method, MATCH (Multimodal feATure Consistency cHeck), which leverages the consistency between multiple modalities in the data to defend against adversarial examples on a single modality. Our experiments demonstrate the effectiveness of MATCH on a hospital readmission prediction task comparing with baseline methods. + 2020.clinicalnlp-1.29 + + + Advancing Seq2seq with Joint Paraphrase Learning + So YeonMin + PreethiRaghavan + PeterSzolovits + 269–279 + We address the problem of model generalization for sequence to sequence (seq2seq) architectures. We propose going beyond data augmentation via paraphrase-optimized multi-task learning and observe that it is useful in correctly handling unseen sentential paraphrases as inputs. Our models greatly outperform SOTA seq2seq models for semantic parsing on diverse domains (Overnight - up to 3.2% and emrQA - 7%) and Nematus, the winning solution for WMT 2017, for Czech to English translation (CzENG 1.6 - 1.5 BLEU). + 2020.clinicalnlp-1.30 + + + On the diminishing return of labeling clinical reports + Jean-BaptisteLamare + OloruntobilobaOlatunji + LiYao + 280–290 + Ample evidence suggests that better machine learning models may be steadily obtained by training on increasingly larger datasets on natural language processing (NLP) problems from non-medical domains. Whether the same holds true for medical NLP has by far not been thoroughly investigated. This work shows that this is indeed not always the case. We reveal the somehow counter-intuitive observation that performant medical NLP models may be obtained with small amount of labeled data, quite the opposite to the common belief, most likely due to the domain specificity of the problem. We show quantitatively the effect of training data size on a fixed test set composed of two of the largest public chest x-ray radiology report datasets on the task of abnormality classification. The trained models not only make use of the training data efficiently, but also outperform the current state-of-the-art rule-based systems by a significant margin. + 2020.clinicalnlp-1.31 + + + The <fixed-case>C</fixed-case>hilean Waiting List Corpus: a new resource for clinical Named Entity Recognition in <fixed-case>S</fixed-case>panish + PabloBáez + FabiánVillena + MatíasRojas + ManuelDurán + JocelynDunstan + 291–300 + In this work we describe the Waiting List Corpus consisting of de-identified referrals for several specialty consultations from the waiting list in Chilean public hospitals. A subset of 900 referrals was manually annotated with 9,029 entities, 385 attributes, and 284 pairs of relations with clinical relevance. A trained medical doctor annotated these referrals, and then together with other three researchers, consolidated each of the annotations. The annotated corpus has nested entities, with 32.2% of entities embedded in other entities. We use this annotated corpus to obtain preliminary results for Named Entity Recognition (NER). The best results were achieved by using a biLSTM-CRF architecture using word embeddings trained over Spanish Wikipedia together with clinical embeddings computed by the group. NER models applied to this corpus can leverage statistics of diseases and pending procedures within this waiting list. This work constitutes the first annotated corpus using clinical narratives from Chile, and one of the few for the Spanish language. The annotated corpus, the clinical word embeddings, and the annotation guidelines are freely released to the research community. + 2020.clinicalnlp-1.32 + + + Analyzing Text Specific vs Blackbox Fairness Algorithms in Multimodal Clinical <fixed-case>NLP</fixed-case> + JohnChen + IanBerlot-Attwell + XindiWang + SafwanHossain + FrankRudzicz + 301–312 + Clinical machine learning is increasingly multimodal, collected in both structured tabular formats and unstructured forms such as free text. We propose a novel task of exploring fairness on a multimodal clinical dataset, adopting equalized odds for the downstream medical prediction tasks. To this end, we investigate a modality-agnostic fairness algorithm - equalized odds post processing - and compare it to a text-specific fairness algorithm: debiased clinical word embeddings. Despite the fact that debiased word embeddings do not explicitly address equalized odds of protected groups, we show that a text-specific approach to fairness may simultaneously achieve a good balance of performance classical notions of fairness. Our work opens the door for future work at the critical intersection of clinical NLP and fairness. + 2020.clinicalnlp-1.33 + +
+
diff --git a/data/xml/2020.cmcl.xml b/data/xml/2020.cmcl.xml new file mode 100644 index 0000000000..bef0d65986 --- /dev/null +++ b/data/xml/2020.cmcl.xml @@ -0,0 +1,108 @@ + + + + + Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics + EmmanueleChersoni + CassandraJacobs + YoheiOseki + LaurentPrévot + EnricoSantus + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.cmcl-1.0 + + + What Determines the Order of Verbal Dependents in <fixed-case>H</fixed-case>indi? Effects of Efficiency in Comprehension and Production + KartikSharma + RichardFutrell + SamarHusain + 1–10 + Word order flexibility is one of the distinctive features of SOV languages. In this work, we investigate whether the order and relative distance of preverbal dependents in Hindi, an SOV language, is affected by factors motivated by efficiency considerations during comprehension/production. We investigate the influence of Head–Dependent Mutual Information (HDMI), similarity-based interference, accessibility and case-marking. Results show that preverbal dependents remain close to the verbal head when the HDMI between the verb and its dependent is high. This demonstrates the influence of locality constraints on dependency distance and word order in an SOV language. Additionally, dependency distance were found to be longer when the dependent was animate, when it was case-marked and when it was semantically similar to other preverbal dependents. Together the results highlight the crosslinguistic generalizability of these factors and provide evidence for a functionally motivated account of word order in SOV languages such as Hindi. + 2020.cmcl-1.1 + + + Images and Imagination: Automated Analysis of Priming Effects Related to Autism Spectrum Disorder and Developmental Language Disorder + MichaelaRegneri + DianeKing + FahreenWalji + OlympiaPalikara + 11–27 + Different aspects of language processing have been shown to be sensitive to priming but the findings of studies examining priming effects in adolescents with Autism Spectrum Disorder (ASD) and Developmental Language Disorder (DLD) have been inconclusive. We present a study analysing visual and implicit semantic priming in adolescents with ASD and DLD. Based on a dataset of fictional and script-like narratives, we evaluate how often and how extensively, content of two different priming sources is used by the participants. The first priming source was visual, consisting of images shown to the participants to assist them with their storytelling. The second priming source originated from commonsense knowledge, using crowdsourced data containing prototypical script elements. Our results show that individuals with ASD are less sensitive to both types of priming, but show typical usage of primed cues when they use them at all. In contrast, children with DLD show mostly average priming sensitivity, but exhibit an over-proportional use of the priming cues. + 2020.cmcl-1.2 + + + Production-based Cognitive Models as a Test Suite for Reinforcement Learning Algorithms + AdrianBrasoveanu + JakubDotlacil + 28–37 + We introduce a framework in which production-rule based computational cognitive modeling and Reinforcement Learning can systematically interact and inform each other. We focus on linguistic applications because the sophisticated rule-based cognitive models needed to capture linguistic behavioral data promise to provide a stringent test suite for RL algorithms, connecting RL algorithms to both accuracy and reaction-time experimental data. Thus, we open a path towards assembling an experimentally rigorous and cognitively realistic benchmark for RL algorithms. We extend our previous work on lexical decision tasks and tabular RL algorithms (Brasoveanu and Dotlačil, 2020b) with a discussion of neural-network based approaches, and a discussion of how parsing can be formalized as an RL problem. + 2020.cmcl-1.3 + + + Evaluating Word Embeddings for Language Acquisition + Raquel G.Alhama + CarolineRowland + EvanKidd + 38–42 + Continuous vector word representations (or word embeddings) have shown success in capturing semantic relations between words, as evidenced with evaluation against behavioral data of adult performance on semantic tasks (Pereira et al. 2016). Adult semantic knowledge is the endpoint of a language acquisition process; thus, a relevant question is whether these models can also capture emerging word representations of young language learners. However, the data of semantic knowledge of children is scarce or non-existent for some age groups. In this paper, we propose to bridge this gap by using Age of Acquisition norms to evaluate word embeddings learnt from child-directed input. We present two methods that evaluate word embeddings in terms of (a) the semantic neighbourhood density of learnt words, and (b) the convergence to adult word associations. We apply our methods to bag-of-words models, and we find that (1) children acquire words with fewer semantic neighbours earlier, and (2) young learners only attend to very local context. These findings provide converging evidence for validity of our methods in understanding the prerequisite features for a distributional model of word learning. + 2020.cmcl-1.4 + + + Guessing the Age of Acquisition of <fixed-case>I</fixed-case>talian Lemmas through Linear Regression + IreneRusso + 43–48 + The age of acquisition of a word is a psycholinguistic variable concerning the age at which a word is typically learned. It correlates with other psycholinguistic variables such as familiarity, concreteness, and imageability. Existing datasets for multiple languages also include linguistic variables such as the length and the frequency of lemmas in different corpora. There are substantial sets of normative values for English, but for other languages, such as Italian, the coverage is scarce. In this paper,a set of regression experiments investigates whether it is possible to guess the age of acquisition of Italian lemmas that have not been previously rated by humans. An intrinsic evaluation is proposed, correlating estimated Italian lemmas’ AoA with English lemmas’ AoA. An extrinsic evaluation - using AoA values as features for the classification of literary excerpts labeled by age appropriateness - shows how es-sential is lexical coverage for this task. + 2020.cmcl-1.5 + + + Word Co-occurrence in Child-directed Speech Predicts Children’s Free Word Associations + AbdellahFourtassi + 49–53 + The free association task has been very influential both in cognitive science and in computational linguistics. However, little research has been done to study how free associations develop in childhood. The current work focuses on the developmental hypothesis according to which free word associations emerge by mirroring the co-occurrence distribution of children’s linguistic environment. I trained a distributional semantic model on a large corpus of child language and I tested if it could predict children’s responses. The results largely supported the hypothesis: Co-occurrence-based similarity was a strong predictor of children’s associative behavior even controlling for other possible predictors such as phonological similarity, word frequency, and word length. I discuss the findings in the light of theories of conceptual development. + 2020.cmcl-1.6 + + + Development of Multi-level Linguistic Alignment in Child-adult Conversations + ThomasMisiek + BenoitFavre + AbdellahFourtassi + 54–58 + Interactive alignment is a major mechanism of linguistic coordination. Here we study the way this mechanism emerges in development across the lexical, syntactic, and conceptual levels. We leverage NLP tools to analyze a large-scale corpus of child-adult conversations between 2 and 5 years old. We found that, across development, children align consistently to adults above chance and that adults align consistently more to children than vice versa (even controlling for language production abilities). Besides these consistencies, we found a diversity of developmental trajectories across linguistic levels. These corpus-based findings provide strong support for an early onset of multi-level linguistic alignment in children and invites new experimental work. + 2020.cmcl-1.7 + + + Conditioning, but on Which Distribution? Grammatical Gender in <fixed-case>G</fixed-case>erman Plural Inflection + KateMcCurdy + AdamLopez + SharonGoldwater + 59–65 + Grammatical gender is a consistent and informative cue to the plural class of German nouns. We find that neural encoder-decoder models learn to rely on this cue to predict plural class, but adult speakers are relatively insensitive to it. This suggests that the neural models are not an effective cognitive model of German plural formation. + 2020.cmcl-1.8 + + + Learning Pronoun Case from Distributional Cues: Flexible Frames for Case Acquisition + XiaomengMa + MartinChodorow + VirginiaValian + 66–74 + Case is an abstract grammatical feature that indicates argument relationship in a sentence. In English, cases are expressed on pronouns, as nominative case (e.g. I, he), accusative case (e.g. me, him) and genitive case (e.g. my, his). Children correctly use cased pronouns at a very young age. How do they acquire abstract case in the first place, when different cases are not associated with different meanings? This paper proposes that the distributional patterns in parents’ input could be used to distinguish grammatical cases in English. + 2020.cmcl-1.9 + + + Probabilistic Predictions of People Perusing: Evaluating Metrics of Language Model Performance for Psycholinguistic Modeling + YidingHao + SimonMendelsohn + RachelSterneck + RandiMartinez + RobertFrank + 75–86 + By positing a relationship between naturalistic reading times and information-theoretic surprisal, surprisal theory (Hale, 2001; Levy, 2008) provides a natural interface between language models and psycholinguistic models. This paper re-evaluates a claim due to Goodkind and Bicknell (2018) that a language model’s ability to model reading times is a linear function of its perplexity. By extending Goodkind and Bicknell’s analysis to modern neural architectures, we show that the proposed relation does not always hold for Long Short-Term Memory networks, Transformers, and pre-trained models. We introduce an alternate measure of language modeling performance called predictability norm correlation based on Cloze probabilities measured from human subjects. Our new metric yields a more robust relationship between language model quality and psycholinguistic modeling performance that allows for comparison between models with different training configurations. + 2020.cmcl-1.10 + +
+
diff --git a/data/xml/2020.codi.xml b/data/xml/2020.codi.xml new file mode 100644 index 0000000000..3a467bb641 --- /dev/null +++ b/data/xml/2020.codi.xml @@ -0,0 +1,165 @@ + + + + + Proceedings of the First Workshop on Computational Approaches to Discourse + ChloéBraud + ChristianHardmeier + Junyi JessyLi + AnnieLouis + MichaelStrube + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.codi-1.0 + + + How does discourse affect <fixed-case>S</fixed-case>panish-<fixed-case>C</fixed-case>hinese Translation? A case study based on a <fixed-case>S</fixed-case>panish-<fixed-case>C</fixed-case>hinese parallel corpus + ShuyuanCao + 1–10 + With their huge speaking populations in the world, Spanish and Chinese occupy important positions in linguistic studies. Since the two languages come from different language systems, the translation between Spanish and Chinese is complicated. A comparative study for the language pair can discover the discourse differences between Spanish and Chinese, and can benefit the Spanish-Chinese translation. In this work, based on a Spanish-Chinese parallel corpus annotated with discourse information, we compare the annotation results between the language pair and analyze how discourse affects Spanish-Chinese translation. The research results in our study can help human translators who work with the language pair. + 2020.codi-1.1 + + + Beyond Adjacency Pairs: Extracting Longer Regularities in Human-Machine Dialogues + MaitreyeeMaitreyee + 11–19 + This work proposes a framework to predict sequences in dialogues, using turn based syntactic features and dialogue control functions. Syntactic features were extracted using dependency parsing, while dialogue control functions were manually labelled. These features were transformed using tf-idf and word embedding; feature selection was done using Principal Component Analysis (PCA). We ran experiments on six combinations of features to predict sequences with Hierarchical Agglomerative Clustering. An analysis of the clustering results indicate that using word embeddings and syntactic features, significantly improved the results. + 2020.codi-1.2 + + + Using Type Information to Improve Entity Coreference Resolution + SopanKhosla + CarolynRose + 20–31 + Coreference resolution (CR) is an essential part of discourse analysis. Most recently, neural approaches have been proposed to improve over SOTA models from earlier paradigms. So far none of the published neural models leverage external semantic knowledge such as type information. This paper offers the first such model and evaluation, demonstrating modest gains in accuracy by introducing either gold standard or predicted types. In the proposed approach, type information serves both to (1) improve mention representation and (2) create a soft type consistency check between coreference candidate mentions. Our evaluation covers two different grain sizes of types over four different benchmark corpora. + 2020.codi-1.3 + + + Exploring Span Representations in Neural Coreference Resolution + PatrickKahardipraja + OlenaVyshnevska + SharidLoáiciga + 32–41 + In coreference resolution, span representations play a key role to predict coreference links accurately. We present a thorough examination of the span representation derived by applying BERT on coreference resolution (Joshi et al., 2019) using a probing model. Our results show that the span representation is able to encode a significant amount of coreference information. In addition, we find that the head-finding attention mechanism involved in creating the spans is crucial in encoding coreference knowledge. Last, our analysis shows that the span representation cannot capture non-local coreference as efficiently as local coreference. + 2020.codi-1.4 + + + Supporting Comedy Writers: Predicting Audience’s Response from Sketch Comedy and Crosstalk Scripts + MaolinLi + 42–52 + Sketch comedy and crosstalk are two popular types of comedy. They can relieve people’s stress and thus benefit their mental health, especially when performances and scripts are high-quality. However, writing a script is time-consuming and its quality is difficult to achieve. In order to minimise the time and effort needed for producing an excellent script, we explore ways of predicting the audience’s response from the comedy scripts. For this task, we present a corpus of annotated scripts from popular television entertainment programmes in recent years. Annotations include a) text classification labels, indicating which actor’s lines made the studio audience laugh; b) information extraction labels, i.e. the text spans that made the audience laughed immediately after the performers said them. The corpus will also be useful for dialogue systems and discourse analysis, since our annotations are based on entire scripts. In addition, we evaluate different baseline algorithms. Experimental results demonstrate that BERT models can achieve the best predictions among all the baseline methods. Furthermore, we conduct an error analysis and investigate predictions across scripts with different styles. + 2020.codi-1.5 + + + Exploring Coreference Features in Heterogeneous Data with Text Classification + EkaterinaLapshinova-Koltunski + KerstinKunz + 53–64 + The present paper focuses on variation phenomena in coreference chains. We address the hypothesis that the degree of structural variation between chain elements depends on language-specific constraints and preferences and, even more, on the communicative situation of language production. We define coreference features that also include reference to abstract entities and events. These features are inspired through several sources – cognitive parameters, pragmatic factors and typological status. We pay attention to the distributions of these features in a dataset containing English and German texts of spoken and written discourse mode, which can be classified into seven different registers. We apply text classification and feature selection to find out how these variational dimensions (language, mode and register) impact on coreference features. Knowledge on the variation under analysis is valuable for contrastive linguistics, translation studies and multilingual natural language processing (NLP), e.g. machine translation or cross-lingual coreference resolution. + 2020.codi-1.6 + + + Contextualized Embeddings for Connective Disambiguation in Shallow Discourse Parsing + RenéKnaebel + ManfredStede + 65–75 + This paper studies a novel model that simplifies the disambiguation of connectives for explicit discourse relations. We use a neural approach that integrates contextualized word embeddings and predicts whether a connective candidate is part of a discourse relation or not. We study the influence of those context-specific embeddings. Further, we show the benefit of training the tasks of connective disambiguation and sense classification together at the same time. The success of our approach is supported by state-of-the-art results. + 2020.codi-1.7 + + + <fixed-case>DSNDM</fixed-case>: Deep <fixed-case>S</fixed-case>iamese Neural Discourse Model with Attention for Text Pairs Categorization and Ranking + AlexanderChernyavskiy + DmitryIlvovsky + 76–85 + In this paper, the utility and advantages of the discourse analysis for text pairs categorization and ranking are investigated. We consider two tasks in which discourse structure seems useful and important: automatic verification of political statements, and ranking in question answering systems. We propose a neural network based approach to learn the match between pairs of discourse tree structures. To this end, the neural TreeLSTM model is modified to effectively encode discourse trees and DSNDM model based on it is suggested to analyze pairs of texts. In addition, the integration of the attention mechanism in the model is proposed. Moreover, different ranking approaches are investigated for the second task. In the paper, the comparison with state-of-the-art methods is given. Experiments illustrate that combination of neural networks and discourse structure in DSNDM is effective since it reaches top results in the assigned tasks. The evaluation also demonstrates that discourse analysis improves quality for the processing of longer texts. + 2020.codi-1.8 + + + Do sentence embeddings capture discourse properties of sentences from Scientific Abstracts ? + LaurineHuber + ChakerMemmadi + MathildeDargnat + YannickToussaint + 86–95 + We introduce four tasks designed to determine which sentence encoders best capture discourse properties of sentences from scientific abstracts, namely coherence and cohesion between clauses of a sentence, and discourse relations within sentences. We show that even if contextual encoders such as BERT or SciBERT encodes the coherence in discourse units, they do not help to predict three discourse relations commonly used in scientific abstracts. We discuss what these results underline, namely that these discourse relations are based on particular phrasing that allow non-contextual encoders to perform well. + 2020.codi-1.9 + + + Joint Modeling of Arguments for Event Understanding + YunmoChen + TongfeiChen + BenjaminVan Durme + 96–101 + We recognize the task of event argument linking in documents as similar to that of intent slot resolution in dialogue, providing a Transformer-based model that extends from a recently proposed solution to resolve references to slots. The approach allows for joint consideration of argument candidates given a detected event, which we illustrate leads to state-of-the-art performance in multi-sentence argument linking. + 2020.codi-1.10 + + + Analyzing Neural Discourse Coherence Models + YoumnaFarag + JosefValvoda + HelenYannakoudakis + TedBriscoe + 102–112 + In this work, we systematically investigate how well current models of coherence can capture aspects of text implicated in discourse organisation. We devise two datasets of various linguistic alterations that undermine coherence and test model sensitivity to changes in syntax and semantics. We furthermore probe discourse embedding space and examine the knowledge that is encoded in representations of coherence. We hope this study shall provide further insight into how to frame the task and improve models of coherence assessment further. Finally, we make our datasets publicly available as a resource for researchers to use to test discourse coherence models. + 2020.codi-1.11 + + + Computational Interpretation of Recency for the Choice of Referring Expressions in Discourse + FahimeSame + Keesvan Deemter + 113–123 + First, we discuss the most common linguistic perspectives on the concept of recency and propose a taxonomy of recency metrics employed in Machine Learning studies for choosing the form of referring expressions in discourse context. We then report on a Multi-Layer Perceptron study and a Sequential Forward Search experiment, followed by Bayes Factor analysis of the outcomes. The results suggest that recency metrics counting paragraphs and sentences contribute to referential choice prediction more than other recency-related metrics. Based on the results of our analysis, we argue that, sensitivity to discourse structure is important for recency metrics used in determining referring expression forms. + 2020.codi-1.12 + + + Do We Really Need That Many Parameters In Transformer For Extractive Summarization? Discourse Can Help ! + WenXiao + PatrickHuber + GiuseppeCarenini + 124–134 + The multi-head self-attention of popular transformer models is widely used within Natural Language Processing (NLP), including for the task of extractive summarization. With the goal of analyzing and pruning the parameter-heavy self-attention mechanism, there are multiple approaches proposing more parameter-light self-attention alternatives. In this paper, we present a novel parameter-lean self-attention mechanism using discourse priors. Our new tree self-attention is based on document-level discourse information, extending the recently proposed “Synthesizer” framework with another lightweight alternative. We show empirical results that our tree self-attention approach achieves competitive ROUGE-scores on the task of extractive summarization. When compared to the original single-head transformer model, the tree attention approach reaches similar performance on both, EDU and sentence level, despite the significant reduction of parameters in the attention component. We further significantly outperform the 8-head transformer model on sentence level when applying a more balanced hyper-parameter setting, requiring an order of magnitude less parameters. + 2020.codi-1.13 + + + Extending Implicit Discourse Relation Recognition to the <fixed-case>PDTB</fixed-case>-3 + LiLiang + ZhengZhao + BonnieWebber + 135–147 + The PDTB-3 contains many more Implicit discourse relations than the previous PDTB-2. This is in part because implicit relations have now been annotated within sentences as well as between them. In addition, some now co-occur with explicit discourse relations, instead of standing on their own. Here we show that while this can complicate the problem of identifying the location of implicit discourse relations, it can in turn simplify the problem of identifying their senses. We present data to support this claim, as well as methods that can serve as a non-trivial baseline for future state-of-the-art recognizers for implicit discourse relations. + 2020.codi-1.14 + + + <fixed-case>TED</fixed-case>-<fixed-case>MDB</fixed-case> Lexicons: <fixed-case>T</fixed-case>r<fixed-case>E</fixed-case>n<fixed-case>C</fixed-case>onn<fixed-case>L</fixed-case>ex, <fixed-case>P</fixed-case>t<fixed-case>E</fixed-case>n<fixed-case>C</fixed-case>onn<fixed-case>L</fixed-case>ex + MurathanKurfalı + SibelOzer + DenizZeyrek + AmáliaMendes + 148–153 + In this work, we present two new bilingual discourse connective lexicons, namely, for Turkish-English and European Portuguese-English created automatically using the existing discourse relation-aligned TED-MDB corpus. In their current form, the Pt-En lexicon includes 95 entries, whereas the Tr-En lexicon contains 133 entries. The lexicons constitute the first step of a larger project of developing a multilingual discourse connective lexicon. + 2020.codi-1.15 + + + Evaluation of Coreference Resolution Systems Under Adversarial Attacks + HaixiaChai + WeiZhao + SteffenEger + MichaelStrube + 154–159 + A substantial overlap of coreferent mentions in the CoNLL dataset magnifies the recent progress on coreference resolution. This is because the CoNLL benchmark fails to evaluate the ability of coreference resolvers that requires linking novel mentions unseen at train time. In this work, we create a new dataset based on CoNLL, which largely decreases mention overlaps in the entire dataset and exposes the limitations of published resolvers on two aspects—lexical inference ability and understanding of low-level orthographic noise. Our findings show (1) the requirements for embeddings, used in resolvers, and for coreference resolutions are, by design, in conflict and (2) adversarial approaches are sometimes not legitimate to mitigate the obstacles, as they may falsely introduce mention overlaps in adversarial training and test sets, thus giving an inflated impression for the improvements. + 2020.codi-1.16 + + + Coreference for Discourse Parsing: A Neural Approach + GrigoriiGuz + GiuseppeCarenini + 160–167 + We present preliminary results on investigating the benefits of coreference resolution features for neural RST discourse parsing by considering different levels of coupling of the discourse parser with the coreference resolver. In particular, starting with a strong baseline neural parser unaware of any coreference information, we compare a parser which utilizes only the output of a neural coreference resolver, with a more sophisticated model, where discourse parsing and coreference resolution are jointly learned in a neural multitask fashion. Results indicate that these initial attempts to incorporate coreference information do not boost the performance of discourse parsing in a statistically significant way. + 2020.codi-1.17 + +
+
diff --git a/data/xml/2020.computerm.xml b/data/xml/2020.computerm.xml index 6b52a189cd..4ffa70f6bb 100644 --- a/data/xml/2020.computerm.xml +++ b/data/xml/2020.computerm.xml @@ -78,7 +78,7 @@
A study of semantic projection from single word terms to multi-word terms in the environment domain - YizheWANG + YizheWang BeatriceDaille NabilHathout 50–54 diff --git a/data/xml/2020.conll.xml b/data/xml/2020.conll.xml new file mode 100644 index 0000000000..8874b43f92 --- /dev/null +++ b/data/xml/2020.conll.xml @@ -0,0 +1,627 @@ + + + + + Proceedings of the 24th Conference on Computational Natural Language Learning + RaquelFernández + TalLinzen + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.conll-1.0 + + + Enriching Word Embeddings with Temporal and Spatial Information + HongyuGong + SumaBhat + PramodViswanath + 1–11 + The meaning of a word is closely linked to sociocultural factors that can change over time and location, resulting in corresponding meaning changes. Taking a global view of words and their meanings in a widely used language, such as English, may require us to capture more refined semantics for use in time-specific or location-aware situations, such as the study of cultural trends or language use. However, popular vector representations for words do not adequately include temporal or spatial information. In this work, we present a model for learning word representation conditioned on time and location. In addition to capturing meaning changes over time and location, we require that the resulting word embeddings retain salient semantic and geometric properties. We train our model on time- and location-stamped corpora, and show using both quantitative and qualitative evaluations that it can capture semantics across time and locations. We note that our model compares favorably with the state-of-the-art for time-specific embedding, and serves as a new benchmark for location-specific embeddings. + 2020.conll-1.1 + 2020.conll-1.1.OptionalSupplementaryMaterial.zip + + + Interpreting Attention Models with Human Visual Attention in Machine Reading Comprehension + EktaSood + SimonTannert + DiegoFrassinelli + AndreasBulling + Ngoc ThangVu + 12–25 + While neural networks with attention mechanisms have achieved superior performance on many natural language processing tasks, it remains unclear to which extent learned attention resembles human visual attention. In this paper, we propose a new method that leverages eye-tracking data to investigate the relationship between human visual attention and neural attention in machine reading comprehension. To this end, we introduce a novel 23 participant eye tracking dataset - MQA-RC, in which participants read movie plots and answered pre-defined questions. We compare state of the art networks based on long short-term memory (LSTM), convolutional neural models (CNN) and XLNet Transformer architectures. We find that higher similarity to human attention and performance significantly correlates to the LSTM and CNN models. However, we show this relationship does not hold true for the XLNet models – despite the fact that the XLNet performs best on this challenging task. Our results suggest that different architectures seem to learn rather different neural attention strategies and similarity of neural to human attention does not guarantee best performance. + 2020.conll-1.2 + + + Neural Proof Nets + KonstantinosKogkalidis + MichaelMoortgat + RichardMoot + 26–40 + Linear logic and the linear λ-calculus have a long standing tradition in the study of natural language form and meaning. Among the proof calculi of linear logic, proof nets are of particular interest, offering an attractive geometric representation of derivations that is unburdened by the bureaucratic complications of conventional prooftheoretic formats. Building on recent advances in set-theoretic learning, we propose a neural variant of proof nets based on Sinkhorn networks, which allows us to translate parsing as the problem of extracting syntactic primitives and permuting them into alignment. Our methodology induces a batch-efficient, end-to-end differentiable architecture that actualizes a formally grounded yet highly efficient neuro-symbolic parser. We test our approach on ÆThel, a dataset of type-logical derivations for written Dutch, where it manages to correctly transcribe raw text sentences into proofs and terms of the linear λ-calculus with an accuracy of as high as 70%. + 2020.conll-1.3 + + + <fixed-case>T</fixed-case>axi<fixed-case>NLI</fixed-case>: Taking a Ride up the <fixed-case>NLU</fixed-case> Hill + PratikJoshi + SomakAditya + AalokSathe + MonojitChoudhury + 41–55 + Pre-trained Transformer-based neural architectures have consistently achieved state-of-the-art performance in the Natural Language Inference (NLI) task. Since NLI examples encompass a variety of linguistic, logical, and reasoning phenomena, it remains unclear as to which specific concepts are learnt by the trained systems and where they can achieve strong generalization. To investigate this question, we propose a taxonomic hierarchy of categories that are relevant for the NLI task. We introduce TaxiNLI, a new dataset, that has 10k examples from the MNLI dataset with these taxonomic labels. Through various experiments on TaxiNLI, we observe that whereas for certain taxonomic categories SOTA neural models have achieved near perfect accuracies—a large jump over the previous models—some categories still remain difficult. Our work adds to the growing body of literature that shows the gaps in the current NLI systems and datasets through a systematic presentation and analysis of reasoning categories. + 2020.conll-1.4 + + + Modeling Subjective Assessments of Guilt in Newspaper Crime Narratives + ElisaKreiss + ZijianWang + ChristopherPotts + 56–68 + Crime reporting is a prevalent form of journalism with the power to shape public perceptions and social policies. How does the language of these reports act on readers? We seek to address this question with the SuspectGuilt Corpus of annotated crime stories from English-language newspapers in the U.S. For SuspectGuilt, annotators read short crime articles and provided text-level ratings concerning the guilt of the main suspect as well as span-level annotations indicating which parts of the story they felt most influenced their ratings. SuspectGuilt thus provides a rich picture of how linguistic choices affect subjective guilt judgments. We use SuspectGuilt to train and assess predictive models which validate the usefulness of the corpus, and show that these models benefit from genre pretraining and joint supervision from the text-level ratings and span-level annotations. Such models might be used as tools for understanding the societal effects of crime reporting. + 2020.conll-1.5 + 2020.conll-1.5.OptionalSupplementaryMaterial.zip + + + On the Frailty of Universal <fixed-case>POS</fixed-case> Tags for Neural <fixed-case>UD</fixed-case> Parsers + MarkAnderson + CarlosGómez-Rodríguez + 69–96 + We present an analysis on the effect UPOS accuracy has on parsing performance. Results suggest that leveraging UPOS tags as fea-tures for neural parsers requires a prohibitively high tagging accuracy and that the use of gold tags offers a non-linear increase in performance, suggesting some sort of exceptionality. We also investigate what aspects of predicted UPOS tags impact parsing accuracy the most, highlighting some potentially meaningful linguistic facets of the problem. + 2020.conll-1.6 + + + Classifying Syntactic Errors in Learner Language + LeshemChoshen + DmitryNikolaev + YevgeniBerzak + OmriAbend + 97–107 + We present a method for classifying syntactic errors in learner language, namely errors whose correction alters the morphosyntactic structure of a sentence. The methodology builds on the established Universal Dependencies syntactic representation scheme, and provides complementary information to other error-classification systems. Unlike existing error classification methods, our method is applicable across languages, which we showcase by producing a detailed picture of syntactic errors in learner English and learner Russian. We further demonstrate the utility of the methodology for analyzing the outputs of leading Grammatical Error Correction (GEC) systems. + 2020.conll-1.7 + 2020.conll-1.7.OptionalSupplementaryMaterial.pdf + + + How to Probe Sentence Embeddings in Low-Resource Languages: On Structural Design Choices for Probing Task Evaluation + SteffenEger + JohannesDaxenberger + IrynaGurevych + 108–118 + Sentence encoders map sentences to real valued vectors for use in downstream applications. To peek into these representations—e.g., to increase interpretability of their results—probing tasks have been designed which query them for linguistic knowledge. However, designing probing tasks for lesser-resourced languages is tricky, because these often lack largescale annotated data or (high-quality) dependency parsers as a prerequisite of probing task design in English. To investigate how to probe sentence embeddings in such cases, we investigate sensitivity of probing task results to structural design choices, conducting the first such large scale study. We show that design choices like size of the annotated probing dataset and type of classifier used for evaluation do (sometimes substantially) influence probing outcomes. We then probe embeddings in a multilingual setup with design choices that lie in a ‘stable region’, as we identify for English, and find that results on English do not transfer to other languages. Fairer and more comprehensive sentence-level probing evaluation should thus be carried out on multiple languages in the future. + 2020.conll-1.8 + 2020.conll-1.8.OptionalSupplementaryMaterial.pdf + + + Understanding the Source of Semantic Regularities in Word Embeddings + Hsiao-YuChiang + JoseCamacho-Collados + ZacharyPardos + 119–131 + Semantic relations are core to how humans understand and express concepts in the real world using language. Recently, there has been a thread of research aimed at modeling these relations by learning vector representations from text corpora. Most of these approaches focus strictly on leveraging the co-occurrences of relationship word pairs within sentences. In this paper, we investigate the hypothesis that examples of a lexical relation in a corpus are fundamental to a neural word embedding’s ability to complete analogies involving the relation. Our experiments, in which we remove all known examples of a relation from training corpora, show only marginal degradation in analogy completion performance involving the removed relation. This finding enhances our understanding of neural word embeddings, showing that co-occurrence information of a particular semantic relation is the not the main source of their structural regularity. + 2020.conll-1.9 + + + Finding The Right One and Resolving it + PayalKhullar + ArghyaBhattacharya + ManishShrivastava + 132–141 + One-anaphora has figured prominently in theoretical linguistic literature, but computational linguistics research on the phenomenon is sparse. Not only that, the long standing linguistic controversy between the determinative and the nominal anaphoric element one has propagated in the limited body of computational work on one-anaphora resolution, making this task harder than it is. In the present paper, we resolve this by drawing from an adequate linguistic analysis of the word one in different syntactic environments - once again highlighting the significance of linguistic theory in Natural Language Processing (NLP) tasks. We prepare an annotated corpus marking actual instances of one-anaphora with their textual antecedents, and use the annotations to experiment with state-of-the art neural models for one-anaphora resolution. Apart from presenting a strong neural baseline for this task, we contribute a gold-standard corpus, which is, to the best of our knowledge, the biggest resource on one-anaphora till date. + 2020.conll-1.10 + + + Bridging Information-Seeking Human Gaze and Machine Reading Comprehension + JonathanMalmaud + RogerLevy + YevgeniBerzak + 142–152 + In this work, we analyze how human gaze during reading comprehension is conditioned on the given reading comprehension question, and whether this signal can be beneficial for machine reading comprehension. To this end, we collect a new eye-tracking dataset with a large number of participants engaging in a multiple choice reading comprehension task. Our analysis of this data reveals increased fixation times over parts of the text that are most relevant for answering the question. Motivated by this finding, we propose making automated reading comprehension more human-like by mimicking human information-seeking reading behavior during reading comprehension. We demonstrate that this approach leads to performance gains on multiple choice question answering in English for a state-of-the-art reading comprehension model. + 2020.conll-1.11 + + + A Corpus of Very Short Scientific Summaries + YifanChen + TamaraPolajnar + ColinBatchelor + SimoneTeufel + 153–164 + We present a new summarisation task, taking scientific articles and producing journal table-of-contents entries in the chemistry domain. These are one- or two-sentence author-written summaries that present the key findings of a paper. This is a first look at this summarisation task with an open access publication corpus consisting of titles and abstracts, as input texts, and short author-written advertising blurbs, as the ground truth. We introduce the dataset and evaluate it with state-of-the-art summarisation methods. + 2020.conll-1.12 + + + Recurrent babbling: evaluating the acquisition of grammar from limited input data + LudovicaPannitto + AurélieHerbelot + 165–176 + Recurrent Neural Networks (RNNs) have been shown to capture various aspects of syntax from raw linguistic input. In most previous experiments, however, learning happens over unrealistic corpora, which do not reflect the type and amount of data a child would be exposed to. This paper remedies this state of affairs by training an LSTM over a realistically sized subset of child-directed input. The behaviour of the network is analysed over time using a novel methodology which consists in quantifying the level of grammatical abstraction in the model’s generated output (its ‘babbling’), compared to the language it has been exposed to. We show that the LSTM indeed abstracts new structures as learning proceeds. + 2020.conll-1.13 + 2020.conll-1.13.OptionalSupplementaryMaterial.zip + + + Explaining the efficiency of communication: How communicators can reduce their computational burden through interaction + Jacquelinevan Arkel + MariekeWoensdregt + MarkDingemanse + MarkBlokpoel + 177–194 + How can people communicate successfully while keeping resource costs low in the face of ambiguity? We present a principled theoretical analysis comparing two strategies for disambiguation in communication: (i) pragmatic reasoning, where communicators reason about each other, and (ii) other-initiated repair, where communicators signal and resolve trouble interactively. Using agent-based simulations and computational complexity analyses, we compare the efficiency of these strategies in terms of communicative success, computation cost and interaction cost. We show that agents with a simple repair mechanism can increase efficiency, compared to pragmatic agents, by reducing their computational burden at the cost of longer interactions. We also find that efficiency is highly contingent on the mechanism, highlighting the importance of explicit formalisation and computational rigour. + 2020.conll-1.14 + + + Acquiring language from speech by learning to remember and predict + CoryShain + MichaElsner + 195–214 + Classical accounts of child language learning invoke memory limits as a pressure to discover sparse, language-like representations of speech, while more recent proposals stress the importance of prediction for language learning. In this study, we propose a broad-coverage unsupervised neural network model to test memory and prediction as sources of signal by which children might acquire language directly from the perceptual stream. Our model embodies several likely properties of real-time human cognition: it is strictly incremental, it encodes speech into hierarchically organized labeled segments, it allows interactive top-down and bottom-up information flow, it attempts to model its own sequence of latent representations, and its objective function only recruits local signals that are plausibly supported by human working memory capacity. We show that much phonemic structure is learnable from unlabeled speech on the basis of these local signals. We further show that remembering the past and predicting the future both contribute to the linguistic content of acquired representations, and that these contributions are at least partially complementary. + 2020.conll-1.15 + + + Identifying Incorrect Labels in the <fixed-case>C</fixed-case>o<fixed-case>NLL</fixed-case>-2003 Corpus + FrederickReiss + HongXu + BryanCutler + KarthikMuthuraman + ZacharyEichenberger + 215–226 + The CoNLL-2003 corpus for English-language named entity recognition (NER) is one of the most influential corpora for NER model research. A large number of publications, including many landmark works, have used this corpus as a source of ground truth for NER tasks. In this paper, we examine this corpus and identify over 1300 incorrect labels (out of 35089 in the corpus). In particular, the number of incorrect labels in the test fold is comparable to the number of errors that state-of-the-art models make when running inference over this corpus. We describe the process by which we identified these incorrect labels, using novel variants of techniques from semi-supervised learning. We also summarize the types of errors that we found, and we revisit several recent results in NER in light of the corrected data. Finally, we show experimentally that our corrections to the corpus have a positive impact on three state-of-the-art models. + 2020.conll-1.16 + + + When is a bishop not like a rook? When it’s like a rabbi! Multi-prototype <fixed-case>BERT</fixed-case> embeddings for estimating semantic relationships + GabriellaChronis + KatrinErk + 227–244 + This paper investigates contextual language models, which produce token representations, as a resource for lexical semantics at the word or type level. We construct multi-prototype word embeddings from bert-base-uncased (Devlin et al., 2018). These embeddings retain contextual knowledge that is critical for some type-level tasks, while being less cumbersome and less subject to outlier effects than exemplar models. Similarity and relatedness estimation, both type-level tasks, benefit from this contextual knowledge, indicating the context-sensitivity of these processes. BERT’s token level knowledge also allows the testing of a type-level hypothesis about lexical abstractness, demonstrating the relationship between token-level phenomena and type-level concreteness ratings. Our findings provide important insight into the interpretability of BERT: layer 7 approximates semantic similarity, while the final layer (11) approximates relatedness. + 2020.conll-1.17 + 2020.conll-1.17.OptionalSupplementaryMaterial.zip + + + Processing effort is a poor predictor of cross-linguistic word order frequency + BrennanGonering + EmilyMorgan + 245–255 + Some have argued that word orders which are more difficult to process should be rarer cross-linguistically. Our current study fails to replicate the results of Maurits, Navarro, and Perfors (2010), who used an entropy-based Uniform Information Density (UID) measure to moderately predict the Greenbergian typology of transitive word orders. We additionally report an inability of three measures of processing difficulty — entropy-based UID, surprisal-based UID, and pointwise mutual information — to correctly predict the correct typological distribution, using transitive constructions from 20 languages in the Universal Dependencies project (version 2.5). However, our conclusions are limited by data sparsity. + 2020.conll-1.18 + + + Relations between comprehensibility and adequacy errors in machine translation output + MajaPopović + 256–264 + This work presents a detailed analysis of translation errors perceived by readers as comprehensibility and/or adequacy issues. The main finding is that good comprehensibility, similarly to good fluency, can mask a number of adequacy errors. Of all major adequacy errors, 30% were fully comprehensible, thus fully misleading the reader to accept the incorrect information. Another 25% of major adequacy errors were perceived as almost comprehensible, thus being potentially misleading. Also, a vast majority of omissions (about 70%) is hidden by comprehensibility. Further analysis of misleading translations revealed that the most frequent error types are ambiguity, mistranslation, noun phrase error, word-by-word translation, untranslated word, subject-verb agreement, and spelling error in the source text. However, none of these error types appears exclusively in misleading translations, but are also frequent in fully incorrect (incomprehensible inadequate) and discarded correct (incomprehensible adequate) translations. Deeper analysis is needed to potentially detect underlying phenomena specifically related to misleading translations. + 2020.conll-1.19 + + + Cross-lingual Embeddings Reveal Universal and Lineage-Specific Patterns in Grammatical Gender Assignment + HartgerVeeman + MarcAllassonnière-Tang + AleksandrsBerdicevskis + AliBasirat + 265–275 + Grammatical gender is assigned to nouns differently in different languages. Are all factors that influence gender assignment idiosyncratic to languages or are there any that are universal? Using cross-lingual aligned word embeddings, we perform two experiments to address these questions about language typology and human cognition. In both experiments, we predict the gender of nouns in language X using a classifier trained on the nouns of language Y, and take the classifier’s accuracy as a measure of transferability of gender systems. First, we show that for 22 Indo-European languages the transferability decreases as the phylogenetic distance increases. This correlation supports the claim that some gender assignment factors are idiosyncratic, and as the languages diverge, the proportion of shared inherited idiosyncrasies diminishes. Second, we show that when the classifier is trained on two Afro-Asiatic languages and tested on the same 22 Indo-European languages (or vice versa), its performance is still significantly above the chance baseline, thus showing that universal factors exist and, moreover, can be captured by word embeddings. When the classifier is tested across families and on inanimate nouns only, the performance is still above baseline, indicating that the universal factors are not limited to biological sex. + 2020.conll-1.20 + + + Modelling Lexical Ambiguity with Density Matrices + FrancoisMeyer + MarthaLewis + 276–290 + Words can have multiple senses. Compositional distributional models of meaning have been argued to deal well with finer shades of meaning variation known as polysemy, but are not so well equipped to handle word senses that are etymologically unrelated, or homonymy. Moving from vectors to density matrices allows us to encode a probability distribution over different senses of a word, and can also be accommodated within a compositional distributional model of meaning. In this paper we present three new neural models for learning density matrices from a corpus, and test their ability to discriminate between word senses on a range of compositional datasets. When paired with a particular composition method, our best model outperforms existing vector-based compositional models as well as strong sentence encoders. + 2020.conll-1.21 + 2020.conll-1.21.OptionalSupplementaryMaterial.zip + + + Catplayinginthesnow: Impact of Prior Segmentation on a Model of Visually Grounded Speech + WilliamHavard + LaurentBesacier + Jean-PierreChevrot + 291–301 + The language acquisition literature shows that children do not build their lexicon by segmenting the spoken input into phonemes and then building up words from them, but rather adopt a top-down approach and start by segmenting word-like units and then break them down into smaller units. This suggests that the ideal way of learning a language is by starting from full semantic units. In this paper, we investigate if this is also the case for a neural model of Visually Grounded Speech trained on a speech-image retrieval task. We evaluated how well such a network is able to learn a reliable speech-to-image mapping when provided with phone, syllable, or word boundary information. We present a simple way to introduce such information into an RNN-based model and investigate which type of boundary is the most efficient. We also explore at which level of the network’s architecture such information should be introduced so as to maximise its performances. Finally, we show that using multiple boundary types at once in a hierarchical structure, by which low-level segments are used to recompose high-level segments, is beneficial and yields better results than using low-level or high-level segments in isolation. + 2020.conll-1.22 + 2020.conll-1.22.OptionalSupplementaryMaterial.zip + + + Learning to ground medical text in a 3<fixed-case>D</fixed-case> human atlas + DusanGrujicic + GorjanRadevski + TinneTuytelaars + MatthewBlaschko + 302–312 + In this paper, we develop a method for grounding medical text into a physically meaningful and interpretable space corresponding to a human atlas. We build on text embedding architectures such as Bert and introduce a loss function that allows us to reason about the semantic and spatial relatedness of medical texts by learning a projection of the embedding into a 3D space representing the human body. We quantitatively and qualitatively demonstrate that our proposed method learns a context sensitive and spatially aware mapping, in both the inter-organ and intra-organ sense, using a large scale medical text dataset from the “Large-scale online biomedical semantic indexing” track of the 2020 BioASQ challenge. We extend our approach to a self-supervised setting, and find it to be competitive with a classification based method, and a fully supervised variant of approach. + 2020.conll-1.23 + 2020.conll-1.23.OptionalSupplementaryMaterial.zip + + + Representation Learning for Type-Driven Composition + GijsWijnholds + MehrnooshSadrzadeh + StephenClark + 313–324 + This paper is about learning word representations using grammatical type information. We use the syntactic types of Combinatory Categorial Grammar to develop multilinear representations, i.e. maps with n arguments, for words with different functional types. The multilinear maps of words compose with each other to form sentence representations. We extend the skipgram algorithm from vectors to multi- linear maps to learn these representations and instantiate it on unary and binary maps for transitive verbs. These are evaluated on verb and sentence similarity and disambiguation tasks and a subset of the SICK relatedness dataset. Our model performs better than previous type- driven models and is competitive with state of the art representation learning methods such as BERT and neural sentence encoders. + 2020.conll-1.24 + + + Word Representations Concentrate and This is Good News! + RomainCouillet + Yagmur GizemCinar + EricGaussier + MuhammadImran + 325–334 + This article establishes that, unlike the legacy tf*idf representation, recent natural language representations (word embedding vectors) tend to exhibit a so-called concentration of measure phenomenon, in the sense that, as the representation size p and database size n are both large, their behavior is similar to that of large dimensional Gaussian random vectors. This phenomenon may have important consequences as machine learning algorithms for natural language data could be amenable to improvement, thereby providing new theoretical insights into the field of natural language processing. + 2020.conll-1.25 + + + “<fixed-case>L</fixed-case>az<fixed-case>I</fixed-case>mpa”: Lazy and Impatient neural agents learn to communicate efficiently + MathieuRita + RahmaChaabouni + EmmanuelDupoux + 335–343 + Previous work has shown that artificial neural agents naturally develop surprisingly non-efficient codes. This is illustrated by the fact that in a referential game involving a speaker and a listener neural networks optimizing accurate transmission over a discrete channel, the emergent messages fail to achieve an optimal length. Furthermore, frequent messages tend to be longer than infrequent ones, a pattern contrary to the Zipf Law of Abbreviation (ZLA) observed in all natural languages. Here, we show that near-optimal and ZLA-compatible messages can emerge, but only if both the speaker and the listener are modified. We hence introduce a new communication system, “LazImpa”, where the speaker is made increasingly lazy, i.e., avoids long messages, and the listener impatient, i.e., seeks to guess the intended content as soon as possible. + 2020.conll-1.26 + 2020.conll-1.26.OptionalSupplementaryMaterial.zip + + + Re-solve it: simulating the acquisition of core semantic competences from small data + AurélieHerbelot + 344–354 + Many tasks are considered to be ‘solved’ in the computational linguistics literature, but the corresponding algorithms operate in ways which are radically different from human cognition. I illustrate this by coming back to the notion of semantic competence, which includes basic linguistic skills encompassing both referential phenomena and generic knowledge, in particular a) the ability to denote, b) the mastery of the lexicon, or c) the ability to model one’s language use on others. Even though each of those faculties has been extensively tested individually, there is still no computational model that would account for their joint acquisition under the conditions experienced by a human. In this paper, I focus on one particular aspect of this problem: the amount of linguistic data available to the child or machine. I show that given the first competence mentioned above (a denotation function), the other two can in fact be learned from very limited data (2.8M token), reaching state-of-the-art performance. I argue that both the nature of the data and the way it is presented to the system matter to acquisition. + 2020.conll-1.27 + + + In Media Res: A Corpus for Evaluating Named Entity Linking with Creative Works + Adrian M.P.Brasoveanu + AlbertWeichselbraun + LyndonNixon + 355–364 + Annotation styles express guidelines that direct human annotators in what rules to follow when creating gold standard annotations of text corpora. These guidelines not only shape the gold standards they help create, but also influence the training and evaluation of Named Entity Linking (NEL) tools, since different annotation styles correspond to divergent views on the entities present in the same texts. Such divergence is particularly present in texts from the media domain that contain references to creative works. In this work we present a corpus of 1000 annotated documents selected from the media domain. Each document is presented with multiple gold standard annotations representing various annotation styles. This corpus is used to evaluate a series of Named Entity Linking tools in order to understand the impact of the differences in annotation styles on the reported accuracy when processing highly ambiguous entities such as names of creative works. Relaxed annotation guidelines that include overlap styles lead to better results across all tools. + 2020.conll-1.28 + + + Analogies minus analogy test: measuring regularities in word embeddings + LouisFournier + EmmanuelDupoux + EwanDunbar + 365–375 + Vector space models of words have long been claimed to capture linguistic regularities as simple vector translations, but problems have been raised with this claim. We decompose and empirically analyze the classic arithmetic word analogy test, to motivate two new metrics that address the issues with the standard test, and which distinguish between class-wise offset concentration (similar directions between pairs of words drawn from different broad classes, such as France-London, China-Ottawa,...) and pairing consistency (the existence of a regular transformation between correctly-matched pairs such as France:Paris::China:Beijing). We show that, while the standard analogy test is flawed, several popular word embeddings do nevertheless encode linguistic regularities. + 2020.conll-1.29 + + + Word associations and the distance properties of context-aware word embeddings + MariaA. Rodriguez + PaolaMerlo + 376–385 + What do people know when they know the meaning of words? Word associations have been widely used to tap into lexical repre- sentations and their structure, as a way of probing semantic knowledge in humans. We investigate whether current word embedding spaces (contextualized and uncontextualized) can be considered good models of human lexi- cal knowledge by studying whether they have comparable characteristics to human associa- tion spaces. We study the three properties of association rank, asymmetry of similarity and triangle inequality. We find that word embeddings are good mod- els of some word associations properties. They replicate well human associations between words, and, like humans, their context-aware variants show violations of the triangle in- equality. While they do show asymmetry of similarities, their asymmetries do not map those of human association norms. + 2020.conll-1.30 + 2020.conll-1.30.OptionalSupplementaryMaterial.zip + + + <fixed-case>T</fixed-case>r<fixed-case>C</fixed-case>laim-19: The First Collection for <fixed-case>T</fixed-case>urkish Check-Worthy Claim Detection with Annotator Rationales + Yavuz SelimKartal + MucahidKutlu + 386–395 + Massive misinformation spread over Internet has many negative impacts on our lives. While spreading a claim is easy, investigating its veracity is hard and time consuming, Therefore, we urgently need systems to help human fact-checkers. However, available data resources to develop effective systems are limited and the vast majority of them is for English. In this work, we introduce TrClaim-19, which is the very first labeled dataset for Turkish check-worthy claims. TrClaim-19 consists of labeled 2287 Turkish tweets with annotator rationales, enabling us to better understand the characteristics of check-worthy claims. The rationales we collected suggest that claims’ topics and their possible negative impacts are the main factors affecting their check-worthiness. + 2020.conll-1.31 + + + Discourse structure interacts with reference but not syntax in neural language models + ForrestDavis + Martenvan Schijndel + 396–407 + Language models (LMs) trained on large quantities of text have been claimed to acquire abstract linguistic representations. Our work tests the robustness of these abstractions by focusing on the ability of LMs to learn interactions between different linguistic representations. In particular, we utilized stimuli from psycholinguistic studies showing that humans can condition reference (i.e. coreference resolution) and syntactic processing on the same discourse structure (implicit causality). We compared both transformer and long short-term memory LMs to find that, contrary to humans, implicit causality only influences LM behavior for reference, not syntax, despite model representations that encode the necessary discourse information. Our results further suggest that LM behavior can contradict not only learned representations of discourse but also syntactic agreement, pointing to shortcomings of standard language modeling. + 2020.conll-1.32 + 2020.conll-1.32.OptionalSupplementaryMaterial.pdf + + + Continual Adaptation for Efficient Machine Communication + RobertHawkins + MinaeKwon + DorsaSadigh + NoahGoodman + 408–419 + To communicate with new partners in new contexts, humans rapidly form new linguistic conventions. Recent neural language models are able to comprehend and produce the existing conventions present in their training data, but are not able to flexibly and interactively adapt those conventions on the fly as humans do. We introduce an interactive repeated reference task as a benchmark for models of adaptation in communication and propose a regularized continual learning framework that allows an artificial agent initialized with a generic language model to more accurately and efficiently communicate with a partner over time. We evaluate this framework through simulations on COCO and in real-time reference game experiments with human partners. + 2020.conll-1.33 + 2020.conll-1.33.OptionalSupplementaryMaterial.pdf + + + Diverse and Relevant Visual Storytelling with Scene Graph Embeddings + XudongHong + RakshithShetty + AsadSayeed + KhushbooMehra + VeraDemberg + BerntSchiele + 420–430 + A problem in automatically generated stories for image sequences is that they use overly generic vocabulary and phrase structure and fail to match the distributional characteristics of human-generated text. We address this problem by introducing explicit representations for objects and their relations by extracting scene graphs from the images. Utilizing an embedding of this scene graph enables our model to more explicitly reason over objects and their relations during story generation, compared to the global features from an object classifier used in previous work. We apply metrics that account for the diversity of words and phrases of generated stories as well as for reference to narratively-salient image features and show that our approach outperforms previous systems. Our experiments also indicate that our models obtain competitive results on reference-based metrics. + 2020.conll-1.34 + + + Alleviating Digitization Errors in Named Entity Recognition for Historical Documents + EmanuelaBoros + AhmedHamdi + ElvysLinhares Pontes + Luis AdriánCabrera-Diego + Jose G.Moreno + NicolasSidere + AntoineDoucet + 431–441 + This paper tackles the task of named entity recognition (NER) applied to digitized historical texts obtained from processing digital images of newspapers using optical character recognition (OCR) techniques. We argue that the main challenge for this task is that the OCR process leads to misspellings and linguistic errors in the output text. Moreover, historical variations can be present in aged documents, which can impact the performance of the NER process. We conduct a comparative evaluation on two historical datasets in German and French against previous state-of-the-art models, and we propose a model based on a hierarchical stack of Transformers to approach the NER task for historical data. Our findings show that the proposed model clearly improves the results on both historical datasets, and does not degrade the results for modern datasets. + 2020.conll-1.35 + + + Analysing Word Representation from the Input and Output Embeddings in Neural Network Language Models + StevenDerby + PaulMiller + BarryDevereux + 442–454 + Researchers have recently demonstrated that tying the neural weights between the input look-up table and the output classification layer can improve training and lower perplexity on sequence learning tasks such as language modelling. Such a procedure is possible due to the design of the softmax classification layer, which previous work has shown to comprise a viable set of semantic representations for the model vocabulary, and these these output embeddings are known to perform well on word similarity benchmarks. In this paper, we make meaningful comparisons between the input and output embeddings and other SOTA distributional models to gain a better understanding of the types of information they represent. We also construct a new set of word embeddings using the output embeddings to create locally-optimal approximations for the intermediate representations from the language model. These locally-optimal embeddings demonstrate excellent performance across all our evaluations. + 2020.conll-1.36 + 2020.conll-1.36.OptionalSupplementaryMaterial.pdf + + + On the Computational Power of Transformers and Its Implications in Sequence Modeling + SatwikBhattamishra + ArkilPatel + NavinGoyal + 455–475 + Transformers are being used extensively across several sequence modeling tasks. Significant research effort has been devoted to experimentally probe the inner workings of Transformers. However, our conceptual and theoretical understanding of their power and inherent limitations is still nascent. In particular, the roles of various components in Transformers such as positional encodings, attention heads, residual connections, and feedforward networks, are not clear. In this paper, we take a step towards answering these questions. We analyze the computational power as captured by Turing-completeness. We first provide an alternate and simpler proof to show that vanilla Transformers are Turing-complete and then we prove that Transformers with only positional masking and without any positional encoding are also Turing-complete. We further analyze the necessity of each component for the Turing-completeness of the network; interestingly, we find that a particular type of residual connection is necessary. We demonstrate the practical implications of our results via experiments on machine translation and synthetic tasks. + 2020.conll-1.37 + 2020.conll-1.37.OptionalSupplementaryMaterial.zip + + + An Expectation Maximisation Algorithm for Automated Cognate Detection + RoddyMacSween + AndrewCaines + 476–485 + In historical linguistics, cognate detection is the task of determining whether sets of words have common etymological roots. Inspired by the comparative method used by human linguists, we develop a system for automated cognate detection that frames the task as an inference problem for a general statistical model consisting of observed data (potentially cognate pairs of words), latent variables (the cognacy status of pairs) and unknown global parameters (which sounds correspond between languages). We then give a specific instance of such a model along with an expectation-maximisation algorithm to infer its parameters. We evaluate our system on a corpus of 8140 cognate sets, finding the performance of our method to be comparable to the state of the art. We additionally carry out qualitative analysis demonstrating advantages it has over existing systems. We also suggest several ways our work could be extended within the general theoretical framework we propose. + 2020.conll-1.38 + + + Filler-gaps that neural networks fail to generalize + DebasmitaBhattacharya + Martenvan Schijndel + 486–495 + It can be difficult to separate abstract linguistic knowledge in recurrent neural networks (RNNs) from surface heuristics. In this work, we probe for highly abstract syntactic constraints that have been claimed to govern the behavior of filler-gap dependencies across different surface constructions. For models to generalize abstract patterns in expected ways to unseen data, they must share representational features in predictable ways. We use cumulative priming to test for representational overlap between disparate filler-gap constructions in English and find evidence that the models learn a general representation for the existence of filler-gap dependencies. However, we find no evidence that the models learn any of the shared underlying grammatical constraints we tested. Our work raises questions about the degree to which RNN language models learn abstract linguistic representations. + 2020.conll-1.39 + 2020.conll-1.39.OptionalSupplementaryMaterial.zip + + + Don’t Parse, Insert: Multilingual Semantic Parsing with Insertion Based Decoding + QileZhu + HaidarKhan + SalehSoltan + StephenRawls + WaelHamza + 496–506 + Semantic parsing is one of the key components of natural language understanding systems. A successful parse transforms an input utterance to an action that is easily understood by the system. Many algorithms have been proposed to solve this problem, from conventional rule-based or statistical slot-filling systems to shift-reduce based neural parsers. For complex parsing tasks, the state-of-the-art method is based on an autoregressive sequence to sequence model that generates the parse directly. This model is slow at inference time, generating parses in O(n) decoding steps (n is the length of the target sequence). In addition, we demonstrate that this method performs poorly in zero-shot cross-lingual transfer learning settings. In this paper, we propose a non-autoregressive parser which is based on the insertion transformer to overcome these two issues. Our approach 1) speeds up decoding by 3x while outperforming the autoregressive model and 2) significantly improves cross-lingual transfer in the low-resource setting by 37% compared to autoregressive baseline. We test our approach on three wellknown monolingual datasets: ATIS, SNIPS and TOP. For cross-lingual semantic parsing, we use the MultiATIS++ and the multilingual TOP datasets. + 2020.conll-1.40 + + + Learning Context-free Languages with Nondeterministic Stack <fixed-case>RNN</fixed-case>s + BrianDuSell + DavidChiang + 507–519 + We present a differentiable stack data structure that simultaneously and tractably encodes an exponential number of stack configurations, based on Lang’s algorithm for simulating nondeterministic pushdown automata. We call the combination of this data structure with a recurrent neural network (RNN) controller a Nondeterministic Stack RNN. We compare our model against existing stack RNNs on various formal languages, demonstrating that our model converges more reliably to algorithmic behavior on deterministic tasks, and achieves lower cross-entropy on inherently nondeterministic tasks. + 2020.conll-1.41 + + + Generating Narrative Text in a Switching Dynamical System + NoahWeber + LeenaShekhar + HeeyoungKwon + NiranjanBalasubramanian + NathanaelChambers + 520–530 + Early work on narrative modeling used explicit plans and goals to generate stories, but the language generation itself was restricted and inflexible. Modern methods use language models for more robust generation, but often lack an explicit representation of the scaffolding and dynamics that guide a coherent narrative. This paper introduces a new model that integrates explicit narrative structure with neural language models, formalizing narrative modeling as a Switching Linear Dynamical System (SLDS). A SLDS is a dynamical system in which the latent dynamics of the system (i.e. how the state vector transforms over time) is controlled by top-level discrete switching variables. The switching variables represent narrative structure (e.g., sentiment or discourse states), while the latent state vector encodes information on the current state of the narrative. This probabilistic formulation allows us to control generation, and can be learned in a semi-supervised fashion using both labeled and unlabeled data. Additionally, we derive a Gibbs sampler for our model that can “fill in” arbitrary parts of the narrative, guided by the switching variables. Our filled-in (English language) narratives outperform several baselines on both automatic and human evaluations + 2020.conll-1.42 + + + What Are You Trying to Do? Semantic Typing of Event Processes + MuhaoChen + HongmingZhang + HaoyuWang + DanRoth + 531–542 + This paper studies a new cognitively motivated semantic typing task,multi-axis event process typing, that, given anevent process, attempts to infer free-form typelabels describing (i) the type of action made bythe process and (ii) the type of object the pro-cess seeks to affect. This task is inspired bycomputational and cognitive studies of eventunderstanding, which suggest that understand-ing processes of events is often directed by rec-ognizing the goals, plans or intentions of theprotagonist(s). We develop a large dataset con-taining over 60k event processes, featuring ul-tra fine-grained typing on both the action andobject type axes with very large (10ˆ3∼10ˆ4)label vocabularies. We then propose a hybridlearning framework,P2GT, which addressesthe challenging typing problem with indirectsupervision from glosses1and a joint learning-to-rank framework. As our experiments indi-cate,P2GTsupports identifying the intent ofprocesses, as well as the fine semantic type ofthe affected object. It also demonstrates the ca-pability of handling few-shot cases, and stronggeneralizability on out-of-domain processes. + 2020.conll-1.43 + + + A Corpus for Outbreak Detection of Diseases Prevalent in <fixed-case>L</fixed-case>atin <fixed-case>A</fixed-case>merica + AntonellaDellanzo + VivianaCotik + JoseOchoa-Luna + 543–551 + In this paper we present an annotated corpus which can be used for training and testing algorithms to automatically extract information about diseases outbreaks from news and health reports. We also propose initial approaches to extract information from it. The corpus has been constructed with two main tasks in mind. The first one, to extract entities about outbreaks such as disease, host, location among others. The second one, to retrieve relations among entities, for instance, in such geographic location fifteen cases of a given disease were reported. Overall, our goal is to offer resources and tools to perform an automated analysis so as to support early detection of disease outbreaks and therefore diminish their spreading. + 2020.conll-1.44 + + + Are Pretrained Language Models Symbolic Reasoners over Knowledge? + NoraKassner + BennoKrojer + HinrichSchütze + 552–564 + How can pretrained language models (PLMs) learn factual knowledge from the training set? We investigate the two most important mechanisms: reasoning and memorization. Prior work has attempted to quantify the number of facts PLMs learn, but we present, using synthetic data, the first study that investigates the causal relation between facts present in training and facts learned by the PLM. For reasoning, we show that PLMs seem to learn to apply some symbolic reasoning rules correctly but struggle with others, including two-hop reasoning. Further analysis suggests that even the application of learned reasoning rules is flawed. For memorization, we identify schema conformity (facts systematically supported by other facts) and frequency as key factors for its success. + 2020.conll-1.45 + + + Understanding Linguistic Accommodation in Code-Switched Human-Machine Dialogues + TanmayParekh + EmilyAhn + YuliaTsvetkov + Alan WBlack + 565–577 + Code-switching is a ubiquitous phenomenon in multilingual communities. Natural language technologies that wish to communicate like humans must therefore adaptively incorporate code-switching techniques when they are deployed in multilingual settings. To this end, we propose a Hindi-English human-machine dialogue system that elicits code-switching conversations in a controlled setting. It uses different code-switching agent strategies to understand how users respond and accommodate to the agent’s language choice. Through this system, we collect and release a new dataset CommonDost, comprising of 439 human-machine multilingual conversations. We adapt pre-defined metrics to discover linguistic accommodation from users to agents. Finally, we compare these dialogues with Spanish-English dialogues collected in a similar setting, and analyze the impact of linguistic and socio-cultural factors on code-switching patterns across the two language pairs. + 2020.conll-1.46 + + + Identifying robust markers of <fixed-case>P</fixed-case>arkinson’s disease in typing behaviour using a <fixed-case>CNN</fixed-case>-<fixed-case>LSTM</fixed-case> network + NeilDhir + MathiasEdman + ÁlvaroSanchez Ferro + TomStafford + ColinBannard + 578–595 + There is urgent need for non-intrusive tests that can detect early signs of Parkinson’s disease (PD), a debilitating neurodegenerative disorder that affects motor control. Recent promising research has focused on disease markers evident in the fine-motor behaviour of typing. Most work to date has focused solely on the timing of keypresses without reference to the linguistic content. In this paper we argue that the identity of the key combinations being produced should impact how they are handled by people with PD, and provide evidence that natural language processing methods can thus be of help in identifying signs of disease. We test the performance of a bi-directional LSTM with convolutional features in distinguishing people with PD from age-matched controls typing in English and Spanish, both in clinics and online. + 2020.conll-1.47 + + + An Empirical Study on Model-agnostic Debiasing Strategies for Robust Natural Language Inference + TianyuLiu + ZhengXin + XiaoanDing + BaobaoChang + ZhifangSui + 596–608 + The prior work on natural language inference (NLI) debiasing mainly targets at one or few known biases while not necessarily making the models more robust. In this paper, we focus on the model-agnostic debiasing strategies and explore how to (or is it possible to) make the NLI models robust to multiple distinct adversarial attacks while keeping or even strengthening the models’ generalization power. We firstly benchmark prevailing neural NLI models including pretrained ones on various adversarial datasets. We then try to combat distinct known biases by modifying a mixture of experts (MoE) ensemble method and show that it’s nontrivial to mitigate multiple NLI biases at the same time, and that model-level ensemble method outperforms MoE ensemble method. We also perform data augmentation including text swap, word substitution and paraphrase and prove its efficiency in combating various (though not all) adversarial attacks at the same time. Finally, we investigate several methods to merge heterogeneous training data (1.35M) and perform model ensembling, which are straightforward but effective to strengthen NLI models. + 2020.conll-1.48 + + + Cloze Distillation Improves Psychometric Predictive Power + TiwalayoEisape + NogaZaslavsky + RogerLevy + 609–619 + Contemporary autoregressive language models (LMs) trained purely on corpus data have been shown to capture numerous features of human incremental processing. However, past work has also suggested dissociations between corpus probabilities and human next-word predictions. Here we evaluate several state-of-the-art language models for their match to human next-word predictions and to reading time behavior from eye movements. We then propose a novel method for distilling the linguistic information implicit in human linguistic predictions into pre-trained LMs: Cloze Distillation. We apply this method to a baseline neural LM and show potential improvement in reading time prediction and generalization to held-out human cloze data. + 2020.conll-1.49 + + + Disentangling dialects: a neural approach to <fixed-case>I</fixed-case>ndo-<fixed-case>A</fixed-case>ryan historical phonology and subgrouping + ChundraCathcart + TarakaRama + 620–630 + This paper seeks to uncover patterns of sound change across Indo-Aryan languages using an LSTM encoder-decoder architecture. We augment our models with embeddings represent-ing language ID, part of speech, and other features such as word embeddings. We find that a highly augmented model shows highest accuracy in predicting held-out forms, and investigate other properties of interest learned by our models’ representations. We outline extensions to this architecture that can better capture variation in Indo-Aryan sound change. + 2020.conll-1.50 + + + A Dataset for Linguistic Understanding, Visual Evaluation, and Recognition of Sign Languages: The K-<fixed-case>RSL</fixed-case> + AlfarabiImashev + MedetMukushev + VadimKimmelman + AnaraSandygulova + 631–640 + The paper presents the first dataset that aims to serve interdisciplinary purposes for the utility of computer vision community and sign language linguistics. To date, a majority of Sign Language Recognition (SLR) approaches focus on recognising sign language as a manual gesture recognition problem. However, signers use other articulators: facial expressions, head and body position and movement to convey linguistic information. Given the important role of non-manual markers, this paper proposes a dataset and presents a use case to stress the importance of including non-manual features to improve the recognition accuracy of signs. To the best of our knowledge no prior publicly available dataset exists that explicitly focuses on non-manual components responsible for the grammar of sign languages. To this end, the proposed dataset contains 28250 videos of signs of high resolution and quality, with annotation of manual and non-manual components. We conducted a series of evaluations in order to investigate whether non-manual components would improve signs’ recognition accuracy. We release the dataset to encourage SLR researchers and help advance current progress in this area toward real-time sign language interpretation. Our dataset will be made publicly available at https://krslproject.github.io/krsl-corpus + 2020.conll-1.51 + + + From Dataset Recycling to Multi-Property Extraction and Beyond + TomaszDwojak + MichałPietruszka + ŁukaszBorchmann + JakubChłędowski + FilipGraliński + 641–651 + This paper investigates various Transformer architectures on the WikiReading Information Extraction and Machine Reading Comprehension dataset. The proposed dual-source model outperforms the current state-of-the-art by a large margin. Next, we introduce WikiReading Recycled - a newly developed public dataset, and the task of multiple-property extraction. It uses the same data as WikiReading but does not inherit its predecessor’s identified disadvantages. In addition, we provide a human-annotated test set with diagnostic subsets for a detailed analysis of model performance. + 2020.conll-1.52 + + + How well does surprisal explain N400 amplitude under different experimental conditions? + JamesMichaelov + BenjaminBergen + 652–663 + We investigate the extent to which word surprisal can be used to predict a neural measure of human language processing difficulty—the N400. To do this, we use recurrent neural networks to calculate the surprisal of stimuli from previously published neurolinguistic studies of the N400. We find that surprisal can predict N400 amplitude in a wide range of cases, and the cases where it cannot do so provide valuable insight into the neurocognitive processes underlying the response. + 2020.conll-1.53 + +
+ + + Proceedings of the CoNLL 2020 Shared Task: Cross-Framework Meaning Representation Parsing + StephanOepen + OmriAbend + LashaAbzianidze + JohanBos + JanHajič + DanielHershcovich + BinLi + TimO'Gorman + NianwenXue + DanielZeman + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.conll-shared.0 + + + <fixed-case>MRP</fixed-case> 2020: The Second Shared Task on Cross-Framework and Cross-Lingual Meaning Representation Parsing + StephanOepen + OmriAbend + LashaAbzianidze + JohanBos + JanHajic + DanielHershcovich + BinLi + TimO’Gorman + NianwenXue + DanielZeman + 1–22 + The 2020 Shared Task at the Conference for Computational Language Learning (CoNLL) was devoted to Meaning Representation Parsing (MRP) across frameworks and languages. Extending a similar setup from the previous year, five distinct approaches to the representation of sentence meaning in the form of directed graphs were represented in the English training and evaluation data for the task, packaged in a uniform graph abstraction and serialization; for four of these representation frameworks, additional training and evaluation data was provided for one additional language per framework. The task received submissions from eight teams, of which two do not participate in the official ranking because they arrived after the closing deadline or made use of additional training data. All technical information regarding the task, including system submissions, official results, and links to supporting resources and software are available from the task web site at: http://mrp.nlpl.eu + 2020.conll-shared.1 + 2020.conll-shared.1.Attachment.pdf + + + <fixed-case>DRS</fixed-case> at <fixed-case>MRP</fixed-case> 2020: Dressing up Discourse Representation Structures as Graphs + LashaAbzianidze + JohanBos + StephanOepen + 23–32 + Discourse Representation Theory (DRT) is a formal account for representing the meaning of natural language discourse. Meaning in DRT is modeled via a Discourse Representation Structure (DRS), a meaning representation with a model-theoretic interpretation, which is usually depicted as nested boxes. In contrast, a directed labeled graph is a common data structure used to encode semantics of natural language texts. The paper describes the procedure of dressing up DRSs as directed labeled graphs to include DRT as a new framework in the 2020 shared task on Cross-Framework and Cross-Lingual Meaning Representation Parsing. Since one of the goals of the shared task is to encourage unified models for several semantic graph frameworks, the conversion procedure was biased towards making the DRT graph framework somewhat similar to other graph-based meaning representation frameworks. + 2020.conll-shared.2 + + + <fixed-case>FGD</fixed-case> at <fixed-case>MRP</fixed-case> 2020: <fixed-case>P</fixed-case>rague Tectogrammatical Graphs + DanielZeman + JanHajic + 33–39 + Prague Tectogrammatical Graphs (PTG) is a meaning representation framework that originates in the tectogrammatical layer of the Prague Dependency Treebank (PDT) and is theoretically founded in Functional Generative Description of language (FGD). PTG in its present form has been prepared for the CoNLL 2020 shared task on Cross-Framework Meaning Representation Parsing (MRP). It is generated automatically from the Prague treebanks and stored in the JSON-based MRP graph interchange format. The conversion is partially lossy; in this paper we describe what part of annotation was included and how it is represented in PTG. + 2020.conll-shared.3 + + + Hitachi at <fixed-case>MRP</fixed-case> 2020: Text-to-Graph-Notation Transducer + HiroakiOzaki + GakuMorio + YutaKoreeda + TerufumiMorishita + ToshinoriMiyoshi + 40–52 + This paper presents our proposed parser for the shared task on Meaning Representation Parsing (MRP 2020) at CoNLL, where participant systems were required to parse five types of graphs in different languages. We propose to unify these tasks as a text-to-graph-notation transduction in which we convert an input text into a graph notation. To this end, we designed a novel Plain Graph Notation (PGN) that handles various graphs universally. Then, our parser predicts a PGN-based sequence by leveraging Transformers and biaffine attentions. Notably, our parser can handle any PGN-formatted graphs with fewer framework-specific modifications. As a result, ensemble versions of the parser tied for 1st place in both cross-framework and cross-lingual tracks. + 2020.conll-shared.4 + + + <fixed-case>ÚFAL</fixed-case> at <fixed-case>MRP</fixed-case> 2020: Permutation-invariant Semantic Parsing in <fixed-case>PERIN</fixed-case> + DavidSamuel + MilanStraka + 53–64 + We present PERIN, a novel permutation-invariant approach to sentence-to-graph semantic parsing. PERIN is a versatile, cross-framework and language independent architecture for universal modeling of semantic structures. Our system participated in the CoNLL 2020 shared task, Cross-Framework Meaning Representation Parsing (MRP 2020), where it was evaluated on five different frameworks (AMR, DRG, EDS, PTG and UCCA) across four languages. PERIN was one of the winners of the shared task. The source code and pretrained models are available at http://www.github.com/ufal/perin. + 2020.conll-shared.5 + + + <fixed-case>HIT</fixed-case>-<fixed-case>SCIR</fixed-case> at <fixed-case>MRP</fixed-case> 2020: Transition-based Parser and Iterative Inference Parser + LongxuDou + YunlongFeng + YuqiuJi + WanxiangChe + TingLiu + 65–72 + This paper describes our submission system (HIT-SCIR) for the CoNLL 2020 shared task: Cross-Framework and Cross-Lingual Meaning Representation Parsing. The task includes five frameworks for graph-based meaning representations, i.e., UCCA, EDS, PTG, AMR, and DRG. Our solution consists of two sub-systems: transition-based parser for Flavor (1) frameworks (UCCA, EDS, PTG) and iterative inference parser for Flavor (2) frameworks (DRG, AMR). In the final evaluation, our system is ranked 3rd among the seven team both in Cross-Framework Track and Cross-Lingual Track, with the macro-averaged MRP F1 score of 0.81/0.69. + 2020.conll-shared.6 + + + <fixed-case>HUJI</fixed-case>-<fixed-case>KU</fixed-case> at <fixed-case>MRP</fixed-case> 2020: Two Transition-based Neural Parsers + OfirArviv + RuixiangCui + DanielHershcovich + 73–82 + This paper describes the HUJI-KU system submission to the shared task on CrossFramework Meaning Representation Parsing (MRP) at the 2020 Conference for Computational Language Learning (CoNLL), employing TUPA and the HIT-SCIR parser, which were, respectively, the baseline system and winning system in the 2019 MRP shared task. Both are transition-based parsers using BERT contextualized embeddings. We generalized TUPA to support the newly-added MRP frameworks and languages, and experimented with multitask learning with the HIT-SCIR parser. We reached 4th place in both the crossframework and cross-lingual tracks. + 2020.conll-shared.7 + + + <fixed-case>JBNU</fixed-case> at <fixed-case>MRP</fixed-case> 2020: <fixed-case>AMR</fixed-case> Parsing Using a Joint State Model for Graph-Sequence Iterative Inference + Seung-HoonNa + JinwooMin + 83–87 + This paper describes the Jeonbuk National University (JBNU) system for the 2020 shared task on Cross-Framework Meaning Representation Parsing at the Conference on Computational Natural Language Learning. Among the five frameworks, we address only the abstract meaning representation framework and propose a joint state model for the graph-sequence iterative inference of (Cai and Lam, 2020) for a simplified graph-sequence inference. In our joint state model, we update only a single joint state vector during the graph-sequence inference process instead of keeping the dual state vectors, and all other components are exactly the same as in (Cai and Lam, 2020). + 2020.conll-shared.8 + +
+
diff --git a/data/xml/2020.deelio.xml b/data/xml/2020.deelio.xml new file mode 100644 index 0000000000..f2ce70160a --- /dev/null +++ b/data/xml/2020.deelio.xml @@ -0,0 +1,120 @@ + + + + + Proceedings of Deep Learning Inside Out (DeeLIO): The First Workshop on Knowledge Extraction and Integration for Deep Learning Architectures + EnekoAgirre + MariannaApidianaki + IvanVulić + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.deelio-1.0 + + + Correcting the Misuse: A Method for the <fixed-case>C</fixed-case>hinese Idiom Cloze Test + XinyuWang + HongshengZhao + TanYang + HongboWang + 1–10 + The cloze test for Chinese idioms is a new challenge in machine reading comprehension: given a sentence with a blank, choosing a candidate Chinese idiom which matches the context. Chinese idiom is a type of Chinese idiomatic expression. The common misuse of Chinese idioms leads to error in corpus and causes error in the learned semantic representation of Chinese idioms. In this paper, we introduce the definition written by Chinese experts to correct the misuse. We propose a model for the Chinese idiom cloze test integrating various information effectively. We propose an attention mechanism called Attribute Attention to balance the weight of different attributes among different descriptions of the Chinese idiom. Besides the given candidates of every blank, we also try to choose the answer from all Chinese idioms that appear in the dataset as the extra loss due to the uniqueness and specificity of Chinese idioms. In experiments, our model outperforms the state-of-the-art model. + 2020.deelio-1.1 + + + Relation Extraction with Contextualized Relation Embedding + XiaoyuChen + RohanBadlani + 11–19 + This submission is a paper that proposes an architecture for the relation extraction task which integrates semantic information with knowledge base modeling in a novel manner. + 2020.deelio-1.2 + + + Generalization to Mitigate Synonym Substitution Attacks + BasemahAlshemali + JugalKalita + 20–28 + Studies have shown that deep neural networks (DNNs) are vulnerable to adversarial examples – perturbed inputs that cause DNN-based models to produce incorrect results. One robust adversarial attack in the NLP domain is the synonym substitution. In attacks of this variety, the adversary substitutes words with synonyms. Since synonym substitution perturbations aim to satisfy all lexical, grammatical, and semantic constraints, they are difficult to detect with automatic syntax check as well as by humans. In this paper, we propose a structure-free defensive method that is capable of improving the performance of DNN-based models with both clean and adversarial data. Our findings show that replacing the embeddings of the important words in the input samples with the average of their synonyms’ embeddings can significantly improve the generalization of DNN-based classifiers. By doing so, we reduce model sensitivity to particular words in the input samples. Our results indicate that the proposed defense is not only capable of defending against adversarial attacks, but is also capable of improving the performance of DNN-based models when tested on benign data. On average, the proposed defense improved the classification accuracy of the CNN and Bi-LSTM models by 41.30% and 55.66%, respectively, when tested under adversarial attacks. Extended investigation shows that our defensive method can improve the robustness of nonneural models, achieving an average of 17.62% and 22.93% classification accuracy increase on the SVM and XGBoost models, respectively. The proposed defensive method has also shown an average of 26.60% classification accuracy improvement when tested with the infamous BERT model. Our algorithm is generic enough to be applied in any NLP domain and to any model trained on any natural language. + 2020.deelio-1.3 + + + <fixed-case>G</fixed-case>en<fixed-case>A</fixed-case>ug: Data Augmentation for Finetuning Text Generators + Steven Y.Feng + VarunGangal + DongyeopKang + TerukoMitamura + EduardHovy + 29–42 + In this paper, we investigate data augmentation for text generation, which we call GenAug. Text generation and language modeling are important tasks within natural language processing, and are especially challenging for low-data regimes. We propose and evaluate various augmentation methods, including some that incorporate external knowledge, for finetuning GPT-2 on a subset of Yelp Reviews. We also examine the relationship between the amount of augmentation and the quality of the generated text. We utilize several metrics that evaluate important aspects of the generated text including its diversity and fluency. Our experiments demonstrate that insertion of character-level synthetic noise and keyword replacement with hypernyms are effective augmentation methods, and that the quality of generations improves to a peak at approximately three times the amount of original data. + 2020.deelio-1.4 + + + Common Sense or World Knowledge? Investigating Adapter-Based Knowledge Injection into Pretrained Transformers + AnneLauscher + OlgaMajewska + Leonardo F. R.Ribeiro + IrynaGurevych + NikolaiRozanov + GoranGlavaš + 43–49 + Following the major success of neural language models (LMs) such as BERT or GPT-2 on a variety of language understanding tasks, recent work focused on injecting (structured) knowledge from external resources into these models. While on the one hand, joint pre-training (i.e., training from scratch, adding objectives based on external knowledge to the primary LM objective) may be prohibitively computationally expensive, post-hoc fine-tuning on external knowledge, on the other hand, may lead to the catastrophic forgetting of distributional knowledge. In this work, we investigate models for complementing the distributional knowledge of BERT with conceptual knowledge from ConceptNet and its corresponding Open Mind Common Sense (OMCS) corpus, respectively, using adapter training. While overall results on the GLUE benchmark paint an inconclusive picture, a deeper analysis reveals that our adapter-based models substantially outperform BERT (up to 15-20 performance points) on inference tasks that require the type of conceptual knowledge explicitly present in ConceptNet and OMCS. We also open source all our experiments and relevant code under: https://github.com/wluper/retrograph. + 2020.deelio-1.5 + + + Entity Attribute Relation Extraction with Attribute-Aware Embeddings + DanIter + XiaoYu + FangtaoLi + 50–55 + Entity-attribute relations are a fundamental component for building large-scale knowledge bases, which are widely employed in modern search engines. However, most such knowledge bases are manually curated, covering only a small fraction of all attributes, even for common entities. To improve the precision of model-based entity-attribute extraction, we propose attribute-aware embeddings, which embeds entities and attributes in the same space by the similarity of their attributes. Our model, EANET, learns these embeddings by representing entities as a weighted sum of their attributes and concatenates these embeddings to mention level features. EANET achieves up to 91% classification accuracy, outperforming strong baselines and achieves 83% precision on manually labeled high confidence extractions, outperforming Biperpedia (Gupta et al., 2014), a previous state-of-the-art for large scale entity-attribute extraction. + 2020.deelio-1.6 + + + Enhancing Question Answering by Injecting Ontological Knowledge through Regularization + TravisGoodwin + DinaDemner-Fushman + 56–63 + Deep neural networks have demonstrated high performance on many natural language processing (NLP) tasks that can be answered directly from text, and have struggled to solve NLP tasks requiring external (e.g., world) knowledge. In this paper, we present OSCR (Ontology-based Semantic Composition Regularization), a method for injecting task-agnostic knowledge from an Ontology or knowledge graph into a neural network during pre-training. We evaluated the performance of BERT pre-trained on Wikipedia with and without OSCR by measuring the performance when fine-tuning on two question answering tasks involving world knowledge and causal reasoning and one requiring domain (healthcare) knowledge and obtained 33.3%, 18.6%, and 4% improved accuracy compared to pre-training BERT without OSCR. + 2020.deelio-1.7 + + + Target Concept Guided Medical Concept Normalization in Noisy User-Generated Texts + Katikapalli SubramanyamKalyan + SivanesanSangeetha + 64–73 + Medical concept normalization (MCN) i.e., mapping of colloquial medical phrases to standard concepts is an essential step in analysis of medical social media text. The main drawback in existing state-of-the-art approach (Kalyan and Sangeetha, 2020b) is learning target concept vector representations from scratch which requires more number of training instances. Our model is based on RoBERTa and target concept embeddings. In our model, we integrate a) target concept information in the form of target concept vectors generated by encoding target concept descriptions using SRoBERTa, state-of-the-art RoBERTa based sentence embedding model and b) domain lexicon knowledge by enriching target concept vectors with synonym relationship knowledge using retrofitting algorithm. It is the first attempt in MCN to exploit both target concept information as well as domain lexicon knowledge in the form of retrofitted target concept vectors. Our model outperforms all the existing models with an accuracy improvement up to 1.36% on three standard datasets. Further, our model when trained only on mapping lexicon synonyms achieves up to 4.87% improvement in accuracy. + 2020.deelio-1.8 + + + Incorporating Commonsense Knowledge Graph in Pretrained Models for Social Commonsense Tasks + Ting-YunChang + YangLiu + KarthikGopalakrishnan + BehnamHedayatnia + PeiZhou + DilekHakkani-Tur + 74–79 + Pretrained language models have excelled at many NLP tasks recently; however, their social intelligence is still unsatisfactory. To enable this, machines need to have a more general understanding of our complicated world and develop the ability to perform commonsense reasoning besides fitting the specific downstream tasks. External commonsense knowledge graphs (KGs), such as ConceptNet, provide rich information about words and their relationships. Thus, towards general commonsense learning, we propose two approaches to implicitly and explicitly infuse such KGs into pretrained language models. We demonstrate our proposed methods perform well on SocialIQA, a social commonsense reasoning task, in both limited and full training data regimes. + 2020.deelio-1.9 + + + Commonsense Statements Identification and Explanation with Transformer based Encoders + SoniaCibu + AncaMarginean + 80–88 + In this work, we present our empirical attempt to identify the proper strategy of using Transformer Language Models to identify sentences consistent with commonsense. We tackle the first two tasks from the ComVE competition. The starting point for our work is the BERT assumption according to which a large number of NLP tasks can be solved with pre-trained Transformers with no substantial task-specific changes of the architecture. However, our experiments show that the encoding strategy can have a great impact on the quality of the fine-tuning. The combination between cross-encoding and multi-input models worked better than one cross-encoder and allowed us to achieve comparable results with the state-of-the-art without the use of any external data. + 2020.deelio-1.10 + + + On the Complementary Nature of Knowledge Graph Embedding, Fine Grain Entity Types, and Language Modeling + RajatPatel + FrancisFerraro + 89–99 + We demonstrate the complementary natures of neural knowledge graph embedding, fine-grain entity type prediction, and neural language modeling. We show that a language model-inspired knowledge graph embedding approach yields both improved knowledge graph embeddings and fine-grain entity type representations. Our work also shows that jointly modeling both structured knowledge tuples and language improves both. + 2020.deelio-1.11 + +
+
diff --git a/data/xml/2020.emnlp.xml b/data/xml/2020.emnlp.xml new file mode 100644 index 0000000000..a150dfc979 --- /dev/null +++ b/data/xml/2020.emnlp.xml @@ -0,0 +1,7608 @@ + + + + + Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) + TrevorCohn + YulanHe + YangLiu + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.emnlp-main.0 + + + Detecting Attackable Sentences in Arguments + YohanJo + SeojinBang + EmaadManzoor + EduardHovy + ChrisReed + 1–23 + 2020.emnlp-main.1 + + + Extracting Implicitly Asserted Propositions in Argumentation + YohanJo + JackyVisser + ChrisReed + EduardHovy + 24–38 + 2020.emnlp-main.2 + + + Quantitative Argument Summarization and beyond: Cross-Domain Key Point Analysis + RoyBar-Haim + YoavKantor + LilachEden + RoniFriedman + DanLahav + NoamSlonim + 39–49 + 2020.emnlp-main.3 + + + Unsupervised Stance Detection for Arguments from Consequences + JonathanKobbe + IoanaHulpuș + HeinerStuckenschmidt + 50–60 + 2020.emnlp-main.4 + + + <fixed-case>BLEU</fixed-case> Might Be Guilty but References Are Not Innocent + MarkusFreitag + DavidGrangier + IsaacCaswell + 61–71 + 2020.emnlp-main.5 + + + Translationese in Machine Translation Evaluation + YvetteGraham + BarryHaddow + PhilippKoehn + 72–81 + 2020.emnlp-main.6 + + + Simulated Multiple Reference Training Improves Low-Resource Machine Translation + HudaKhayrallah + BrianThompson + MattPost + PhilippKoehn + 82–89 + 2020.emnlp-main.7 + + + Automatic Machine Translation Evaluation in Many Languages via Zero-Shot Paraphrasing + BrianThompson + MattPost + 90–121 + 2020.emnlp-main.8 + + + <fixed-case>PR</fixed-case>over: Proof Generation for Interpretable Reasoning over Rules + SwarnadeepSaha + SayanGhosh + ShashankSrivastava + MohitBansal + 122–136 + 2020.emnlp-main.9 + + + Learning to Explain: Datasets and Models for Identifying Valid Reasoning Chains in Multihop Question-Answering + HarshJhamtani + PeterClark + 137–150 + 2020.emnlp-main.10 + + + Self-Supervised Knowledge Triplet Learning for Zero-shot Question Answering + PratyayBanerjee + ChittaBaral + 151–162 + 2020.emnlp-main.11 + 2020.emnlp-main.11.OptionalSupplementaryMaterial.zip + + + More Bang for Your Buck: Natural Perturbation for Robust Question Answering + DanielKhashabi + TusharKhot + AshishSabharwal + 163–170 + 2020.emnlp-main.12 + + + A Matter of Framing: The Impact of Linguistic Formalism on Probing Results + IliaKuznetsov + IrynaGurevych + 171–182 + 2020.emnlp-main.13 + + + Information-Theoretic Probing with Minimum Description Length + ElenaVoita + IvanTitov + 183–196 + 2020.emnlp-main.14 + + + Intrinsic Probing through Dimension Selection + LucasTorroba Hennigen + AdinaWilliams + RyanCotterell + 197–216 + 2020.emnlp-main.15 + + + Learning Helpful Inductive Biases from Self-Supervised Pretraining + AlexWarstadt + YianZhang + XiaochengLi + HaokunLiu + Samuel R.Bowman + 217–235 + 2020.emnlp-main.16 + + + Repulsive Attention: Rethinking Multi-head Attention as <fixed-case>B</fixed-case>ayesian Inference + BangAn + JieLyu + ZhenyiWang + ChunyuanLi + ChangweiHu + FeiTan + RuiyiZhang + YifanHu + ChangyouChen + 236–255 + 2020.emnlp-main.17 + + + <fixed-case>KERMIT</fixed-case>: Complementing Transformer Architectures with Encoders of Explicit Syntactic Interpretations + Fabio MassimoZanzotto + AndreaSantilli + LeonardoRanaldi + DarioOnorati + PierfrancescoTommasino + FrancescaFallucchi + 256–267 + 2020.emnlp-main.18 + + + <fixed-case>ETC</fixed-case>: Encoding Long and Structured Inputs in Transformers + JoshuaAinslie + SantiagoOntanon + ChrisAlberti + VaclavCvicek + ZacharyFisher + PhilipPham + AnirudhRavula + SumitSanghai + QifanWang + LiYang + 268–284 + 2020.emnlp-main.19 + + + Pre-Training Transformers as Energy-Based Cloze Models + KevinClark + Minh-ThangLuong + QuocLe + Christopher D.Manning + 285–294 + 2020.emnlp-main.20 + + + Calibration of Pre-trained Transformers + ShreyDesai + GregDurrett + 295–302 + 2020.emnlp-main.21 + + + Near-imperceptible Neural Linguistic Steganography via Self-Adjusting Arithmetic Coding + JiamingShen + HengJi + JiaweiHan + 303–313 + 2020.emnlp-main.22 + + + Multi-Dimensional Gender Bias Classification + EmilyDinan + AngelaFan + LedellWu + JasonWeston + DouweKiela + AdinaWilliams + 314–331 + 2020.emnlp-main.23 + + + Human-in-the-loop Debugging Deep Text Classifiers + PiyawatLertvittayakumjorn + LuciaSpecia + FrancescaToni + 332–348 + 2020.emnlp-main.24 + + + Conversational Document Prediction to Assist Customer Care Agents + JatinGanhotra + HaggaiRoitman + DoronCohen + NathanielMills + ChulakaGunasekara + YosiMass + SachindraJoshi + LuisLastras + DavidKonopnicki + 349–356 + 2020.emnlp-main.25 + + + Incremental Processing in the Age of Non-Incremental Encoders: An Empirical Assessment of Bidirectional Models for Incremental <fixed-case>NLU</fixed-case> + BrielenMadureira + DavidSchlangen + 357–374 + 2020.emnlp-main.26 + + + Augmented Natural Language for Generative Sequence Labeling + BenAthiwaratkun + CiceroNogueira dos Santos + JasonKrone + BingXiang + 375–385 + 2020.emnlp-main.27 + + + Dialogue Response Ranking Training with Large-Scale Human Feedback Data + XiangGao + YizheZhang + MichelGalley + ChrisBrockett + BillDolan + 386–395 + 2020.emnlp-main.28 + + + Semantic Evaluation for Text-to-<fixed-case>SQL</fixed-case> with Distilled Test Suite + RuiqiZhong + TaoYu + DanKlein + 396–411 + 2020.emnlp-main.29 + + + Cross-Thought for Sentence Encoder Pre-training + ShuohangWang + YuweiFang + SiqiSun + ZheGan + YuCheng + JingjingLiu + JingJiang + 412–421 + 2020.emnlp-main.30 + + + <fixed-case>A</fixed-case>uto<fixed-case>QA</fixed-case>: From Databases to <fixed-case>Q</fixed-case>&<fixed-case>A</fixed-case> Semantic Parsers with Only Synthetic Training Data + SileiXu + SinaSemnani + GiovanniCampagna + MonicaLam + 422–434 + 2020.emnlp-main.31 + + + A Spectral Method for Unsupervised Multi-Document Summarization + KexiangWang + BaobaoChang + ZhifangSui + 435–445 + 2020.emnlp-main.32 + + + What Have We Achieved on Text Summarization? + DandanHuang + LeyangCui + SenYang + GuangshengBao + KunWang + JunXie + YueZhang + 446–469 + 2020.emnlp-main.33 + + + <fixed-case>Q</fixed-case>-learning with Language Model for Edit-based Unsupervised Summarization + RyosukeKohita + AkifumiWachi + YangZhao + RyukiTachibana + 470–484 + 2020.emnlp-main.34 + + + Friendly Topic Assistant for Transformer Based Abstractive Summarization + ZhengjueWang + ZhibinDuan + HaoZhang + ChaojieWang + LongTian + BoChen + MingyuanZhou + 485–497 + 2020.emnlp-main.35 + + + Contrastive Distillation on Intermediate Representations for Language Model Compression + SiqiSun + ZheGan + YuweiFang + YuCheng + ShuohangWang + JingjingLiu + 498–508 + 2020.emnlp-main.36 + + + <fixed-case>T</fixed-case>ernary<fixed-case>BERT</fixed-case>: Distillation-aware Ultra-low Bit <fixed-case>BERT</fixed-case> + WeiZhang + LuHou + YichunYin + LifengShang + XiaoChen + XinJiang + QunLiu + 509–521 + 2020.emnlp-main.37 + + + Self-Supervised Meta-Learning for Few-Shot Natural Language Classification Tasks + TrapitBansal + RishikeshJha + TsendsurenMunkhdalai + AndrewMcCallum + 522–534 + 2020.emnlp-main.38 + + + Efficient Meta Lifelong-Learning with Limited Memory + ZiruiWang + Sanket VaibhavMehta + BarnabasPoczos + JaimeCarbonell + 535–548 + 2020.emnlp-main.39 + + + On the Evaluation of Contextual Embeddings for Zero-Shot Cross-Lingual Transfer Learning + PhillipKeung + YichaoLu + JulianSalazar + VikasBhardwaj + 549–554 + 2020.emnlp-main.40 + + + A Supervised Word Alignment Method Based on Cross-Language Span Prediction Using Multilingual <fixed-case>BERT</fixed-case> + MasaakiNagata + KatsukiChousa + MasaakiNishino + 555–565 + 2020.emnlp-main.41 + + + Accurate Word Alignment Induction from Neural Machine Translation + YunChen + YangLiu + GuanhuaChen + XinJiang + QunLiu + 566–576 + 2020.emnlp-main.42 + + + <fixed-case>C</fixed-case>hr<fixed-case>E</fixed-case>n: <fixed-case>C</fixed-case>herokee-<fixed-case>E</fixed-case>nglish Machine Translation for Endangered Language Revitalization + ShiyueZhang + BenjaminFrey + MohitBansal + 577–595 + 2020.emnlp-main.43 + + + Unsupervised Discovery of Implicit Gender Bias + AnjalieField + YuliaTsvetkov + 596–608 + 2020.emnlp-main.44 + + + Condolences and Empathy in Online Communities + NaitianZhou + DavidJurgens + 609–626 + 2020.emnlp-main.45 + + + An Embedding Model for Estimating Legislative Preferences from the Frequency and Sentiment of Tweets + GregorySpell + BrianGuay + SunshineHillygus + LawrenceCarin + 627–641 + 2020.emnlp-main.46 + + + Measuring Information Propagation in Literary Social Networks + MatthewSims + DavidBamman + 642–652 + 2020.emnlp-main.47 + + + Social Chemistry 101: Learning to Reason about Social and Moral Norms + MaxwellForbes + Jena D.Hwang + VeredShwartz + MaartenSap + YejinChoi + 653–670 + 2020.emnlp-main.48 + + + Event Extraction by Answering (Almost) Natural Questions + XinyaDu + ClaireCardie + 671–683 + 2020.emnlp-main.49 + + + Connecting the Dots: Event Graph Schema Induction with Path Language Modeling + ManlingLi + QiZeng + YingLin + KyunghyunCho + HengJi + JonathanMay + NathanaelChambers + ClareVoss + 684–695 + 2020.emnlp-main.50 + + + Joint Constrained Learning for Event-Event Relation Extraction + HaoyuWang + MuhaoChen + HongmingZhang + DanRoth + 696–706 + 2020.emnlp-main.51 + + + Incremental Event Detection via Knowledge Consolidation Networks + PengfeiCao + YuboChen + JunZhao + TaifengWang + 707–717 + 2020.emnlp-main.52 + + + Semi-supervised New Event Type Induction and Event Detection + LifuHuang + HengJi + 718–724 + 2020.emnlp-main.53 + + + Language Generation with Multi-hop Reasoning on Commonsense Knowledge Graph + HaozheJi + PeiKe + ShaohanHuang + FuruWei + XiaoyanZhu + MinlieHuang + 725–736 + 2020.emnlp-main.54 + + + Reformulating Unsupervised Style Transfer as Paraphrase Generation + KalpeshKrishna + JohnWieting + MohitIyyer + 737–762 + 2020.emnlp-main.55 + 2020.emnlp-main.55.OptionalSupplementaryMaterial.zip + + + De-biased Court’s View Generation with Causality + YiquanWu + KunKuang + YatingZhang + XiaozhongLiu + ChanglongSun + JunXiao + YuetingZhuang + LuoSi + FeiWu + 763–780 + 2020.emnlp-main.56 + 2020.emnlp-main.56.OptionalSupplementaryMaterial.zip + + + <fixed-case>PAIR</fixed-case>: Planning and Iterative Refinement in Pre-trained Transformers for Long Text Generation + XinyuHua + LuWang + 781–793 + 2020.emnlp-main.57 + 2020.emnlp-main.57.OptionalSupplementaryMaterial.pdf + + + Backpropagation-based Decoding for Unsupervised Counterfactual and Abductive Reasoning + LianhuiQin + VeredShwartz + PeterWest + ChandraBhagavatula + Jena D.Hwang + RonanLe Bras + AntoineBosselut + YejinChoi + 794–805 + 2020.emnlp-main.58 + 2020.emnlp-main.58.OptionalSupplementaryMaterial.zip + + + Where Are You? Localization from Embodied Dialog + MeeraHahn + JacobKrantz + DhruvBatra + DeviParikh + JamesRehg + StefanLee + PeterAnderson + 806–822 + 2020.emnlp-main.59 + + + Learning to Represent Image and Text with Denotation Graphs + BowenZhang + HexiangHu + VihanJain + EugeneIe + FeiSha + 823–839 + 2020.emnlp-main.60 + + + <fixed-case>V</fixed-case>ideo2<fixed-case>C</fixed-case>ommonsense: Generating Commonsense Descriptions to Enrich Video Captioning + ZhiyuanFang + TejasGokhale + PratyayBanerjee + ChittaBaral + YezhouYang + 840–860 + 2020.emnlp-main.61 + + + Does My Multimodal Model Learn Cross-modal Interactions? It’s Harder to Tell than You Might Think! + JackHessel + LillianLee + 861–877 + 2020.emnlp-main.62 + 2020.emnlp-main.62.OptionalSupplementaryMaterial.zip + + + <fixed-case>MUTANT</fixed-case>: A Training Paradigm for Out-of-Distribution Generalization in Visual Question Answering + TejasGokhale + PratyayBanerjee + ChittaBaral + YezhouYang + 878–892 + 2020.emnlp-main.63 + + + Mitigating Gender Bias for Neural Dialogue Generation with Adversarial Learning + HaochenLiu + WentaoWang + YiqiWang + HuiLiu + ZitaoLiu + JiliangTang + 893–903 + 2020.emnlp-main.64 + + + Will <fixed-case>I</fixed-case> Sound like Me? Improving Persona Consistency in Dialogues through Pragmatic Self-Consciousness + HyunwooKim + ByeongchangKim + GunheeKim + 904–916 + 2020.emnlp-main.65 + + + <fixed-case>TOD</fixed-case>-<fixed-case>BERT</fixed-case>: Pre-trained Natural Language Understanding for Task-Oriented Dialogue + Chien-ShengWu + Steven C.H.Hoi + RichardSocher + CaimingXiong + 917–929 + 2020.emnlp-main.66 + + + <fixed-case>R</fixed-case>i<fixed-case>SAWOZ</fixed-case>: A Large-Scale Multi-Domain <fixed-case>W</fixed-case>izard-of-<fixed-case>O</fixed-case>z Dataset with Rich Semantic Annotations for Task-Oriented Dialogue Modeling + JunQuan + ShianZhang + QianCao + ZizhongLi + DeyiXiong + 930–940 + 2020.emnlp-main.67 + + + Filtering Noisy Dialogue Corpora by Connectivity and Content Relatedness + ReinaAkama + ShoYokoi + JunSuzuki + KentaroInui + 941–958 + 2020.emnlp-main.68 + + + Latent Geographical Factors for Analyzing the Evolution of Dialects in Contact + YugoMurawaki + 959–976 + 2020.emnlp-main.69 + + + Predicting Reference: What Do Language Models Learn about Discourse Models? + ShivaUpadhye + LeonBergen + AndrewKehler + 977–982 + 2020.emnlp-main.70 + + + Word Class Flexibility: A Deep Contextualized Approach + BaiLi + GuillaumeThomas + YangXu + FrankRudzicz + 983–994 + 2020.emnlp-main.71 + + + Shallow-to-Deep Training for Neural Machine Translation + BeiLi + ZiyangWang + HuiLiu + YufanJiang + QuanDu + TongXiao + HuizhenWang + JingboZhu + 995–1005 + 2020.emnlp-main.72 + + + Iterative Refinement in the Continuous Space for Non-Autoregressive Neural Machine Translation + JasonLee + RaphaelShu + KyunghyunCho + 1006–1015 + 2020.emnlp-main.73 + 2020.emnlp-main.73.OptionalSupplementaryMaterial.zip + + + Why Skip If You Can Combine: A Simple Knowledge Distillation Technique for Intermediate Layers + YimengWu + PeymanPassban + MehdiRezagholizadeh + QunLiu + 1016–1021 + 2020.emnlp-main.74 + + + Multi-task Learning for Multilingual Neural Machine Translation + YirenWang + ChengXiangZhai + HanyHassan + 1022–1034 + 2020.emnlp-main.75 + + + Token-level Adaptive Training for Neural Machine Translation + ShuhaoGu + JinchaoZhang + FandongMeng + YangFeng + WanyingXie + JieZhou + DongYu + 1035–1046 + 2020.emnlp-main.76 + + + Multi-Unit Transformers for Neural Machine Translation + JianhaoYan + FandongMeng + JieZhou + 1047–1059 + 2020.emnlp-main.77 + + + On the Sparsity of Neural Machine Translation Models + YongWang + LongyueWang + VictorLi + ZhaopengTu + 1060–1066 + 2020.emnlp-main.78 + + + Incorporating a Local Translation Mechanism into Non-autoregressive Translation + XiangKong + ZhisongZhang + EduardHovy + 1067–1073 + 2020.emnlp-main.79 + + + Self-Paced Learning for Neural Machine Translation + YuWan + BaosongYang + Derek F.Wong + YikaiZhou + Lidia S.Chao + HaiboZhang + BoxingChen + 1074–1080 + 2020.emnlp-main.80 + + + Long-Short Term Masking Transformer: A Simple but Effective Baseline for Document-level Neural Machine Translation + PeiZhang + BoxingChen + NiyuGe + KaiFan + 1081–1087 + 2020.emnlp-main.81 + 2020.emnlp-main.81.OptionalSupplementaryMaterial.pdf + + + Generating Diverse Translation from Model Distribution with Dropout + XuanfuWu + YangFeng + ChenzeShao + 1088–1097 + 2020.emnlp-main.82 + + + Non-Autoregressive Machine Translation with Latent Alignments + ChitwanSaharia + WilliamChan + SaurabhSaxena + MohammadNorouzi + 1098–1108 + 2020.emnlp-main.83 + + + Look at the First Sentence: Position Bias in Question Answering + MiyoungKo + JinhyukLee + HyunjaeKim + GangwooKim + JaewooKang + 1109–1121 + 2020.emnlp-main.84 + + + <fixed-case>P</fixed-case>roto<fixed-case>QA</fixed-case>: A Question Answering Dataset for Prototypical Common-Sense Reasoning + MichaelBoratko + XiangLi + TimO’Gorman + RajarshiDas + DanLe + AndrewMcCallum + 1122–1136 + 2020.emnlp-main.85 + 2020.emnlp-main.85.OptionalSupplementaryMaterial.zip + + + <fixed-case>IIRC</fixed-case>: A Dataset of Incomplete Information Reading Comprehension Questions + JamesFerguson + MattGardner + HannanehHajishirzi + TusharKhot + PradeepDasigi + 1137–1147 + 2020.emnlp-main.86 + + + Unsupervised Adaptation of Question Answering Systems via Generative Self-training + StevenRennie + EtienneMarcheret + NeilMallinar + DavidNahamoo + VaibhavaGoel + 1148–1157 + 2020.emnlp-main.87 + + + <fixed-case>TORQUE</fixed-case>: A Reading Comprehension Dataset of Temporal Ordering Questions + QiangNing + HaoWu + RujunHan + NanyunPeng + MattGardner + DanRoth + 1158–1172 + 2020.emnlp-main.88 + + + <fixed-case>T</fixed-case>o<fixed-case>TT</fixed-case>o: A Controlled Table-To-Text Generation Dataset + AnkurParikh + XuezhiWang + SebastianGehrmann + ManaalFaruqui + BhuwanDhingra + DiyiYang + DipanjanDas + 1173–1186 + 2020.emnlp-main.89 + 2020.emnlp-main.89.OptionalSupplementaryMaterial.zip + + + Knowledge Graph Empowered Entity Description Generation + LiyingCheng + DekunWu + LidongBing + YanZhang + ZhanmingJie + WeiLu + LuoSi + 1187–1197 + 2020.emnlp-main.90 + + + Small but Mighty: New Benchmarks for Split and Rephrase + LiZhang + HuaiyuZhu + SiddharthaBrahma + YunyaoLi + 1198–1205 + 2020.emnlp-main.91 + + + Online Back-Parsing for <fixed-case>AMR</fixed-case>-to-Text Generation + XuefengBai + LinfengSong + YueZhang + 1206–1219 + 2020.emnlp-main.92 + + + Reading between the Lines: Exploring Infilling in Visual Narratives + Khyathi RaghaviChandu + Ruo-PingDong + Alan WBlack + 1220–1229 + 2020.emnlp-main.93 + + + Acrostic Poem Generation + RajatAgarwal + KatharinaKann + 1230–1240 + 2020.emnlp-main.94 + + + Local Additivity Based Data Augmentation for Semi-supervised <fixed-case>NER</fixed-case> + JiaaoChen + ZhenghuiWang + RanTian + ZichaoYang + DiyiYang + 1241–1251 + 2020.emnlp-main.95 + + + Grounded Compositional Outputs for Adaptive Language Modeling + NikolaosPappas + PhoebeMulcaire + Noah A.Smith + 1252–1267 + 2020.emnlp-main.96 + + + <fixed-case>SSMBA</fixed-case>: Self-Supervised Manifold Based Data Augmentation for Improving Out-of-Domain Robustness + NathanNg + KyunghyunCho + MarzyehGhassemi + 1268–1283 + 2020.emnlp-main.97 + + + <fixed-case>S</fixed-case>et<fixed-case>C</fixed-case>onv: A New Approach for Learning from Imbalanced Data + YangGao + Yi-FanLi + YuLin + CharuAggarwal + LatifurKhan + 1284–1294 + 2020.emnlp-main.98 + + + Scalable Multi-Hop Relational Reasoning for Knowledge-Aware Question Answering + YanlinFeng + XinyueChen + Bill YuchenLin + PeifengWang + JunYan + XiangRen + 1295–1309 + 2020.emnlp-main.99 + 2020.emnlp-main.99.OptionalSupplementaryMaterial.zip + + + Improving Bilingual Lexicon Induction for Low Frequency Words + JiajiHuang + XingyuCai + KennethChurch + 1310–1314 + 2020.emnlp-main.100 + + + Learning <fixed-case>VAE</fixed-case>-<fixed-case>LDA</fixed-case> Models with Rounded Reparameterization Trick + RunzhiTian + YongyiMao + RichongZhang + 1315–1325 + 2020.emnlp-main.101 + + + Calibrated Fine-Tuning for Pre-trained Language Models via Manifold Smoothing + LingkaiKong + HaomingJiang + YuchenZhuang + JieLyu + TuoZhao + ChaoZhang + 1326–1340 + 2020.emnlp-main.102 + + + Scaling Hidden <fixed-case>M</fixed-case>arkov Language Models + JustinChiu + AlexanderRush + 1341–1349 + 2020.emnlp-main.103 + + + Coding Textual Inputs Boosts the Accuracy of Neural Networks + Abdul RafaeKhan + JiaXu + WeiweiSun + 1350–1360 + 2020.emnlp-main.104 + + + Learning from Task Descriptions + OrionWeller + NicholasLourie + MattGardner + MatthewPeters + 1361–1375 + 2020.emnlp-main.105 + + + Hashtags, Emotions, and Comments: A Large-Scale Dataset to Understand Fine-Grained Social Emotions to Online Topics + KeyangDing + JingLi + YujiZhang + 1376–1382 + 2020.emnlp-main.106 + + + Named Entity Recognition for Social Media Texts with Semantic Augmentation + YuyangNie + YuanheTian + XiangWan + YanSong + BoDai + 1383–1391 + 2020.emnlp-main.107 + + + Predicting Stance and Rumor Veracity via Dual Hierarchical Transformer with Pretrained Encoders + JianfeiYu + JingJiang + Ling Min SerenaKhoo + Hai LeongChieu + RuiXia + 1392–1401 + 2020.emnlp-main.108 + + + Social Media Attributions in the Context of Water Crisis + RupakSarkar + SayantanMahinder + HirakSarkar + AshiqurKhudaBukhsh + 1402–1412 + 2020.emnlp-main.109 + + + On the Reliability and Validity of Detecting Approval of Political Actors in Tweets + IndiraSen + FabianFlöck + ClaudiaWagner + 1413–1426 + 2020.emnlp-main.110 + + + Towards Medical Machine Reading Comprehension with Structural Knowledge and Plain Text + DongfangLi + BaotianHu + QingcaiChen + WeihuaPeng + AnqiWang + 1427–1438 + 2020.emnlp-main.111 + 2020.emnlp-main.111.OptionalSupplementaryMaterial.zip + + + Generating Radiology Reports via Memory-driven Transformer + ZhihongChen + YanSong + Tsung-HuiChang + XiangWan + 1439–1449 + 2020.emnlp-main.112 + + + Planning and Generating Natural and Diverse Disfluent Texts as Augmentation for Disfluency Detection + JingfengYang + DiyiYang + ZhaoranMa + 1450–1460 + 2020.emnlp-main.113 + + + Predicting Clinical Trial Results by Implicit Evidence Integration + QiaoJin + ChuanqiTan + MoshaChen + XiaozhongLiu + SongfangHuang + 1461–1477 + 2020.emnlp-main.114 + + + Explainable Clinical Decision Support from Text + JinyueFeng + ChantalShaib + FrankRudzicz + 1478–1489 + 2020.emnlp-main.115 + 2020.emnlp-main.115.OptionalSupplementaryMaterial.zip + + + A Knowledge-driven Generative Model for Multi-implication <fixed-case>C</fixed-case>hinese Medical Procedure Entity Normalization + JinghuiYan + YiningWang + LuXiang + YuZhou + ChengqingZong + 1490–1499 + 2020.emnlp-main.116 + + + <fixed-case>C</fixed-case>he<fixed-case>X</fixed-case>bert: Combining Automatic Labelers and Expert Annotations for Accurate Radiology Report Labeling Using <fixed-case>BERT</fixed-case> + AkshaySmit + SaahilJain + PranavRajpurkar + AnujPareek + AndrewNg + MatthewLungren + 1500–1519 + 2020.emnlp-main.117 + + + Benchmarking Meaning Representations in Neural Semantic Parsing + JiaqiGuo + QianLiu + Jian-GuangLou + ZhenwenLi + XueqingLiu + TaoXie + TingLiu + 1520–1540 + 2020.emnlp-main.118 + + + Analogous Process Structure Induction for Sub-event Sequence Prediction + HongmingZhang + MuhaoChen + HaoyuWang + YangqiuSong + DanRoth + 1541–1550 + 2020.emnlp-main.119 + + + <fixed-case>SLM</fixed-case>: Learning a Discourse Language Representation with Sentence Unshuffling + HaejunLee + Drew A.Hudson + KangwookLee + Christopher D.Manning + 1551–1562 + 2020.emnlp-main.120 + + + Detecting Fine-Grained Cross-Lingual Semantic Divergences without Supervision by Learning to Rank + EleftheriaBriakou + MarineCarpuat + 1563–1580 + 2020.emnlp-main.121 + + + A Bilingual Generative Transformer for Semantic Sentence Embedding + JohnWieting + GrahamNeubig + TaylorBerg-Kirkpatrick + 1581–1594 + 2020.emnlp-main.122 + + + Semantically Inspired <fixed-case>AMR</fixed-case> Alignment for the <fixed-case>P</fixed-case>ortuguese Language + RafaelAnchiêta + ThiagoPardo + 1595–1600 + 2020.emnlp-main.123 + + + An Unsupervised Sentence Embedding Method by Mutual Information Maximization + YanZhang + RuidanHe + ZuozhuLiu + Kwan HuiLim + LidongBing + 1601–1610 + 2020.emnlp-main.124 + + + Compositional Phrase Alignment and beyond + YukiArase + Jun’ichiTsujii + 1611–1623 + 2020.emnlp-main.125 + + + Table Fact Verification with Structure-Aware Transformer + HongzhiZhang + YingyaoWang + SiruiWang + XuezhiCao + FuzhengZhang + ZhongyuanWang + 1624–1629 + 2020.emnlp-main.126 + + + Double Graph Based Reasoning for Document-level Relation Extraction + ShuangZeng + RunxinXu + BaobaoChang + LeiLi + 1630–1640 + 2020.emnlp-main.127 + + + Event Extraction as Machine Reading Comprehension + JianLiu + YuboChen + KangLiu + WeiBi + XiaojiangLiu + 1641–1651 + 2020.emnlp-main.128 + + + <fixed-case>MAVEN</fixed-case>: A Massive General Domain Event Detection Dataset + XiaozhiWang + ZiqiWang + XuHan + WangyiJiang + RongHan + ZhiyuanLiu + JuanziLi + PengLi + YankaiLin + JieZhou + 1652–1671 + 2020.emnlp-main.129 + 2020.emnlp-main.129.OptionalSupplementaryMaterial.zip + + + Knowledge Graph Alignment with Entity-Pair Embedding + ZhichunWang + JinjianYang + XiaojuYe + 1672–1680 + 2020.emnlp-main.130 + + + Adaptive Attentional Network for Few-Shot Knowledge Graph Completion + JiaweiSheng + ShuGuo + ZhenyuChen + JuweiYue + LihongWang + TingwenLiu + HongboXu + 1681–1691 + 2020.emnlp-main.131 + + + Pre-training Entity Relation Encoder with Intra-span and Inter-span Information + YijunWang + ChangzhiSun + YuanbinWu + JunchiYan + PengGao + GuotongXie + 1692–1705 + 2020.emnlp-main.132 + + + Two Are Better than One: Joint Entity and Relation Extraction with Table-Sequence Encoders + JueWang + WeiLu + 1706–1721 + 2020.emnlp-main.133 + + + Beyond [<fixed-case>CLS</fixed-case>] through Ranking by Generation + CiceroNogueira dos Santos + XiaofeiMa + RameshNallapati + ZhihengHuang + BingXiang + 1722–1727 + 2020.emnlp-main.134 + + + Tired of Topic Models? Clusters of Pretrained Word Embeddings Make for Fast and Good Topics Too! + SuzannaSia + AyushDalmia + Sabrina J.Mielke + 1728–1736 + 2020.emnlp-main.135 + 2020.emnlp-main.135.OptionalSupplementaryMaterial.zip + + + Multi-document Summarization with Maximal Marginal Relevance-guided Reinforcement Learning + YuningMao + YanruQu + YiqingXie + XiangRen + JiaweiHan + 1737–1751 + 2020.emnlp-main.136 + + + Improving Neural Topic Models Using Knowledge Distillation + Alexander MiserlisHoyle + PranavGoel + PhilipResnik + 1752–1771 + 2020.emnlp-main.137 + + + Short Text Topic Modeling with Topic Distribution Quantization and Negative Sampling Decoder + XiaobaoWu + ChunpingLi + YanZhu + YishuMiao + 1772–1782 + 2020.emnlp-main.138 + + + Querying across Genres to Retrieve Research That Supports Medical Claims Made in News + ChaoyuanZuo + NarayanAcharya + RitwikBanerjee + 1783–1789 + 2020.emnlp-main.139 + + + Incorporating Multimodal Information in Open-Domain Web Keyphrase Extraction + YansenWang + ZhenFan + CarolynRose + 1790–1800 + 2020.emnlp-main.140 + 2020.emnlp-main.140.OptionalSupplementaryMaterial.zip + + + <fixed-case>MOSEAS</fixed-case>: A Multimodal Language Dataset for <fixed-case>S</fixed-case>panish, <fixed-case>P</fixed-case>ortuguese, <fixed-case>G</fixed-case>erman and <fixed-case>F</fixed-case>rench + AmirAliBagher Zadeh + YanshengCao + SimonHessner + Paul PuLiang + SoujanyaPoria + Louis-PhilippeMorency + 1801–1812 + 2020.emnlp-main.141 + + + Combining Self-Training and Self-Supervised Learning for Unsupervised Disfluency Detection + ShaoleiWang + ZhongyuanWang + WanxiangChe + TingLiu + 1813–1822 + 2020.emnlp-main.142 + + + Multimodal Routing: Improving Local and Global Interpretability of Multimodal Language Analysis + Yao-Hung HubertTsai + MartinMa + MuqiaoYang + RuslanSalakhutdinov + Louis-PhilippeMorency + 1823–1833 + 2020.emnlp-main.143 + 2020.emnlp-main.143.OptionalSupplementaryMaterial.zip + + + Multistage Fusion with Forget Gate for Multimodal Summarization in Open-Domain Videos + NayuLiu + XianSun + HongfengYu + WenkaiZhang + GuangluanXu + 1834–1845 + 2020.emnlp-main.144 + + + <fixed-case>B</fixed-case>i<fixed-case>ST</fixed-case>: Bi-directional Spatio-Temporal Reasoning for Video-Grounded Dialogues + HungLe + DoyenSahoo + NancyChen + Steven C.H.Hoi + 1846–1859 + 2020.emnlp-main.145 + + + <fixed-case>U</fixed-case>ni<fixed-case>C</fixed-case>onv: A Unified Conversational Neural Architecture for Multi-domain Task-oriented Dialogues + HungLe + DoyenSahoo + ChenghaoLiu + NancyChen + Steven C.H.Hoi + 1860–1877 + 2020.emnlp-main.146 + + + <fixed-case>G</fixed-case>raph<fixed-case>D</fixed-case>ialog: Integrating Graph Knowledge into End-to-End Task-Oriented Dialogue Systems + ShiquanYang + RuiZhang + SarahErfani + 1878–1888 + 2020.emnlp-main.147 + + + Structured Attention for Unsupervised Dialogue Structure Induction + LiangQiu + YizhouZhao + WeiyanShi + YuanLiang + FengShi + TaoYuan + ZhouYu + Song-ChunZhu + 1889–1899 + 2020.emnlp-main.148 + 2020.emnlp-main.148.OptionalSupplementaryMaterial.zip + + + Cross Copy Network for Dialogue Generation + ChangzhenJi + XinZhou + YatingZhang + XiaozhongLiu + ChanglongSun + ConghuiZhu + TiejunZhao + 1900–1910 + 2020.emnlp-main.149 + 2020.emnlp-main.149.OptionalSupplementaryMaterial.rar + + + Multi-turn Response Selection Using Dialogue Dependency Relations + QiJia + YizhuLiu + SiyuRen + KennyZhu + HaifengTang + 1911–1920 + 2020.emnlp-main.150 + + + Parallel Interactive Networks for Multi-Domain Dialogue State Generation + JunfanChen + RichongZhang + YongyiMao + JieXu + 1921–1931 + 2020.emnlp-main.151 + + + <fixed-case>S</fixed-case>lot<fixed-case>R</fixed-case>efine: A Fast Non-Autoregressive Model for Joint Intent Detection and Slot Filling + DiWu + LiangDing + FanLu + JianXie + 1932–1937 + 2020.emnlp-main.152 + + + An Information Bottleneck Approach for Controlling Conciseness in Rationale Extraction + BhargaviParanjape + MandarJoshi + JohnThickstun + HannanehHajishirzi + LukeZettlemoyer + 1938–1952 + 2020.emnlp-main.153 + + + <fixed-case>C</fixed-case>row<fixed-case>S</fixed-case>-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models + NikitaNangia + ClaraVania + RasikaBhalerao + Samuel R.Bowman + 1953–1967 + 2020.emnlp-main.154 + 2020.emnlp-main.154.OptionalSupplementaryMaterial.zip + + + <fixed-case>LOGAN</fixed-case>: Local Group Bias Detection by Clustering + JieyuZhao + Kai-WeiChang + 1968–1977 + 2020.emnlp-main.155 + + + <fixed-case>RNN</fixed-case>s Can Generate Bounded Hierarchical Languages with Optimal Memory + JohnHewitt + MichaelHahn + SuryaGanguli + PercyLiang + Christopher D.Manning + 1978–2010 + 2020.emnlp-main.156 + 2020.emnlp-main.156.OptionalSupplementaryMaterial.tgz + + + Detecting Independent Pronoun Bias with Partially-Synthetic Data Generation + RobertMunro + Alex (Carmen)Morrison + 2011–2017 + 2020.emnlp-main.157 + + + Visually Grounded Continual Learning of Compositional Phrases + XisenJin + JunyiDu + ArkaSadhu + RamNevatia + XiangRen + 2018–2029 + 2020.emnlp-main.158 + + + An Effective Framework for Weakly-Supervised Phrase Grounding + QinxinWang + HaoTan + ShengShen + MichaelMahoney + ZheweiYao + 2030–2038 + 2020.emnlp-main.159 + 2020.emnlp-main.159.OptionalSupplementaryMaterial.zip + + + Finding Domain-Specific Grounding in Noisy Visual-Textual Documents + GregoryYauney + JackHessel + DavidMimno + 2039–2045 + 2020.emnlp-main.160 + + + Hero: Hierarchical Encoder for <fixed-case>V</fixed-case>ideo+<fixed-case>L</fixed-case>anguage Omni-representation Pre-training + LinjieLi + Yen-ChunChen + YuCheng + ZheGan + LichengYu + JingjingLiu + 2046–2065 + 2020.emnlp-main.161 + + + Vokenization: Improving Language Understanding via Contextualized, Visually-Grounded Supervision + HaoTan + MohitBansal + 2066–2080 + 2020.emnlp-main.162 + + + Detecting Cross-Modal Inconsistency to Defend against Neural Fake News + ReubenTan + BryanPlummer + KateSaenko + 2081–2106 + 2020.emnlp-main.163 + + + Enhancing Aspect Term Extraction with Soft Prototypes + ZhuangChen + TieyunQian + 2107–2117 + 2020.emnlp-main.164 + + + <fixed-case>F</fixed-case>ed<fixed-case>ED</fixed-case>: Federated Learning via Ensemble Distillation for Medical Relation Extraction + DianboSui + YuboChen + JunZhao + YantaoJia + YuantaoXie + WeijianSun + 2118–2128 + 2020.emnlp-main.165 + + + Multimodal Joint Attribute Prediction and Value Extraction for <fixed-case>E</fixed-case>-commerce Product + TiangangZhu + YueWang + HaoranLi + YouzhengWu + XiaodongHe + BowenZhou + 2129–2139 + 2020.emnlp-main.166 + + + A Predicate-Function-Argument Annotation of Natural Language for Open-Domain Information Expression + MingmingSun + WenyueHua + ZoeyLiu + XinWang + KangjieZheng + PingLi + 2140–2150 + 2020.emnlp-main.167 + + + Retrofitting Structure-aware Transformer Language Model for End Tasks + HaoFei + YafengRen + DonghongJi + 2151–2161 + 2020.emnlp-main.168 + + + Lightweight, Dynamic Graph Convolutional Networks for <fixed-case>AMR</fixed-case>-to-Text Generation + YanZhang + ZhijiangGuo + ZhiyangTeng + WeiLu + Shay B.Cohen + ZuozhuLiu + LidongBing + 2162–2172 + 2020.emnlp-main.169 + + + If Beam Search Is the Answer, What Was the Question? + ClaraMeister + RyanCotterell + TimVieira + 2173–2185 + 2020.emnlp-main.170 + + + Understanding the Mechanics of <fixed-case>SPIGOT</fixed-case>: Surrogate Gradients for Latent Structure Learning + TsvetomilaMihaylova + VladNiculae + André F. T.Martins + 2186–2202 + 2020.emnlp-main.171 + + + Is the Best Better? <fixed-case>B</fixed-case>ayesian Statistical Model Comparison for Natural Language Processing + PiotrSzymański + KyleGorman + 2203–2212 + 2020.emnlp-main.172 + 2020.emnlp-main.172.OptionalSupplementaryMaterial.zip + + + Multi-Task Learning for Logically Dependent Tasks from the Perspective of Causal Inference + WenqingChen + JidongTian + LiqiangXiao + HaoHe + YaohuiJin + 2213–2225 + 2020.emnlp-main.173 + + + Masking as an Efficient Alternative to Finetuning for Pretrained Language Models + MengjieZhao + TaoLin + FeiMi + MartinJaggi + HinrichSchütze + 2226–2241 + 2020.emnlp-main.174 + + + Dynamic Context Selection for Document-level Neural Machine Translation via Reinforcement Learning + XiaomianKang + YangZhao + JiajunZhang + ChengqingZong + 2242–2254 + 2020.emnlp-main.175 + + + Data Rejuvenation: Exploiting Inactive Training Examples for Neural Machine Translation + WenxiangJiao + XingWang + ShilinHe + IrwinKing + MichaelLyu + ZhaopengTu + 2255–2266 + 2020.emnlp-main.176 + + + Targeted Finetuning for <fixed-case>NMT</fixed-case> with Conditional Generative-Discriminative Loss + PrathyushaJwalapuram + ShafiqJoty + YoulinShen + 2267–2279 + 2020.emnlp-main.177 + + + Learning Adaptive Segmentation Policy for Simultaneous Translation + RuiqingZhang + ChuanqiangZhang + ZhongjunHe + HuaWu + HaifengWang + 2280–2289 + 2020.emnlp-main.178 + + + Learn to Cross-lingual Transfer with Meta Graph Learning across Heterogeneous Languages + ZhengLi + MukulKumar + WilliamHeadden + BingYin + YingWei + YuZhang + QiangYang + 2290–2301 + 2020.emnlp-main.179 + 2020.emnlp-main.179.OptionalSupplementaryMaterial.zip + + + <fixed-case>UD</fixed-case>apter: Language Adaptation for Truly <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependency Parsing + AhmetÜstün + AriannaBisazza + GosseBouma + Gertjanvan Noord + 2302–2315 + 2020.emnlp-main.180 + + + Uncertainty-Aware Label Refinement for Sequence Labeling + TaoGui + JiachengYe + QiZhang + ZhengyanLi + ZichuFei + YeyunGong + XuanjingHuang + 2316–2326 + 2020.emnlp-main.181 + + + Adversarial Attack and Defense of Structured Prediction Models + WenjuanHan + LiwenZhang + YongJiang + KeweiTu + 2327–2338 + 2020.emnlp-main.182 + + + Position-Aware Tagging for Aspect Sentiment Triplet Extraction + LuXu + HaoLi + WeiLu + LidongBing + 2339–2349 + 2020.emnlp-main.183 + 2020.emnlp-main.183.OptionalSupplementaryMaterial.zip + + + Simultaneous Machine Translation with Visual Context + OzanCaglayan + JuliaIve + VenetaHaralampieva + PranavaMadhyastha + LoïcBarrault + LuciaSpecia + 2350–2361 + 2020.emnlp-main.184 + + + <fixed-case>XCOPA</fixed-case>: A Multilingual Dataset for Causal Commonsense Reasoning + Edoardo MariaPonti + GoranGlavaš + OlgaMajewska + QianchuLiu + IvanVulić + AnnaKorhonen + 2362–2376 + 2020.emnlp-main.185 + + + The Secret Is in the Spectra: Predicting Cross-lingual Task Performance with Spectral Similarity Measures + HaimDubossarsky + IvanVulić + RoiReichart + AnnaKorhonen + 2377–2390 + 2020.emnlp-main.186 + 2020.emnlp-main.186.OptionalSupplementaryMaterial.zip + + + Bridging Linguistic Typology and Multilingual Machine Translation with Multi-view Language Representations + ArturoOncevay + BarryHaddow + AlexandraBirch + 2391–2406 + 2020.emnlp-main.187 + + + <fixed-case>A</fixed-case>nswer<fixed-case>F</fixed-case>act: Fact Checking in Product Question Answering + WenxuanZhang + YangDeng + JingMa + WaiLam + 2407–2417 + 2020.emnlp-main.188 + + + Context-Aware Answer Extraction in Question Answering + YeonSeonwoo + Ji-HoonKim + Jung-WooHa + AliceOh + 2418–2428 + 2020.emnlp-main.189 + + + What Do Models Learn from Question Answering Datasets? + PriyankaSen + AmirSaffari + 2429–2438 + 2020.emnlp-main.190 + + + Discern: Discourse-Aware Entailment Reasoning Network for Conversational Machine Reading + YifanGao + Chien-ShengWu + JingjingLi + ShafiqJoty + Steven C.H.Hoi + CaimingXiong + IrwinKing + MichaelLyu + 2439–2449 + 2020.emnlp-main.191 + + + A Method for Building a Commonsense Inference Dataset Based on Basic Events + KazumasaOmura + DaisukeKawahara + SadaoKurohashi + 2450–2460 + 2020.emnlp-main.192 + + + Neural Deepfake Detection with Factual Structure of Text + WanjunZhong + DuyuTang + ZenanXu + RuizeWang + NanDuan + MingZhou + JiahaiWang + JianYin + 2461–2470 + 2020.emnlp-main.193 + + + <fixed-case>M</fixed-case>ulti<fixed-case>CQA</fixed-case>: Zero-Shot Transfer of Self-Supervised Text Matching Models on a Massive Scale + AndreasRücklé + JonasPfeiffer + IrynaGurevych + 2471–2486 + 2020.emnlp-main.194 + + + Enabling Cross-Lingual <fixed-case>AMR</fixed-case> Parsing with Transfer Learning Techniques + RexhinaBlloshmi + RoccoTripodi + RobertoNavigli + 2487–2500 + 2020.emnlp-main.195 + + + Improving <fixed-case>AMR</fixed-case> Parsing with Sequence-to-Sequence Pre-training + DongqinXu + JunhuiLi + MuhuaZhu + MinZhang + GuodongZhou + 2501–2511 + 2020.emnlp-main.196 + + + Hate-Speech and Offensive Language Detection in <fixed-case>R</fixed-case>oman <fixed-case>U</fixed-case>rdu + HammadRizwan + Muhammad HaroonShakeel + AsimKarim + 2512–2522 + 2020.emnlp-main.197 + + + Suicidal Risk Detection for Military Personnel + SungjoonPark + KiwoongPark + JaimeenAhn + AliceOh + 2523–2531 + 2020.emnlp-main.198 + + + Comparative Evaluation of Label Agnostic Selection Bias in Multilingual Hate Speech Datasets + NedjmaOusidhoum + YangqiuSong + Dit-YanYeung + 2532–2542 + 2020.emnlp-main.199 + + + <fixed-case>HENIN</fixed-case>: Learning Heterogeneous Neural Interaction Networks for Explainable Cyberbullying Detection on Social Media + Hsin-YuChen + Cheng-TeLi + 2543–2552 + 2020.emnlp-main.200 + + + <fixed-case>I</fixed-case> Was Just Being Sarcastic! Reactive Supervision: A New Method for Collecting Sarcasm Data + BoazShmueli + Lun-WeiKu + SoumyaRay + 2553–2559 + 2020.emnlp-main.201 + + + Self-Induced Curriculum Learning in Self-Supervised Neural Machine Translation + DanaRuiter + Josefvan Genabith + CristinaEspaña-Bonet + 2560–2571 + 2020.emnlp-main.202 + + + Towards Reasonably-Sized Character-Level Transformer <fixed-case>NMT</fixed-case> by Finetuning Subword Systems + JindřichLibovický + AlexanderFraser + 2572–2579 + 2020.emnlp-main.203 + 2020.emnlp-main.203.OptionalSupplementaryMaterial.tgz + + + Transfer Learning and Distant Supervision for Multilingual Transformer Models: A Study on <fixed-case>A</fixed-case>frican Languages + Michael A.Hedderich + DavidAdelani + DaweiZhu + JesujobaAlabi + UdiaMarkus + DietrichKlakow + 2580–2591 + 2020.emnlp-main.204 + 2020.emnlp-main.204.OptionalSupplementaryMaterial.pdf + + + Translation Quality Estimation by Jointly Learning to Score and Rank + JingyiZhang + Josefvan Genabith + 2592–2598 + 2020.emnlp-main.205 + + + Direct Segmentation Models for Streaming Speech Translation + JavierIranzo-Sánchez + AdriàGiménez Pastor + Joan AlbertSilvestre-Cerdà + PauBaquero-Arnal + JorgeCivera Saiz + AlfonsJuan + 2599–2611 + 2020.emnlp-main.206 + 2020.emnlp-main.206.OptionalSupplementaryMaterial.zip + + + Not Low-Resource Anymore: Aligner Ensembling, Batch Filtering, and New Datasets for <fixed-case>B</fixed-case>engali-<fixed-case>E</fixed-case>nglish Machine Translation + TahmidHasan + AbhikBhattacharjee + KaziSamin + MasumHasan + MadhusudanBasak + M. SohelRahman + RifatShahriyar + 2612–2623 + 2020.emnlp-main.207 + + + <fixed-case>CSP</fixed-case>: Code-Switching Pre-training for Neural Machine Translation + ZhenYang + BojieHu + AmbyeraHan + ShenHuang + QiJu + 2624–2636 + 2020.emnlp-main.208 + + + Type <fixed-case>B</fixed-case> Reflexivization as an Unambiguous Testbed for Multilingual Multi-Task Gender Bias + Ana ValeriaGonzález + MariaBarrett + RasmusHvingelby + KellieWebster + AndersSøgaard + 2637–2648 + 2020.emnlp-main.209 + + + Pre-training Multilingual Neural Machine Translation by Leveraging Alignment Information + ZehuiLin + XiaoPan + MingxuanWang + XipengQiu + JiangtaoFeng + HaoZhou + LeiLi + 2649–2663 + 2020.emnlp-main.210 + + + Losing Heads in the Lottery: Pruning Transformer Attention in Neural Machine Translation + MaximilianaBehnke + KennethHeafield + 2664–2674 + 2020.emnlp-main.211 + + + Towards Enhancing Faithfulness for Neural Machine Translation + RongxiangWeng + HengYu + XiangpengWei + WeihuaLuo + 2675–2684 + 2020.emnlp-main.212 + + + <fixed-case>COMET</fixed-case>: A Neural Framework for <fixed-case>MT</fixed-case> Evaluation + RicardoRei + CraigStewart + Ana CFarinha + AlonLavie + 2685–2702 + 2020.emnlp-main.213 + + + Reusing a Pretrained Language Model on Languages with Limited Corpora for Unsupervised <fixed-case>NMT</fixed-case> + AlexandraChronopoulou + DarioStojanovski + AlexanderFraser + 2703–2711 + 2020.emnlp-main.214 + 2020.emnlp-main.214.OptionalSupplementaryMaterial.zip + + + <fixed-case>LNM</fixed-case>ap: Departures from Isomorphic Assumption in Bilingual Lexicon Induction through Non-Linear Mapping in Latent Space + TasnimMohiuddin + M SaifulBari + ShafiqJoty + 2712–2723 + 2020.emnlp-main.215 + + + Uncertainty-Aware Semantic Augmentation for Neural Machine Translation + XiangpengWei + HengYu + YueHu + RongxiangWeng + LuxiXing + WeihuaLuo + 2724–2735 + 2020.emnlp-main.216 + 2020.emnlp-main.216.OptionalSupplementaryMaterial.zip + + + Can Automatic Post-Editing Improve <fixed-case>NMT</fixed-case>? + ShamilChollampatt + Raymond HendySusanto + LilingTan + EwaSzymanska + 2736–2746 + 2020.emnlp-main.217 + + + Parsing Gapping Constructions Based on Grammatical and Semantic Roles + YoshihideKato + ShigekiMatsubara + 2747–2752 + 2020.emnlp-main.218 + + + Span-based Discontinuous Constituency Parsing: A Family of Exact Chart-based Algorithms with Time Complexities from <fixed-case>O</fixed-case>(nˆ6) Down to <fixed-case>O</fixed-case>(nˆ3) + CaioCorro + 2753–2764 + 2020.emnlp-main.219 + 2020.emnlp-main.219.OptionalSupplementaryMaterial.pdf + + + Some Languages Seem Easier to Parse Because Their Treebanks Leak + AndersSøgaard + 2765–2770 + 2020.emnlp-main.220 + 2020.emnlp-main.220.OptionalSupplementaryMaterial.zip + + + Discontinuous Constituent Parsing as Sequence Labeling + DavidVilares + CarlosGómez-Rodríguez + 2771–2785 + 2020.emnlp-main.221 + + + Modularized Syntactic Neural Networks for Sentence Classification + HaiyanWu + YingLiu + ShaoyunShi + 2786–2792 + 2020.emnlp-main.222 + + + <fixed-case>TED</fixed-case>-<fixed-case>CDB</fixed-case>: A Large-Scale <fixed-case>C</fixed-case>hinese Discourse Relation Dataset on <fixed-case>TED</fixed-case> Talks + WanqiuLong + BonnieWebber + DeyiXiong + 2793–2803 + 2020.emnlp-main.223 + + + <fixed-case>QAD</fixed-case>iscourse - Discourse Relations as <fixed-case>QA</fixed-case> Pairs: Representation, Crowdsourcing and Baselines + ValentinaPyatkin + AyalKlein + ReutTsarfaty + IdoDagan + 2804–2819 + 2020.emnlp-main.224 + + + Discourse Self-Attention for Discourse Element Identification in Argumentative Student Essays + WeiSong + ZiyaoSong + RuijiFu + LizhenLiu + MiaomiaoCheng + TingLiu + 2820–2830 + 2020.emnlp-main.225 + 2020.emnlp-main.225.OptionalSupplementaryMaterial.zip + + + Controllable Story Generation with External Knowledge Using Large-Scale Language Models + PengXu + MostofaPatwary + MohammadShoeybi + RaulPuri + PascaleFung + AnimaAnandkumar + BryanCatanzaro + 2831–2845 + 2020.emnlp-main.226 + + + Incomplete Utterance Rewriting as Semantic Segmentation + QianLiu + BeiChen + Jian-GuangLou + BinZhou + DongmeiZhang + 2846–2857 + 2020.emnlp-main.227 + + + Improving Grammatical Error Correction Models with Purpose-Built Adversarial Examples + LihaoWang + XiaoqingZheng + 2858–2869 + 2020.emnlp-main.228 + + + Homophonic Pun Generation with Lexically Constrained Rewriting + ZhiweiYu + HongyuZang + XiaojunWan + 2870–2876 + 2020.emnlp-main.229 + + + How to Make Neural Natural Language Generation as Reliable as Templates in Task-Oriented Dialogue + HenryElder + AlexanderO’Connor + JenniferFoster + 2877–2888 + 2020.emnlp-main.230 + + + Multilingual <fixed-case>AMR</fixed-case>-to-Text Generation + AngelaFan + ClaireGardent + 2889–2901 + 2020.emnlp-main.231 + + + Exploring the Linear Subspace Hypothesis in Gender Bias Mitigation + FranciscoVargas + RyanCotterell + 2902–2913 + 2020.emnlp-main.232 + 2020.emnlp-main.232.OptionalSupplementaryMaterial.zip + + + Lifelong Language Knowledge Distillation + Yung-SungChuang + Shang-YuSu + Yun-NungChen + 2914–2924 + 2020.emnlp-main.233 + + + Sparse Parallel Training for Hierarchical <fixed-case>D</fixed-case>irichlet Process Topic Models + AlexanderTerenin + MånsMagnusson + LeifJonsson + 2925–2934 + 2020.emnlp-main.234 + 2020.emnlp-main.234.OptionalSupplementaryMaterial.zip + + + Multi-label Few/Zero-shot Learning with Knowledge Aggregated from Multiple Label Graphs + JueqingLu + LanDu + MingLiu + JoannaDipnall + 2935–2943 + 2020.emnlp-main.235 + + + Word Rotator’s Distance + ShoYokoi + RyoTakahashi + ReinaAkama + JunSuzuki + KentaroInui + 2944–2960 + 2020.emnlp-main.236 + + + Disentangle-based Continual Graph Representation Learning + XiaoyuKou + YankaiLin + ShaoboLiu + PengLi + JieZhou + YanZhang + 2961–2972 + 2020.emnlp-main.237 + + + Semi-Supervised Bilingual Lexicon Induction with Two-way Interaction + XuZhao + ZihaoWang + HaoWu + YongZhang + 2973–2984 + 2020.emnlp-main.238 + 2020.emnlp-main.238.OptionalSupplementaryMaterial.zip + + + <fixed-case>W</fixed-case>asserstein Distance Regularized Sequence Representation for Text Matching in Asymmetrical Domains + WeijieYu + ChenXu + JunXu + LiangPang + XiaopengGao + XiaozhaoWang + Ji-RongWen + 2985–2994 + 2020.emnlp-main.239 + + + A Simple Approach to Learning Unsupervised Multilingual Embeddings + PratikJawanpuria + MayankMeghwanshi + BamdevMishra + 2995–3001 + 2020.emnlp-main.240 + + + Bootstrapped <fixed-case>Q</fixed-case>-learning with Context Relevant Observation Pruning to Generalize in Text-based Games + SubhajitChaudhury + DaikiKimura + KartikTalamadupula + MichiakiTatsubori + AsimMunawar + RyukiTachibana + 3002–3008 + 2020.emnlp-main.241 + + + <fixed-case>BERT</fixed-case>-<fixed-case>EMD</fixed-case>: Many-to-Many Layer Mapping for <fixed-case>BERT</fixed-case> Compression with Earth Mover’s Distance + JianquanLi + XiaokangLiu + HonghongZhao + RuifengXu + MinYang + YaohongJin + 3009–3018 + 2020.emnlp-main.242 + + + Slot Attention with Value Normalization for Multi-domain Dialogue State Tracking + YexiangWang + YiGuo + SiqiZhu + 3019–3028 + 2020.emnlp-main.243 + + + Don’t Read Too Much into It: Adaptive Computation for Open-Domain Question Answering + YuxiangWu + SebastianRiedel + PasqualeMinervini + PontusStenetorp + 3029–3039 + 2020.emnlp-main.244 + + + Multi-Step Inference for Reasoning over Paragraphs + JiangmingLiu + MattGardner + Shay B.Cohen + MirellaLapata + 3040–3050 + 2020.emnlp-main.245 + + + Learning a Cost-Effective Annotation Policy for Question Answering + BernhardKratzwald + StefanFeuerriegel + HuanSun + 3051–3062 + 2020.emnlp-main.246 + + + Scene Restoring for Narrative Machine Reading Comprehension + ZhixingTian + YuanzheZhang + KangLiu + JunZhao + YantaoJia + ZhichengSheng + 3063–3073 + 2020.emnlp-main.247 + + + A Simple and Effective Model for Answering Multi-span Questions + EladSegal + AviaEfrat + MorShoham + AmirGloberson + JonathanBerant + 3074–3080 + 2020.emnlp-main.248 + + + Top-Rank-Focused Adaptive Vote Collection for the Evaluation of Domain-Specific Semantic Models + PierangeloLombardo + AlessioBoiardi + LucaColombo + AngeloSchiavone + NicolòTamagnone + 3081–3093 + 2020.emnlp-main.249 + + + Meta Fine-Tuning Neural Language Models for Multi-Domain Text Mining + ChengyuWang + MinghuiQiu + JunHuang + XiaofengHe + 3094–3104 + 2020.emnlp-main.250 + + + Incorporating Context Structures for Query Generation + Ruey-ChengChen + Chia-JungLee + 3105–3110 + 2020.emnlp-main.251 + + + Conditional Causal Relationships between Emotions and Causes in Texts + XinhongChen + QingLi + JianpingWang + 3111–3121 + 2020.emnlp-main.252 + 2020.emnlp-main.252.OptionalSupplementaryMaterial.zip + + + <fixed-case>COMETA</fixed-case>: A Corpus for Medical Entity Linking in the Social Media + MarcoBasaldella + FangyuLiu + EhsanShareghi + NigelCollier + 3122–3137 + 2020.emnlp-main.253 + + + <fixed-case>P</fixed-case>areto Probing: Trading-Off Accuracy and Complexity + TiagoPimentel + NaomiSaphra + AdinaWilliams + RyanCotterell + 3138–3153 + 2020.emnlp-main.254 + + + Interpretation of <fixed-case>NLP</fixed-case> Models through Input Marginalization + SiwonKim + JihunYi + EunjiKim + SungrohYoon + 3154–3167 + 2020.emnlp-main.255 + + + Generating Label Cohesive and Well-Formed Adversarial Claims + PepaAtanasova + DustinWright + IsabelleAugenstein + 3168–3177 + 2020.emnlp-main.256 + + + Are All Good Word Vector Spaces Isomorphic? + IvanVulić + SebastianRuder + AndersSøgaard + 3178–3192 + 2020.emnlp-main.257 + 2020.emnlp-main.257.OptionalSupplementaryMaterial.zip + + + Cold-start and Interpretability: Turning Regular Expressions into Trainable Recurrent Neural Networks + ChengyueJiang + YinggongZhao + ShanboChu + LibinShen + KeweiTu + 3193–3207 + 2020.emnlp-main.258 + + + When <fixed-case>BERT</fixed-case> Plays the Lottery, All Tickets Are Winning + SaiPrasanna + AnnaRogers + AnnaRumshisky + 3208–3229 + 2020.emnlp-main.259 + + + On the Weak Link between Importance and Prunability of Attention Heads + AakritiBudhraja + MadhuraPande + PrekshaNema + PratyushKumar + Mitesh M.Khapra + 3230–3235 + 2020.emnlp-main.260 + + + Towards Interpreting <fixed-case>BERT</fixed-case> for Reading Comprehension Based <fixed-case>QA</fixed-case> + SahanaRamnath + PrekshaNema + DeepSahni + Mitesh M.Khapra + 3236–3242 + 2020.emnlp-main.261 + + + How Do Decisions Emerge across Layers in Neural Models? Interpretation with Differentiable Masking + NicolaDe Cao + Michael SejrSchlichtkrull + WilkerAziz + IvanTitov + 3243–3255 + 2020.emnlp-main.262 + 2020.emnlp-main.262.OptionalSupplementaryMaterial.zip + + + A Diagnostic Study of Explainability Techniques for Text Classification + PepaAtanasova + Jakob GrueSimonsen + ChristinaLioma + IsabelleAugenstein + 3256–3274 + 2020.emnlp-main.263 + + + <fixed-case>STL</fixed-case>-<fixed-case>CQA</fixed-case>: Structure-based Transformers with Localization and Encoding for Chart Question Answering + HriturajSingh + SumitShekhar + 3275–3284 + 2020.emnlp-main.264 + 2020.emnlp-main.264.OptionalSupplementaryMaterial.zip + + + Learning to Contrast the Counterfactual Samples for Robust Visual Question Answering + ZujieLiang + WeitaoJiang + HaifengHu + JiayingZhu + 3285–3292 + 2020.emnlp-main.265 + + + Learning Physical Common Sense as Knowledge Graph Completion via <fixed-case>BERT</fixed-case> Data Augmentation and Constrained Tucker Factorization + ZhenjieZhao + EvangelosPapalexakis + XiaojuanMa + 3293–3298 + 2020.emnlp-main.266 + + + A Visually-grounded First-person Dialogue Dataset with Verbal and Non-verbal Responses + HisashiKamezawa + NorikiNishida + NobuyukiShimizu + TakashiMiyazaki + HidekiNakayama + 3299–3310 + 2020.emnlp-main.267 + + + Cross-Media Keyphrase Prediction: A Unified Framework with Multi-Modality Multi-Head Attention and Image Wordings + YueWang + JingLi + MichaelLyu + IrwinKing + 3311–3324 + 2020.emnlp-main.268 + + + <fixed-case>VD</fixed-case>-<fixed-case>BERT</fixed-case>: A Unified Vision and Dialog Transformer with <fixed-case>BERT</fixed-case> + YueWang + ShafiqJoty + MichaelLyu + IrwinKing + CaimingXiong + Steven C.H.Hoi + 3325–3338 + 2020.emnlp-main.269 + + + The Grammar of Emergent Languages + Oskarvan der Wal + Silvande Boer + EliaBruni + DieuwkeHupkes + 3339–3359 + 2020.emnlp-main.270 + + + Sub-Instruction Aware Vision-and-Language Navigation + YicongHong + CristianRodriguez + QiWu + StephenGould + 3360–3376 + 2020.emnlp-main.271 + + + Knowledge-Grounded Dialogue Generation with Pre-trained Language Models + XueliangZhao + WeiWu + CanXu + ChongyangTao + DongyanZhao + RuiYan + 3377–3390 + 2020.emnlp-main.272 + + + <fixed-case>M</fixed-case>in<fixed-case>TL</fixed-case>: Minimalist Transfer Learning for Task-Oriented Dialogue Systems + ZhaojiangLin + AndreaMadotto + Genta IndraWinata + PascaleFung + 3391–3405 + 2020.emnlp-main.273 + + + Variational Hierarchical Dialog Autoencoder for Dialog State Tracking Data Augmentation + Kang MinYoo + HanbitLee + FranckDernoncourt + TrungBui + WalterChang + Sang-gooLee + 3406–3425 + 2020.emnlp-main.274 + + + Bridging the Gap between Prior and Posterior Knowledge Selection for Knowledge-Grounded Dialogue Generation + XiuyiChen + FandongMeng + PengLi + FeilongChen + ShuangXu + BoXu + JieZhou + 3426–3437 + 2020.emnlp-main.275 + + + Counterfactual Off-Policy Training for Neural Dialogue Generation + QingfuZhu + Wei-NanZhang + TingLiu + William YangWang + 3438–3448 + 2020.emnlp-main.276 + + + Dialogue Distillation: Open-domain Dialogue Augmentation Using Unpaired Data + RongshengZhang + YinheZheng + JianzhiShao + XiaoxiMao + YadongXi + MinlieHuang + 3449–3460 + 2020.emnlp-main.277 + + + Task-Completion Dialogue Policy Learning via <fixed-case>M</fixed-case>onte <fixed-case>C</fixed-case>arlo Tree Search with Dueling Network + SihanWang + KaijieZhou + KunfengLai + JianpingShen + 3461–3471 + 2020.emnlp-main.278 + + + Learning a Simple and Effective Model for Multi-turn Response Generation with Auxiliary Tasks + YufanZhao + CanXu + WeiWu + 3472–3483 + 2020.emnlp-main.279 + + + <fixed-case>A</fixed-case>ttn<fixed-case>IO</fixed-case>: Knowledge Graph Exploration with In-and-Out Attention Flow for Knowledge-Grounded Dialogue + JaehunJung + BokyungSon + SungwonLyu + 3484–3497 + 2020.emnlp-main.280 + 2020.emnlp-main.280.OptionalSupplementaryMaterial.zip + + + Amalgamating Knowledge from Two Teachers for Task-oriented Dialogue System with Adversarial Training + WanweiHe + MinYang + RuiYan + ChengmingLi + YingShen + RuifengXu + 3498–3507 + 2020.emnlp-main.281 + + + Task-oriented Domain-specific Meta-Embedding for Text Classification + XinWu + YiCai + YangKai + TaoWang + QingLi + 3508–3513 + 2020.emnlp-main.282 + + + Don’t Neglect the Obvious: On the Role of Unambiguous Words in Word Sense Disambiguation + DanielLoureiro + JoseCamacho-Collados + 3514–3520 + 2020.emnlp-main.283 + + + Within-Between Lexical Relation Classification Using Path-based and Distributional Data + OrenBarkan + AviCaciularu + IdoDagan + 3521–3527 + 2020.emnlp-main.284 + + + With More Contexts Comes Better Performance: Contextualized Sense Embeddings for All-Round Word Sense Disambiguation + BiancaScarlini + TommasoPasini + RobertoNavigli + 3528–3539 + 2020.emnlp-main.285 + + + Convolution over Hierarchical Syntactic and Lexical Graphs for Aspect Level Sentiment Analysis + MiZhang + TieyunQian + 3540–3549 + 2020.emnlp-main.286 + + + Multi-Instance Multi-Label Learning Networks for Aspect-Category Sentiment Analysis + YuncongLi + CunxiangYin + Sheng-huaZhong + XuPan + 3550–3560 + 2020.emnlp-main.287 + + + Aspect Based Sentiment Analysis with Aspect-Specific Opinion Spans + LuXu + LidongBing + WeiLu + FeiHuang + 3561–3567 + 2020.emnlp-main.288 + + + Emotion-Cause Pair Extraction as Sequence Labeling Based on a Novel Tagging Scheme + ChaofaYuan + ChuangFan + JianzhuBao + RuifengXu + 3568–3573 + 2020.emnlp-main.289 + + + End-to-End Emotion-Cause Pair Extraction Based on Sliding Window Multi-Label Learning + ZixiangDing + RuiXia + JianfeiYu + 3574–3583 + 2020.emnlp-main.290 + + + Multi-modal Multi-label Emotion Detection with Modality and Label Dependence + DongZhang + XinchengJu + JunhuiLi + ShoushanLi + QiaomingZhu + GuodongZhou + 3584–3593 + 2020.emnlp-main.291 + + + Tasty Burgers, Soggy Fries: Probing Aspect Robustness in Aspect-Based Sentiment Analysis + XiaoyuXing + ZhijingJin + DiJin + BingningWang + QiZhang + XuanjingHuang + 3594–3605 + 2020.emnlp-main.292 + + + Modeling Content Importance for Summarization with Pre-trained Language Models + LiqiangXiao + LuWang + HaoHe + YaohuiJin + 3606–3611 + 2020.emnlp-main.293 + + + Unsupervised Reference-Free Summary Quality Evaluation via Contrastive Learning + HanluWu + TengfeiMa + LingfeiWu + TariroManyumwa + ShoulingJi + 3612–3621 + 2020.emnlp-main.294 + + + Neural Extractive Summarization with Hierarchical Attentive Heterogeneous Graph Network + RuipengJia + YananCao + HengzhuTang + FangFang + CongCao + ShiWang + 3622–3631 + 2020.emnlp-main.295 + + + Coarse-to-Fine Query Focused Multi-Document Summarization + YumoXu + MirellaLapata + 3632–3645 + 2020.emnlp-main.296 + + + Pre-training for Abstractive Document Summarization by Reinstating Source Text + YanyanZou + XingxingZhang + WeiLu + FuruWei + MingZhou + 3646–3660 + 2020.emnlp-main.297 + + + Learning from Context or Names? An Empirical Study on Neural Relation Extraction + HaoPeng + TianyuGao + XuHan + YankaiLin + PengLi + ZhiyuanLiu + MaosongSun + JieZhou + 3661–3672 + 2020.emnlp-main.298 + 2020.emnlp-main.298.OptionalSupplementaryMaterial.zip + + + <fixed-case>S</fixed-case>elf<fixed-case>ORE</fixed-case>: Self-supervised Relational Feature Learning for Open Relation Extraction + XumingHu + LijieWen + YusongXu + ChenweiZhang + PhilipYu + 3673–3682 + 2020.emnlp-main.299 + + + Denoising Relation Extraction from Document-level Distant Supervision + ChaojunXiao + YuanYao + RuobingXie + XuHan + ZhiyuanLiu + MaosongSun + FenLin + LeyuLin + 3683–3688 + 2020.emnlp-main.300 + + + Let’s Stop Error Propagation in the End-to-End Relation Extraction Literature! + BrunoTaillé + VincentGuigue + GeoffreyScoutheeten + PatrickGallinari + 3689–3701 + 2020.emnlp-main.301 + 2020.emnlp-main.301.OptionalSupplementaryMaterial.zip + + + Exposing Shallow Heuristics of Relation Extraction Models with Challenge Data + ShacharRosenman + AlonJacovi + YoavGoldberg + 3702–3710 + 2020.emnlp-main.302 + + + Global-to-Local Neural Networks for Document-Level Relation Extraction + DifengWang + WeiHu + ErmeiCao + WeijianSun + 3711–3721 + 2020.emnlp-main.303 + 2020.emnlp-main.303.OptionalSupplementaryMaterial.zip + + + Recurrent Interaction Network for Jointly Extracting Entities and Classifying Relations + KaiSun + RichongZhang + SamuelMensah + YongyiMao + XudongLiu + 3722–3732 + 2020.emnlp-main.304 + + + Temporal Knowledge Base Completion: New Algorithms and Evaluation Protocols + PrachiJain + SushantRathi + Mausam + SoumenChakrabarti + 3733–3747 + 2020.emnlp-main.305 + + + Constrained Iterative Labeling for Open Information Extraction + KeshavKolluru + VaibhavAdlakha + SamarthAggarwal + Mausam + SoumenChakrabarti + 3748–3761 + 2020.emnlp-main.306 + + + Public Sentiment Drift Analysis Based on Hierarchical Variational Auto-encoder + WenyueZhang + XiaoliLi + YangLi + SugeWang + DeyuLi + JianLiao + JianxingZheng + 3762–3767 + 2020.emnlp-main.307 + + + Point to the Expression: Solving Algebraic Word Problems Using the Expression-Pointer Transformer Model + BugeunKim + Kyung SeoKi + DonggeonLee + GahgeneGweon + 3768–3779 + 2020.emnlp-main.308 + 2020.emnlp-main.308.OptionalSupplementaryMaterial.zip + + + Semantically-Aligned Universal Tree-Structured Solver for Math Word Problems + JinghuiQin + LihuiLin + XiaodanLiang + RuminZhang + LiangLin + 3780–3789 + 2020.emnlp-main.309 + + + Neural Topic Modeling by Incorporating Document Relationship Graph + DeyuZhou + XuemengHu + RuiWang + 3790–3796 + 2020.emnlp-main.310 + + + Routing Enforced Generative Model for Recipe Generation + ZhiweiYu + HongyuZang + XiaojunWan + 3797–3806 + 2020.emnlp-main.311 + + + Assessing the Helpfulness of Learning Materials with Inference-Based Learner-Like Agent + Yun-HsuanJen + Chieh-YangHuang + MeiHuaChen + Ting-HaoHuang + Lun-WeiKu + 3807–3817 + 2020.emnlp-main.312 + 2020.emnlp-main.312.OptionalSupplementaryMaterial.zip + + + Selection and Generation: Learning towards Multi-Product Advertisement Post Generation + ZhangmingChan + YuchiZhang + XiuyingChen + ShenGao + ZhiqiangZhang + DongyanZhao + RuiYan + 3818–3829 + 2020.emnlp-main.313 + + + <fixed-case>F</fixed-case>orm2<fixed-case>S</fixed-case>eq : A Framework for Higher-Order Form Structure Extraction + MilanAggarwal + HireshGupta + MausoomSarkar + BalajiKrishnamurthy + 3830–3840 + 2020.emnlp-main.314 + 2020.emnlp-main.314.OptionalSupplementaryMaterial.zip + + + Domain Adaptation of <fixed-case>T</fixed-case>hai Word Segmentation Models Using Stacked Ensemble + PeeratLimkonchotiwat + WannaphongPhatthiyaphaibun + RaheemSarwar + EkapolChuangsuwanich + SaranaNutanong + 3841–3847 + 2020.emnlp-main.315 + + + <fixed-case>D</fixed-case>ago<fixed-case>BERT</fixed-case>: Generating Derivational Morphology with a Pretrained Language Model + ValentinHofmann + JanetPierrehumbert + HinrichSchütze + 3848–3861 + 2020.emnlp-main.316 + + + Attention Is All You Need for <fixed-case>C</fixed-case>hinese Word Segmentation + SufengDuan + HaiZhao + 3862–3872 + 2020.emnlp-main.317 + + + A Joint Multiple Criteria Model in Transfer Learning for Cross-domain <fixed-case>C</fixed-case>hinese Word Segmentation + KaiyuHuang + DegenHuang + ZhuangLiu + FengranMo + 3873–3882 + 2020.emnlp-main.318 + + + Alignment-free Cross-lingual Semantic Role Labeling + RuiCai + MirellaLapata + 3883–3894 + 2020.emnlp-main.319 + + + Leveraging Declarative Knowledge in Text and First-Order Logic for Fine-Grained Propaganda Detection + RuizeWang + DuyuTang + NanDuan + WanjunZhong + ZhongyuWei + XuanjingHuang + DaxinJiang + MingZhou + 3895–3903 + 2020.emnlp-main.320 + + + <fixed-case>X</fixed-case>-<fixed-case>SRL</fixed-case>: A Parallel Cross-Lingual Semantic Role Labeling Dataset + AngelDaza + AnetteFrank + 3904–3914 + 2020.emnlp-main.321 + 2020.emnlp-main.321.OptionalSupplementaryMaterial.pdf + + + Graph Convolutions over Constituent Trees for Syntax-Aware Semantic Role Labeling + DiegoMarcheggiani + IvanTitov + 3915–3928 + 2020.emnlp-main.322 + + + Fast Semantic Parsing with Well-typedness Guarantees + MatthiasLindemann + JonasGroschwitz + AlexanderKoller + 3929–3951 + 2020.emnlp-main.323 + + + Improving Out-of-Scope Detection in Intent Classification by Using Embeddings of the Word Graph Space of the Classes + PauloCavalin + Victor HenriqueAlves Ribeiro + AnaAppel + ClaudioPinhanez + 3952–3961 + 2020.emnlp-main.324 + + + Supervised Seeded Iterated Learning for Interactive Language Learning + YuchenLu + SoumyeSinghal + FlorianStrub + OlivierPietquin + AaronCourville + 3962–3970 + 2020.emnlp-main.325 + + + Spot the Bot: A Robust and Efficient Framework for the Evaluation of Conversational Dialogue Systems + JanDeriu + DonTuggener + Piusvon Däniken + Jon AnderCampos + AlvaroRodrigo + ThiziriBelkacem + AitorSoroa + EnekoAgirre + MarkCieliebak + 3971–3984 + 2020.emnlp-main.326 + 2020.emnlp-main.326.OptionalSupplementaryMaterial.zip + + + Human-centric Dialog Training via Offline Reinforcement Learning + NatashaJaques + Judy HanwenShen + AsmaGhandeharioun + CraigFerguson + AgataLapedriza + NoahJones + ShixiangGu + RosalindPicard + 3985–4003 + 2020.emnlp-main.327 + + + Speakers Fill Semantic Gaps with Context + TiagoPimentel + RowanHall Maudslay + DamianBlasi + RyanCotterell + 4004–4015 + 2020.emnlp-main.328 + + + Investigating Cross-Linguistic Adjective Ordering Tendencies with a Latent-Variable Model + Jun YenLeung + GuyEmerson + RyanCotterell + 4016–4028 + 2020.emnlp-main.329 + + + Surprisal Predicts Code-Switching in <fixed-case>C</fixed-case>hinese-<fixed-case>E</fixed-case>nglish Bilingual Text + JesúsCalvillo + LeFang + JeremyCole + DavidReitter + 4029–4039 + 2020.emnlp-main.330 + + + Investigating Lexical Variability in Language Models + CharlesYu + RyanSie + NicolasTedeschi + LeonBergen + 4040–4054 + 2020.emnlp-main.331 + + + Improving Word Sense Disambiguation with Translations + YixingLuan + BradleyHauer + LiliMou + GrzegorzKondrak + 4055–4065 + 2020.emnlp-main.332 + + + Towards Better Context-aware Lexical Semantics: Adjusting Contextualized Representations through Static Anchors + QianchuLiu + DianaMcCarthy + AnnaKorhonen + 4066–4075 + 2020.emnlp-main.333 + + + Compositional Demographic Word Embeddings + CharlesWelch + Jonathan K.Kummerfeld + VerónicaPérez-Rosas + RadaMihalcea + 4076–4089 + 2020.emnlp-main.334 + + + Do “Undocumented Immigrants” == “Illegal Aliens”? Differentiating Denotation and Connotation in Vector Space + AlbertWebson + ZhizhongChen + CarstenEickhoff + ElliePavlick + 4090–4105 + 2020.emnlp-main.335 + + + Multi-View Sequence-to-Sequence Models with Conversational Structure for Abstractive Dialogue Summarization + JiaaoChen + DiyiYang + 4106–4118 + 2020.emnlp-main.336 + + + Few-Shot Learning for Opinion Summarization + ArthurBražinskas + MirellaLapata + IvanTitov + 4119–4135 + 2020.emnlp-main.337 + + + Learning to Fuse Sentences with Transformers for Summarization + LoganLebanoff + FranckDernoncourt + Doo SoonKim + LidanWang + WalterChang + FeiLiu + 4136–4142 + 2020.emnlp-main.338 + + + Stepwise Extractive Summarization and Planning with Structured Transformers + ShashiNarayan + JoshuaMaynez + JakubAdamek + DanielePighin + BlazBratanic + RyanMcDonald + 4143–4159 + 2020.emnlp-main.339 + + + <fixed-case>CLIRM</fixed-case>atrix: A Massively Large Collection of Bilingual and Multilingual Datasets for Cross-Lingual Information Retrieval + ShuoSun + KevinDuh + 4160–4170 + 2020.emnlp-main.340 + + + <fixed-case>SLEDGE</fixed-case>: A Simple Yet Effective Zero-Shot Baseline for Coronavirus Scientific Knowledge Search + SeanMacAvaney + ArmanCohan + NazliGoharian + 4171–4179 + 2020.emnlp-main.341 + + + Modularized Transfomer-based Ranking Framework + LuyuGao + ZhuyunDai + JamieCallan + 4180–4190 + 2020.emnlp-main.342 + + + Ad-hoc Document Retrieval Using Weak-Supervision with <fixed-case>BERT</fixed-case> and <fixed-case>GPT</fixed-case>2 + YosiMass + HaggaiRoitman + 4191–4197 + 2020.emnlp-main.343 + + + Adversarial Semantic Collisions + CongzhengSong + AlexanderRush + VitalyShmatikov + 4198–4210 + 2020.emnlp-main.344 + + + Learning Explainable Linguistic Expressions with Neural Inductive Logic Programming for Sentence Classification + PrithvirajSen + MarinaDanilevsky + YunyaoLi + SiddharthaBrahma + MatthiasBoehm + LauraChiticariu + RajasekarKrishnamurthy + 4211–4221 + 2020.emnlp-main.345 + + + Eliciting Knowledge from Language Models Using Automatically Generated Prompts + TaylorShin + YasamanRazeghi + Robert L.Logan IV + EricWallace + SameerSingh + 4222–4235 + 2020.emnlp-main.346 + + + Learning Variational Word Masks to Improve the Interpretability of Neural Text Classifiers + HanjieChen + YangfengJi + 4236–4251 + 2020.emnlp-main.347 + + + Sparse Text Generation + Pedro HenriqueMartins + ZitaMarinho + André F. T.Martins + 4252–4273 + 2020.emnlp-main.348 + + + <fixed-case>P</fixed-case>lot<fixed-case>M</fixed-case>achines: Outline-Conditioned Generation with Dynamic Plot State Tracking + HannahRashkin + AsliCelikyilmaz + YejinChoi + JianfengGao + 4274–4295 + 2020.emnlp-main.349 + + + Do Sequence-to-sequence <fixed-case>VAE</fixed-case>s Learn Global Features of Sentences? + TomBosc + PascalVincent + 4296–4318 + 2020.emnlp-main.350 + + + Content Planning for Neural Story Generation with Aristotelian Rescoring + SeraphinaGoldfarb-Tarrant + TuhinChakrabarty + RalphWeischedel + NanyunPeng + 4319–4338 + 2020.emnlp-main.351 + + + Generating Dialogue Responses from a Semantic Latent Space + Wei-JenKo + AvikRay + YilinShen + HongxiaJin + 4339–4349 + 2020.emnlp-main.352 + + + Refer, Reuse, Reduce: Grounding Subsequent References in Visual and Conversational Contexts + EceTakmaz + MarioGiulianelli + SandroPezzelle + ArabellaSinclair + RaquelFernández + 4350–4368 + 2020.emnlp-main.353 + + + Visually Grounded Compound <fixed-case>PCFG</fixed-case>s + YanpengZhao + IvanTitov + 4369–4379 + 2020.emnlp-main.354 + + + <fixed-case>ALICE</fixed-case>: Active Learning with Contrastive Natural Language Explanations + WeixinLiang + JamesZou + ZhouYu + 4380–4391 + 2020.emnlp-main.355 + + + Room-Across-Room: Multilingual Vision-and-Language Navigation with Dense Spatiotemporal Grounding + AlexanderKu + PeterAnderson + RomaPatel + EugeneIe + JasonBaldridge + 4392–4412 + 2020.emnlp-main.356 + + + Iterative Language-Based Image Editing via Self-Supervised Counterfactual Reasoning + Tsu-JuiFu + XinWang + ScottGrafton + MiguelEckstein + William YangWang + 4413–4422 + 2020.emnlp-main.357 + + + Identifying Elements Essential for <fixed-case>BERT</fixed-case>’s Multilinguality + PhilippDufter + HinrichSchütze + 4423–4437 + 2020.emnlp-main.358 + + + On Negative Interference in Multilingual Language Models + ZiruiWang + Zachary C.Lipton + YuliaTsvetkov + 4438–4450 + 2020.emnlp-main.359 + + + Pre-tokenization of Multi-word Expressions in Cross-lingual Word Embeddings + NaokiOtani + SatoruOzaki + XingyuanZhao + YucenLi + MicaelahSt Johns + LoriLevin + 4451–4464 + 2020.emnlp-main.360 + + + Language Adapters for Zero Shot Neural Machine Translation + JerinPhilip + AlexandreBerard + MatthiasGallé + LaurentBesacier + 4465–4470 + 2020.emnlp-main.361 + + + Do Explicit Alignments Robustly Improve Massively Multilingual Encoders? + ShijieWu + MarkDredze + 4471–4482 + 2020.emnlp-main.362 + + + From Zero to Hero: On the Limitations of Zero-Shot Language Transfer with Multilingual Transformers + AnneLauscher + VinitRavishankar + IvanVulić + GoranGlavaš + 4483–4499 + 2020.emnlp-main.363 + 2020.emnlp-main.363.OptionalSupplementaryMaterial.zip + + + Distilling Multiple Domains for Neural Machine Translation + AnnaCurrey + PrashantMathur + GeorgianaDinu + 4500–4511 + 2020.emnlp-main.364 + + + Making Monolingual Sentence Embeddings Multilingual Using Knowledge Distillation + NilsReimers + IrynaGurevych + 4512–4525 + 2020.emnlp-main.365 + + + A Streaming Approach for Efficient Batched Beam Search + KevinYang + VioletYao + JohnDeNero + DanKlein + 4526–4535 + 2020.emnlp-main.366 + + + Improving Multilingual Models with Language-Clustered Vocabularies + Hyung WonChung + DanGarrette + Kiat ChuanTan + JasonRiesa + 4536–4546 + 2020.emnlp-main.367 + + + Zero-Shot Cross-Lingual Transfer with Meta Learning + FarhadNooralahzadeh + GiannisBekoulis + JohannesBjerva + IsabelleAugenstein + 4547–4562 + 2020.emnlp-main.368 + + + The Multilingual <fixed-case>A</fixed-case>mazon Reviews Corpus + PhillipKeung + YichaoLu + GyörgySzarvas + Noah A.Smith + 4563–4568 + 2020.emnlp-main.369 + + + <fixed-case>GLUCOSE</fixed-case>: <fixed-case>G</fixed-case>enera<fixed-case>L</fixed-case>ized and <fixed-case>CO</fixed-case>ntextualized Story Explanations + NasrinMostafazadeh + AdityaKalyanpur + LoriMoon + DavidBuchanan + LaurenBerkowitz + OrBiran + JenniferChu-Carroll + 4569–4586 + 2020.emnlp-main.370 + + + Character-level Representations Still Improve Semantic Parsing in the Age of <fixed-case>BERT</fixed-case> + Rikvan Noord + AntonioToral + JohanBos + 4587–4603 + 2020.emnlp-main.371 + + + Infusing Disease Knowledge into <fixed-case>BERT</fixed-case> for Health Question Answering, Medical Inference and Disease Name Recognition + YunHe + ZiweiZhu + YinZhang + QinChen + JamesCaverlee + 4604–4614 + 2020.emnlp-main.372 + + + Unsupervised Commonsense Question Answering with Self-Talk + VeredShwartz + PeterWest + RonanLe Bras + ChandraBhagavatula + YejinChoi + 4615–4629 + 2020.emnlp-main.373 + + + Reasoning about Goals, Steps, and Temporal Ordering with <fixed-case>W</fixed-case>iki<fixed-case>H</fixed-case>ow + LiZhang + QingLyu + ChrisCallison-Burch + 4630–4639 + 2020.emnlp-main.374 + 2020.emnlp-main.374.OptionalSupplementaryMaterial.zip + + + Structural Supervision Improves Few-Shot Learning and Syntactic Generalization in Neural Language Models + EthanWilcox + PengQian + RichardFutrell + RyosukeKohita + RogerLevy + MiguelBallesteros + 4640–4652 + 2020.emnlp-main.375 + + + Investigating Representations of Verb Bias in Neural Language Models + RobertHawkins + TakateruYamakoshi + ThomasGriffiths + AdeleGoldberg + 4653–4663 + 2020.emnlp-main.376 + + + Generating Image Descriptions via Sequential Cross-Modal Alignment Guided by Human Gaze + EceTakmaz + SandroPezzelle + LisaBeinborn + RaquelFernández + 4664–4677 + 2020.emnlp-main.377 + + + Optimus: Organizing Sentences via Pre-trained Modeling of a Latent Space + ChunyuanLi + XiangGao + YuanLi + BaolinPeng + XiujunLi + YizheZhang + JianfengGao + 4678–4699 + 2020.emnlp-main.378 + + + Bio-Megatron: Larger Biomedical Domain Language Model + Hoo-ChangShin + YangZhang + EvelinaBakhturina + RaulPuri + MostofaPatwary + MohammadShoeybi + RaghavMani + 4700–4706 + 2020.emnlp-main.379 + + + Text Segmentation by Cross Segment Attention + MichalLukasik + BorisDadachev + KishorePapineni + GonçaloSimões + 4707–4716 + 2020.emnlp-main.380 + + + <fixed-case>R</fixed-case>ussian<fixed-case>S</fixed-case>uper<fixed-case>GLUE</fixed-case>: A <fixed-case>R</fixed-case>ussian Language Understanding Evaluation Benchmark + TatianaShavrina + AlenaFenogenova + EmelyanovAnton + DenisShevelev + EkaterinaArtemova + ValentinMalykh + VladislavMikhailov + MariaTikhonova + AndreyChertok + AndreyEvlampiev + 4717–4726 + 2020.emnlp-main.381 + + + An Empirical Study of Pre-trained Transformers for <fixed-case>A</fixed-case>rabic Information Extraction + WuweiLan + YangChen + WeiXu + AlanRitter + 4727–4734 + 2020.emnlp-main.382 + + + <fixed-case>TNT</fixed-case>: Text Normalization Based Pre-training of Transformers for Content Moderation + FeiTan + YifanHu + ChangweiHu + KeqianLi + KevinYen + 4735–4741 + 2020.emnlp-main.383 + + + Methods for Numeracy-Preserving Word Embeddings + DhanasekarSundararaman + ShijingSi + VivekSubramanian + GuoyinWang + DevamanyuHazarika + LawrenceCarin + 4742–4753 + 2020.emnlp-main.384 + + + An Empirical Investigation of Contextualized Number Prediction + TaylorBerg-Kirkpatrick + DanielSpokoyny + 4754–4764 + 2020.emnlp-main.385 + + + Modeling the Music Genre Perception across Language-Bound Cultures + Elena V.Epure + GuillaumeSalha + ManuelMoussallam + RomainHennequin + 4765–4779 + 2020.emnlp-main.386 + + + Joint Estimation and Analysis of Risk Behavior Ratings in Movie Scripts + VictorMartinez + KrishnaSomandepalli + YaldaTehranian-Uhls + ShrikanthNarayanan + 4780–4790 + 2020.emnlp-main.387 + + + Keep It Surprisingly Simple: A Simple First Order Graph Based Parsing Model for Joint Morphosyntactic Parsing in <fixed-case>S</fixed-case>anskrit + AmrithKrishna + AshimGupta + DeepakGarasangi + PavankumarSatuluri + PawanGoyal + 4791–4797 + 2020.emnlp-main.388 + 2020.emnlp-main.388.OptionalSupplementaryMaterial.pdf + + + Unsupervised Parsing via Constituency Tests + StevenCao + NikitaKitaev + DanKlein + 4798–4808 + 2020.emnlp-main.389 + + + Please Mind the Root: Decoding Arborescences for Dependency Parsing + RanZmigrod + TimVieira + RyanCotterell + 4809–4819 + 2020.emnlp-main.390 + + + Unsupervised Cross-Lingual Part-of-Speech Tagging for Truly Low-Resource Scenarios + RamyEskander + SmarandaMuresan + MichaelCollins + 4820–4831 + 2020.emnlp-main.391 + + + Unsupervised Parsing with <fixed-case>S</fixed-case>-<fixed-case>DIORA</fixed-case>: Single Tree Encoding for Deep Inside-Outside Recursive Autoencoders + AndrewDrozdov + SubendhuRongali + Yi-PeiChen + TimO’Gorman + MohitIyyer + AndrewMcCallum + 4832–4845 + 2020.emnlp-main.392 + + + Utility Is in the Eye of the User: A Critique of <fixed-case>NLP</fixed-case> Leaderboard Design + KawinEthayarajh + DanJurafsky + 4846–4853 + 2020.emnlp-main.393 + + + An Empirical Investigation towards Efficient Multi-Domain Language Model Pre-training + KristjanArumae + QingSun + ParminderBhatia + 4854–4864 + 2020.emnlp-main.394 + + + Analyzing Individual Neurons in Pre-trained Language Models + NadirDurrani + HassanSajjad + FahimDalvi + YonatanBelinkov + 4865–4880 + 2020.emnlp-main.395 + + + Dissecting Span Identification Tasks with Performance Prediction + SeanPapay + RomanKlinger + SebastianPadó + 4881–4895 + 2020.emnlp-main.396 + + + Assessing Phrasal Representation and Composition in Transformers + LangYu + AllysonEttinger + 4896–4907 + 2020.emnlp-main.397 + + + Analyzing Redundancy in Pretrained Transformer Models + FahimDalvi + HassanSajjad + NadirDurrani + YonatanBelinkov + 4908–4926 + 2020.emnlp-main.398 + + + Be More with Less: Hypergraph Attention Networks for Inductive Text Classification + KaizeDing + JianlingWang + JundongLi + DingchengLi + HuanLiu + 4927–4936 + 2020.emnlp-main.399 + + + Entities as Experts: Sparse Memory Access with Entity Supervision + ThibaultFévry + LivioBaldini Soares + NicholasFitzGerald + EunsolChoi + TomKwiatkowski + 4937–4951 + 2020.emnlp-main.400 + + + <fixed-case>H</fixed-case>2<fixed-case>KGAT</fixed-case>: Hierarchical Hyperbolic Knowledge Graph Attention Network + ShenWang + XiaokaiWei + CiceroNogueira dos Santos + ZhiguoWang + RameshNallapati + AndrewArnold + BingXiang + Philip S.Yu + 4952–4962 + 2020.emnlp-main.401 + + + Does the Objective Matter? Comparing Training Objectives for Pronoun Resolution + YordanYordanov + Oana-MariaCamburu + VidKocijan + ThomasLukasiewicz + 4963–4969 + 2020.emnlp-main.402 + + + On Losses for Modern Language Models + StéphaneAroca-Ouellette + FrankRudzicz + 4970–4981 + 2020.emnlp-main.403 + + + We Can Detect Your Bias: Predicting the Political Ideology of News Articles + RamyBaly + GiovanniDa San Martino + JamesGlass + PreslavNakov + 4982–4991 + 2020.emnlp-main.404 + + + Semantic Label Smoothing for Sequence to Sequence Problems + MichalLukasik + HimanshuJain + AdityaMenon + SeungyeonKim + SrinadhBhojanapalli + FelixYu + SanjivKumar + 4992–4998 + 2020.emnlp-main.405 + + + Training for <fixed-case>G</fixed-case>ibbs Sampling on Conditional Random Fields with Neural Scoring Factors + SidaGao + Matthew R.Gormley + 4999–5011 + 2020.emnlp-main.406 + + + Multilevel Text Alignment with Cross-Document Attention + XuhuiZhou + NikolaosPappas + Noah A.Smith + 5012–5025 + 2020.emnlp-main.407 + + + Conversational Semantic Parsing + ArmenAghajanyan + JeanMaillard + AkshatShrivastava + KeithDiedrick + MichaelHaeger + HaoranLi + YasharMehdad + VeselinStoyanov + AnujKumar + MikeLewis + SonalGupta + 5026–5035 + 2020.emnlp-main.408 + + + Probing Task-Oriented Dialogue Representation from Language Models + Chien-ShengWu + CaimingXiong + 5036–5051 + 2020.emnlp-main.409 + + + End-to-End Slot Alignment and Recognition for Cross-Lingual <fixed-case>NLU</fixed-case> + WeijiaXu + BatoolHaider + SaabMansour + 5052–5063 + 2020.emnlp-main.410 + 2020.emnlp-main.410.OptionalSupplementaryMaterial.zip + + + Discriminative Nearest Neighbor Few-Shot Intent Detection by Transferring Natural Language Inference + JianguoZhang + KazumaHashimoto + WenhaoLiu + Chien-ShengWu + YaoWan + PhilipYu + RichardSocher + CaimingXiong + 5064–5082 + 2020.emnlp-main.411 + + + Simple Data Augmentation with the Mask Token Improves Domain Adaptation for Dialog Act Tagging + SemihYavuz + KazumaHashimoto + WenhaoLiu + Nitish ShirishKeskar + RichardSocher + CaimingXiong + 5083–5089 + 2020.emnlp-main.412 + + + Low-Resource Domain Adaptation for Compositional Task-Oriented Semantic Parsing + XilunChen + AsishGhoshal + YasharMehdad + LukeZettlemoyer + SonalGupta + 5090–5100 + 2020.emnlp-main.413 + + + Sound Natural: Content Rephrasing in Dialog Systems + ArashEinolghozati + AnchitGupta + KeithDiedrick + SonalGupta + 5101–5108 + 2020.emnlp-main.414 + + + Zero-Shot Crosslingual Sentence Simplification + JonathanMallinson + RicoSennrich + MirellaLapata + 5109–5126 + 2020.emnlp-main.415 + + + Facilitating the Communication of Politeness through Fine-Grained Paraphrasing + LiyeFu + SusanFussell + CristianDanescu-Niculescu-Mizil + 5127–5140 + 2020.emnlp-main.416 + + + <fixed-case>CAT</fixed-case>-Gen: Improving Robustness in <fixed-case>NLP</fixed-case> Models via Controlled Adversarial Text Generation + TianluWang + XuezhiWang + YaoQin + BenPacker + KangLi + JilinChen + AlexBeutel + EdChi + 5141–5146 + 2020.emnlp-main.417 + + + <fixed-case>S</fixed-case>eq2<fixed-case>E</fixed-case>dits: Sequence Transduction Using Span-level Edit Operations + FelixStahlberg + ShankarKumar + 5147–5159 + 2020.emnlp-main.418 + 2020.emnlp-main.418.OptionalSupplementaryMaterial.zip + + + Controllable Meaning Representation to Text Generation: Linearization and Data Augmentation Strategies + ChrisKedzie + KathleenMcKeown + 5160–5185 + 2020.emnlp-main.419 + + + Blank Language Models + TianxiaoShen + VictorQuach + ReginaBarzilay + TommiJaakkola + 5186–5198 + 2020.emnlp-main.420 + + + <fixed-case>COD</fixed-case>3<fixed-case>S</fixed-case>: Diverse Generation with Discrete Semantic Signatures + NathanielWeir + JoãoSedoc + BenjaminVan Durme + 5199–5211 + 2020.emnlp-main.421 + + + Automatic Extraction of Rules Governing Morphological Agreement + AditiChaudhary + AntoniosAnastasopoulos + AdithyaPratapa + David R.Mortensen + ZaidSheikh + YuliaTsvetkov + GrahamNeubig + 5212–5236 + 2020.emnlp-main.422 + + + Tackling the Low-resource Challenge for Canonical Segmentation + ManuelMager + ÖzlemÇetinoğlu + KatharinaKann + 5237–5250 + 2020.emnlp-main.423 + + + <fixed-case>IGT</fixed-case>2<fixed-case>P</fixed-case>: From Interlinear Glossed Texts to Paradigms + SarahMoeller + LingLiu + ChangbingYang + KatharinaKann + MansHulden + 5251–5262 + 2020.emnlp-main.424 + + + A Computational Approach to Understanding Empathy Expressed in Text-Based Mental Health Support + AshishSharma + AdamMiner + DavidAtkins + TimAlthoff + 5263–5276 + 2020.emnlp-main.425 + + + Modeling Protagonist Emotions for Emotion-Aware Storytelling + FaezeBrahman + SnigdhaChaturvedi + 5277–5294 + 2020.emnlp-main.426 + + + Help! Need Advice on Identifying Advice + Venkata SubrahmanyanGovindarajan + BenjaminChen + RebeccaWarholic + KatrinErk + Junyi JessyLi + 5295–5306 + 2020.emnlp-main.427 + + + Quantifying Intimacy in Language + JiaxinPei + DavidJurgens + 5307–5326 + 2020.emnlp-main.428 + + + Writing Strategies for Science Communication: Data and Computational Analysis + TalAugust + LaurenKim + KatharinaReinecke + Noah A.Smith + 5327–5344 + 2020.emnlp-main.429 + 2020.emnlp-main.429.OptionalSupplementaryMaterial.zip + + + Weakly Supervised Subevent Knowledge Acquisition + WenlinYao + ZeyuDai + MaitreyiRamaswamy + BonanMin + RuihongHuang + 5345–5356 + 2020.emnlp-main.430 + + + Biomedical Event Extraction as Sequence Labeling + AlanRamponi + Robvan der Goot + RosarioLombardo + BarbaraPlank + 5357–5367 + 2020.emnlp-main.431 + + + Annotating Temporal Dependency Graphs via Crowdsourcing + JiaruiYao + HaolingQiu + BonanMin + NianwenXue + 5368–5380 + 2020.emnlp-main.432 + + + Introducing a New Dataset for Event Detection in Cybersecurity Texts + HieuMan Duc Trong + DucTrong Le + AmirPouran Ben Veyseh + ThuatNguyen + Thien HuuNguyen + 5381–5390 + 2020.emnlp-main.433 + + + <fixed-case>CHARM</fixed-case>: Inferring Personal Attributes from Conversations + AnnaTigunova + AndrewYates + ParamitaMirza + GerhardWeikum + 5391–5404 + 2020.emnlp-main.434 + 2020.emnlp-main.434.OptionalSupplementaryMaterial.zip + + + Event Detection: Gate Diversity and Syntactic Importance Scores for Graph Convolution Neural Networks + Viet DacLai + Tuan NgoNguyen + Thien HuuNguyen + 5405–5411 + 2020.emnlp-main.435 + + + Severing the Edge between before and after: Neural Architectures for Temporal Ordering of Events + MiguelBallesteros + RishitaAnubhai + ShuaiWang + NimaPourdamghani + YogarshiVyas + JieMa + ParminderBhatia + KathleenMcKeown + YaserAl-Onaizan + 5412–5417 + 2020.emnlp-main.436 + + + How Much Knowledge Can You Pack into the Parameters of a Language Model? + AdamRoberts + ColinRaffel + NoamShazeer + 5418–5426 + 2020.emnlp-main.437 + + + <fixed-case>EXAMS</fixed-case>: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering + MomchilHardalov + TodorMihaylov + DimitrinaZlatkova + YoanDinkov + IvanKoychev + PreslavNakov + 5427–5444 + 2020.emnlp-main.438 + + + End-to-End Synthetic Data Generation for Domain Adaptation of Question Answering Systems + SiamakShakeri + CiceroNogueira dos Santos + HenghuiZhu + PatrickNg + FengNan + ZhiguoWang + RameshNallapati + BingXiang + 5445–5460 + 2020.emnlp-main.439 + + + Multi-Stage Pretraining for Low-Resource Domain Adaptation + RongZhang + RevanthGangi Reddy + Md ArafatSultan + VittorioCastelli + AnthonyFerritto + RaduFlorian + EfsunSarioglu Kayi + SalimRoukos + AviSil + ToddWard + 5461–5468 + 2020.emnlp-main.440 + + + <fixed-case>ISAAQ</fixed-case> - Mastering Textbook Questions with Pre-trained Transformers and Bottom-Up and Top-Down Attention + Jose ManuelGomez-Perez + RaúlOrtega + 5469–5479 + 2020.emnlp-main.441 + + + <fixed-case>S</fixed-case>ubj<fixed-case>QA</fixed-case>: A Dataset for Subjectivity and Review Comprehension + JohannesBjerva + NikitaBhutani + BehzadGolshan + Wang-ChiewTan + IsabelleAugenstein + 5480–5494 + 2020.emnlp-main.442 + + + Widget Captioning: Generating Natural Language Description for Mobile User Interface Elements + YangLi + GangLi + LuhengHe + JingjieZheng + HongLi + ZhiweiGuan + 5495–5510 + 2020.emnlp-main.443 + + + Unsupervised Natural Language Inference via Decoupled Multimodal Contrastive Learning + WanyunCui + GuangyuZheng + WeiWang + 5511–5520 + 2020.emnlp-main.444 + + + Digital Voicing of Silent Speech + DavidGaddy + DanKlein + 5521–5530 + 2020.emnlp-main.445 + 2020.emnlp-main.445.OptionalSupplementaryMaterial.zip + + + Imitation Attacks and Defenses for Black-box Machine Translation Systems + EricWallace + MitchellStern + DawnSong + 5531–5546 + 2020.emnlp-main.446 + + + Sequence-level Mixed Sample Data Augmentation + DemiGuo + YoonKim + AlexanderRush + 5547–5552 + 2020.emnlp-main.447 + 2020.emnlp-main.447.OptionalSupplementaryMaterial.zip + + + Consistency of a Recurrent Language Model with Respect to Incomplete Decoding + SeanWelleck + IliaKulikov + JaedeokKim + Richard YuanzhePang + KyunghyunCho + 5553–5568 + 2020.emnlp-main.448 + + + An Exploration of Arbitrary-Order Sequence Labeling via Energy-Based Inference Networks + LifuTu + TianyuLiu + KevinGimpel + 5569–5582 + 2020.emnlp-main.449 + + + Ensemble Distillation for Structured Prediction: Calibrated, Accurate, <fixed-case>F</fixed-case>ast—<fixed-case>C</fixed-case>hoose Three + StevenReich + DavidMueller + NicholasAndrews + 5583–5595 + 2020.emnlp-main.450 + + + Inducing Target-specific Latent Structures for Aspect Sentiment Classification + ChenhuaChen + ZhiyangTeng + YueZhang + 5596–5607 + 2020.emnlp-main.451 + + + Affective Event Classification with Discourse-enhanced Self-training + YuanZhuang + TianyuJiang + EllenRiloff + 5608–5617 + 2020.emnlp-main.452 + + + Deep Weighted <fixed-case>M</fixed-case>ax<fixed-case>SAT</fixed-case> for Aspect-based Opinion Extraction + MeixiWu + WenyaWang + Sinno JialinPan + 5618–5628 + 2020.emnlp-main.453 + + + Multi-view Story Characterization from Movie Plot Synopses and Reviews + SudiptaKar + GustavoAguilar + MirellaLapata + ThamarSolorio + 5629–5646 + 2020.emnlp-main.454 + + + Mind Your Inflections! Improving <fixed-case>NLP</fixed-case> for Non-Standard Englishes with Base-Inflection Encoding + SamsonTan + ShafiqJoty + LavVarshney + Min-YenKan + 5647–5663 + 2020.emnlp-main.455 + + + Measuring the Similarity of Grammatical Gender Systems by Comparing Partitions + Arya D.McCarthy + AdinaWilliams + ShijiaLiu + DavidYarowsky + RyanCotterell + 5664–5675 + 2020.emnlp-main.456 + + + Is <fixed-case>C</fixed-case>hinese Word Segmentation a Solved Task? Rethinking Neural <fixed-case>C</fixed-case>hinese Word Segmentation + JinlanFu + PengfeiLiu + QiZhang + XuanjingHuang + 5676–5686 + 2020.emnlp-main.457 + + + Learning to Pronounce <fixed-case>C</fixed-case>hinese without a Pronunciation Dictionary + ChristopherChu + ScotFang + KevinKnight + 5687–5693 + 2020.emnlp-main.458 + + + Dynamic Anticipation and Completion for Multi-Hop Reasoning over Sparse Knowledge Graph + XinLv + XuHan + LeiHou + JuanziLi + ZhiyuanLiu + WeiZhang + YichiZhang + HaoKong + SuhuiWu + 5694–5703 + 2020.emnlp-main.459 + + + Knowledge Association with Hyperbolic Knowledge Graph Embeddings + ZequnSun + MuhaoChen + WeiHu + ChengmingWang + JianDai + WeiZhang + 5704–5716 + 2020.emnlp-main.460 + + + Domain Knowledge Empowered Structured Neural Net for End-to-End Event Temporal Relation Extraction + RujunHan + YichaoZhou + NanyunPeng + 5717–5729 + 2020.emnlp-main.461 + + + <fixed-case>T</fixed-case>e<fixed-case>MP</fixed-case>: Temporal Message Passing for Temporal Knowledge Graph Completion + JiapengWu + MengCao + Jackie Chi KitCheung + William L.Hamilton + 5730–5746 + 2020.emnlp-main.462 + + + Understanding the Difficulty of Training Transformers + LiyuanLiu + XiaodongLiu + JianfengGao + WeizhuChen + JiaweiHan + 5747–5763 + 2020.emnlp-main.463 + 2020.emnlp-main.463.OptionalSupplementaryMaterial.zip + + + An Empirical Study of Generation Order for Machine Translation + WilliamChan + MitchellStern + JamieKiros + JakobUszkoreit + 5764–5773 + 2020.emnlp-main.464 + 2020.emnlp-main.464.OptionalSupplementaryMaterial.pdf + + + Inference Strategies for Sequence Generation with Conditional Masking + JuliaKreutzer + GeorgeFoster + ColinCherry + 5774–5782 + 2020.emnlp-main.465 + + + <fixed-case>A</fixed-case>mbig<fixed-case>QA</fixed-case>: Answering Ambiguous Open-domain Questions + SewonMin + JulianMichael + HannanehHajishirzi + LukeZettlemoyer + 5783–5797 + 2020.emnlp-main.466 + + + Tell Me How to Ask Again: Question Data Augmentation with Controllable Rewriting in Continuous Space + DayihengLiu + YeyunGong + JieFu + YuYan + JiushengChen + JianchengLv + NanDuan + MingZhou + 5798–5810 + 2020.emnlp-main.467 + + + Training Question Answering Models from Synthetic Data + RaulPuri + RyanSpring + MohammadShoeybi + MostofaPatwary + BryanCatanzaro + 5811–5826 + 2020.emnlp-main.468 + + + Few-shot Complex Knowledge Base Question Answering via Meta Reinforcement Learning + YunchengHua + Yuan-FangLi + GholamrezaHaffari + GuilinQi + TongtongWu + 5827–5837 + 2020.emnlp-main.469 + + + Multilingual Offensive Language Identification with Cross-lingual Embeddings + TharinduRanasinghe + MarcosZampieri + 5838–5844 + 2020.emnlp-main.470 + + + Solving Historical Dictionary Codes with a Neural Language Model + ChristopherChu + RaphaelValenti + KevinKnight + 5845–5854 + 2020.emnlp-main.471 + + + Beyond Geolocation: Micro-Dialect Identification in Diaglossic and Code-Switched Environments + MuhammadAbdul-Mageed + ChiyuZhang + AbdelRahimElmadany + LyleUngar + 5855–5876 + 2020.emnlp-main.472 + + + Dats Wassup!!: Investigating <fixed-case>A</fixed-case>frican-<fixed-case>A</fixed-case>merican <fixed-case>V</fixed-case>ernacular <fixed-case>E</fixed-case>nglish in Transformer-Based Text Generation + SophieGroenwold + LilyOu + AeshaParekh + SamhitaHonnavalli + SharonLevy + DibaMirza + William YangWang + 5877–5883 + 2020.emnlp-main.473 + 2020.emnlp-main.473.OptionalSupplementaryMaterial.zip + + + Iterative Domain-Repaired Back-Translation + Hao-RanWei + ZhiruiZhang + BoxingChen + WeihuaLuo + 5884–5893 + 2020.emnlp-main.474 + + + Dynamic Data Selection and Weighting for Iterative Back-Translation + Zi-YiDou + AntoniosAnastasopoulos + GrahamNeubig + 5894–5904 + 2020.emnlp-main.475 + + + Revisiting Modularized Multilingual <fixed-case>NMT</fixed-case> to Meet Industrial Demands + SungwonLyu + BokyungSon + KichangYang + JaekyoungBae + 5905–5918 + 2020.emnlp-main.476 + + + <fixed-case>LAR</fixed-case>e<fixed-case>QA</fixed-case>: Language-agnostic Answer Retrieval from a Multilingual Pool + UmaRoy + NoahConstant + RamiAl-Rfou + AdityaBarua + AaronPhillips + YinfeiYang + 5919–5930 + 2020.emnlp-main.477 + + + <fixed-case>OCR</fixed-case> Post-Correction for Endangered Language Texts + ShrutiRijhwani + AntoniosAnastasopoulos + GrahamNeubig + 5931–5942 + 2020.emnlp-main.478 + + + <fixed-case>X</fixed-case>-<fixed-case>FACTR</fixed-case>: Multilingual Factual Knowledge Retrieval from Pretrained Language Models + ZhengbaoJiang + AntoniosAnastasopoulos + JunAraki + HaiboDing + GrahamNeubig + 5943–5959 + 2020.emnlp-main.479 + + + A Massive Collection of Cross-Lingual Web-Document Pairs + AhmedEl-Kishky + VishravChaudhary + FranciscoGuzmán + PhilippKoehn + 5960–5969 + 2020.emnlp-main.480 + + + Localizing <fixed-case>Q</fixed-case>&<fixed-case>A</fixed-case> Semantic Parsers for Any Language in a Day + MehradMoradshahi + GiovanniCampagna + SinaSemnani + SileiXu + MonicaLam + 5970–5983 + 2020.emnlp-main.481 + + + Interactive Refinement of Cross-Lingual Word Embeddings + MichelleYuan + MozhiZhang + BenjaminVan Durme + LeahFindlater + JordanBoyd-Graber + 5984–5996 + 2020.emnlp-main.482 + + + Exploiting Sentence Order in Document Alignment + BrianThompson + PhilippKoehn + 5997–6007 + 2020.emnlp-main.483 + + + <fixed-case>XGLUE</fixed-case>: A New Benchmark Datasetfor Cross-lingual Pre-training, Understanding and Generation + YaoboLiang + NanDuan + YeyunGong + NingWu + FenfeiGuo + WeizhenQi + MingGong + LinjunShou + DaxinJiang + GuihongCao + XiaodongFan + RuofeiZhang + RahulAgrawal + EdwardCui + SiningWei + TaroonBharti + YingQiao + Jiun-HungChen + WinnieWu + ShuguangLiu + FanYang + DanielCampos + RanganMajumder + MingZhou + 6008–6018 + 2020.emnlp-main.484 + + + <fixed-case>AIN</fixed-case>: Fast and Accurate Sequence Labeling with Approximate Inference Network + XinyuWang + YongJiang + NguyenBach + TaoWang + ZhongqiangHuang + FeiHuang + KeweiTu + 6019–6026 + 2020.emnlp-main.485 + + + <fixed-case>HIT</fixed-case>: Nested Named Entity Recognition via Head-Tail Pair and Token Interaction + YuWang + YunLi + HanghangTong + ZiyeZhu + 6027–6036 + 2020.emnlp-main.486 + + + Supertagging <fixed-case>C</fixed-case>ombinatory <fixed-case>C</fixed-case>ategorial <fixed-case>G</fixed-case>rammar with Attentive Graph Convolutional Networks + YuanheTian + YanSong + FeiXia + 6037–6044 + 2020.emnlp-main.487 + 2020.emnlp-main.487.OptionalSupplementaryMaterial.zip + + + An Effective Data Augmentation Method for Low-resource Tagging Tasks + BoshengDing + LinlinLiu + LidongBing + CanasaiKruengkrai + Thien HaiNguyen + ShafiqJoty + LuoSi + ChunyanMiao + 6045–6057 + 2020.emnlp-main.488 + + + Interpretable Multi-dataset Evaluation for Named Entity Recognition + JinlanFu + PengfeiLiu + GrahamNeubig + 6058–6069 + 2020.emnlp-main.489 + 2020.emnlp-main.489.OptionalSupplementaryMaterial.zip + + + Adversarial Semantic Decoupling for Recognizing Open-Vocabulary Slots + YuanmengYan + KeqingHe + HongXu + SihongLiu + FanyuMeng + MinHu + WeiranXu + 6070–6075 + 2020.emnlp-main.490 + + + Plug and Play Autoencoders for Conditional Text Generation + FlorianMai + NikolaosPappas + IvanMontero + Noah A.Smith + JamesHenderson + 6076–6092 + 2020.emnlp-main.491 + 2020.emnlp-main.491.OptionalSupplementaryMaterial.zip + + + Structure Aware Negative Sampling in Knowledge Graphs + KianAhrabian + AarashFeizi + YasminSalehi + William L.Hamilton + Avishek JoeyBose + 6093–6101 + 2020.emnlp-main.492 + 2020.emnlp-main.492.OptionalSupplementaryMaterial.zip + + + Neural Mask Generator: Learning to Generate Adaptive Word Maskings for Language Model Adaptation + MinkiKang + MoonsuHan + Sung JuHwang + 6102–6120 + 2020.emnlp-main.493 + + + Autoregressive Knowledge Distillation through Imitation Learning + AlexanderLin + JeremyWohlwend + HowardChen + TaoLei + 6121–6133 + 2020.emnlp-main.494 + + + T3: Tree-Autoencoder Regularized Adversarial Text Generation for Targeted Attack + BoxinWang + HengzhiPei + BoyuanPan + QianChen + ShuohangWang + BoLi + 6134–6150 + 2020.emnlp-main.495 + 2020.emnlp-main.495.OptionalSupplementaryMaterial.zip + + + Structured Pruning of Large Language Models + ZihengWang + JeremyWohlwend + TaoLei + 6151–6162 + 2020.emnlp-main.496 + + + Effective Unsupervised Domain Adaptation with Adversarially Trained Language Models + Thuy-TrangVu + DinhPhung + GholamrezaHaffari + 6163–6173 + 2020.emnlp-main.497 + + + <fixed-case>BAE</fixed-case>: <fixed-case>BERT</fixed-case>-based Adversarial Examples for Text Classification + SiddhantGarg + GouthamRamakrishnan + 6174–6181 + 2020.emnlp-main.498 + + + Adversarial Self-Supervised Data Free Distillation for Text Classification + XinyinMa + YongliangShen + GongfanFang + ChenChen + ChenghaoJia + WeimingLu + 6182–6192 + 2020.emnlp-main.499 + 2020.emnlp-main.499.OptionalSupplementaryMaterial.zip + + + <fixed-case>BERT</fixed-case>-<fixed-case>ATTACK</fixed-case>: Adversarial Attack against <fixed-case>BERT</fixed-case> Using <fixed-case>BERT</fixed-case> + LinyangLi + RuotianMa + QipengGuo + XiangyangXue + XipengQiu + 6193–6202 + 2020.emnlp-main.500 + + + The Thieves on Sesame Street Are Polyglots — Extracting Multilingual Models from Monolingual <fixed-case>API</fixed-case>s + Nitish ShirishKeskar + BryanMcCann + CaimingXiong + RichardSocher + 6203–6207 + 2020.emnlp-main.501 + + + When Hearst Is Not Enough: Improving Hypernymy Detection from Corpus with Distributional Models + ChanglongYu + JialongHan + PeifengWang + YangqiuSong + HongmingZhang + WilfredNg + ShumingShi + 6208–6217 + 2020.emnlp-main.502 + + + Interpreting Open-Domain Modifiers: Decomposition of <fixed-case>W</fixed-case>ikipedia Categories into Disambiguated Property-Value Pairs + MariusPasca + 6218–6228 + 2020.emnlp-main.503 + + + A Synset Relation-enhanced Framework with a Try-again Mechanism for Word Sense Disambiguation + MingWang + YinglinWang + 6229–6240 + 2020.emnlp-main.504 + + + Diverse, Controllable, and Keyphrase-Aware: A Corpus and Method for News Multi-Headline Generation + DayihengLiu + YeyunGong + YuYan + JieFu + BoShao + DaxinJiang + JianchengLv + NanDuan + 6241–6250 + 2020.emnlp-main.505 + + + Factual Error Correction for Abstractive Summarization Models + MengCao + YueDong + JiapengWu + Jackie Chi KitCheung + 6251–6258 + 2020.emnlp-main.506 + + + Compressive Summarization with Plausibility and Salience Modeling + ShreyDesai + JiachengXu + GregDurrett + 6259–6274 + 2020.emnlp-main.507 + + + Understanding Neural Abstractive Summarization Models via Uncertainty + JiachengXu + ShreyDesai + GregDurrett + 6275–6281 + 2020.emnlp-main.508 + + + Better Highlighting: Creating Sub-Sentence Summary Highlights + SangwooCho + KaiqiangSong + ChenLi + DongYu + HassanForoosh + FeiLiu + 6282–6300 + 2020.emnlp-main.509 + + + Summarizing Text on Any Aspects: A Knowledge-Informed Weakly-Supervised Approach + BowenTan + LianhuiQin + EricXing + ZhitingHu + 6301–6309 + 2020.emnlp-main.510 + + + <fixed-case>BERT</fixed-case>-enhanced Relational Sentence Ordering Network + BaiyunCui + YingmingLi + ZhongfeiZhang + 6310–6320 + 2020.emnlp-main.511 + + + Online Conversation Disentanglement with Pointer Networks + TaoYu + ShafiqJoty + 6321–6330 + 2020.emnlp-main.512 + + + <fixed-case>VCDM</fixed-case>: Leveraging Variational Bi-encoding and Deep Contextualized Word Representations for Improved Definition Modeling + MachelReid + EdisonMarrese-Taylor + YutakaMatsuo + 6331–6344 + 2020.emnlp-main.513 + + + Coarse-to-Fine Pre-training for Named Entity Recognition + XueMengge + BowenYu + ZhenyuZhang + TingwenLiu + YueZhang + BinWang + 6345–6354 + 2020.emnlp-main.514 + + + Exploring and Evaluating Attributes, Values, and Structure for Entity Alignment + ZhiyuanLiu + YixinCao + LiangmingPan + JuanziLi + ZhiyuanLiu + Tat-SengChua + 6355–6364 + 2020.emnlp-main.515 + + + Frustratingly Simple Few-Shot Named Entity Recognition with Structured Nearest Neighbor Learning + YiYang + ArzooKatiyar + 6365–6375 + 2020.emnlp-main.516 + + + Learning Structured Representations of Entity Names Using Active Learning and Weak Supervision + KunQian + PoornimaChozhiyath Raman + YunyaoLi + LucianPopa + 6376–6383 + 2020.emnlp-main.517 + + + Entity Enhanced <fixed-case>BERT</fixed-case> Pre-training for <fixed-case>C</fixed-case>hinese <fixed-case>NER</fixed-case> + ChenJia + YuefengShi + QinrongYang + YueZhang + 6384–6396 + 2020.emnlp-main.518 + + + Scalable Zero-shot Entity Linking with Dense Entity Retrieval + LedellWu + FabioPetroni + MartinJosifoski + SebastianRiedel + LukeZettlemoyer + 6397–6407 + 2020.emnlp-main.519 + + + A Dataset for Tracking Entities in Open Domain Procedural Text + NiketTandon + KeisukeSakaguchi + BhavanaDalvi + DheerajRajagopal + PeterClark + MichalGuerquin + KyleRichardson + EduardHovy + 6408–6417 + 2020.emnlp-main.520 + + + Design Challenges in Low-resource Cross-lingual Entity Linking + XingyuFu + WeijiaShi + XiaodongYu + ZianZhao + DanRoth + 6418–6432 + 2020.emnlp-main.521 + + + Efficient One-Pass End-to-End Entity Linking for Questions + Belinda Z.Li + SewonMin + SrinivasanIyer + YasharMehdad + Wen-tauYih + 6433–6441 + 2020.emnlp-main.522 + + + <fixed-case>LUKE</fixed-case>: Deep Contextualized Entity Representations with Entity-aware Self-attention + IkuyaYamada + AkariAsai + HiroyukiShindo + HideakiTakeda + YujiMatsumoto + 6442–6454 + 2020.emnlp-main.523 + + + Generating Similes <fixed-case>E</fixed-case>̶<fixed-case>F</fixed-case>̶<fixed-case>F</fixed-case>̶<fixed-case>O</fixed-case>̶<fixed-case>R</fixed-case>̶<fixed-case>T</fixed-case>̶<fixed-case>L</fixed-case>̶<fixed-case>E</fixed-case>̶<fixed-case>S</fixed-case>̶<fixed-case>S</fixed-case>̶<fixed-case>L</fixed-case>̶<fixed-case>Y</fixed-case>̶ 𝘭𝘪𝘬𝘦 𝘢 <fixed-case>𝘗</fixed-case>𝘳𝘰: A Style Transfer Approach for Simile Generation + TuhinChakrabarty + SmarandaMuresan + NanyunPeng + 6455–6469 + 2020.emnlp-main.524 + + + <fixed-case>HUSH</fixed-case>: A Dataset and Platform for Human-in-the-Loop Story Generation + NaderAkoury + ShufanWang + JoshWhiting + StephenHood + NanyunPeng + MohitIyyer + 6470–6484 + 2020.emnlp-main.525 + + + Substance over Style: Document-Level Targeted Content Transfer + AllisonHegel + SudhaRao + AsliCelikyilmaz + BillDolan + 6485–6504 + 2020.emnlp-main.526 + + + Template Guided Text Generation for Task Oriented Dialogue + MihirKale + AbhinavRastogi + 6505–6520 + 2020.emnlp-main.527 + + + <fixed-case>MOCHA</fixed-case>: A Dataset for Training and Evaluating Generative Reading Comprehension Metrics + AnthonyChen + GabrielStanovsky + SameerSingh + MattGardner + 6521–6532 + 2020.emnlp-main.528 + + + Self-Supervised Text Planning for Paragraph Completion Task + DongyeopKang + EduardHovy + 6533–6543 + 2020.emnlp-main.529 + 2020.emnlp-main.529.OptionalSupplementaryMaterial.pdf + + + Inquisitive Question Generation for High Level Text Comprehension + Wei-JenKo + Te-yuanChen + YiyanHuang + GregDurrett + Junyi JessyLi + 6544–6555 + 2020.emnlp-main.530 + + + Towards Persona-Based Empathetic Conversational Models + PeixiangZhong + ChenZhang + HaoWang + YongLiu + ChunyanMiao + 6556–6566 + 2020.emnlp-main.531 + + + Personal Information Leakage Detection in Conversations + QiongkaiXu + LizhenQu + ZeyuGao + GholamrezaHaffari + 6567–6580 + 2020.emnlp-main.532 + + + Response Selection for Multi-Party Conversations with Dynamic Topic Tracking + WeishiWang + Steven C.H.Hoi + ShafiqJoty + 6581–6591 + 2020.emnlp-main.533 + + + Regularizing Dialogue Generation by Imitating Implicit Scenarios + ShaoxiongFeng + XuanchengRen + HongshenChen + BinSun + KanLi + XuSun + 6592–6604 + 2020.emnlp-main.534 + + + <fixed-case>M</fixed-case>ovie<fixed-case>C</fixed-case>hats: Chat like Humans in a Closed Domain + HuiSu + XiaoyuShen + ZhouXiao + ZhengZhang + ErnieChang + ChengZhang + ChengNiu + JieZhou + 6605–6619 + 2020.emnlp-main.535 + 2020.emnlp-main.535.OptionalSupplementaryMaterial.pdf + + + Conundrums in Entity Reference Resolution + JingLu + VincentNg + 6620–6631 + 2020.emnlp-main.536 + + + Semantic Role Labeling Guided Multi-turn Dialogue <fixed-case>R</fixed-case>e<fixed-case>W</fixed-case>riter + KunXu + HaochenTan + LinfengSong + HanWu + HaisongZhang + LinqiSong + DongYu + 6632–6639 + 2020.emnlp-main.537 + + + Continuity of Topic, Interaction, and Query: Learning to Quote in Online Conversations + LingzhiWang + JingLi + XingshanZeng + HaisongZhang + Kam-FaiWong + 6640–6650 + 2020.emnlp-main.538 + + + Profile Consistency Identification for Open-domain Dialogue Agents + HaoyuSong + YanWang + Wei-NanZhang + ZhengyuZhao + TingLiu + XiaojiangLiu + 6651–6662 + 2020.emnlp-main.539 + + + An Element-aware Multi-representation Model for Law Article Prediction + HuilinZhong + JunshengZhou + WeiguangQu + YunfeiLong + YanhuiGu + 6663–6668 + 2020.emnlp-main.540 + + + Recurrent Event Network: Autoregressive Structure Inference over Temporal Knowledge Graphs + WoojeongJin + MengQu + XisenJin + XiangRen + 6669–6683 + 2020.emnlp-main.541 + + + Multi-resolution Annotations for Emoji Prediction + WeichengMa + RuiboLiu + LiliWang + SoroushVosoughi + 6684–6694 + 2020.emnlp-main.542 + + + Less Is More: Attention Supervision with Counterfactuals for Text Classification + SeungtaekChoi + HaejuPark + JinyoungYeo + Seung-wonHwang + 6695–6704 + 2020.emnlp-main.543 + + + <fixed-case>MODE</fixed-case>-<fixed-case>LSTM</fixed-case>: A Parameter-efficient Recurrent Network with Multi-Scale for Sentence Classification + QianliMa + ZhenxiLin + JiangyueYan + ZipengChen + LiuhongYu + 6705–6715 + 2020.emnlp-main.544 + + + <fixed-case>MSCNN</fixed-case>: A Monomeric-<fixed-case>S</fixed-case>iamese Convolutional Neural Network for Extremely Imbalanced Multi-label Text Classification + WenshuoYang + JiyiLi + FumiyoFukumoto + YanmingYe + 6716–6722 + 2020.emnlp-main.545 + + + Multi-Stage Pre-training for Automated <fixed-case>C</fixed-case>hinese Essay Scoring + WeiSong + KaiZhang + RuijiFu + LizhenLiu + TingLiu + MiaomiaoCheng + 6723–6733 + 2020.emnlp-main.546 + + + Multi-hop Inference for Question-driven Summarization + YangDeng + WenxuanZhang + WaiLam + 6734–6744 + 2020.emnlp-main.547 + + + Towards Interpretable Reasoning over Paragraph Effects in Situation + MuchengRen + XiuboGeng + TaoQin + HeyanHuang + DaxinJiang + 6745–6758 + 2020.emnlp-main.548 + + + Question Directed Graph Attention Network for Numerical Reasoning over Text + KunlongChen + WeidiXu + XingyiCheng + ZouXiaochuan + YuyuZhang + LeSong + TaifengWang + YuanQi + WeiChu + 6759–6768 + 2020.emnlp-main.549 + + + Dense Passage Retrieval for Open-Domain Question Answering + VladimirKarpukhin + BarlasOguz + SewonMin + PatrickLewis + LedellWu + SergeyEdunov + DanqiChen + Wen-tauYih + 6769–6781 + 2020.emnlp-main.550 + + + Distilling Structured Knowledge for Text-Based Relational Reasoning + JinDong + Marc-AntoineRondeau + William L.Hamilton + 6782–6791 + 2020.emnlp-main.551 + + + Asking without Telling: Exploring Latent Ontologies in Contextual Representations + JulianMichael + Jan A.Botha + IanTenney + 6792–6812 + 2020.emnlp-main.552 + + + Pretrained Language Model Embryology: The Birth of <fixed-case>ALBERT</fixed-case> + Cheng-HanChiang + Sung-FengHuang + Hung-yiLee + 6813–6828 + 2020.emnlp-main.553 + 2020.emnlp-main.553.OptionalSupplementaryMaterial.zip + + + Learning Music Helps You Read: Using Transfer to Study Linguistic Structure in Language Models + IsabelPapadimitriou + DanJurafsky + 6829–6839 + 2020.emnlp-main.554 + + + What Do Position Embeddings Learn? An Empirical Study of Pre-Trained Language Model Positional Encoding + Yu-AnWang + Yun-NungChen + 6840–6849 + 2020.emnlp-main.555 + + + “You Are Grounded!”: Latent Name Artifacts in Pre-trained Language Models + VeredShwartz + RachelRudinger + OyvindTafjord + 6850–6861 + 2020.emnlp-main.556 + + + Birds Have Four Legs?! <fixed-case>N</fixed-case>umer<fixed-case>S</fixed-case>ense: Probing Numerical Commonsense Knowledge of Pre-trained Language Models + Bill YuchenLin + SeyeonLee + RahulKhanna + XiangRen + 6862–6868 + 2020.emnlp-main.557 + + + Grounded Adaptation for Zero-shot Executable Semantic Parsing + VictorZhong + MikeLewis + Sida I.Wang + LukeZettlemoyer + 6869–6882 + 2020.emnlp-main.558 + 2020.emnlp-main.558.OptionalSupplementaryMaterial.zip + + + An Imitation Game for Learning Semantic Parsers from User Interaction + ZiyuYao + YiqiTang + Wen-tauYih + HuanSun + YuSu + 6883–6902 + 2020.emnlp-main.559 + + + <fixed-case>IGSQL</fixed-case>: Database Schema Interaction Graph Based Neural Model for Context-Dependent Text-to-<fixed-case>SQL</fixed-case> Generation + YitaoCai + XiaojunWan + 6903–6912 + 2020.emnlp-main.560 + + + “What Do You Mean by That?” - a Parser-Independent Interactive Approach for Enhancing Text-to-<fixed-case>SQL</fixed-case> + YuntaoLi + BeiChen + QianLiu + YanGao + Jian-GuangLou + YanZhang + DongmeiZhang + 6913–6922 + 2020.emnlp-main.561 + + + <fixed-case>C</fixed-case>hi<fixed-case>T</fixed-case>e<fixed-case>SQL</fixed-case>: A Large-Scale and Pragmatic <fixed-case>C</fixed-case>hinese Text-to-<fixed-case>SQL</fixed-case> Dataset + LijieWang + AoZhang + KunWu + KeSun + ZhenghuaLi + HuaWu + MinZhang + HaifengWang + 6923–6935 + 2020.emnlp-main.562 + + + Mention Extraction and Linking for <fixed-case>SQL</fixed-case> Query Generation + JianqiangMa + ZeyuYan + ShuaiPang + YangZhang + JianpingShen + 6936–6942 + 2020.emnlp-main.563 + + + Re-examining the Role of Schema Linking in Text-to-<fixed-case>SQL</fixed-case> + WenqiangLei + WeixinWang + ZhixinMa + TianGan + WeiLu + Min-YenKan + Tat-SengChua + 6943–6954 + 2020.emnlp-main.564 + + + A Multi-Task Incremental Learning Framework with Category Name Embedding for Aspect-Category Sentiment Analysis + ZehuiDai + ChengPeng + HuajieChen + YadongDing + 6955–6965 + 2020.emnlp-main.565 + 2020.emnlp-main.565.OptionalSupplementaryMaterial.zip + + + Train No Evil: Selective Masking for Task-guided Pre-training + YuxianGu + ZhengyanZhang + XiaozhiWang + ZhiyuanLiu + MaosongSun + 6966–6974 + 2020.emnlp-main.566 + + + <fixed-case>S</fixed-case>enti<fixed-case>LARE</fixed-case>: Linguistic Knowledge Enhanced Language Representation for Sentiment Analysis + PeiKe + HaozheJi + SiyangLiu + XiaoyanZhu + MinlieHuang + 6975–6988 + 2020.emnlp-main.567 + + + Aspect-Based Sentiment Analysis by Aspect-Sentiment Joint Embedding + JiaxinHuang + YuMeng + FangGuo + HengJi + JiaweiHan + 6989–6999 + 2020.emnlp-main.568 + + + Argument Pair Extraction from Peer Review and Rebuttal via Multi-task Learning + LiyingCheng + LidongBing + QianYu + WeiLu + LuoSi + 7000–7011 + 2020.emnlp-main.569 + + + <fixed-case>D</fixed-case>iversifi<fixed-case>E</fixed-case>d Multiple Instance Learning for Document-Level Multi-Aspect Sentiment <fixed-case>C</fixed-case>lassifi<fixed-case>C</fixed-case>ation + YunjieJi + HaoLiu + BoleiHe + XinyanXiao + HuaWu + YanhuaYu + 7012–7023 + 2020.emnlp-main.570 + + + An Empirical Study of Hyperbole + LiKong + ChuanyiLi + JidongGe + BinLuo + VincentNg + 7024–7034 + 2020.emnlp-main.571 + + + Unified Feature and Instance Based Domain Adaptation for End-to-End Aspect-based Sentiment Analysis + ChenggongGong + JianfeiYu + RuiXia + 7035–7045 + 2020.emnlp-main.572 + + + Compositional and Lexical Semantics in <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a, <fixed-case>BERT</fixed-case> and <fixed-case>D</fixed-case>istil<fixed-case>BERT</fixed-case>: A Case Study on <fixed-case>C</fixed-case>o<fixed-case>QA</fixed-case> + IevaStaliūnaitė + IgnacioIacobacci + 7046–7056 + 2020.emnlp-main.573 + + + Attention Is Not Only a Weight: Analyzing Transformers with Vector Norms + GoroKobayashi + TatsukiKuribayashi + ShoYokoi + KentaroInui + 7057–7075 + 2020.emnlp-main.574 + + + F1 Is Not Enough! Models and Evaluation towards User-Centered Explainable Question Answering + HendrikSchuff + HeikeAdel + Ngoc ThangVu + 7076–7095 + 2020.emnlp-main.575 + + + On the Ability of Self-Attention Networks to Recognize Counter Languages + SatwikBhattamishra + KabirAhuja + NavinGoyal + 7096–7116 + 2020.emnlp-main.576 + 2020.emnlp-main.576.OptionalSupplementaryMaterial.zip + + + An Unsupervised Joint System for Text Generation from Knowledge Graphs and Semantic Parsing + MartinSchmitt + SahandSharifzadeh + VolkerTresp + HinrichSchütze + 7117–7130 + 2020.emnlp-main.577 + + + A Dual-generator Network for Text Style Transfer Applications + XiaoLi + GuanyiChen + ChenghuaLin + RuizheLi + 7131–7136 + 2020.emnlp-main.578 + + + A Knowledge-Aware Sequence-to-Tree Network for Math Word Problem Solving + QinzhuoWu + QiZhang + JinlanFu + XuanjingHuang + 7137–7146 + 2020.emnlp-main.579 + + + Generating Fact Checking Briefs + AngelaFan + AleksandraPiktus + FabioPetroni + GuillaumeWenzek + MarziehSaeidi + AndreasVlachos + AntoineBordes + SebastianRiedel + 7147–7161 + 2020.emnlp-main.580 + + + Improving the Efficiency of Grammatical Error Correction with Erroneous Span Detection and Correction + MengyunChen + TaoGe + XingxingZhang + FuruWei + MingZhou + 7162–7169 + 2020.emnlp-main.581 + + + Coreferential Reasoning Learning for Language Representation + DemingYe + YankaiLin + JiajuDu + ZhenghaoLiu + PengLi + MaosongSun + ZhiyuanLiu + 7170–7186 + 2020.emnlp-main.582 + + + Is Graph Structure Necessary for Multi-hop Question Answering? + NanShao + YimingCui + TingLiu + ShijinWang + GuopingHu + 7187–7192 + 2020.emnlp-main.583 + + + <fixed-case>XL</fixed-case>-<fixed-case>W</fixed-case>i<fixed-case>C</fixed-case>: A Multilingual Benchmark for Evaluating Semantic Contextualization + AlessandroRaganato + TommasoPasini + JoseCamacho-Collados + Mohammad TaherPilehvar + 7193–7206 + 2020.emnlp-main.584 + + + Generationary or: “How We Went beyond Word Sense Inventories and Learned to Gloss” + MicheleBevilacqua + MarcoMaru + RobertoNavigli + 7207–7221 + 2020.emnlp-main.585 + + + Probing Pretrained Language Models for Lexical Semantics + IvanVulić + Edoardo MariaPonti + RobertLitschko + GoranGlavaš + AnnaKorhonen + 7222–7240 + 2020.emnlp-main.586 + 2020.emnlp-main.586.OptionalSupplementaryMaterial.zip + + + Cross-lingual Spoken Language Understanding with Regularized Representation Alignment + ZihanLiu + Genta IndraWinata + PengXu + ZhaojiangLin + PascaleFung + 7241–7251 + 2020.emnlp-main.587 + + + <fixed-case>SLURP</fixed-case>: A Spoken Language Understanding Resource Package + EmanueleBastianelli + AndreaVanzo + PawelSwietojanski + VerenaRieser + 7252–7262 + 2020.emnlp-main.588 + + + Neural Conversational <fixed-case>QA</fixed-case>: Learning to Reason vs Exploiting Patterns + NikhilVerma + AbhishekSharma + DhirajMadan + DanishContractor + HarshitKumar + SachindraJoshi + 7263–7269 + 2020.emnlp-main.589 + + + Counterfactual Generator: A Weakly-Supervised Method for Named Entity Recognition + XiangjiZeng + YunliangLi + YuchenZhai + YinZhang + 7270–7280 + 2020.emnlp-main.590 + + + Understanding Procedural Text Using Interactive Entity Networks + JizhiTang + YansongFeng + DongyanZhao + 7281–7290 + 2020.emnlp-main.591 + + + A Rigorous Study on Named Entity Recognition: Can Fine-tuning Pretrained Model Lead to the Promised Land? + HongyuLin + YaojieLu + JialongTang + XianpeiHan + LeSun + ZhichengWei + Nicholas JingYuan + 7291–7300 + 2020.emnlp-main.592 + + + <fixed-case>D</fixed-case>y<fixed-case>ERNIE</fixed-case>: Dynamic Evolution of <fixed-case>R</fixed-case>iemannian Manifold Embeddings for Temporal Knowledge Graph Completion + ZhenHan + PengChen + YunpuMa + VolkerTresp + 7301–7316 + 2020.emnlp-main.593 + + + Embedding Words in Non-Vector Space with Unsupervised Graph Learning + MaxRyabinin + SergeiPopov + LiudmilaProkhorenkova + ElenaVoita + 7317–7331 + 2020.emnlp-main.594 + + + Debiasing Knowledge Graph Embeddings + JosephFisher + ArpitMittal + DavePalfrey + ChristosChristodoulopoulos + 7332–7345 + 2020.emnlp-main.595 + + + Message Passing for Hyper-Relational Knowledge Graphs + MikhailGalkin + PriyanshTrivedi + GauravMaheshwari + RicardoUsbeck + JensLehmann + 7346–7359 + 2020.emnlp-main.596 + 2020.emnlp-main.596.OptionalSupplementaryMaterial.zip + + + Relation-aware Graph Attention Networks with Relational Position Encodings for Emotion Recognition in Conversations + TaichiIshiwatari + YukiYasuda + TaroMiyazaki + JunGoto + 7360–7370 + 2020.emnlp-main.597 + + + <fixed-case>BERT</fixed-case> Knows Punta Cana Is Not Just Beautiful, It’s Gorgeous: Ranking Scalar Adjectives with Contextualised Representations + AinaGarí Soler + MariannaApidianaki + 7371–7385 + 2020.emnlp-main.598 + + + Feature Adaptation of Pre-Trained Language Models across Languages and Domains with Robust Self-Training + HaiYe + QingyuTan + RuidanHe + JuntaoLi + Hwee TouNg + LidongBing + 7386–7399 + 2020.emnlp-main.599 + + + Textual Data Augmentation for Efficient Active Learning on Tiny Datasets + HusamQuteineh + SpyridonSamothrakis + RichardSutcliffe + 7400–7410 + 2020.emnlp-main.600 + + + “<fixed-case>I</fixed-case>’<fixed-case>D</fixed-case> Rather Just Go to Bed”: Understanding Indirect Answers + AnnieLouis + DanRoth + FilipRadlinski + 7411–7425 + 2020.emnlp-main.601 + + + <fixed-case>P</fixed-case>ower<fixed-case>T</fixed-case>ransformer: Unsupervised Controllable Revision for Biased Language Correction + XinyaoMa + MaartenSap + HannahRashkin + YejinChoi + 7426–7441 + 2020.emnlp-main.602 + + + <fixed-case>MEGA</fixed-case> <fixed-case>RST</fixed-case> Discourse Treebanks with Structure and Nuclearity from Scalable Distant Sentiment Supervision + PatrickHuber + GiuseppeCarenini + 7442–7457 + 2020.emnlp-main.603 + + + A Centering Approach for Discourse Structure-aware Coherence Modeling + SunghoJeon + MichaelStrube + 7458–7472 + 2020.emnlp-main.604 + + + Keeping up Appearances: Computational Modeling of Face Acts in Persuasion Oriented Discussions + RitamDutt + RishabhJoshi + CarolynRose + 7473–7485 + 2020.emnlp-main.605 + + + <fixed-case>HABERTOR</fixed-case>: An Efficient and Effective Deep Hatespeech Detector + ThanhTran + YifanHu + ChangweiHu + KevinYen + FeiTan + KyuminLee + Se RimPark + 7486–7502 + 2020.emnlp-main.606 + + + An Empirical Study on Large-Scale Multi-Label Text Classification Including Few and Zero-Shot Labels + IliasChalkidis + ManosFergadiotis + SotirisKotitsas + ProdromosMalakasiotis + NikolaosAletras + IonAndroutsopoulos + 7503–7515 + 2020.emnlp-main.607 + + + Which *<fixed-case>BERT</fixed-case>? A Survey Organizing Contextualized Encoders + PatrickXia + ShijieWu + BenjaminVan Durme + 7516–7533 + 2020.emnlp-main.608 + + + Fact or Fiction: Verifying Scientific Claims + DavidWadden + ShanchuanLin + KyleLo + Lucy LuWang + Madeleinevan Zuylen + ArmanCohan + HannanehHajishirzi + 7534–7550 + 2020.emnlp-main.609 + + + Semantic Role Labeling as Syntactic Dependency Parsing + TianzeShi + IgorMalioutov + OzanIrsoy + 7551–7571 + 2020.emnlp-main.610 + + + <fixed-case>PARADE</fixed-case>: A New Dataset for Paraphrase Identification Requiring Computer Science Domain Knowledge + YunHe + ZhuoerWang + YinZhang + RuihongHuang + JamesCaverlee + 7572–7582 + 2020.emnlp-main.611 + + + Causal Inference of Script Knowledge + NoahWeber + RachelRudinger + BenjaminVan Durme + 7583–7596 + 2020.emnlp-main.612 + + + Towards Debiasing <fixed-case>NLU</fixed-case> Models from Unknown Biases + Prasetya AjieUtama + Nafise SadatMoosavi + IrynaGurevych + 7597–7610 + 2020.emnlp-main.613 + + + On the Role of Supervision in Unsupervised Constituency Parsing + HaoyueShi + KarenLivescu + KevinGimpel + 7611–7621 + 2020.emnlp-main.614 + + + Language Model Prior for Low-Resource Neural Machine Translation + ChristosBaziotis + BarryHaddow + AlexandraBirch + 7622–7634 + 2020.emnlp-main.615 + + + Towards Detecting and Exploiting Disambiguation Biases in Neural Machine Translation + DenisEmelin + IvanTitov + RicoSennrich + 7635–7653 + 2020.emnlp-main.616 + + + <fixed-case>MAD</fixed-case>-<fixed-case>X</fixed-case>: An Adapter-based Framework for Multi-task Cross-lingual Transfer + JonasPfeiffer + IvanVulić + IrynaGurevych + SebastianRuder + 7654–7673 + 2020.emnlp-main.617 + + + Translation Artifacts in Cross-lingual Transfer Learning + MikelArtetxe + GorkaLabaka + EnekoAgirre + 7674–7684 + 2020.emnlp-main.618 + + + A Time-Aware Transformer Based Model for Suicide Ideation Detection on Social Media + RamitSawhney + HarshitJoshi + SaumyaGandhi + Rajiv RatnShah + 7685–7697 + 2020.emnlp-main.619 + + + Weakly Supervised Learning of Nuanced Frames for Analyzing Polarization in News Media + ShamikRoy + DanGoldwasser + 7698–7716 + 2020.emnlp-main.620 + 2020.emnlp-main.620.OptionalSupplementaryMaterial.zip + + + Where Are the Facts? Searching for Fact-checked Information to Alleviate the Spread of Fake News + NguyenVo + KyuminLee + 7717–7731 + 2020.emnlp-main.621 + + + Fortifying Toxic Speech Detectors against Disguised Toxicity + XiaochuangHan + YuliaTsvetkov + 7732–7739 + 2020.emnlp-main.622 + + + Explainable Automated Fact-Checking for Public Health Claims + NeemaKotonya + FrancescaToni + 7740–7754 + 2020.emnlp-main.623 + + + Interactive Fiction Game Playing as Multi-Paragraph Reading Comprehension with Reinforcement Learning + XiaoxiaoGuo + MoYu + YupengGao + ChuangGan + MurrayCampbell + ShiyuChang + 7755–7765 + 2020.emnlp-main.624 + 2020.emnlp-main.624.OptionalSupplementaryMaterial.zip + + + <fixed-case>DORB</fixed-case>: Dynamically Optimizing Multiple Rewards with Bandits + RamakanthPasunuru + HanGuo + MohitBansal + 7766–7780 + 2020.emnlp-main.625 + + + Improving Detection and Categorization of Task-relevant Utterances through Integration of Discourse Structure and Ontological Knowledge + SopanKhosla + ShikharVashishth + Jill FainLehman + CarolynRose + 7781–7797 + 2020.emnlp-main.626 + 2020.emnlp-main.626.OptionalSupplementaryMaterial.zip + + + Hierarchical Evidence Set Modeling for Automated Fact Extraction and Verification + ShyamSubramanian + KyuminLee + 7798–7809 + 2020.emnlp-main.627 + + + Program Enhanced Fact Verification with Verbalization and Graph Attention Network + XiaoyuYang + FengNie + YufeiFeng + QuanLiu + ZhigangChen + XiaodanZhu + 7810–7825 + 2020.emnlp-main.628 + 2020.emnlp-main.628.OptionalSupplementaryMaterial.zip + + + Constrained Fact Verification for <fixed-case>FEVER</fixed-case> + AdithyaPratapa + Sai MuralidharJayanthi + KavyaNerella + 7826–7832 + 2020.emnlp-main.629 + + + Entity Linking in 100 Languages + Jan A.Botha + ZifeiShan + DanielGillick + 7833–7845 + 2020.emnlp-main.630 + + + <fixed-case>P</fixed-case>atch<fixed-case>BERT</fixed-case>: Just-in-Time, Out-of-Vocabulary Patching + SangwhanMoon + NaoakiOkazaki + 7846–7852 + 2020.emnlp-main.631 + + + On the Importance of Pre-training Data Volume for Compact Language Models + VincentMicheli + Martind’Hoffschmidt + FrançoisFleuret + 7853–7858 + 2020.emnlp-main.632 + + + <fixed-case>BERT</fixed-case>-of-Theseus: Compressing <fixed-case>BERT</fixed-case> by Progressive Module Replacing + CanwenXu + WangchunshuZhou + TaoGe + FuruWei + MingZhou + 7859–7869 + 2020.emnlp-main.633 + + + Recall and Learn: Fine-tuning Deep Pretrained Language Models with Less Forgetting + SanyuanChen + YutaiHou + YimingCui + WanxiangChe + TingLiu + XiangzhanYu + 7870–7881 + 2020.emnlp-main.634 + 2020.emnlp-main.634.OptionalSupplementaryMaterial.zip + + + Exploring and Predicting Transferability across <fixed-case>NLP</fixed-case> Tasks + TuVu + TongWang + TsendsurenMunkhdalai + AlessandroSordoni + AdamTrischler + AndrewMattarella-Micke + SubhransuMaji + MohitIyyer + 7882–7926 + 2020.emnlp-main.635 + + + To <fixed-case>BERT</fixed-case> or Not to <fixed-case>BERT</fixed-case>: Comparing Task-specific and Task-agnostic Semi-Supervised Approaches for Sequence Tagging + KasturiBhattacharjee + MiguelBallesteros + RishitaAnubhai + SmarandaMuresan + JieMa + FaisalLadhak + YaserAl-Onaizan + 7927–7934 + 2020.emnlp-main.636 + + + Cold-start Active Learning through Self-Supervised Language Modeling + MichelleYuan + Hsuan-TienLin + JordanBoyd-Graber + 7935–7948 + 2020.emnlp-main.637 + + + Active Learning for <fixed-case>BERT</fixed-case>: An Empirical Study + LiatEin-Dor + AlonHalfon + ArielGera + EyalShnarch + LenaDankin + LeshemChoshen + MarinaDanilevsky + RanitAharonov + YoavKatz + NoamSlonim + 7949–7962 + 2020.emnlp-main.638 + + + Transformer Based Multi-Source Domain Adaptation + DustinWright + IsabelleAugenstein + 7963–7974 + 2020.emnlp-main.639 + + + Vector-Vector-Matrix Architecture: A Novel Hardware-Aware Framework for Low-Latency Inference in <fixed-case>NLP</fixed-case> Applications + MatthewKhoury + RumenDangovski + LongwuOu + PreslavNakov + YichenShen + LiJing + 7975–7984 + 2020.emnlp-main.640 + + + The Importance of Fillers for Text Representations of Speech Transcripts + TanviDinkar + PierreColombo + MatthieuLabeau + ChloéClavel + 7985–7993 + 2020.emnlp-main.641 + + + The Role of Context in Neural Pitch Accent Detection in <fixed-case>E</fixed-case>nglish + ElizabethNielsen + MarkSteedman + SharonGoldwater + 7994–8000 + 2020.emnlp-main.642 + + + <fixed-case>V</fixed-case>ol<fixed-case>TAGE</fixed-case>: Volatility Forecasting via Text-Audio Fusion with Graph Convolution Networks for Earnings Calls + RamitSawhney + PiyushKhanna + ArshiyaAggarwal + TaruJain + PuneetMathur + Rajiv RatnShah + 8001–8013 + 2020.emnlp-main.643 + + + Effectively Pretraining a Speech Translation Decoder with Machine Translation Data + AshkanAlinejad + AnoopSarkar + 8014–8020 + 2020.emnlp-main.644 + + + A Preliminary Exploration of <fixed-case>GAN</fixed-case>s for Keyphrase Generation + AvinashSwaminathan + HaiminZhang + DebanjanMahata + RakeshGosangi + Rajiv RatnShah + AmandaStent + 8021–8030 + 2020.emnlp-main.645 + + + <fixed-case>TESA</fixed-case>: A Task in Entity Semantic Aggregation for Abstractive Summarization + ClémentJumel + AnnieLouis + Jackie Chi KitCheung + 8031–8050 + 2020.emnlp-main.646 + + + <fixed-case>MLSUM</fixed-case>: The Multilingual Summarization Corpus + ThomasScialom + Paul-AlexisDray + SylvainLamprier + BenjaminPiwowarski + JacopoStaiano + 8051–8067 + 2020.emnlp-main.647 + + + Multi-<fixed-case>XS</fixed-case>cience: A Large-scale Dataset for Extreme Multi-document Summarization of Scientific Articles + YaoLu + YueDong + LaurentCharlin + 8068–8074 + 2020.emnlp-main.648 + + + Intrinsic Evaluation of Summarization Datasets + RishiBommasani + ClaireCardie + 8075–8096 + 2020.emnlp-main.649 + + + Iterative Feature Mining for Constraint-Based Data Collection to Increase Data Diversity and Model Robustness + StefanLarson + AnthonyZheng + AnishMahendran + RishiTekriwal + AdrianCheung + EricGuldan + KevinLeach + Jonathan K.Kummerfeld + 8097–8106 + 2020.emnlp-main.650 + 2020.emnlp-main.650.OptionalSupplementaryMaterial.zip + + + Conversational Semantic Parsing for Dialog State Tracking + JianpengCheng + DevangAgrawal + HéctorMartínez Alonso + ShrutiBhargava + JorisDriesen + FedericoFlego + DainKaplan + DimitriKartsaklis + LinLi + DhivyaPiraviperumal + Jason D.Williams + HongYu + DiarmuidÓ Séaghdha + AndersJohannsen + 8107–8117 + 2020.emnlp-main.651 + + + <fixed-case>D</fixed-case>oc2<fixed-case>D</fixed-case>ial: A Goal-Oriented Document-Grounded Dialogue Dataset + SongFeng + HuiWan + ChulakaGunasekara + SivaPatel + SachindraJoshi + LuisLastras + 8118–8128 + 2020.emnlp-main.652 + + + Interview: Large-scale Modeling of Media Dialog with Discourse Patterns and Knowledge Grounding + Bodhisattwa PrasadMajumder + ShuyangLi + JianmoNi + JulianMcAuley + 8129–8141 + 2020.emnlp-main.653 + + + <fixed-case>INSPIRED</fixed-case>: Toward Sociable Recommendation Dialog Systems + Shirley AnugrahHayati + DongyeopKang + QingxiaoyangZhu + WeiyanShi + ZhouYu + 8142–8152 + 2020.emnlp-main.654 + 2020.emnlp-main.654.OptionalSupplementaryMaterial.zip + + + Information Seeking in the Spirit of Learning: A Dataset for Conversational Curiosity + PedroRodriguez + PaulCrook + SeungwhanMoon + ZhiguangWang + 8153–8172 + 2020.emnlp-main.655 + + + Queens Are Powerful Too: Mitigating Gender Bias in Dialogue Generation + EmilyDinan + AngelaFan + AdinaWilliams + JackUrbanek + DouweKiela + JasonWeston + 8173–8188 + 2020.emnlp-main.656 + + + Discriminatively-Tuned Generative Classifiers for Robust Natural Language Inference + XiaoanDing + TianyuLiu + BaobaoChang + ZhifangSui + KevinGimpel + 8189–8202 + 2020.emnlp-main.657 + + + Collecting Entailment Data for Pretraining: New Protocols and Negative Results + Samuel R.Bowman + JennimariaPalomaki + LivioBaldini Soares + EmilyPitler + 8203–8214 + 2020.emnlp-main.658 + + + The Curse of Performance Instability in Analysis Datasets: Consequences, Source, and Suggestions + XiangZhou + YixinNie + HaoTan + MohitBansal + 8215–8228 + 2020.emnlp-main.659 + + + Universal Natural Language Processing with Limited Annotations: Try Few-shot Textual Entailment as a Start + WenpengYin + Nazneen FatemaRajani + DragomirRadev + RichardSocher + CaimingXiong + 8229–8239 + 2020.emnlp-main.660 + + + <fixed-case>C</fixed-case>onj<fixed-case>NLI</fixed-case>: Natural Language Inference over Conjunctive Sentences + SwarnadeepSaha + YixinNie + MohitBansal + 8240–8252 + 2020.emnlp-main.661 + + + Data and Representation for <fixed-case>T</fixed-case>urkish Natural Language Inference + EmrahBudur + RızaÖzçelik + TungaGungor + ChristopherPotts + 8253–8267 + 2020.emnlp-main.662 + + + Multitask Learning for Cross-Lingual Transfer of Broad-coverage Semantic Dependencies + MaryamAminian + Mohammad SadeghRasooli + MonaDiab + 8268–8274 + 2020.emnlp-main.663 + + + Precise Task Formalization Matters in <fixed-case>W</fixed-case>inograd Schema Evaluations + HaokunLiu + WilliamHuang + DharaMungra + Samuel R.Bowman + 8275–8280 + 2020.emnlp-main.664 + + + Gone at Last: Removing the Hypothesis-Only Bias in Natural Language Inference via Ensemble Adversarial Training + JoeStacey + PasqualeMinervini + HaimDubossarsky + SebastianRiedel + TimRocktäschel + 8281–8291 + 2020.emnlp-main.665 + 2020.emnlp-main.665.OptionalSupplementaryMaterial.zip + + + <fixed-case>S</fixed-case>yn<fixed-case>S</fixed-case>et<fixed-case>E</fixed-case>xpan: An Iterative Framework for Joint Entity Set Expansion and Synonym Discovery + JiamingShen + WendaQiu + JingboShang + MichelleVanni + XiangRen + JiaweiHan + 8292–8307 + 2020.emnlp-main.666 + + + Evaluating the Calibration of Knowledge Graph Embeddings for Trustworthy Link Prediction + TaraSafavi + DanaiKoutra + EdgarMeij + 8308–8321 + 2020.emnlp-main.667 + + + Text Graph Transformer for Document Classification + HaopengZhang + JiaweiZhang + 8322–8327 + 2020.emnlp-main.668 + + + <fixed-case>C</fixed-case>o<fixed-case>DE</fixed-case>x: A Comprehensive Knowledge Graph Completion Benchmark + TaraSafavi + DanaiKoutra + 8328–8350 + 2020.emnlp-main.669 + + + <fixed-case>META</fixed-case>: Metadata-Empowered Weak Supervision for Text Classification + DheerajMekala + XinyangZhang + JingboShang + 8351–8361 + 2020.emnlp-main.670 + + + Towards More Accurate Uncertainty Estimation in Text Classification + JianfengHe + XuchaoZhang + ShuoLei + ZhiqianChen + FanglanChen + AbdulazizAlhamadani + BeiXiao + ChangTienLu + 8362–8372 + 2020.emnlp-main.671 + 2020.emnlp-main.671.OptionalSupplementaryMaterial.zip + + + Chapter Captor: Text Segmentation in Novels + CharutaPethe + AllenKim + SteveSkiena + 8373–8383 + 2020.emnlp-main.672 + + + Authorship Attribution for Neural Text Generation + AdakuUchendu + ThaiLe + KaiShu + DongwonLee + 8384–8395 + 2020.emnlp-main.673 + + + <fixed-case>N</fixed-case>w<fixed-case>QM</fixed-case>: A Neural Quality Assessment Framework for <fixed-case>W</fixed-case>ikipedia + Bhanu Prakash ReddyGuda + Sasi BhushanSeelaboyina + SoumyaSarkar + AnimeshMukherjee + 8396–8406 + 2020.emnlp-main.674 + + + Towards Modeling Revision Requirements in wiki<fixed-case>H</fixed-case>ow Instructions + IrshadBhat + TalitaAnthonio + MichaelRoth + 8407–8414 + 2020.emnlp-main.675 + + + Deep Attentive Learning for Stock Movement Prediction from Social Media Text and Company Correlations + RamitSawhney + ShivamAgarwal + ArnavWadhwa + Rajiv RatnShah + 8415–8426 + 2020.emnlp-main.676 + + + Natural Language Processing for Achieving Sustainable Development: The Case of Neural Labelling to Enhance Community Profiling + CostanzaConforti + StephanieHirmer + DaiMorgan + MarcoBasaldella + YauBen Or + 8427–8444 + 2020.emnlp-main.677 + + + To Schedule or Not to Schedule: Extracting Task Specific Temporal Entities and Associated Negation Constraints + BarunPatra + ChalaFufa + PamelaBhattacharya + CharlesLee + 8445–8455 + 2020.emnlp-main.678 + + + Competence-Level Prediction and Resume-<fixed-case>J</fixed-case>ob_<fixed-case>D</fixed-case>escription Matching Using Context-Aware Transformer Models + ChangmaoLi + ElaineFisher + RebeccaThomas + StevePittard + VickiHertzberg + Jinho D.Choi + 8456–8466 + 2020.emnlp-main.679 + + + Grammatical Error Correction in Low Error Density Domains: A New Benchmark and Analyses + SimonFlachs + OphélieLacroix + HelenYannakoudakis + MarekRei + AndersSøgaard + 8467–8478 + 2020.emnlp-main.680 + + + Deconstructing Word Embedding Algorithms + KianKenyon-Dean + EdwardNewell + Jackie Chi KitCheung + 8479–8484 + 2020.emnlp-main.681 + + + Sequential Modelling of the Evolution of Word Representations for Semantic Change Detection + AdamTsakalidis + MariaLiakata + 8485–8497 + 2020.emnlp-main.682 + + + Sparsity Makes Sense: Word Sense Disambiguation Using Sparse Contextualized Word Representations + GáborBerend + 8498–8508 + 2020.emnlp-main.683 + + + Exploring Semantic Capacity of Terms + JieHuang + ZilongWang + KevinChang + Wen-meiHwu + JinJunXiong + 8509–8518 + 2020.emnlp-main.684 + + + Learning to Ignore: Long Document Coreference with Bounded Memory Neural Networks + ShubhamToshniwal + SamWiseman + AllysonEttinger + KarenLivescu + KevinGimpel + 8519–8526 + 2020.emnlp-main.685 + + + Revealing the Myth of Higher-Order Inference in Coreference Resolution + LiyanXu + Jinho D.Choi + 8527–8533 + 2020.emnlp-main.686 + + + Pre-training of Mention Representations in Coreference Models + YuvalVarkel + AmirGloberson + 8534–8540 + 2020.emnlp-main.687 + + + Learning Collaborative Agents with Rule Guidance for Knowledge Graph Reasoning + DerenLei + GangrongJiang + XiaotaoGu + KexuanSun + YuningMao + XiangRen + 8541–8547 + 2020.emnlp-main.688 + + + Exploring Contextualized Neural Language Models for Temporal Dependency Parsing + HayleyRoss + JonathonCai + BonanMin + 8548–8553 + 2020.emnlp-main.689 + + + Systematic Comparison of Neural Architectures and Training Approaches for Open Information Extraction + PatrickHohenecker + FrankMtumbuka + VidKocijan + ThomasLukasiewicz + 8554–8565 + 2020.emnlp-main.690 + + + <fixed-case>S</fixed-case>eq<fixed-case>M</fixed-case>ix: Augmenting Active Sequence Labeling via Sequence Mixup + RongzhiZhang + YueYu + ChaoZhang + 8566–8579 + 2020.emnlp-main.691 + 2020.emnlp-main.691.OptionalSupplementaryMaterial.zip + + + <fixed-case>A</fixed-case>x<fixed-case>C</fixed-case>ell: Automatic Extraction of Results from Machine Learning Papers + MarcinKardas + PiotrCzapla + PontusStenetorp + SebastianRuder + SebastianRiedel + RossTaylor + RobertStojnic + 8580–8594 + 2020.emnlp-main.692 + + + Knowledge-guided Open Attribute Value Extraction with Reinforcement Learning + YeLiu + ShengZhang + RuiSong + SuoFeng + YanghuaXiao + 8595–8604 + 2020.emnlp-main.693 + + + <fixed-case>D</fixed-case>ual<fixed-case>TKB</fixed-case>: A Dual Learning Bridge between Text and Knowledge Base + PierreDognin + IgorMelnyk + InkitPadhi + CiceroNogueira dos Santos + PayelDas + 8605–8616 + 2020.emnlp-main.694 + + + Incremental Neural Coreference Resolution in Constant Memory + PatrickXia + JoãoSedoc + BenjaminVan Durme + 8617–8624 + 2020.emnlp-main.695 + + + Improving Low Compute Language Modeling with In-Domain Embedding Initialisation + CharlesWelch + RadaMihalcea + Jonathan K.Kummerfeld + 8625–8634 + 2020.emnlp-main.696 + 2020.emnlp-main.696.OptionalSupplementaryMaterial.zip + + + <fixed-case>KGLM</fixed-case>: Pretrained Knowledge-Grounded Language Model for Data-to-Text Generation + WenhuChen + YuSu + XifengYan + William YangWang + 8635–8648 + 2020.emnlp-main.697 + + + Pointer: Constrained Text Generation via Insertion-based Generative Pre-training + YizheZhang + GuoyinWang + ChunyuanLi + ZheGan + ChrisBrockett + BillDolan + 8649–8670 + 2020.emnlp-main.698 + + + Unsupervised Text Style Transfer with Masked Language Models + EricMalmi + AliakseiSeveryn + SaschaRothe + 8671–8680 + 2020.emnlp-main.699 + + + <fixed-case>PALM</fixed-case>: Pre-training an Autoencoding&autoregressive Language Model for Context-conditioned Generation + BinBi + ChenliangLi + ChenWu + MingYan + WeiWang + SongfangHuang + FeiHuang + LuoSi + 8681–8691 + 2020.emnlp-main.700 + + + Gradient-guided Unsupervised Lexically Constrained Text Generation + LeiSha + 8692–8703 + 2020.emnlp-main.701 + + + <fixed-case>T</fixed-case>ea<fixed-case>F</fixed-case>or<fixed-case>N</fixed-case>: Teacher-Forcing with N-grams + SebastianGoodman + NanDing + RaduSoricut + 8704–8717 + 2020.emnlp-main.702 + + + Experience Grounds Language + YonatanBisk + AriHoltzman + JesseThomason + JacobAndreas + YoshuaBengio + JoyceChai + MirellaLapata + AngelikiLazaridou + JonathanMay + AleksandrNisnevich + NicolasPinto + JosephTurian + 8718–8735 + 2020.emnlp-main.703 + + + Keep <fixed-case>CALM</fixed-case> and Explore: Language Models for Action Generation in Text-based Games + ShunyuYao + RohanRao + MatthewHausknecht + KarthikNarasimhan + 8736–8754 + 2020.emnlp-main.704 + + + <fixed-case>C</fixed-case>ap<fixed-case>WAP</fixed-case>: Captioning with a Purpose + AdamFisch + KentonLee + Ming-WeiChang + JonathanClark + ReginaBarzilay + 8755–8768 + 2020.emnlp-main.705 + + + What Is More Likely to Happen Next? Video-and-Language Future Event Prediction + JieLei + LichengYu + TamaraBerg + MohitBansal + 8769–8784 + 2020.emnlp-main.706 + + + <fixed-case>X</fixed-case>-<fixed-case>LXMERT</fixed-case>: Paint, Caption and Answer Questions with Multi-Modal Transformers + JaeminCho + JiasenLu + DustinSchwenk + HannanehHajishirzi + AniruddhaKembhavi + 8785–8805 + 2020.emnlp-main.707 + + + Towards Understanding Sample Variance in Visually Grounded Language Generation: Evaluations and Observations + WanrongZhu + XinWang + PradyumnaNarayana + KazooSone + SugatoBasu + William YangWang + 8806–8811 + 2020.emnlp-main.708 + + + Beyond Instructional Videos: Probing for More Diverse Visual-Textual Grounding on <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube + JackHessel + ZhenhaiZhu + BoPang + RaduSoricut + 8812–8822 + 2020.emnlp-main.709 + + + Hierarchical Graph Network for Multi-hop Question Answering + YuweiFang + SiqiSun + ZheGan + RohitPillai + ShuohangWang + JingjingLiu + 8823–8838 + 2020.emnlp-main.710 + + + A Simple Yet Strong Pipeline for <fixed-case>H</fixed-case>otpot<fixed-case>QA</fixed-case> + DirkGroeneveld + TusharKhot + Mausam + AshishSabharwal + 8839–8845 + 2020.emnlp-main.711 + + + Is Multihop <fixed-case>QA</fixed-case> in <fixed-case>D</fixed-case>i<fixed-case>R</fixed-case>e Condition? Measuring and Reducing Disconnected Reasoning + HarshTrivedi + NiranjanBalasubramanian + TusharKhot + AshishSabharwal + 8846–8863 + 2020.emnlp-main.712 + + + Unsupervised Question Decomposition for Question Answering + EthanPerez + PatrickLewis + Wen-tauYih + KyunghyunCho + DouweKiela + 8864–8880 + 2020.emnlp-main.713 + + + <fixed-case>SRLGRN</fixed-case>: Semantic Role Labeling Graph Reasoning Network + ChenZheng + ParisaKordjamshidi + 8881–8891 + 2020.emnlp-main.714 + + + <fixed-case>C</fixed-case>ancer<fixed-case>E</fixed-case>mo: A Dataset for Fine-Grained Emotion Detection + TiberiuSosea + CorneliaCaragea + 8892–8904 + 2020.emnlp-main.715 + + + Exploring the Role of Argument Structure in Online Debate Persuasion + JialuLi + EsinDurmus + ClaireCardie + 8905–8912 + 2020.emnlp-main.716 + + + Zero-Shot Stance Detection: A Dataset and Model Using Generalized Topic Representations + EmilyAllaway + KathleenMcKeown + 8913–8931 + 2020.emnlp-main.717 + + + Sentiment Analysis of Tweets Using Heterogeneous Multi-layer Network Representation and Embedding + LoitongbamGyanendro Singh + AnasuaMitra + SanasamRanbir Singh + 8932–8946 + 2020.emnlp-main.718 + + + Introducing Syntactic Structures into Target Opinion Word Extraction with Deep Learning + AmirPouran Ben Veyseh + NasimNouri + FranckDernoncourt + DejingDou + Thien HuuNguyen + 8947–8956 + 2020.emnlp-main.719 + + + Can Emojis Convey Human Emotions? A Study to Understand the Association between Emojis and Emotions + Abu Awal MdShoeb + Gerardde Melo + 8957–8967 + 2020.emnlp-main.720 + + + <fixed-case>MIME</fixed-case>: <fixed-case>MIM</fixed-case>icking Emotions for Empathetic Response Generation + NavonilMajumder + PengfeiHong + ShanshanPeng + JiankunLu + DeepanwayGhosal + AlexanderGelbukh + RadaMihalcea + SoujanyaPoria + 8968–8979 + 2020.emnlp-main.721 + + + Exploiting Structured Knowledge in Text via Graph-Guided Representation Learning + TaoShen + YiMao + PengchengHe + GuodongLong + AdamTrischler + WeizhuChen + 8980–8994 + 2020.emnlp-main.722 + + + Named Entity Recognition Only from Word Embeddings + YingLuo + HaiZhao + JunlangZhan + 8995–9005 + 2020.emnlp-main.723 + + + Weakly-Supervised Text Classification Using Label Names Only + YuMeng + YunyiZhang + JiaxinHuang + ChenyanXiong + HengJi + ChaoZhang + JiaweiHan + 9006–9017 + 2020.emnlp-main.724 + + + Neural Topic Modeling with Cycle-Consistent Adversarial Training + XuemengHu + RuiWang + DeyuZhou + YuxuanXiong + 9018–9030 + 2020.emnlp-main.725 + + + Data Boost: Text Data Augmentation through Reinforcement Learning Guided Conditional Generation + RuiboLiu + GuangxuanXu + ChenyanJia + WeichengMa + LiliWang + SoroushVosoughi + 9031–9041 + 2020.emnlp-main.726 + + + A State-independent and Time-evolving Network with Applications to Early Rumor Detection + RuiXia + KaizhouXuan + JianfeiYu + 9042–9051 + 2020.emnlp-main.727 + + + <fixed-case>P</fixed-case>y<fixed-case>MT</fixed-case>5: Multi-mode Translation of Natural Language and Python Code with Transformers + ColinClement + DawnDrain + JonathanTimcheck + AlexeySvyatkovskiy + NeelSundaresan + 9052–9065 + 2020.emnlp-main.728 + + + <fixed-case>P</fixed-case>ath<fixed-case>QG</fixed-case>: Neural Question Generation from Facts + SiyuanWang + ZhongyuWei + ZhihaoFan + ZengfengHuang + WeijianSun + QiZhang + XuanjingHuang + 9066–9075 + 2020.emnlp-main.729 + 2020.emnlp-main.729.OptionalSupplementaryMaterial.zip + + + What Time Is It? Temporal Analysis of Novels + AllenKim + CharutaPethe + SteveSkiena + 9076–9086 + 2020.emnlp-main.730 + + + <fixed-case>COGS</fixed-case>: A Compositional Generalization Challenge Based on Semantic Interpretation + NajoungKim + TalLinzen + 9087–9105 + 2020.emnlp-main.731 + + + An Analysis of Natural Language Inference Benchmarks through the Lens of Negation + Md MosharafHossain + VenelinKovatchev + PranoyDutta + TiffanyKao + ElizabethWei + EduardoBlanco + 9106–9118 + 2020.emnlp-main.732 + + + On the Sentence Embeddings from <fixed-case>BERT</fixed-case> for Semantic Textual Similarity + BohanLi + HaoZhou + JunxianHe + MingxuanWang + YimingYang + LeiLi + 9119–9130 + 2020.emnlp-main.733 + + + What Can We Learn from Collective Human Opinions on Natural Language Inference Data? + YixinNie + XiangZhou + MohitBansal + 9131–9143 + 2020.emnlp-main.734 + + + Improving Text Generation with Student-Forcing Optimal Transport + JianqiaoLi + ChunyuanLi + GuoyinWang + HaoFu + YuhchenLin + LiqunChen + YizheZhang + ChenyangTao + RuiyiZhang + WenlinWang + DinghanShen + QianYang + LawrenceCarin + 9144–9156 + 2020.emnlp-main.735 + + + <fixed-case>UNION</fixed-case>: An Unreferenced Metric for Evaluating Open-ended Story Generation + JianGuan + MinlieHuang + 9157–9166 + 2020.emnlp-main.736 + 2020.emnlp-main.736.OptionalSupplementaryMaterial.zip + + + Fˆ2-Softmax: Diversifying Neural Text Generation via Frequency Factorized Softmax + Byung-JuChoi + JiminHong + DavidPark + Sang WanLee + 9167–9182 + 2020.emnlp-main.737 + 2020.emnlp-main.737.OptionalSupplementaryMaterial.zip + + + Partially-Aligned Data-to-Text Generation with Distant Supervision + ZihaoFu + BeiShi + WaiLam + LidongBing + ZhiyuanLiu + 9183–9193 + 2020.emnlp-main.738 + + + Like Hiking? You Probably Enjoy Nature: Persona-grounded Dialog with Commonsense Expansions + Bodhisattwa PrasadMajumder + HarshJhamtani + TaylorBerg-Kirkpatrick + JulianMcAuley + 9194–9206 + 2020.emnlp-main.739 + + + A Probabilistic End-To-End Task-Oriented Dialog Model with Latent Belief States towards Semi-Supervised Learning + YichiZhang + ZhijianOu + MinHu + JunlanFeng + 9207–9219 + 2020.emnlp-main.740 + + + The World Is Not Binary: Learning to Rank with Grayscale Data for Dialogue Response Selection + ZiboLin + DengCai + YanWang + XiaojiangLiu + HaitaoZheng + ShumingShi + 9220–9229 + 2020.emnlp-main.741 + + + <fixed-case>GRADE</fixed-case>: Automatic Graph-Enhanced Coherence Metric for Evaluating Open-Domain Dialogue Systems + LishanHuang + ZhengYe + JinghuiQin + LiangLin + XiaodanLiang + 9230–9240 + 2020.emnlp-main.742 + + + <fixed-case>M</fixed-case>ed<fixed-case>D</fixed-case>ialog: A Large-scale Medical Dialogue Dataset + GuangtaoZeng + WenmianYang + ZeqianJu + YueYang + SichengWang + RuisiZhang + MengZhou + JiaqiZeng + XiangyuDong + RuoyuZhang + HongchaoFang + PenghuiZhu + ShuChen + PengtaoXie + 9241–9250 + 2020.emnlp-main.743 + + + An Information Theoretic View on Selecting Linguistic Probes + ZiningZhu + FrankRudzicz + 9251–9262 + 2020.emnlp-main.744 + + + With Little Power Comes Great Responsibility + DallasCard + PeterHenderson + UrvashiKhandelwal + RobinJia + KyleMahowald + DanJurafsky + 9263–9274 + 2020.emnlp-main.745 + 2020.emnlp-main.745.OptionalSupplementaryMaterial.zip + + + Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics + SwabhaSwayamdipta + RoySchwartz + NicholasLourie + YizhongWang + HannanehHajishirzi + Noah A.Smith + YejinChoi + 9275–9293 + 2020.emnlp-main.746 + + + Evaluating and Characterizing Human Rationales + SamuelCarton + AnirudhRathore + ChenhaoTan + 9294–9307 + 2020.emnlp-main.747 + + + On Extractive and Abstractive Neural Document Summarization with Transformer Language Models + JonathanPilault + RaymondLi + SandeepSubramanian + ChrisPal + 9308–9319 + 2020.emnlp-main.748 + + + Multi-Fact Correction in Abstractive Text Summarization + YueDong + ShuohangWang + ZheGan + YuCheng + Jackie Chi KitCheung + JingjingLiu + 9320–9331 + 2020.emnlp-main.749 + + + Evaluating the Factual Consistency of Abstractive Text Summarization + WojciechKryscinski + BryanMcCann + CaimingXiong + RichardSocher + 9332–9346 + 2020.emnlp-main.750 + + + Re-evaluating Evaluation in Text Summarization + ManikBhandari + Pranav NarayanGour + AtabakAshfaq + PengfeiLiu + GrahamNeubig + 9347–9359 + 2020.emnlp-main.751 + + + <fixed-case>VMSMO</fixed-case>: Learning to Generate Multimodal Summary for Video-based News Articles + MingzheLi + XiuyingChen + ShenGao + ZhangmingChan + DongyanZhao + RuiYan + 9360–9369 + 2020.emnlp-main.752 + +
+ + + Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations + QunLiu + DavidSchlangen + Association for Computational Linguistics +
Online
+ October + 2020 + + + 2020.emnlp-demos.0 + + + <fixed-case>O</fixed-case>pen<fixed-case>UE</fixed-case>: An Open Toolkit of Universal Extraction from Text + NingyuZhang + ShuminDeng + ZhenBi + HaiyangYu + JiachengYang + MoshaChen + FeiHuang + WeiZhang + HuajunChen + 1–8 + Natural language processing covers a wide variety of tasks with token-level or sentence-level understandings. In this paper, we provide a simple insight that most tasks can be represented in a single universal extraction format. We introduce a prototype model and provide an open-source and extensible toolkit called OpenUE for various extraction tasks. OpenUE allows developers to train custom models to extract information from the text and supports quick model validation for researchers. Besides, OpenUE provides various functional modules to maintain sufficient modularity and extensibility. Except for the toolkit, we also deploy an online demo with restful APIs to support real-time extraction without training and deploying. Additionally, the online system can extract information in various tasks, including relational triple extraction, slot & intent detection, event extraction, and so on. We release the source code, datasets, and pre-trained models to promote future researches in http://github.com/zjunlp/openue. + 2020.emnlp-demos.1 + + + <fixed-case>BERT</fixed-case>weet: A pre-trained language model for <fixed-case>E</fixed-case>nglish Tweets + Dat QuocNguyen + ThanhVu + AnhTuan Nguyen + 9–14 + We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al., 2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks: Part-of-speech tagging, Named-entity recognition and text classification. We release BERTweet under the MIT License to facilitate future research and applications on Tweet data. Our BERTweet is available at https://github.com/VinAIResearch/BERTweet + 2020.emnlp-demos.2 + + + <fixed-case>N</fixed-case>eural<fixed-case>QA</fixed-case>: A Usable Library for Question Answering (Contextual Query Expansion + <fixed-case>BERT</fixed-case>) on Large Datasets + VictorDibia + 15–22 + Existing tools for Question Answering (QA) have challenges that limit their use in practice. They can be complex to set up or integrate with existing infrastructure, do not offer configurable interactive interfaces, and do not cover the full set of subtasks that frequently comprise the QA pipeline (query expansion, retrieval, reading, and explanation/sensemaking). To help address these issues, we introduce NeuralQA - a usable library for QA on large datasets. NeuralQA integrates well with existing infrastructure (e.g., ElasticSearch instances and reader models trained with the HuggingFace Transformers API) and offers helpful defaults for QA subtasks. It introduces and implements contextual query expansion (CQE) using a masked language model (MLM) as well as relevant snippets (RelSnip) - a method for condensing large documents into smaller passages that can be speedily processed by a document reader model. Finally, it offers a flexible user interface to support workflows for research explorations (e.g., visualization of gradient-based explanations to support qualitative inspection of model behaviour) and large scale search deployment. Code and documentation for NeuralQA is available as open source on Github. + 2020.emnlp-demos.3 + + + <fixed-case>W</fixed-case>ikipedia2<fixed-case>V</fixed-case>ec: An Efficient Toolkit for Learning and Visualizing the Embeddings of Words and Entities from <fixed-case>W</fixed-case>ikipedia + IkuyaYamada + AkariAsai + JinSakuma + HiroyukiShindo + HideakiTakeda + YoshiyasuTakefuji + YujiMatsumoto + 23–30 + The embeddings of entities in a large knowledge base (e.g., Wikipedia) are highly beneficial for solving various natural language tasks that involve real world knowledge. In this paper, we present Wikipedia2Vec, a Python-based open-source tool for learning the embeddings of words and entities from Wikipedia. The proposed tool enables users to learn the embeddings efficiently by issuing a single command with a Wikipedia dump file as an argument. We also introduce a web-based demonstration of our tool that allows users to visualize and explore the learned embeddings. In our experiments, our tool achieved a state-of-the-art result on the KORE entity relatedness dataset, and competitive results on various standard benchmark datasets. Furthermore, our tool has been used as a key component in various recent studies. We publicize the source code, demonstration, and the pretrained embeddings for 12 languages at https://wikipedia2vec.github.io/. + 2020.emnlp-demos.4 + + + <fixed-case>ARES</fixed-case>: A Reading Comprehension Ensembling Service + AnthonyFerritto + LinPan + RishavChakravarti + SalimRoukos + RaduFlorian + J. WilliamMurdock + AviSil + 31–37 + We introduce ARES (A Reading Comprehension Ensembling Service): a novel Machine Reading Comprehension (MRC) demonstration system which utilizes an ensemble of models to increase F1 by 2.3 points. While many of the top leaderboard submissions in popular MRC benchmarks such as the Stanford Question Answering Dataset (SQuAD) and Natural Questions (NQ) use model ensembles, the accompanying papers do not publish their ensembling strategies. In this work, we detail and evaluate various ensembling strategies using the NQ dataset. ARES leverages the CFO (Chakravarti et al., 2019) and ReactJS distributed frameworks to provide a scalable interactive Question Answering experience that capitalizes on the agreement (or lack thereof) between models to improve the answer visualization experience. + 2020.emnlp-demos.5 + + + Transformers: State-of-the-Art Natural Language Processing + ThomasWolf + JulienChaumond + LysandreDebut + VictorSanh + ClementDelangue + AnthonyMoi + PierricCistac + MorganFuntowicz + JoeDavison + SamShleifer + RemiLouf + Patrickvon Platen + TimRault + YacineJernite + TevenLe Scao + SylvainGugger + JulienPlu + ClaraMa + CanweiShen + MariamaDrame + QuentinLhoest + AlexanderRush + 38–45 + Recent progress in natural language processing has been driven by advances in both model architecture and model pretraining. Transformer architectures have facilitated building higher-capacity models and pretraining has made it possible to effectively utilize this capacity for a wide variety of tasks. Transformers is an open-source library with the goal of opening up these advances to the wider machine learning community. The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community. Transformers is designed to be extensible by researchers, simple for practitioners, and fast and robust in industrial deployments. The library is available at https://github.com/huggingface/transformers. + 2020.emnlp-demos.6 + + + <fixed-case>A</fixed-case>dapter<fixed-case>H</fixed-case>ub: A Framework for Adapting Transformers + JonasPfeiffer + AndreasRücklé + CliftonPoth + AishwaryaKamath + IvanVulić + SebastianRuder + KyunghyunCho + IrynaGurevych + 46–54 + The current modus operandi in NLP involves downloading and fine-tuning pre-trained models consisting of millions or billions of parameters. Storing and sharing such large trained models is expensive, slow, and time-consuming, which impedes progress towards more general and versatile NLP methods that learn from and for many tasks. Adapters—small learnt bottleneck layers inserted within each layer of a pre-trained model— ameliorate this issue by avoiding full fine-tuning of the entire model. However, sharing and integrating adapter layers is not straightforward. We propose AdapterHub, a framework that allows dynamic “stiching-in” of pre-trained adapters for different tasks and languages. The framework, built on top of the popular HuggingFace Transformers library, enables extremely easy and quick adaptations of state-of-the-art pre-trained models (e.g., BERT, RoBERTa, XLM-R) across tasks and languages. Downloading, sharing, and training adapters is as seamless as possible using minimal changes to the training scripts and a specialized infrastructure. Our framework enables scalable and easy access to sharing of task-specific models, particularly in low-resource scenarios. AdapterHub includes all recent adapter architectures and can be found at AdapterHub.ml + 2020.emnlp-demos.7 + 2020.emnlp-demos.7.OptionalSupplementaryMaterial.zip + + + <fixed-case>HUMAN</fixed-case>: Hierarchical Universal Modular <fixed-case>AN</fixed-case>notator + MoritzWolf + DanaRuiter + Ashwin GeetD’Sa + LianeReiners + JanAlexandersson + DietrichKlakow + 55–61 + A lot of real-world phenomena are complex and cannot be captured by single task annotations. This causes a need for subsequent annotations, with interdependent questions and answers describing the nature of the subject at hand. Even in the case a phenomenon is easily captured by a single task, the high specialisation of most annotation tools can result in having to switch to another tool if the task only slightly changes. We introduce HUMAN, a novel web-based annotation tool that addresses the above problems by a) covering a variety of annotation tasks on both textual and image data, and b) the usage of an internal deterministic state machine, allowing the researcher to chain different annotation tasks in an interdependent manner. Further, the modular nature of the tool makes it easy to define new annotation tasks and integrate machine learning algorithms e.g., for active learning. HUMAN comes with an easy-to-use graphical user interface that simplifies the annotation task and management. + 2020.emnlp-demos.8 + + + <fixed-case>D</fixed-case>eezy<fixed-case>M</fixed-case>atch: A Flexible Deep Learning Approach to Fuzzy String Matching + KasraHosseini + FedericoNanni + MarionaColl Ardanuy + 62–69 + We present DeezyMatch, a free, open-source software library written in Python for fuzzy string matching and candidate ranking. Its pair classifier supports various deep neural network architectures for training new classifiers and for fine-tuning a pretrained model, which paves the way for transfer learning in fuzzy string matching. This approach is especially useful where only limited training examples are available. The learned DeezyMatch models can be used to generate rich vector representations from string inputs. The candidate ranker component in DeezyMatch uses these vector representations to find, for a given query, the best matching candidates in a knowledge base. It uses an adaptive searching algorithm applicable to large knowledge bases and query sets. We describe DeezyMatch’s functionality, design and implementation, accompanied by a use case in toponym matching and candidate ranking in realistic noisy datasets. + 2020.emnlp-demos.9 + + + <fixed-case>C</fixed-case>o<fixed-case>S</fixed-case>a<fixed-case>T</fixed-case>a: A Constraint Satisfaction Solver and Interpreted Language for Semi-Structured Tables of Sentences + PeterJansen + 70–76 + This work presents CoSaTa, an intuitive constraint satisfaction solver and interpreted language for knowledge bases of semi-structured tables expressed as text. The stand-alone CoSaTa solver allows easily expressing complex compositional “inference patterns” for how knowledge from different tables tends to connect to support inference and explanation construction in question answering and other downstream tasks, while including advanced declarative features and the ability to operate over multiple representations of text (words, lemmas, or part-of-speech tags). CoSaTa also includes a hybrid imperative/declarative interpreted language for expressing simple models through minimally-specified simulations grounded in constraint patterns, helping bridge the gap between question answering, question explanation, and model simulation. The solver and interpreter are released as open source. Screencast Demo: https://youtu.be/t93Acsz7LyE + 2020.emnlp-demos.10 + + + <fixed-case>I</fixed-case>n<fixed-case>V</fixed-case>e<fixed-case>R</fixed-case>o: Making Semantic Role Labeling Accessible with Intelligible Verbs and Roles + SimoneConia + FabrizioBrignone + DavideZanfardino + RobertoNavigli + 77–84 + Semantic Role Labeling (SRL) is deeply dependent on complex linguistic resources and sophisticated neural models, which makes the task difficult to approach for non-experts. To address this issue we present a new platform named Intelligible Verbs and Roles (InVeRo). This platform provides access to a new verb resource, VerbAtlas, and a state-of-the-art pretrained implementation of a neural, span-based architecture for SRL. Both the resource and the system provide human-readable verb sense and semantic role information, with an easy to use Web interface and RESTful APIs available at http://nlp.uniroma1.it/invero. + 2020.emnlp-demos.11 + + + Youling: an <fixed-case>AI</fixed-case>-assisted Lyrics Creation System + RongshengZhang + XiaoxiMao + LeLi + LinJiang + LinChen + ZhiweiHu + YadongXi + ChangjieFan + MinlieHuang + 85–91 + Recently, a variety of neural models have been proposed for lyrics generation. However, most previous work completes the generation process in a single pass with little human intervention. We believe that lyrics creation is a creative process with human intelligence centered. AI should play a role as an assistant in the lyrics creation process, where human interactions are crucial for high-quality creation. This paper demonstrates Youling, an AI-assisted lyrics creation system, designed to collaborate with music creators. In the lyrics generation process, Youling supports traditional one pass full-text generation mode as well as an interactive generation mode, which allows users to select the satisfactory sentences from generated candidates conditioned on preceding context. The system also provides a revision module which enables users to revise undesired sentences or words of lyrics repeatedly. Besides, Youling allows users to use multifaceted attributes to control the content and format of generated lyrics. The demo video of the system is available at https://youtu.be/DFeNpHk0pm4. + 2020.emnlp-demos.12 + + + A Technical Question Answering System with Transfer Learning + WenhaoYu + LingfeiWu + YuDeng + RuchiMahindru + QingkaiZeng + SinemGuven + MengJiang + 92–99 + In recent years, the need for community technical question-answering sites has increased significantly. However, it is often expensive for human experts to provide timely and helpful responses on those forums. We develop TransTQA, which is a novel system that offers automatic responses by retrieving proper answers based on correctly answered similar questions in the past. TransTQA is built upon a siamese ALBERT network, which enables it to respond quickly and accurately. Furthermore, TransTQA adopts a standard deep transfer learning strategy to improve its capability of supporting multiple technical domains. + 2020.emnlp-demos.13 + 2020.emnlp-demos.13.OptionalSupplementaryMaterial.zip + + + <fixed-case>ENTYFI</fixed-case>: A System for Fine-grained Entity Typing in Fictional Texts + Cuong XuanChu + SimonRazniewski + GerhardWeikum + 100–106 + Fiction and fantasy are archetypes of long-tail domains that lack suitable NLP methodologies and tools. We present ENTYFI, a web-based system for fine-grained typing of entity mentions in fictional texts. It builds on 205 automatically induced high-quality type systems for popular fictional domains, and provides recommendations towards reference type systems for given input texts. Users can exploit the richness and diversity of these reference type systems for fine-grained supervised typing, in addition, they can choose among and combine four other typing modules: pre-trained real-world models, unsupervised dependency-based typing, knowledge base lookups, and constraint-based candidate consolidation. The demonstrator is available at: https://d5demos.mpi-inf.mpg.de/entyfi. + 2020.emnlp-demos.14 + + + The Language Interpretability Tool: Extensible, Interactive Visualizations and Analysis for <fixed-case>NLP</fixed-case> Models + IanTenney + JamesWexler + JasmijnBastings + TolgaBolukbasi + AndyCoenen + SebastianGehrmann + EllenJiang + MahimaPushkarna + CareyRadebaugh + EmilyReif + AnnYuan + 107–118 + We present the Language Interpretability Tool (LIT), an open-source platform for visualization and understanding of NLP models. We focus on core questions about model behavior: Why did my model make this prediction? When does it perform poorly? What happens under a controlled change in the input? LIT integrates local explanations, aggregate analysis, and counterfactual generation into a streamlined, browser-based interface to enable rapid exploration and error analysis. We include case studies for a diverse set of workflows, including exploring counterfactuals for sentiment analysis, measuring gender bias in coreference systems, and exploring local behavior in text generation. LIT supports a wide range of models—including classification, seq2seq, and structured prediction—and is highly extensible through a declarative, framework-agnostic API. LIT is under active development, with code and full documentation available at https://github.com/pair-code/lit. + 2020.emnlp-demos.15 + + + <fixed-case>T</fixed-case>ext<fixed-case>A</fixed-case>ttack: A Framework for Adversarial Attacks, Data Augmentation, and Adversarial Training in <fixed-case>NLP</fixed-case> + JohnMorris + EliLifland + Jin YongYoo + JakeGrigsby + DiJin + YanjunQi + 119–126 + While there has been substantial research using adversarial attacks to analyze NLP models, each attack is implemented in its own code repository. It remains challenging to develop NLP attacks and utilize them to improve model performance. This paper introduces TextAttack, a Python framework for adversarial attacks, data augmentation, and adversarial training in NLP. TextAttack builds attacks from four components: a goal function, a set of constraints, a transformation, and a search method. TextAttack’s modular design enables researchers to easily construct attacks from combinations of novel and existing components. TextAttack provides implementations of 16 adversarial attacks from the literature and supports a variety of models and datasets, including BERT and other transformers, and all GLUE tasks. TextAttack also includes data augmentation and adversarial training modules for using components of adversarial attacks to improve model accuracy and robustness.TextAttack is democratizing NLP: anyone can try data augmentation and adversarial training on any model or dataset, with just a few lines of code. Code and tutorials are available at https://github.com/QData/TextAttack. + 2020.emnlp-demos.16 + 2020.emnlp-demos.16.OptionalSupplementaryMaterial.zip + + + Easy, Reproducible and Quality-Controlled Data Collection with <fixed-case>CROWDAQ</fixed-case> + QiangNing + HaoWu + PradeepDasigi + DheeruDua + MattGardner + Robert L.Logan IV + AnaMarasović + ZhenNie + 127–134 + High-quality and large-scale data are key to success for AI systems. However, large-scale data annotation efforts are often confronted with a set of common challenges: (1) designing a user-friendly annotation interface; (2) training enough annotators efficiently; and (3) reproducibility. To address these problems, we introduce CROWDAQ, an open-source platform that standardizes the data collection pipeline with customizable user-interface components, automated annotator qualification, and saved pipelines in a re-usable format. We show that CROWDAQ simplifies data annotation significantly on a diverse set of data collection use cases and we hope it will be a convenient tool for the community. + 2020.emnlp-demos.17 + 2020.emnlp-demos.17.OptionalSupplementaryMaterial.pdf + + + <fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>ight: Combining faceted navigation and research group detection for <fixed-case>COVID</fixed-case>-19 exploratory scientific search + TomHope + JasonPortenoy + KishoreVasan + JonathanBorchardt + EricHorvitz + DanielWeld + MartiHearst + JevinWest + 135–143 + The COVID-19 pandemic has sparked unprecedented mobilization of scientists, generating a deluge of papers that makes it hard for researchers to keep track and explore new directions. Search engines are designed for targeted queries, not for discovery of connections across a corpus. In this paper, we present SciSight, a system for exploratory search of COVID-19 research integrating two key capabilities: first, exploring associations between biomedical facets automatically extracted from papers (e.g., genes, drugs, diseases, patient outcomes); second, combining textual and network information to search and visualize groups of researchers and their ties. SciSight has so far served over 15K users with over 42K page views and 13% returns. + 2020.emnlp-demos.18 + + + <fixed-case>SIMULEVAL</fixed-case>: An Evaluation Toolkit for Simultaneous Translation + XutaiMa + Mohammad JavadDousti + ChanghanWang + JiataoGu + JuanPino + 144–150 + Simultaneous translation on both text and speech focuses on a real-time and low-latency scenario where the model starts translating before reading the complete source input. Evaluating simultaneous translation models is more complex than offline models because the latency is another factor to consider in addition to translation quality. The research community, despite its growing focus on novel modeling approaches to simultaneous translation, currently lacks a universal evaluation procedure. Therefore, we present SimulEval, an easy-to-use and general evaluation toolkit for both simultaneous text and speech translation. A server-client scheme is introduced to create a simultaneous translation scenario, where the server sends source input and receives predictions for evaluation and the client executes customized policies. Given a policy, it automatically performs simultaneous decoding and collectively reports several popular latency metrics. We also adapt latency metrics from text simultaneous translation to the speech task. Additionally, SimulEval is equipped with a visualization interface to provide better understanding of the simultaneous decoding process of a system. SimulEval has already been extensively used for the IWSLT 2020 shared task on simultaneous speech translation. Code will be released upon publication. + 2020.emnlp-demos.19 + + + Agent Assist through Conversation Analysis + KshitijFadnis + NathanielMills + JatinGanhotra + HaggaiRoitman + GauravPandey + DoronCohen + YosiMass + ShaiErera + ChulakaGunasekara + DanishContractor + SivaPatel + Q. VeraLiao + SachindraJoshi + LuisLastras + DavidKonopnicki + 151–157 + Customer support agents play a crucial role as an interface between an organization and its end-users. We propose CAIRAA: Conversational Approach to Information Retrieval for Agent Assistance, to reduce the cognitive workload of support agents who engage with users through conversation systems. CAIRAA monitors an evolving conversation and recommends both responses and URLs of documents the agent can use in replies to their client. We combine traditional information retrieval (IR) approaches with more recent Deep Learning (DL) models to ensure high accuracy and efficient run-time performance in the deployed system. Here, we describe the CAIRAA system and demonstrate its effectiveness in a pilot study via a short video. + 2020.emnlp-demos.20 + + + <fixed-case>N</fixed-case>eu<fixed-case>S</fixed-case>pell: A Neural Spelling Correction Toolkit + Sai MuralidharJayanthi + DanishPruthi + GrahamNeubig + 158–164 + We introduce NeuSpell, an open-source toolkit for spelling correction in English. Our toolkit comprises ten different models, and benchmarks them on naturally occurring misspellings from multiple sources. We find that many systems do not adequately leverage the context around the misspelt token. To remedy this, (i) we train neural models using spelling errors in context, synthetically constructed by reverse engineering isolated misspellings; and (ii) use richer representations of the context. By training on our synthetic examples, correction rates improve by 9% (absolute) compared to the case when models are trained on randomly sampled character perturbations. Using richer contextual representations boosts the correction rate by another 3%. Our toolkit enables practitioners to use our proposed and existing spelling correction systems, both via a simple unified command line, as well as a web interface. Among many potential applications, we demonstrate the utility of our spell-checkers in combating adversarial misspellings. The toolkit can be accessed at neuspell.github.io. + 2020.emnlp-demos.21 + + + <fixed-case>L</fixed-case>ib<fixed-case>KGE</fixed-case> - A knowledge graph embedding library for reproducible research + SamuelBroscheit + DanielRuffinelli + AdrianKochsiek + PatrickBetz + RainerGemulla + 165–174 + LibKGE ( https://github.com/uma-pi1/kge ) is an open-source PyTorch-based library for training, hyperparameter optimization, and evaluation of knowledge graph embedding models for link prediction. The key goals of LibKGE are to enable reproducible research, to provide a framework for comprehensive experimental studies, and to facilitate analyzing the contributions of individual components of training methods, model architectures, and evaluation methods. LibKGE is highly configurable and every experiment can be fully reproduced with a single configuration file. Individual components are decoupled to the extent possible so that they can be mixed and matched with each other. Implementations in LibKGE aim to be as efficient as possible without leaving the scope of Python/Numpy/PyTorch. A comprehensive logging mechanism and tooling facilitates in-depth analysis. LibKGE provides implementations of common knowledge graph embedding models and training methods, and new ones can be easily added. A comparative study (Ruffinelli et al., 2020) showed that LibKGE reaches competitive to state-of-the-art performance for many models with a modest amount of automatic hyperparameter tuning. + 2020.emnlp-demos.22 + + + <fixed-case>W</fixed-case>ant<fixed-case>W</fixed-case>ords: An Open-source Online Reverse Dictionary System + FanchaoQi + LeiZhang + YanhuiYang + ZhiyuanLiu + MaosongSun + 175–181 + A reverse dictionary takes descriptions of words as input and outputs words semantically matching the input descriptions. Reverse dictionaries have great practical value such as solving the tip-of-the-tongue problem and helping new language learners. There have been some online reverse dictionary systems, but they support English reverse dictionary queries only and their performance is far from perfect. In this paper, we present a new open-source online reverse dictionary system named WantWords (https://wantwords.thunlp.org/). It not only significantly outperforms other reverse dictionary systems on English reverse dictionary performance, but also supports Chinese and English-Chinese as well as Chinese-English cross-lingual reverse dictionary queries for the first time. Moreover, it has user-friendly front-end design which can help users find the words they need quickly and easily. All the code and data are available at https://github.com/thunlp/WantWords. + 2020.emnlp-demos.23 + + + <fixed-case>BENNERD</fixed-case>: A Neural Named Entity Linking System for <fixed-case>COVID</fixed-case>-19 + Mohammad GolamSohrab + KhoaDuong + MakotoMiwa + GoranTopić + IkedaMasami + TakamuraHiroya + 182–188 + We present a biomedical entity linking (EL) system BENNERD that detects named enti- ties in text and links them to the unified medical language system (UMLS) knowledge base (KB) entries to facilitate the corona virus disease 2019 (COVID-19) research. BEN- NERD mainly covers biomedical domain, es- pecially new entity types (e.g., coronavirus, vi- ral proteins, immune responses) by address- ing CORD-NER dataset. It includes several NLP tools to process biomedical texts includ- ing tokenization, flat and nested entity recog- nition, and candidate generation and rank- ing for EL that have been pre-trained using the CORD-NER corpus. To the best of our knowledge, this is the first attempt that ad- dresses NER and EL on COVID-19-related entities, such as COVID-19 virus, potential vaccines, and spreading mechanism, that may benefit research on COVID-19. We release an online system to enable real-time entity annotation with linking for end users. We also release the manually annotated test set and CORD-NERD dataset for leveraging EL task. The BENNERD system is available at https://aistairc.github.io/BENNERD/. + 2020.emnlp-demos.24 + + + <fixed-case>R</fixed-case>o<fixed-case>FT</fixed-case>: A Tool for Evaluating Human Detection of Machine-Generated Text + LiamDugan + DaphneIppolito + ArunKirubarajan + ChrisCallison-Burch + 189–196 + In recent years, large neural networks for natural language generation (NLG) have made leaps and bounds in their ability to generate fluent text. However, the tasks of evaluating quality differences between NLG systems and understanding how humans perceive the generated text remain both crucial and difficult. In this system demonstration, we present Real or Fake Text (RoFT), a website that tackles both of these challenges by inviting users to try their hand at detecting machine-generated text in a variety of domains. We introduce a novel evaluation task based on detecting the boundary at which a text passage that starts off human-written transitions to being machine-generated. We show preliminary results of using RoFT to evaluate detection of machine-generated news articles. + 2020.emnlp-demos.25 + + + A Data-Centric Framework for Composable <fixed-case>NLP</fixed-case> Workflows + ZhengzhongLiu + GuanxiongDing + AvinashBukkittu + MansiGupta + PengzhiGao + AtifAhmed + ShikunZhang + XinGao + SwapnilSinghavi + LinweiLi + WeiWei + ZecongHu + HaoranShi + XiaodanLiang + TerukoMitamura + EricXing + ZhitingHu + 197–204 + Empirical natural language processing (NLP) systems in application domains (e.g., healthcare, finance, education) involve interoperation among multiple components, ranging from data ingestion, human annotation, to text retrieval, analysis, generation, and visualization. We establish a unified open-source framework to support fast development of such sophisticated NLP workflows in a composable manner. The framework introduces a uniform data representation to encode heterogeneous results by a wide range of NLP tasks. It offers a large repository of processors for NLP tasks, visualization, and annotation, which can be easily assembled with full interoperability under the unified representation. The highly extensible framework allows plugging in custom processors from external off-the-shelf NLP and deep learning libraries. The whole framework is delivered through two modularized yet integratable open-source projects, namely Forte (for workflow infrastructure and NLP function processors) and Stave (for user interaction, visualization, and annotation). + 2020.emnlp-demos.26 + + + <fixed-case>C</fixed-case>o<fixed-case>R</fixed-case>efi: A Crowd Sourcing Suite for Coreference Annotation + AriBornstein + ArieCattan + IdoDagan + 205–215 + Coreference annotation is an important, yet expensive and time consuming, task, which often involved expert annotators trained on complex decision guidelines. To enable cheaper and more efficient annotation, we present CoRefi, a web-based coreference annotation suite, oriented for crowdsourcing. Beyond the core coreference annotation tool, CoRefi provides guided onboarding for the task as well as a novel algorithm for a reviewing phase. CoRefi is open source and directly embeds into any website, including popular crowdsourcing platforms. CoRefi Demo: aka.ms/corefi Video Tour: aka.ms/corefivideo Github Repo: https://github.com/aribornstein/corefi + 2020.emnlp-demos.27 + + + Langsmith: An Interactive Academic Text Revision System + TakumiIto + TatsukiKuribayashi + MasatoshiHidaka + JunSuzuki + KentaroInui + 216–226 + Despite the current diversity and inclusion initiatives in the academic community, researchers with a non-native command of English still face significant obstacles when writing papers in English. This paper presents the Langsmith editor, which assists inexperienced, non-native researchers to write English papers, especially in the natural language processing (NLP) field. Our system can suggest fluent, academic-style sentences to writers based on their rough, incomplete phrases or sentences. The system also encourages interaction between human writers and the computerized revision system. The experimental results demonstrated that Langsmith helps non-native English-speaker students write papers in English. The system is available at https://emnlp-demo.editor. langsmith.co.jp/. + 2020.emnlp-demos.28 + + + <fixed-case>I</fixed-case>s<fixed-case>OBS</fixed-case>: An Information System for Oracle Bone Script + XuHan + YuzhuoBai + KeyueQiu + ZhiyuanLiu + MaosongSun + 227–233 + Oracle bone script (OBS) is the earliest known ancient Chinese writing system and the ancestor of modern Chinese. As the Chinese writing system is the oldest continuously-used system in the world, the study of OBS plays an important role in both linguistic and historical research. In order to utilize advanced machine learning methods to automatically process OBS, we construct an information system for OBS (IsOBS) to symbolize, serialize, and store OBS data at the character-level, based on efficient databases and retrieval modules. Moreover, we also apply few-shot learning methods to build an effective OBS character recognition module, which can recognize a large number of OBS characters (especially those characters with a handful of examples) and make the system easy to use. The demo system of IsOBS can be found from http://isobs.thunlp.org/. In the future, we will add more OBS data to the system, and hopefully our IsOBS can support further efforts in automatically processing OBS and advance the scientific progress in this field. + 2020.emnlp-demos.29 + +
+ + + Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Tutorial Abstracts + AlineVillavicencio + BenjaminVan Durme + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.emnlp-tutorials.0 + + + Machine Reasoning: Technology, Dilemma and Future + NanDuan + DuyuTang + MingZhou + 1–6 + Machine reasoning research aims to build interpretable AI systems that can solve problems or draw conclusions from what they are told (i.e. facts and observations) and already know (i.e. models, common sense and knowledge) under certain constraints. In this tutorial, we will (1) describe the motivation of this tutorial and give our definition on machine reasoning; (2) introduce typical machine reasoning frameworks, including symbolic reasoning, probabilistic reasoning, neural-symbolic reasoning and neural-evidence reasoning, and show their successful applications in real-world scenarios; (3) talk about the dilemma between black-box neural networks with state-of-the-art performance and machine reasoning approaches with better interpretability; (4) summarize the content of this tutorial and discuss possible future directions. + 2020.emnlp-tutorials.1 + + + Fact-Checking, Fake News, Propaganda, and Media Bias: Truth Seeking in the Post-Truth Era + PreslavNakov + GiovanniDa San Martino + 7–19 + The rise of social media has democratized content creation and has made it easy for everybody to share and spread information online. On the positive side, this has given rise to citizen journalism, thus enabling much faster dissemination of information compared to what was possible with newspapers, radio, and TV. On the negative side, stripping traditional media from their gate-keeping role has left the public unprotected against the spread of misinformation, which could now travel at breaking-news speed over the same democratic channel. This has given rise to the proliferation of false information specifically created to affect individual people’s beliefs, and ultimately to influence major events such as political elections. There are strong indications that false information was weaponized at an unprecedented scale during Brexit and the 2016 U.S. presidential elections. “Fake news,” which can be defined as fabricated information that mimics news media content in form but not in organizational process or intent, became the Word of the Year for 2017, according to Collins Dictionary. Thus, limiting the spread of “fake news” and its impact has become a major focus for computer scientists, journalists, social media companies, and regulatory authorities. The tutorial will offer an overview of the broad and emerging research area of disinformation, with focus on the latest developments and research directions. + 2020.emnlp-tutorials.2 + + + Interpreting Predictions of <fixed-case>NLP</fixed-case> Models + EricWallace + MattGardner + SameerSingh + 20–23 + Although neural NLP models are highly expressive and empirically successful, they also systematically fail in counterintuitive ways and are opaque in their decision-making process. This tutorial will provide a background on interpretation techniques, i.e., methods for explaining the predictions of NLP models. We will first situate example-specific interpretations in the context of other ways to understand models (e.g., probing, dataset analyses). Next, we will present a thorough study of example-specific interpretations, including saliency maps, input perturbations (e.g., LIME, input reduction), adversarial attacks, and influence functions. Alongside these descriptions, we will walk through source code that creates and visualizes interpretations for a diverse set of NLP tasks. Finally, we will discuss open problems in the field, e.g., evaluating, extending, and improving interpretation methods. + 2020.emnlp-tutorials.3 + + + High Performance Natural Language Processing + GabrielIlharco + CesarIlharco + IuliaTurc + TimDettmers + FelipeFerreira + KentonLee + 24–27 + Scale has played a central role in the rapid progress natural language processing has enjoyed in recent years. While benchmarks are dominated by ever larger models, efficient hardware use is critical for their widespread adoption and further progress in the field. In this cutting-edge tutorial, we will recapitulate the state-of-the-art in natural language processing with scale in perspective. After establishing these foundations, we will cover a wide range of techniques for improving efficiency, including knowledge distillation, quantization, pruning, more efficient architectures, along with case studies and practical implementation tricks. + 2020.emnlp-tutorials.4 + + + Representation, Learning and Reasoning on Spatial Language for Downstream <fixed-case>NLP</fixed-case> Tasks + ParisaKordjamshidi + JamesPustejovsky + Marie-FrancineMoens + 28–33 + Understating spatial semantics expressed in natural language can become highly complex in real-world applications. This includes applications of language grounding, navigation, visual question answering, and more generic human-machine interaction and dialogue systems. In many of such downstream tasks, explicit representation of spatial concepts and relationships can improve the capabilities of machine learning models in reasoning and deep language understanding. In this tutorial, we overview the cutting-edge research results and existing challenges related to spatial language understanding including semantic annotations, existing corpora, symbolic and sub-symbolic representations, qualitative spatial reasoning, spatial common sense, deep and structured learning models. We discuss the recent results on the above-mentioned applications –that need spatial language learning and reasoning – and highlight the research gaps and future directions. + 2020.emnlp-tutorials.5 + + + Simultaneous Translation + LiangHuang + ColinCherry + MingboMa + NaveenArivazhagan + ZhongjunHe + 34–36 + Simultaneous translation, which performs translation concurrently with the source speech, is widely useful in many scenarios such as international conferences, negotiations, press releases, legal proceedings, and medicine. This problem has long been considered one of the hardest problems in AI and one of its holy grails. Recently, with rapid improvements in machine translation, speech recognition, and speech synthesis, there has been exciting progress towards simultaneous translation. This tutorial will focus on the design and evaluation of policies for simultaneous translation, to leave attendees with a deep technical understanding of the history, the recent advances, and the remaining challenges in this field. + 2020.emnlp-tutorials.6 + + + The Amazing World of Neural Language Generation + YangfengJi + AntoineBosselut + ThomasWolf + AsliCelikyilmaz + 37–42 + Neural Language Generation (NLG) – using neural network models to generate coherent text – is among the most promising methods for automated text creation. Recent years have seen a paradigm shift in neural text generation, caused by the advances in deep contextual language modeling (e.g., LSTMs, GPT, GPT2) and transfer learning (e.g., ELMo, BERT). While these tools have dramatically improved the state of NLG, particularly for low resources tasks, state-of-the-art NLG models still face many challenges: a lack of diversity in generated text, commonsense violations in depicted situations, difficulties in making use of factual information, and difficulties in designing reliable evaluation metrics. In this tutorial, we will present an overview of the current state-of-the-art in neural network architectures, and how they shaped recent research directions in text generation. We will discuss how and why these models succeed/fail at generating coherent text, and provide insights on several applications. + 2020.emnlp-tutorials.7 + +
+
diff --git a/data/xml/2020.eval4nlp.xml b/data/xml/2020.eval4nlp.xml new file mode 100644 index 0000000000..d18d9e3e71 --- /dev/null +++ b/data/xml/2020.eval4nlp.xml @@ -0,0 +1,181 @@ + + + + + Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems + SteffenEger + YangGao + MaximePeyrard + WeiZhao + EduardHovy + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.eval4nlp-1.0 + + + Truth or Error? Towards systematic analysis of factual errors in abstractive summaries + Klaus-MichaelLux + MayaSappelli + MarthaLarson + 1–10 + This paper presents a typology of errors produced by automatic summarization systems. The typology was created by manually analyzing the output of four recent neural summarization systems. Our work is motivated by the growing awareness of the need for better summary evaluation methods that go beyond conventional overlap-based metrics. Our typology is structured into two dimensions. First, the Mapping Dimension describes surface-level errors and provides insight into word-sequence transformation issues. Second, the Meaning Dimension describes issues related to interpretation and provides insight into breakdowns in truth, i.e., factual faithfulness to the original text. Comparative analysis revealed that two neural summarization systems leveraging pre-trained models have an advantage in decreasing grammaticality errors, but not necessarily factual errors. We also discuss the importance of ensuring that summary length and abstractiveness do not interfere with evaluating summary quality. + 2020.eval4nlp-1.1 + + + Fill in the <fixed-case>BLANC</fixed-case>: Human-free quality estimation of document summaries + OlegVasilyev + VedantDharnidharka + JohnBohannon + 11–20 + We present BLANC, a new approach to the automatic estimation of document summary quality. Our goal is to measure the functional performance of a summary with an objective, reproducible, and fully automated method. Our approach achieves this by measuring the performance boost gained by a pre-trained language model with access to a document summary while carrying out its language understanding task on the document’s text. We present evidence that BLANC scores have as good correlation with human evaluations as do the ROUGE family of summary quality measurements. And unlike ROUGE, the BLANC method does not require human-written reference summaries, allowing for fully human-free summary quality estimation. + 2020.eval4nlp-1.2 + + + Item Response Theory for Efficient Human Evaluation of Chatbots + JoãoSedoc + LyleUngar + 21–33 + Conversational agent quality is currently assessed using human evaluation, and often requires an exorbitant number of comparisons to achieve statistical significance. In this paper, we introduce Item Response Theory (IRT) for chatbot evaluation, using a paired comparison in which annotators judge which system responds better to the next turn of a conversation. IRT is widely used in educational testing for simultaneously assessing the ability of test takers and the quality of test questions. It is similarly well suited for chatbot evaluation since it allows the assessment of both models and the prompts used to evaluate them. We use IRT to efficiently assess chatbots, and show that different examples from the evaluation set are better suited for comparing high-quality (nearer to human performance) than low-quality systems. Finally, we use IRT to reduce the number of evaluation examples assessed by human annotators while retaining discriminative power. + 2020.eval4nlp-1.3 + 2020.eval4nlp-1.3.OptionalSupplementaryMaterial.pdf + + + <fixed-case>V</fixed-case>i<fixed-case>LBERTS</fixed-case>core: Evaluating Image Caption Using Vision-and-Language <fixed-case>BERT</fixed-case> + HwanheeLee + SeunghyunYoon + FranckDernoncourt + Doo SoonKim + TrungBui + KyominJung + 34–39 + In this paper, we propose an evaluation metric for image captioning systems using both image and text information. Unlike the previous methods that rely on textual representations in evaluating the caption, our approach uses visiolinguistic representations. The proposed method generates image-conditioned embeddings for each token using ViLBERT from both generated and reference texts. Then, these contextual embeddings from each of the two sentence-pair are compared to compute the similarity score. Experimental results on three benchmark datasets show that our method correlates significantly better with human judgments than all existing metrics. + 2020.eval4nlp-1.4 + + + <fixed-case>BLEU</fixed-case> Neighbors: A Reference-less Approach to Automatic Evaluation + KawinEthayarajh + DorsaSadigh + 40–50 + Evaluation is a bottleneck in the development of natural language generation (NLG) models. Automatic metrics such as BLEU rely on references, but for tasks such as open-ended generation, there are no references to draw upon. Although language diversity can be estimated using statistical measures such as perplexity, measuring language quality requires human evaluation. However, because human evaluation at scale is slow and expensive, it is used sparingly; it cannot be used to rapidly iterate on NLG models, in the way BLEU is used for machine translation. To this end, we propose BLEU Neighbors, a nearest neighbors model for estimating language quality by using the BLEU score as a kernel function. On existing datasets for chitchat dialogue and open-ended sentence generation, we find that – on average – the quality estimation from a BLEU Neighbors model has a lower mean squared error and higher Spearman correlation with the ground truth than individual human annotators. Despite its simplicity, BLEU Neighbors even outperforms state-of-the-art models on automatically grading essays, including models that have access to a gold-standard reference essay. + 2020.eval4nlp-1.5 + + + Improving Text Generation Evaluation with Batch Centering and Tempered Word Mover Distance + XiChen + NanDing + TomerLevinboim + RaduSoricut + 51–59 + Recent advances in automatic evaluation metrics for text have shown that deep contextualized word representations, such as those generated by BERT encoders, are helpful for designing metrics that correlate well with human judgements. At the same time, it has been argued that contextualized word representations exhibit sub-optimal statistical properties for encoding the true similarity between words or sentences. In this paper, we present two techniques for improving encoding representations for similarity metrics: a batch-mean centering strategy that improves statistical properties; and a computationally efficient tempered Word Mover Distance, for better fusion of the information in the contextualized word representations. We conduct numerical experiments that demonstrate the robustness of our techniques, reporting results over various BERT-backbone learned metrics and achieving state of the art correlation with human ratings on several benchmarks. + 2020.eval4nlp-1.6 + 2020.eval4nlp-1.6.OptionalSupplementaryMaterial.zip + + + On the Evaluation of Machine Translation n-best Lists + JacobBremerman + HudaKhayrallah + DouglasOard + MattPost + 60–68 + The standard machine translation evaluation framework measures the single-best output of machine translation systems. There are, however, many situations where n-best lists are needed, yet there is no established way of evaluating them. This paper establishes a framework for addressing n-best evaluation by outlining three different questions one could consider when determining how one would define a ‘good’ n-best list and proposing evaluation measures for each question. The first and principal contribution is an evaluation measure that characterizes the translation quality of an entire n-best list by asking whether many of the valid translations are placed near the top of the list. The second is a measure that uses gold translations with preference annotations to ask to what degree systems can produce ranked lists in preference order. The third is a measure that rewards partial matches, evaluating the closeness of the many items in an n-best list to a set of many valid references. These three perspectives make clear that having access to many references can be useful when n-best evaluation is the goal. + 2020.eval4nlp-1.7 + + + Artemis: A Novel Annotation Methodology for Indicative Single Document Summarization + RahulJha + KepingBi + YangLi + MahdiPakdaman + AsliCelikyilmaz + IvanZhiboedov + KieranMcDonald + 69–78 + We describe Artemis (Annotation methodology for Rich, Tractable, Extractive, Multi-domain, Indicative Summarization), a novel hierarchical annotation process that produces indicative summaries for documents from multiple domains. Current summarization evaluation datasets are single-domain and focused on a few domains for which naturally occurring summaries can be easily found, such as news and scientific articles. These are not sufficient for training and evaluation of summarization models for use in document management and information retrieval systems, which need to deal with documents from multiple domains. Compared to other annotation methods such as Relative Utility and Pyramid, Artemis is more tractable because judges don’t need to look at all the sentences in a document when making an importance judgment for one of the sentences, while providing similarly rich sentence importance annotations. We describe the annotation process in detail and compare it with other similar evaluation systems. We also present analysis and experimental results over a sample set of 532 annotated documents. + 2020.eval4nlp-1.8 + + + Probabilistic Extension of Precision, Recall, and F1 Score for More Thorough Evaluation of Classification Models + RedaYacouby + DustinAxman + 79–91 + In pursuit of the perfect supervised NLP classifier, razor thin margins and low-resource test sets can make modeling decisions difficult. Popular metrics such as Accuracy, Precision, and Recall are often insufficient as they fail to give a complete picture of the model’s behavior. We present a probabilistic extension of Precision, Recall, and F1 score, which we refer to as confidence-Precision (cPrecision), confidence-Recall (cRecall), and confidence-F1 (cF1) respectively. The proposed metrics address some of the challenges faced when evaluating large-scale NLP systems, specifically when the model’s confidence score assignments have an impact on the system’s behavior. We describe four key benefits of our proposed metrics as compared to their threshold-based counterparts. Two of these benefits, which we refer to as robustness to missing values and sensitivity to model confidence score assignments are self-evident from the metrics’ definitions; the remaining benefits, generalization, and functional consistency are demonstrated empirically. + 2020.eval4nlp-1.9 + + + A survey on Recognizing Textual Entailment as an <fixed-case>NLP</fixed-case> Evaluation + AdamPoliak + 92–109 + Recognizing Textual Entailment (RTE) was proposed as a unified evaluation framework to compare semantic understanding of different NLP systems. In this survey paper, we provide an overview of different approaches for evaluating and understanding the reasoning capabilities of NLP systems. We then focus our discussion on RTE by highlighting prominent RTE datasets as well as advances in RTE dataset that focus on specific linguistic phenomena that can be used to evaluate NLP systems on a fine-grained level. We conclude by arguing that when evaluating NLP systems, the community should utilize newly introduced RTE datasets that focus on specific linguistic phenomena. + 2020.eval4nlp-1.10 + + + Grammaticality and Language Modelling + JingchengNiu + GeraldPenn + 110–119 + Ever since Pereira (2000) provided evidence against Chomsky’s (1957) conjecture that statistical language modelling is incommensurable with the aims of grammaticality prediction as a research enterprise, a new area of research has emerged that regards statistical language models as “psycholinguistic subjects” and probes their ability to acquire syntactic knowledge. The advent of The Corpus of Linguistic Acceptability (CoLA) (Warstadt et al., 2019) has earned a spot on the leaderboard for acceptability judgements, and the polemic between Lau et al. (2017) and Sprouse et al. (2018) has raised fundamental questions about the nature of grammaticality and how acceptability judgements should be elicited. All the while, we are told that neural language models continue to improve. That is not an easy claim to test at present, however, because there is almost no agreement on how to measure their improvement when it comes to grammaticality and acceptability judgements. The GLUE leaderboard bundles CoLA together with a Matthews correlation coefficient (MCC), although probably because CoLA’s seminal publication was using it to compute inter-rater reliabilities. Researchers working in this area have used other accuracy and correlation scores, often driven by a need to reconcile and compare various discrete and continuous variables with each other. The score that we will advocate for in this paper, the point biserial correlation, in fact compares a discrete variable (for us, acceptability judgements) to a continuous variable (for us, neural language model probabilities). The only previous work in this area to choose the PBC that we are aware of is Sprouse et al. (2018a), and that paper actually applied it backwards (with some justification) so that the language model probability was treated as the discrete binary variable by setting a threshold. With the PBC in mind, we will first reappraise some recent work in syntactically targeted linguistic evaluations (Hu et al., 2020), arguing that while their experimental design sets a new high watermark for this topic, their results may not prove what they have claimed. We then turn to the task-independent assessment of language models as grammaticality classifiers. Prior to the introduction of the GLUE leaderboard, the vast majority of this assessment was essentially anecdotal, and we find the use of the MCC in this regard to be problematic. We conduct several studies with PBCs to compare several popular language models. We also study the effects of several variables such as normalization and data homogeneity on PBC. + 2020.eval4nlp-1.11 + + + One of these words is not like the other: a reproduction of outlier identification using non-contextual word representations + JesperBrink Andersen + MikkelBak Bertelsen + MikkelHørby Schou + Manuel R.Ciosici + IraAssent + 120–130 + Word embeddings are an active topic in the NLP research community. State-of-the-art neural models achieve high performance on downstream tasks, albeit at the cost of computationally expensive training. Cost aware solutions require cheaper models that still achieve good performance. We present several reproduction studies of intrinsic evaluation tasks that evaluate non-contextual word representations in multiple languages. Furthermore, we present 50-8-8, a new data set for the outlier identification task, which avoids limitations of the original data set, such as ambiguous words, infrequent words, and multi-word tokens, while increasing the number of test cases. The data set is expanded to contain semantic and syntactic tests and is multilingual (English, German, and Italian). We provide an in-depth analysis of word embedding models with a range of hyper-parameters. Our analysis shows the suitability of different models and hyper-parameters for different tasks and the greater difficulty of representing German and Italian languages. + 2020.eval4nlp-1.12 + 2020.eval4nlp-1.12.OptionalSupplementaryMaterial.pdf + + + Are Some Words Worth More than Others? + ShiranDudy + StevenBedrick + 131–142 + Current evaluation metrics for language modeling and generation rely heavily on the accuracy of predicted (or generated) words as compared to a reference ground truth. While important, token-level accuracy only captures one aspect of a language model’s behavior, and ignores linguistic properties of words that may allow some mis-predicted tokens to be useful in practice. Furthermore, statistics directly tied to prediction accuracy (including perplexity) may be confounded by the Zipfian nature of written language, as the majority of the prediction attempts will occur with frequently-occurring types. A model’s performance may vary greatly between high- and low-frequency words, which in practice could lead to failure modes such as repetitive and dull generated text being produced by a downstream consumer of a language model. To address this, we propose two new intrinsic evaluation measures within the framework of a simple word prediction task that are designed to give a more holistic picture of a language model’s performance. We evaluate several commonly-used large English language models using our proposed metrics, and demonstrate that our approach reveals functional differences in performance between the models that are obscured by more traditional metrics. + 2020.eval4nlp-1.13 + + + On Aligning <fixed-case>O</fixed-case>pen<fixed-case>IE</fixed-case> Extractions with Knowledge Bases: A Case Study + KirilGashteovski + RainerGemulla + BhushanKotnis + SvenHertling + ChristianMeilicke + 143–154 + Open information extraction (OIE) is the task of extracting relations and their corresponding arguments from a natural language text in un- supervised manner. Outputs of such systems are used for downstream tasks such as ques- tion answering and automatic knowledge base (KB) construction. Many of these downstream tasks rely on aligning OIE triples with refer- ence KBs. Such alignments are usually eval- uated w.r.t. a specific downstream task and, to date, no direct manual evaluation of such alignments has been performed. In this paper, we directly evaluate how OIE triples from the OPIEC corpus are related to the DBpedia KB w.r.t. information content. First, we investigate OPIEC triples and DBpedia facts having the same arguments by comparing the information on the OIE surface relation with the KB rela- tion. Second, we evaluate the expressibility of general OPIEC triples in DBpedia. We in- vestigate whether—and, if so, how—a given OIE triple can be mapped to a single KB fact. We found that such mappings are not always possible because the information in the OIE triples tends to be more specific. Our evalua- tion suggests, however, that significant part of OIE triples can be expressed by means of KB formulas instead of individual facts. + 2020.eval4nlp-1.14 + + + <fixed-case>C</fixed-case>luster<fixed-case>D</fixed-case>ata<fixed-case>S</fixed-case>plit: Exploring Challenging Clustering-Based Data Splits for Model Performance Evaluation + HannaWecker + AnnemarieFriedrich + HeikeAdel + 155–163 + This paper adds to the ongoing discussion in the natural language processing community on how to choose a good development set. Motivated by the real-life necessity of applying machine learning models to different data distributions, we propose a clustering-based data splitting algorithm. It creates development (or test) sets which are lexically different from the training data while ensuring similar label distributions. Hence, we are able to create challenging cross-validation evaluation setups while abstracting away from performance differences resulting from label distribution shifts between training and test data. In addition, we present a Python-based tool for analyzing and visualizing data split characteristics and model performance. We illustrate the workings and results of our approach using a sentiment analysis and a patent classification task. + 2020.eval4nlp-1.15 + + + Best Practices for Crowd-based Evaluation of <fixed-case>G</fixed-case>erman Summarization: Comparing Crowd, Expert and Automatic Evaluation + NeslihanIskender + TimPolzehl + SebastianMöller + 164–175 + One of the main challenges in the development of summarization tools is summarization quality evaluation. On the one hand, the human assessment of summarization quality conducted by linguistic experts is slow, expensive, and still not a standardized procedure. On the other hand, the automatic assessment metrics are reported not to correlate high enough with human quality ratings. As a solution, we propose crowdsourcing as a fast, scalable, and cost-effective alternative to expert evaluations to assess the intrinsic and extrinsic quality of summarization by comparing crowd ratings with expert ratings and automatic metrics such as ROUGE, BLEU, or BertScore on a German summarization data set. Our results provide a basis for best practices for crowd-based summarization evaluation regarding major influential factors such as the best annotation aggregation method, the influence of readability and reading effort on summarization evaluation, and the optimal number of crowd workers to achieve comparable results to experts, especially when determining factors such as overall quality, grammaticality, referential clarity, focus, structure & coherence, summary usefulness, and summary informativeness. + 2020.eval4nlp-1.16 + + + Evaluating Word Embeddings on Low-Resource Languages + NathanStringham + MikeIzbicki + 176–186 + The analogy task introduced by Mikolov et al. (2013) has become the standard metric for tuning the hyperparameters of word embedding models. In this paper, however, we argue that the analogy task is unsuitable for low-resource languages for two reasons: (1) it requires that word embeddings be trained on large amounts of text, and (2) analogies may not be well-defined in some low-resource settings. We solve these problems by introducing the OddOneOut and Topk tasks, which are specifically designed for model selection in the low-resource setting. We use these metrics to successfully tune hyperparameters for a low-resource emoji embedding task and word embeddings on 16 extinct languages. The largest of these languages (Ancient Hebrew) has a 41 million token dataset, and the smallest (Old Gujarati) has only a 1813 token dataset. + 2020.eval4nlp-1.17 + +
+
diff --git a/data/xml/2020.figlang.xml b/data/xml/2020.figlang.xml index b24073c393..e34b431186 100644 --- a/data/xml/2020.figlang.xml +++ b/data/xml/2020.figlang.xml @@ -84,7 +84,7 @@
Sarcasm Detection using Context Separators in Online Discourse - TANVIDADU + TanviDadu KartikeyPant 51–55 Sarcasm is an intricate form of speech, where meaning is conveyed implicitly. Being a convoluted form of expression, detecting sarcasm is an assiduous problem. The difficulty in recognition of sarcasm has many pitfalls, including misunderstandings in everyday communications, which leads us to an increasing focus on automated sarcasm detection. In the second edition of the Figurative Language Processing (FigLang 2020) workshop, the shared task of sarcasm detection released two datasets, containing responses along with their context sampled from Twitter and Reddit. In this work, we use RoBERTa_{large} to detect sarcasm in both the datasets. We further assert the importance of context in improving the performance of contextual word embedding based models by using three different types of inputs - Response-only, Context-Response, and Context-Response (Separated). We show that our proposed architecture performs competitively for both the datasets. We also show that the addition of a separation token between context and target response results in an improvement of 5.13% in the F1-score in the Reddit dataset. @@ -189,7 +189,7 @@ Using Conceptual Norms for Metaphor Detection - MingyuWAN + MingyuWan KathleenAhrens EmmanueleChersoni MenghanJiang diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml new file mode 100644 index 0000000000..99d8c862bb --- /dev/null +++ b/data/xml/2020.findings.xml @@ -0,0 +1,4807 @@ + + + + + Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings + TrevorCohn + YulanHe + YangLiu + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.findings-emnlp.0 + + + Fully Quantized Transformer for Machine Translation + GabrielePrato + EllaCharlaix + MehdiRezagholizadeh + 1–14 + State-of-the-art neural machine translation methods employ massive amounts of parameters. Drastically reducing computational costs of such methods without affecting performance has been up to this point unsuccessful. To this end, we propose FullyQT: an all-inclusive quantization strategy for the Transformer. To the best of our knowledge, we are the first to show that it is possible to avoid any loss in translation quality with a fully quantized Transformer. Indeed, compared to full-precision, our 8-bit models score greater or equal BLEU on most tasks. Comparing ourselves to all previously proposed methods, we achieve state-of-the-art quantization results. + 2020.findings-emnlp.1 + + + Summarizing <fixed-case>C</fixed-case>hinese Medical Answer with Graph Convolution Networks and Question-focused Dual Attention + NingyuZhang + ShuminDeng + JuanLi + XiChen + WeiZhang + HuajunChen + 15–24 + Online search engines are a popular source of medical information for users, where users can enter questions and obtain relevant answers. It is desirable to generate answer summaries for online search engines, particularly summaries that can reveal direct answers to questions. Moreover, answer summaries are expected to reveal the most relevant information in response to questions; hence, the summaries should be generated with a focus on the question, which is a challenging topic-focused summarization task. In this paper, we propose an approach that utilizes graph convolution networks and question-focused dual attention for Chinese medical answer summarization. We first organize the original long answer text into a medical concept graph with graph convolution networks to better understand the internal structure of the text and the correlation between medical concepts. Then, we introduce a question-focused dual attention mechanism to generate summaries relevant to questions. Experimental results demonstrate that the proposed model can generate more coherent and informative summaries compared with baseline models. + 2020.findings-emnlp.2 + + + Stay Hungry, Stay Focused: Generating Informative and Specific Questions in Information-Seeking Conversations + PengQi + YuhaoZhang + Christopher D.Manning + 25–40 + We investigate the problem of generating informative questions in information-asymmetric conversations. Unlike previous work on question generation which largely assumes knowledge of what the answer might be, we are interested in the scenario where the questioner is not given the context from which answers are drawn, but must reason pragmatically about how to acquire new information, given the shared conversation history. We identify two core challenges: (1) formally defining the informativeness of potential questions, and (2) exploring the prohibitively large space of potential questions to find the good candidates. To generate pragmatic questions, we use reinforcement learning to optimize an informativeness metric we propose, combined with a reward function designed to promote more specific questions. We demonstrate that the resulting pragmatic questioner substantially improves the informativeness and specificity of questions generated over a baseline model, as evaluated by our metrics as well as humans. + 2020.findings-emnlp.3 + + + Adapting <fixed-case>BERT</fixed-case> for Word Sense Disambiguation with Gloss Selection Objective and Example Sentences + Boon PengYap + AndrewKoh + Eng SiongChng + 41–46 + Domain adaptation or transfer learning using pre-trained language models such as BERT has proven to be an effective approach for many natural language processing tasks. In this work, we propose to formulate word sense disambiguation as a relevance ranking task, and fine-tune BERT on sequence-pair ranking task to select the most probable sense definition given a context sentence and a list of candidate sense definitions. We also introduce a data augmentation technique for WSD using existing example sentences from WordNet. Using the proposed training objective and data augmentation technique, our models are able to achieve state-of-the-art results on the English all-words benchmark datasets. + 2020.findings-emnlp.4 + + + Adversarial Text Generation via Sequence Contrast Discrimination + KeWang + XiaojunWan + 47–53 + In this paper, we propose a sequence contrast loss driven text generation framework, which learns the difference between real texts and generated texts and uses that difference. Specifically, our discriminator contains a discriminative sequence generator instead of a binary classifier, and measures the ‘relative realism’ of generated texts against real texts by making use of them simultaneously. Moreover, our generator uses discriminative sequences to directly improve itself, which not only replaces the gradient propagation process from the discriminator to the generator, but also avoids the time-consuming sampling process of estimating rewards in some previous methods. We conduct extensive experiments with various metrics, substantiating that our framework brings improvements in terms of training stability and the quality of generated texts. + 2020.findings-emnlp.5 + 2020.findings-emnlp.5.OptionalSupplementaryMaterial.zip + + + <fixed-case>GRACE</fixed-case>: Gradient Harmonized and Cascaded Labeling for Aspect-based Sentiment Analysis + HuaishaoLuo + LeiJi + TianruiLi + DaxinJiang + NanDuan + 54–64 + In this paper, we focus on the imbalance issue, which is rarely studied in aspect term extraction and aspect sentiment classification when regarding them as sequence labeling tasks. Besides, previous works usually ignore the interaction between aspect terms when labeling polarities. We propose a GRadient hArmonized and CascadEd labeling model (GRACE) to solve these problems. Specifically, a cascaded labeling module is developed to enhance the interchange between aspect terms and improve the attention of sentiment tokens when labeling sentiment polarities. The polarities sequence is designed to depend on the generated aspect terms labels. To alleviate the imbalance issue, we extend the gradient harmonized mechanism used in object detection to the aspect-based sentiment analysis by adjusting the weight of each label dynamically. The proposed GRACE adopts a post-pretraining BERT as its backbone. Experimental results demonstrate that the proposed model achieves consistency improvement on multiple benchmark datasets and generates state-of-the-art results. + 2020.findings-emnlp.6 + + + Reducing Sentiment Bias in Language Models via Counterfactual Evaluation + Po-SenHuang + HuanZhang + RayJiang + RobertStanforth + JohannesWelbl + JackRae + VishalMaini + DaniYogatama + PushmeetKohli + 65–83 + Advances in language modeling architectures and the availability of large text corpora have driven progress in automatic text generation. While this results in models capable of generating coherent texts, it also prompts models to internalize social biases present in the training corpus. This paper aims to quantify and reduce a particular type of bias exhibited by language models: bias in the sentiment of generated text. Given a conditioning context (e.g., a writing prompt) and a language model, we analyze if (and how) the sentiment of the generated text is affected by changes in values of sensitive attributes (e.g., country names, occupations, genders) in the conditioning context using a form of counterfactual evaluation. We quantify sentiment bias by adopting individual and group fairness metrics from the fair machine learning literature, and demonstrate that large-scale models trained on two different corpora (news articles, and Wikipedia) exhibit considerable levels of bias. We then propose embedding and sentiment prediction-derived regularization on the language model’s latent representations. The regularizations improve fairness metrics while retaining comparable levels of perplexity and semantic similarity. + 2020.findings-emnlp.7 + + + Improving Text Understanding via Deep Syntax-Semantics Communication + HaoFei + YafengRen + DonghongJi + 84–93 + Recent studies show that integrating syntactic tree models with sequential semantic models can bring improved task performance, while these methods mostly employ shallow integration of syntax and semantics. In this paper, we propose a deep neural communication model between syntax and semantics to improve the performance of text understanding. Local communication is performed between syntactic tree encoder and sequential semantic encoder for mutual learning of information exchange. Global communication can further ensure comprehensive information propagation. Results on multiple syntax-dependent tasks show that our model outperforms strong baselines by a large margin. In-depth analysis indicates that our method is highly effective in composing sentence semantics. + 2020.findings-emnlp.8 + + + <fixed-case>GRUEN</fixed-case> for Evaluating Linguistic Quality of Generated Text + WanzhengZhu + SumaBhat + 94–108 + Automatic evaluation metrics are indispensable for evaluating generated text. To date, these metrics have focused almost exclusively on the content selection aspect of the system output, ignoring the linguistic quality aspect altogether. We bridge this gap by proposing GRUEN for evaluating Grammaticality, non-Redundancy, focUs, structure and coherENce of generated text. GRUEN utilizes a BERT-based model and a class of syntactic, semantic, and contextual features to examine the system output. Unlike most existing evaluation metrics which require human references as an input, GRUEN is reference-less and requires only the system output. Besides, it has the advantage of being unsupervised, deterministic, and adaptable to various tasks. Experiments on seven datasets over four language generation tasks show that the proposed metric correlates highly with human judgments. + 2020.findings-emnlp.9 + + + A Greedy Bit-flip Training Algorithm for Binarized Knowledge Graph Embeddings + KatsuhikoHayashi + KokiKishimoto + MasashiShimbo + 109–114 + This paper presents a simple and effective discrete optimization method for training binarized knowledge graph embedding model B-CP. Unlike the prior work using a SGD-based method and quantization of real-valued vectors, the proposed method directly optimizes binary embedding vectors by a series of bit flipping operations. On the standard knowledge graph completion tasks, the B-CP model trained with the proposed method achieved comparable performance with that trained with SGD as well as state-of-the-art real-valued models with similar embedding dimensions. + 2020.findings-emnlp.10 + + + Difference-aware Knowledge Selection for Knowledge-grounded Conversation Generation + ChujieZheng + YunboCao + DaxinJiang + MinlieHuang + 115–125 + In a multi-turn knowledge-grounded dialog, the difference between the knowledge selected at different turns usually provides potential clues to knowledge selection, which has been largely neglected in previous research. In this paper, we propose a difference-aware knowledge selection method. It first computes the difference between the candidate knowledge sentences provided at the current turn and those chosen in the previous turns. Then, the differential information is fused with or disentangled from the contextual information to facilitate final knowledge selection. Automatic, human observational, and interactive evaluation shows that our method is able to select knowledge more accurately and generate more informative responses, significantly outperforming the state-of-the-art baselines. + 2020.findings-emnlp.11 + + + An Attentive Recurrent Model for Incremental Prediction of Sentence-final Verbs + WenyanLi + AlvinGrissom II + JordanBoyd-Graber + 126–136 + Verb prediction is important for understanding human processing of verb-final languages, with practical applications to real-time simultaneous interpretation from verb-final to verb-medial languages. While previous approaches use classical statistical models, we introduce an attention-based neural model to incrementally predict final verbs on incomplete sentences in Japanese and German SOV sentences. To offer flexibility to the model, we further incorporate synonym awareness. Our approach both better predicts the final verbs in Japanese and German and provides more interpretable explanations of why those verbs are selected. + 2020.findings-emnlp.12 + 2020.findings-emnlp.12.OptionalSupplementaryMaterial.zip + + + Transformer-<fixed-case>GCRF</fixed-case>: Recovering <fixed-case>C</fixed-case>hinese Dropped Pronouns with General Conditional Random Fields + JingxuanYang + KeruiXu + JunXu + SiLi + ShengGao + JunGuo + Ji-RongWen + NianwenXue + 137–147 + Pronouns are often dropped in Chinese conversations and recovering the dropped pronouns is important for NLP applications such as Machine Translation. Existing approaches usually formulate this as a sequence labeling task of predicting whether there is a dropped pronoun before each token and its type. Each utterance is considered to be a sequence and labeled independently. Although these approaches have shown promise, labeling each utterance independently ignores the dependencies between pronouns in neighboring utterances. Modeling these dependencies is critical to improving the performance of dropped pronoun recovery. In this paper, we present a novel framework that combines the strength of Transformer network with General Conditional Random Fields (GCRF) to model the dependencies between pronouns in neighboring utterances. Results on three Chinese conversation datasets show that the Transformer-GCRF model outperforms the state-of-the-art dropped pronoun recovery models. Exploratory analysis also demonstrates that the GCRF did help to capture the dependencies between pronouns in neighboring utterances, thus contributes to the performance improvements. + 2020.findings-emnlp.13 + + + Neural Speed Reading Audited + AndersSøgaard + 148–153 + Several approaches to neural speed reading have been presented at major NLP and machine learning conferences in 2017–20; i.e., “human-inspired” recurrent network architectures that learn to “read” text faster by skipping irrelevant words, typically optimizing the joint objective of minimizing classification error rate and FLOPs used at inference time. This paper reflects on the meaningfulness of the speed reading task, showing that (a) better and faster approaches to, say, document classification, already exist, which also learn to ignore part of the input (I give an example with 7% error reduction and a 136x speed-up over the state of the art in neural speed reading); and that (b) any claims that neural speed reading is “human-inspired”, are ill-founded. + 2020.findings-emnlp.14 + + + Converting the Point of View of Message Spoken to Virtual Assistants + GunheeLee + VeraZu + Sai SrujanaBuddi + DennisLiang + PurvaKulkarni + JackFitzGerald + 154–163 + Virtual Assistants can be quite literal at times. If the user says “tell Bob I love him,” most virtual assistants will extract the message “I love him” and send it to the user’s contact named Bob, rather than properly converting the message to “I love you.” We designed a system to allow virtual assistants to take a voice message from one user, convert the point of view of the message, and then deliver the result to its target user. We developed a rule-based model, which integrates a linear text classification model, part-of-speech tagging, and constituency parsing with rule-based transformation methods. We also investigated Neural Machine Translation (NMT) approaches, including LSTMs, CopyNet, and T5. We explored 5 metrics to gauge both naturalness and faithfulness automatically, and we chose to use BLEU plus METEOR for faithfulness and relative perplexity using a separately trained language model (GPT) for naturalness. Transformer-Copynet and T5 performed similarly on faithfulness metrics, with T5 achieving slight edge, a BLEU score of 63.8 and a METEOR score of 83.0. CopyNet was the most natural, with a relative perplexity of 1.59. CopyNet also has 37 times fewer parameters than T5. We have publicly released our dataset, which is composed of 46,565 crowd-sourced samples. + 2020.findings-emnlp.15 + + + Robustness to Modification with Shared Words in Paraphrase Identification + ZhouxingShi + MinlieHuang + 164–171 + Revealing the robustness issues of natural language processing models and improving their robustness is important to their performance under difficult situations. In this paper, we study the robustness of paraphrase identification models from a new perspective – via modification with shared words, and we show that the models have significant robustness issues when facing such modifications. To modify an example consisting of a sentence pair, we either replace some words shared by both sentences or introduce new shared words. We aim to construct a valid new example such that a target model makes a wrong prediction. To find a modification solution, we use beam search constrained by heuristic rules, and we leverage a BERT masked language model for generating substitution words compatible with the context. Experiments show that the performance of the target models has a dramatic drop on the modified examples, thereby revealing the robustness issue. We also show that adversarial training can mitigate this issue. + 2020.findings-emnlp.16 + + + Few-shot Natural Language Generation for Task-Oriented Dialog + BaolinPeng + ChenguangZhu + ChunyuanLi + XiujunLi + JinchaoLi + MichaelZeng + JianfengGao + 172–182 + As a crucial component in task-oriented dialog systems, the Natural Language Generation (NLG) module converts a dialog act represented in a semantic form into a response in natural language. The success of traditional template-based or statistical models typically relies on heavily annotated data, which is infeasible for new domains. Therefore, it is pivotal for an NLG system to generalize well with limited labelled data in real applications. To this end, we present FewshotWOZ, the first NLG benchmark to simulate the few-shot learning setting in task-oriented dialog systems. Further, we develop the SC-GPT model. It is pre-trained on a large set of annotated NLG corpus to acquire the controllable generation ability, and fine-tuned with only a few domain-specific labels to adapt to new domains. Experiments on FewshotWOZ and the large Multi-Domain-WOZ datasets show that the proposed SC-GPT significantly outperforms existing methods, measured by various automatic metrics and human evaluations. + 2020.findings-emnlp.17 + + + Mimic and Conquer: Heterogeneous Tree Structure Distillation for Syntactic <fixed-case>NLP</fixed-case> + HaoFei + YafengRen + DonghongJi + 183–193 + Syntax has been shown useful for various NLP tasks, while existing work mostly encodes singleton syntactic tree using one hierarchical neural network. In this paper, we investigate a simple and effective method, Knowledge Distillation, to integrate heterogeneous structure knowledge into a unified sequential LSTM encoder. Experimental results on four typical syntax-dependent tasks show that our method outperforms tree encoders by effectively integrating rich heterogeneous structure syntax, meanwhile reducing error propagation, and also outperforms ensemble methods, in terms of both the efficiency and accuracy. + 2020.findings-emnlp.18 + + + A Hierarchical Network for Abstractive Meeting Summarization with Cross-Domain Pretraining + ChenguangZhu + RuochenXu + MichaelZeng + XuedongHuang + 194–203 + With the abundance of automatic meeting transcripts, meeting summarization is of great interest to both participants and other parties. Traditional methods of summarizing meetings depend on complex multi-step pipelines that make joint optimization intractable. Meanwhile, there are a handful of deep neural models for text summarization and dialogue systems. However, the semantic structure and styles of meeting transcripts are quite different from articles and conversations. In this paper, we propose a novel abstractive summary network that adapts to the meeting scenario. We design a hierarchical structure to accommodate long meeting transcripts and a role vector to depict the difference among speakers. Furthermore, due to the inadequacy of meeting summary data, we pretrain the model on large-scale news summary data. Empirical results show that our model outperforms previous approaches in both automatic metrics and human evaluation. For example, on ICSI dataset, the ROUGE-1 score increases from 34.66% to 46.28%. + 2020.findings-emnlp.19 + + + Active Testing: An Unbiased Evaluation Method for Distantly Supervised Relation Extraction + PengshuaiLi + XinsongZhang + WeijiaJia + WeiZhao + 204–211 + Distant supervision has been a widely used method for neural relation extraction for its convenience of automatically labeling datasets. However, existing works on distantly supervised relation extraction suffer from the low quality of test set, which leads to considerable biased performance evaluation. These biases not only result in unfair evaluations but also mislead the optimization of neural relation extraction. To mitigate this problem, we propose a novel evaluation method named active testing through utilizing both the noisy test set and a few manual annotations. Experiments on a widely used benchmark show that our proposed approach can yield approximately unbiased evaluations for distantly supervised relation extractors. + 2020.findings-emnlp.20 + 2020.findings-emnlp.20.OptionalSupplementaryMaterial.pdf + + + Semantic Matching via Optimal Partial Transport + RuiyiZhang + ChangyouChen + XinyuanZhang + KeBai + LawrenceCarin + 212–222 + In sequence-to-sequence models, classical optimal transport (OT) can be applied to semantically match generated sentences with target sentences. However, in non-parallel settings, target sentences are usually unavailable. To tackle this issue without losing the benefits of classical OT, we present a semantic matching scheme based on the Optimal Partial Transport (OPT). Specifically, our approach partially matches semantically meaningful words between source and partial target sequences. To overcome the difficulty of detecting active regions in OPT (corresponding to the words needed to be matched), we further exploit prior knowledge to perform partial matching. Extensive experiments are conducted to evaluate the proposed approach, showing consistent improvements over sequence-to-sequence tasks. + 2020.findings-emnlp.21 + 2020.findings-emnlp.21.OptionalSupplementaryMaterial.bbl + + + How Decoding Strategies Affect the Verifiability of Generated Text + LucaMassarelli + FabioPetroni + AleksandraPiktus + MyleOtt + TimRocktäschel + VassilisPlachouras + FabrizioSilvestri + SebastianRiedel + 223–235 + Recent progress in pre-trained language models led to systems that are able to generate text of an increasingly high quality. While several works have investigated the fluency and grammatical correctness of such models, it is still unclear to which extent the generated text is consistent with factual world knowledge. Here, we go beyond fluency and also investigate the verifiability of text generated by state-of-the-art pre-trained language models. A generated sentence is verifiable if it can be corroborated or disproved by Wikipedia, and we find that the verifiability of generated text strongly depends on the decoding strategy. In particular, we discover a tradeoff between factuality (i.e., the ability of generating Wikipedia corroborated text) and repetitiveness. While decoding strategies such as top-k and nucleus sampling lead to less repetitive generations, they also produce less verifiable text. Based on these finding, we introduce a simple and effective decoding strategy which, in comparison to previously used decoding strategies, produces less repetitive and more verifiable text. + 2020.findings-emnlp.22 + + + Minimize Exposure Bias of <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models in Joint Entity and Relation Extraction + Ranran HaoranZhang + QianyingLiu + Aysa XuemoFan + HengJi + DaojianZeng + FeiCheng + DaisukeKawahara + SadaoKurohashi + 236–246 + Joint entity and relation extraction aims to extract relation triplets from plain text directly. Prior work leverages Sequence-to-Sequence (Seq2Seq) models for triplet sequence generation. However, Seq2Seq enforces an unnecessary order on the unordered triplets and involves a large decoding length associated with error accumulation. These methods introduce exposure bias, which may cause the models overfit to the frequent label combination, thus limiting the generalization ability. We propose a novel Sequence-to-Unordered-Multi-Tree (Seq2UMTree) model to minimize the effects of exposure bias by limiting the decoding length to three within a triplet and removing the order among triplets. We evaluate our model on two datasets, DuIE and NYT, and systematically study how exposure bias alters the performance of Seq2Seq models. Experiments show that the state-of-the-art Seq2Seq model overfits to both datasets while Seq2UMTree shows significantly better generalization. Our code is available at https://github.com/WindChimeRan/OpenJERE. + 2020.findings-emnlp.23 + + + Gradient-based Analysis of <fixed-case>NLP</fixed-case> Models is Manipulable + JunlinWang + JensTuyls + EricWallace + SameerSingh + 247–258 + Gradient-based analysis methods, such as saliency map visualizations and adversarial input perturbations, have found widespread use in interpreting neural NLP models due to their simplicity, flexibility, and most importantly, the fact that they directly reflect the model internals. In this paper, however, we demonstrate that the gradients of a model are easily manipulable, and thus bring into question the reliability of gradient-based analyses. In particular, we merge the layers of a target model with a Facade Model that overwhelms the gradients without affecting the predictions. This Facade Model can be trained to have gradients that are misleading and irrelevant to the task, such as focusing only on the stop words in the input. On a variety of NLP tasks (sentiment analysis, NLI, and QA), we show that the merged model effectively fools different analysis tools: saliency maps differ significantly from the original model’s, input reduction keeps more irrelevant input tokens, and adversarial perturbations identify unimportant tokens as being highly important. + 2020.findings-emnlp.24 + 2020.findings-emnlp.24.OptionalSupplementaryMaterial.zip + + + Pretrain-<fixed-case>KGE</fixed-case>: Learning Knowledge Representation from Pretrained Language Models + ZhiyuanZhang + XiaoqianLiu + YiZhang + QiSu + XuSun + BinHe + 259–266 + Conventional knowledge graph embedding (KGE) often suffers from limited knowledge representation, leading to performance degradation especially on the low-resource problem. To remedy this, we propose to enrich knowledge representation via pretrained language models by leveraging world knowledge from pretrained models. Specifically, we present a universal training framework named Pretrain-KGE consisting of three phases: semantic-based fine-tuning phase, knowledge extracting phase and KGE training phase. Extensive experiments show that our proposed Pretrain-KGE can improve results over KGE models, especially on solving the low-resource problem. + 2020.findings-emnlp.25 + + + A Self-Refinement Strategy for Noise Reduction in Grammatical Error Correction + MasatoMita + ShunKiyono + MasahiroKaneko + JunSuzuki + KentaroInui + 267–280 + Existing approaches for grammatical error correction (GEC) largely rely on supervised learning with manually created GEC datasets. However, there has been little focus on verifying and ensuring the quality of the datasets, and on how lower-quality data might affect GEC performance. We indeed found that there is a non-negligible amount of “noise” where errors were inappropriately edited or left uncorrected. To address this, we designed a self-refinement method where the key idea is to denoise these datasets by leveraging the prediction consistency of existing models, and outperformed strong denoising baseline methods. We further applied task-specific techniques and achieved state-of-the-art performance on the CoNLL-2014, JFLEG, and BEA-2019 benchmarks. We then analyzed the effect of the proposed denoising method, and found that our approach leads to improved coverage of corrections and facilitated fluency edits which are reflected in higher recall and overall performance. + 2020.findings-emnlp.26 + + + Understanding tables with intermediate pre-training + JulianEisenschlos + SyrineKrichene + ThomasMüller + 281–296 + Table entailment, the binary classification task of finding if a sentence is supported or refuted by the content of a table, requires parsing language and table structure as well as numerical and discrete reasoning. While there is extensive work on textual entailment, table entailment is less well studied. We adapt TAPAS (Herzig et al., 2020), a table-based BERT model, to recognize entailment. Motivated by the benefits of data augmentation, we create a balanced dataset of millions of automatically created training examples which are learned in an intermediate step prior to fine-tuning. This new data is not only useful for table entailment, but also for SQA (Iyyer et al., 2017), a sequential table QA task. To be able to use long examples as input of BERT models, we evaluate table pruning techniques as a pre-processing step to drastically improve the training and prediction efficiency at a moderate drop in accuracy. The different methods set the new state-of-the-art on the TabFact (Chen et al., 2020) and SQA datasets. + 2020.findings-emnlp.27 + 2020.findings-emnlp.27.OptionalSupplementaryMaterial.pdf + + + Enhance Robustness of Sequence Labelling with Masked Adversarial Training + LuoxinChen + XinyueLiu + WeitongRuan + JianhuaLu + 297–302 + Adversarial training (AT) has shown strong regularization effects on deep learning algorithms by introducing small input perturbations to improve model robustness. In language tasks, adversarial training brings word-level robustness by adding input noise, which is beneficial for text classification. However, it lacks sufficient contextual information enhancement and thus is less useful for sequence labelling tasks such as chunking and named entity recognition (NER). To address this limitation, we propose masked adversarial training (MAT) to improve robustness from contextual information in sequence labelling. MAT masks or replaces some words in the sentence when computing adversarial loss from perturbed inputs and consequently enhances model robustness using more context-level information. In our experiments, our method shows significant improvements on accuracy and robustness of sequence labelling. By further incorporating with ELMo embeddings, our model achieves better or comparable results to state-of-the-art on CoNLL 2000 and 2003 benchmarks using much less parameters. + 2020.findings-emnlp.28 + + + Multilingual Argument Mining: Datasets and Analysis + OrithToledo-Ronen + MatanOrbach + YonatanBilu + ArtemSpector + NoamSlonim + 303–317 + The growing interest in argument mining and computational argumentation brings with it a plethora of Natural Language Understanding (NLU) tasks and corresponding datasets. However, as with many other NLU tasks, the dominant language is English, with resources in other languages being few and far between. In this work, we explore the potential of transfer learning using the multilingual BERT model to address argument mining tasks in non-English languages, based on English datasets and the use of machine translation. We show that such methods are well suited for classifying the stance of arguments and detecting evidence, but less so for assessing the quality of arguments, presumably because quality is harder to preserve under translation. In addition, focusing on the translate-train approach, we show how the choice of languages for translation, and the relations among them, affect the accuracy of the resultant model. Finally, to facilitate evaluation of transfer learning on argument mining tasks, we provide a human-generated dataset with more than 10k arguments in multiple languages, as well as machine translation of the English datasets. + 2020.findings-emnlp.29 + + + Improving Grammatical Error Correction with Machine Translation Pairs + WangchunshuZhou + TaoGe + ChangMu + KeXu + FuruWei + MingZhou + 318–328 + We propose a novel data synthesis method to generate diverse error-corrected sentence pairs for improving grammatical error correction, which is based on a pair of machine translation models (e.g., Chinese to English) of different qualities (i.e., poor and good). The poor translation model can resemble the ESL (English as a second language) learner and tends to generate translations of low quality in terms of fluency and grammaticality, while the good translation model generally generates fluent and grammatically correct translations. With the pair of translation models, we can generate unlimited numbers of poor to good English sentence pairs from text in the source language (e.g., Chinese) of the translators. Our approach can generate various error-corrected patterns and nicely complement the other data synthesis approaches for GEC. Experimental results demonstrate the data generated by our approach can effectively help a GEC model to improve the performance and achieve the state-of-the-art single-model performance in BEA-19 and CoNLL-14 benchmark datasets. + 2020.findings-emnlp.30 + + + Machines Getting with the Program: Understanding Intent Arguments of Non-Canonical Directives + Won IkCho + YoungkiMoon + SangwhanMoon + Seok MinKim + Nam SooKim + 329–339 + Modern dialog managers face the challenge of having to fulfill human-level conversational skills as part of common user expectations, including but not limited to discourse with no clear objective. Along with these requirements, agents are expected to extrapolate intent from the user’s dialogue even when subjected to non-canonical forms of speech. This depends on the agent’s comprehension of paraphrased forms of such utterances. Especially in low-resource languages, the lack of data is a bottleneck that prevents advancements of the comprehension performance for these types of agents. In this regard, here we demonstrate the necessity of extracting the intent argument of non-canonical directives in a natural language format, which may yield more accurate parsing, and suggest guidelines for building a parallel corpus for this purpose. Following the guidelines, we construct a Korean corpus of 50K instances of question/command-intent pairs, including the labels for classification of the utterance type. We also propose a method for mitigating class imbalance, demonstrating the potential applications of the corpus generation method and its multilingual extensibility. + 2020.findings-emnlp.31 + + + The <fixed-case>RELX</fixed-case> Dataset and Matching the Multilingual Blanks for Cross-lingual Relation Classification + AbdullatifKöksal + ArzucanÖzgür + 340–350 + Relation classification is one of the key topics in information extraction, which can be used to construct knowledge bases or to provide useful information for question answering. Current approaches for relation classification are mainly focused on the English language and require lots of training data with human annotations. Creating and annotating a large amount of training data for low-resource languages is impractical and expensive. To overcome this issue, we propose two cross-lingual relation classification models: a baseline model based on Multilingual BERT and a new multilingual pretraining setup, which significantly improves the baseline with distant supervision. For evaluation, we introduce a new public benchmark dataset for cross-lingual relation classification in English, French, German, Spanish, and Turkish, called RELX. We also provide the RELX-Distant dataset, which includes hundreds of thousands of sentences with relations from Wikipedia and Wikidata collected by distant supervision for these languages. Our code and data are available at: https://github.com/boun-tabi/RELX + 2020.findings-emnlp.32 + + + Control, Generate, Augment: A Scalable Framework for Multi-Attribute Text Generation + GiuseppeRusso + NoraHollenstein + Claudiu CristianMusat + CeZhang + 351–366 + We introduce CGA, a conditional VAE architecture, to control, generate, and augment text. CGA is able to generate natural English sentences controlling multiple semantic and syntactic attributes by combining adversarial learning with a context-aware loss and a cyclical word dropout routine. We demonstrate the value of the individual model components in an ablation study. The scalability of our approach is ensured through a single discriminator, independently of the number of attributes. We show high quality, diversity and attribute control in the generated sentences through a series of automatic and human assessments. As the main application of our work, we test the potential of this new NLG model in a data augmentation scenario. In a downstream NLP task, the sentences generated by our CGA model show significant improvements over a strong baseline, and a classification performance often comparable to adding same amount of additional real data. + 2020.findings-emnlp.33 + + + Open-Ended Visual Question Answering by Multi-Modal Domain Adaptation + YimingXu + LinChen + ZhongweiCheng + LixinDuan + JieboLuo + 367–376 + We study the problem of visual question answering (VQA) in images by exploiting supervised domain adaptation, where there is a large amount of labeled data in the source domain but only limited labeled data in the target domain, with the goal to train a good target model. A straightforward solution is to fine-tune a pre-trained source model by using those limited labeled target data, but it usually cannot work well due to the considerable difference between the data distributions of the source and target domains. Moreover, the availability of multiple modalities (i.e., images, questions and answers) in VQA poses further challenges in modeling the transferability between various modalities. In this paper, we address the above issues by proposing a novel supervised multi-modal domain adaptation method for VQA to learn joint feature embeddings across different domains and modalities. Specifically, we align the data distributions of the source and target domains by considering those modalities both jointly and separately. Extensive experiments on the benchmark VQA 2.0 and VizWiz datasets demonstrate that our proposed method outperforms the existing state-of-the-art baselines for open-ended VQA in this challenging domain adaptation setting. + 2020.findings-emnlp.34 + + + Dual Low-Rank Multimodal Fusion + TaoJin + SiyuHuang + YingmingLi + ZhongfeiZhang + 377–387 + Tensor-based fusion methods have been proven effective in multimodal fusion tasks. However, existing tensor-based methods make a poor use of the fine-grained temporal dynamics of multimodal sequential features. Motivated by this observation, this paper proposes a novel multimodal fusion method called Fine-Grained Temporal Low-Rank Multimodal Fusion (FT-LMF). FT-LMF correlates the features of individual time steps between multiple modalities, while it involves multiplications of high-order tensors in its calculation. This paper further proposes Dual Low-Rank Multimodal Fusion (Dual-LMF) to reduce the computational complexity of FT-LMF through low-rank tensor approximation along dual dimensions of input features. Dual-LMF is conceptually simple and practically effective and efficient. Empirical studies on benchmark multimodal analysis tasks show that our proposed methods outperform the state-of-the-art tensor-based fusion methods with a similar computational complexity. + 2020.findings-emnlp.35 + 2020.findings-emnlp.35.OptionalSupplementaryMaterial.zip + + + Contextual Modulation for Relation-Level Metaphor Identification + OmniaZayed + John P.McCrae + PaulBuitelaar + 388–406 + Identifying metaphors in text is very challenging and requires comprehending the underlying comparison. The automation of this cognitive process has gained wide attention lately. However, the majority of existing approaches concentrate on word-level identification by treating the task as either single-word classification or sequential labelling without explicitly modelling the interaction between the metaphor components. On the other hand, while existing relation-level approaches implicitly model this interaction, they ignore the context where the metaphor occurs. In this work, we address these limitations by introducing a novel architecture for identifying relation-level metaphoric expressions of certain grammatical relations based on contextual modulation. In a methodology inspired by works in visual reasoning, our approach is based on conditioning the neural network computation on the deep contextualised features of the candidate expressions using feature-wise linear modulation. We demonstrate that the proposed architecture achieves state-of-the-art results on benchmark datasets. The proposed methodology is generic and could be applied to other textual classification problems that benefit from contextual interaction. + 2020.findings-emnlp.36 + + + Context-aware Stand-alone Neural Spelling Correction + XiangciLi + HairongLiu + LiangHuang + 407–414 + Existing natural language processing systems are vulnerable to noisy inputs resulting from misspellings. On the contrary, humans can easily infer the corresponding correct words from their misspellings and surrounding context. Inspired by this, we address the stand-alone spelling correction problem, which only corrects the spelling of each token without additional token insertion or deletion, by utilizing both spelling information and global context representations. We present a simple yet powerful solution that jointly detects and corrects misspellings as a sequence labeling task by fine-turning a pre-trained language model. Our solution outperform the previous state-of-the-art result by 12.8% absolute F0.5 score. + 2020.findings-emnlp.37 + + + A Novel Workflow for Accurately and Efficiently Crowdsourcing Predicate Senses and Argument Labels + YouxuanJiang + HuaiyuZhu + Jonathan K.Kummerfeld + YunyaoLi + WalterLasecki + 415–421 + Resources for Semantic Role Labeling (SRL) are typically annotated by experts at great expense. Prior attempts to develop crowdsourcing methods have either had low accuracy or required substantial expert annotation. We propose a new multi-stage crowd workflow that substantially reduces expert involvement without sacrificing accuracy. In particular, we introduce a unique filter stage based on the key observation that crowd workers are able to almost perfectly filter out incorrect options for labels. Our three-stage workflow produces annotations with 95% accuracy for predicate labels and 93% for argument labels, which is comparable to expert agreement. Compared to prior work on crowdsourcing for SRL, we decrease expert effort by 4x, from 56% to 14% of cases. Our approach enables more scalable annotation of SRL, and could enable annotation of NLP tasks that have previously been considered too complex to effectively crowdsource. + 2020.findings-emnlp.38 + 2020.findings-emnlp.38.OptionalSupplementaryMaterial.zip + + + <fixed-case>K</fixed-case>or<fixed-case>NLI</fixed-case> and <fixed-case>K</fixed-case>or<fixed-case>STS</fixed-case>: New Benchmark Datasets for <fixed-case>K</fixed-case>orean Natural Language Understanding + JiyeonHam + Yo JoongChoe + KyubyongPark + IljiChoi + HyungjoonSoh + 422–430 + Natural language inference (NLI) and semantic textual similarity (STS) are key tasks in natural language understanding (NLU). Although several benchmark datasets for those tasks have been released in English and a few other languages, there are no publicly available NLI or STS datasets in the Korean language. Motivated by this, we construct and release new datasets for Korean NLI and STS, dubbed KorNLI and KorSTS, respectively. Following previous approaches, we machine-translate existing English training sets and manually translate development and test sets into Korean. To accelerate research on Korean NLU, we also establish baselines on KorNLI and KorSTS. Our datasets are publicly available at https://github.com/kakaobrain/KorNLUDatasets. + 2020.findings-emnlp.39 + + + Dialogue Generation on Infrequent Sentence Functions via Structured Meta-Learning + YifanGao + PijiLi + WeiBi + XiaojiangLiu + MichaelLyu + IrwinKing + 431–440 + Sentence function is an important linguistic feature indicating the communicative purpose in uttering a sentence. Incorporating sentence functions into conversations has shown improvements in the quality of generated responses. However, the number of utterances for different types of fine-grained sentence functions is extremely imbalanced. Besides a small number of high-resource sentence functions, a large portion of sentence functions is infrequent. Consequently, dialogue generation conditioned on these infrequent sentence functions suffers from data deficiency. In this paper, we investigate a structured meta-learning (SML) approach for dialogue generation on infrequent sentence functions. We treat dialogue generation conditioned on different sentence functions as separate tasks, and apply model-agnostic meta-learning to high-resource sentence functions data. Furthermore, SML enhances meta-learning effectiveness by promoting knowledge customization among different sentence functions but simultaneously preserving knowledge generalization for similar sentence functions. Experimental results demonstrate that SML not only improves the informativeness and relevance of generated responses, but also can generate responses consistent with the target sentence functions. Code will be public to facilitate the research along this line. + 2020.findings-emnlp.40 + + + Exploring Versatile Generative Language Model Via Parameter-Efficient Transfer Learning + ZhaojiangLin + AndreaMadotto + PascaleFung + 441–459 + Fine-tuning pre-trained generative language models to down-stream language generation tasks has shown promising results. However, this comes with the cost of having a single, large model for each task, which is not ideal in low-memory/power scenarios (e.g., mobile). In this paper, we propose an effective way to fine-tune multiple down-stream generation tasks simultaneously using a single, large pretrained model. The experiments on five diverse language generation tasks show that by just using an additional 2-3% parameters for each task, our model can maintain or even improve the performance of fine-tuning the whole model. + 2020.findings-emnlp.41 + 2020.findings-emnlp.41.OptionalSupplementaryMaterial.pdf + + + A Fully Hyperbolic Neural Model for Hierarchical Multi-class Classification + FedericoLópez + MichaelStrube + 460–475 + Label inventories for fine-grained entity typing have grown in size and complexity. Nonetheless, they exhibit a hierarchical structure. Hyperbolic spaces offer a mathematically appealing approach for learning hierarchical representations of symbolic data. However, it is not clear how to integrate hyperbolic components into downstream tasks. This is the first work that proposes a fully hyperbolic model for multi-class multi-label classification, which performs all operations in hyperbolic space. We evaluate the proposed model on two challenging datasets and compare to different baselines that operate under Euclidean assumptions. Our hyperbolic model infers the latent hierarchy from the class distribution, captures implicit hyponymic relations in the inventory, and shows performance on par with state-of-the-art methods on fine-grained classification with remarkable reduction of the parameter size. A thorough analysis sheds light on the impact of each component in the final prediction and showcases its ease of integration with Euclidean layers. + 2020.findings-emnlp.42 + + + Claim Check-Worthiness Detection as Positive Unlabelled Learning + DustinWright + IsabelleAugenstein + 476–488 + As the first step of automatic fact checking, claim check-worthiness detection is a critical component of fact checking systems. There are multiple lines of research which study this problem: check-worthiness ranking from political speeches and debates, rumour detection on Twitter, and citation needed detection from Wikipedia. To date, there has been no structured comparison of these various tasks to understand their relatedness, and no investigation into whether or not a unified approach to all of them is achievable. In this work, we illuminate a central challenge in claim check-worthiness detection underlying all of these tasks, being that they hinge upon detecting both how factual a sentence is, as well as how likely a sentence is to be believed without verification. As such, annotators only mark those instances they judge to be clear-cut check-worthy. Our best performing method is a unified approach which automatically corrects for this using a variant of positive unlabelled learning that finds instances which were incorrectly labelled as not check-worthy. In applying this, we out-perform the state of the art in two of the three tasks studied for claim check-worthiness detection in English. + 2020.findings-emnlp.43 + + + <fixed-case>C</fixed-case>oncept<fixed-case>B</fixed-case>ert: Concept-Aware Representation for Visual Question Answering + FrançoisGardères + MaryamZiaeefard + BaptisteAbeloos + FreddyLecue + 489–498 + Visual Question Answering (VQA) is a challenging task that has received increasing attention from both the computer vision and the natural language processing communities. A VQA model combines visual and textual features in order to answer questions grounded in an image. Current works in VQA focus on questions which are answerable by direct analysis of the question and image alone. We present a concept-aware algorithm, ConceptBert, for questions which require common sense, or basic factual knowledge from external structured content. Given an image and a question in natural language, ConceptBert requires visual elements of the image and a Knowledge Graph (KG) to infer the correct answer. We introduce a multi-modal representation which learns a joint Concept-Vision-Language embedding inspired by the popular BERT architecture. We exploit ConceptNet KG for encoding the common sense knowledge and evaluate our methodology on the Outside Knowledge-VQA (OK-VQA) and VQA datasets. + 2020.findings-emnlp.44 + + + Bootstrapping a Crosslingual Semantic Parser + TomSherborne + YumoXu + MirellaLapata + 499–517 + Recent progress in semantic parsing scarcely considers languages other than English but professional translation can be prohibitively expensive. We adapt a semantic parser trained on a single language, such as English, to new languages and multiple domains with minimal annotation. We query if machine translation is an adequate substitute for training data, and extend this to investigate bootstrapping using joint training with English, paraphrasing, and multilingual pre-trained models. We develop a Transformer-based parser combining paraphrases by ensembling attention over multiple encoders and present new versions of ATIS and Overnight in German and Chinese for evaluation. Experimental results indicate that MT can approximate training data in a new language for accurate parsing when augmented with paraphrasing through multiple MT engines. Considering when MT is inadequate, we also find that using our approach achieves parsing accuracy within 2% of complete translation using only 50% of training data. + 2020.findings-emnlp.45 + + + Revisiting Representation Degeneration Problem in Language Modeling + ZhongZhang + ChongmingGao + CongXu + RuiMiao + QinliYang + JunmingShao + 518–527 + Weight tying is now a common setting in many language generation tasks such as language modeling and machine translation. However, a recent study reveals that there is a potential flaw in weight tying. They find that the learned word embeddings are likely to degenerate and lie in a narrow cone when training a language model. They call it the representation degeneration problem and propose a cosine regularization to solve it. Nevertheless, we prove that the cosine regularization is insufficient to solve the problem, as the degeneration is still likely to happen under certain conditions. In this paper, we revisit the representation degeneration problem and theoretically analyze the limitations of the previously proposed solution. Afterward, we propose an alternative regularization method called Laplacian regularization to tackle the problem. Experiments on language modeling demonstrate the effectiveness of the proposed Laplacian regularization. + 2020.findings-emnlp.46 + + + The workweek is the best time to start a family – A Study of <fixed-case>GPT</fixed-case>-2 Based Claim Generation + ShaiGretz + YonatanBilu + EdoCohen-Karlik + NoamSlonim + 528–544 + Argument generation is a challenging task whose research is timely considering its potential impact on social media and the dissemination of information. Here we suggest a pipeline based on GPT-2 for generating coherent claims, and explore the types of claims that it produces, and their veracity, using an array of manual and automatic assessments. In addition, we explore the interplay between this task and the task of Claim Retrieval, showing how they can complement one another. + 2020.findings-emnlp.47 + + + Dynamic Data Selection for Curriculum Learning via Ability Estimation + John P.Lalor + HongYu + 545–555 + Curriculum learning methods typically rely on heuristics to estimate the difficulty of training examples or the ability of the model. In this work, we propose replacing difficulty heuristics with learned difficulty parameters. We also propose Dynamic Data selection for Curriculum Learning via Ability Estimation (DDaCLAE), a strategy that probes model ability at each training epoch to select the best training examples at that point. We show that models using learned difficulty and/or ability outperform heuristic-based curriculum learning models on the GLUE classification tasks. + 2020.findings-emnlp.48 + + + Fixed Encoder Self-Attention Patterns in Transformer-Based Machine Translation + AlessandroRaganato + YvesScherrer + JörgTiedemann + 556–568 + Transformer-based models have brought a radical change to neural machine translation. A key feature of the Transformer architecture is the so-called multi-head attention mechanism, which allows the model to focus simultaneously on different parts of the input. However, recent works have shown that most attention heads learn simple, and often redundant, positional patterns. In this paper, we propose to replace all but one attention head of each encoder layer with simple fixed – non-learnable – attentive patterns that are solely based on position and do not require any external knowledge. Our experiments with different data sizes and multiple language pairs show that fixing the attention heads on the encoder side of the Transformer at training time does not impact the translation quality and even increases BLEU scores by up to 3 points in low-resource scenarios. + 2020.findings-emnlp.49 + + + <fixed-case>ZEST</fixed-case>: Zero-shot Learning from Text Descriptions using Textual Similarity and Visual Summarization + TzufPaz-Argaman + ReutTsarfaty + GalChechik + YuvalAtzmon + 569–579 + We study the problem of recognizing visual entities from the textual descriptions of their classes. Specifically, given birds’ images with free-text descriptions of their species, we learn to classify images of previously-unseen species based on specie descriptions. This setup has been studied in the vision community under the name zero-shot learning from text, focusing on learning to transfer knowledge about visual aspects of birds from seen classes to previously-unseen ones. Here, we suggest focusing on the textual description and distilling from the description the most relevant information to effectively match visual features to the parts of the text that discuss them. Specifically, (1) we propose to leverage the similarity between species, reflected in the similarity between text descriptions of the species. (2) we derive visual summaries of the texts, i.e., extractive summaries that focus on the visual features that tend to be reflected in images. We propose a simple attention-based model augmented with the similarity and visual summaries components. Our empirical results consistently and significantly outperform the state-of-the-art on the largest benchmarks for text-based zero-shot learning, illustrating the critical importance of texts for zero-shot image-recognition. + 2020.findings-emnlp.50 + + + Few-Shot Multi-Hop Relation Reasoning over Knowledge Bases + ChuxuZhang + LuYu + MandanaSaebi + MengJiang + NiteshChawla + 580–585 + Multi-hop relation reasoning over knowledge base is to generate effective and interpretable relation prediction through reasoning paths. The current methods usually require sufficient training data (fact triples) for each query relation, impairing their performances over few-shot relations (with limited triples) which are common in knowledge base. To this end, we propose FIRE, a novel few-shot multi-hop relation learning model. FIRE applies reinforcement learning to model the sequential steps of multi-hop reasoning, besides performs heterogeneous structure encoding and knowledge-aware search space pruning. The meta-learning technique is employed to optimize model parameters that could quickly adapt to few-shot relations. Empirical study on two datasets demonstrate that FIRE outperforms state-of-the-art methods. + 2020.findings-emnlp.51 + + + Sentiment Analysis with Weighted Graph Convolutional Networks + FanyuMeng + JunlanFeng + DanpingYin + SiChen + MinHu + 586–595 + Syntactic information is essential for both sentiment analysis(SA) and aspect-based sentiment analysis(ABSA). Previous work has already achieved great progress utilizing Graph Convolutional Network(GCN) over dependency tree of a sentence. However, these models do not fully exploit the syntactic information obtained from dependency parsing such as the diversified types of dependency relations. The message passing process of GCN should be distinguished based on these syntactic information.To tackle this problem, we design a novel weighted graph convolutional network(WGCN) which can exploit rich syntactic information based on the feature combination. Furthermore, we utilize BERT instead of Bi-LSTM to generate contextualized representations as inputs for GCN and present an alignment method to keep word-level dependencies consistent with wordpiece unit of BERT. With our proposal, we are able to improve the state-of-the-art on four ABSA tasks out of six and two SA tasks out of three. + 2020.findings-emnlp.52 + + + <fixed-case>PB</fixed-case>o<fixed-case>S</fixed-case>: Probabilistic Bag-of-Subwords for Generalizing Word Embedding + ZhaoJinman + ShawnZhong + XiaominZhang + YingyuLiang + 596–611 + We look into the task of generalizing word embeddings: given a set of pre-trained word vectors over a finite vocabulary, the goal is to predict embedding vectors for out-of-vocabulary words, without extra contextual information. We rely solely on the spellings of words and propose a model, along with an efficient algorithm, that simultaneously models subword segmentation and computes subword-based compositional word embedding. We call the model probabilistic bag-of-subwords (PBoS), as it applies bag-of-subwords for all possible segmentations based on their likelihood. Inspections and affix prediction experiment show that PBoS is able to produce meaningful subword segmentations and subword rankings without any source of explicit morphological knowledge. Word similarity and POS tagging experiments show clear advantages of PBoS over previous subword-level models in the quality of generated word embeddings across languages. + 2020.findings-emnlp.53 + + + Interpretable Entity Representations through Large-Scale Typing + YasumasaOnoe + GregDurrett + 612–624 + In standard methodology for natural language processing, entities in text are typically embedded in dense vector spaces with pre-trained models. The embeddings produced this way are effective when fed into downstream models, but they require end-task fine-tuning and are fundamentally difficult to interpret. In this paper, we present an approach to creating entity representations that are human readable and achieve high performance on entity-related tasks out of the box. Our representations are vectors whose values correspond to posterior probabilities over fine-grained entity types, indicating the confidence of a typing model’s decision that the entity belongs to the corresponding type. We obtain these representations using a fine-grained entity typing model, trained either on supervised ultra-fine entity typing data (Choi et al. 2018) or distantly-supervised examples from Wikipedia. On entity probing tasks involving recognizing entity identity, our embeddings used in parameter-free downstream models achieve competitive performance with ELMo- and BERT-based embeddings in trained models. We also show that it is possible to reduce the size of our type set in a learning-based way for particular domains. Finally, we show that these embeddings can be post-hoc modified through a small number of rules to incorporate domain knowledge and improve performance. + 2020.findings-emnlp.54 + + + Empirical Studies of Institutional Federated Learning For Natural Language Processing + XinghuaZhu + JianzongWang + ZhenhouHong + JingXiao + 625–634 + Federated learning has sparkled new interests in the deep learning society to make use of isolated data sources from independent institutes. With the development of novel training tools, we have successfully deployed federated natural language processing networks on GPU-enabled server clusters. This paper demonstrates federated training of a popular NLP model, TextCNN, with applications in sentence intent classification. Furthermore, differential privacy is introduced to protect participants in the training process, in a manageable manner. Distinguished from previous client-level privacy protection schemes, the proposed differentially private federated learning procedure is defined in the dataset sample level, inherent with the applications among institutions instead of individual users. Optimal settings of hyper-parameters for the federated TextCNN model are studied through comprehensive experiments. We also evaluated the performance of federated TextCNN model under imbalanced data load configuration. Experiments show that, the sampling ratio has a large impact on the performance of the FL models, causing up to 38.4% decrease in the test accuracy, while they are robust to different noise multiplier levels, with less than 3% variance in the test accuracy. It is also found that the FL models are sensitive to data load balancedness among client datasets. When the data load is imbalanced, model performance dropped by up to 10%. + 2020.findings-emnlp.55 + + + <fixed-case>N</fixed-case>eu<fixed-case>R</fixed-case>educe: Reducing Mixed <fixed-case>B</fixed-case>oolean-Arithmetic Expressions by Recurrent Neural Network + WeijieFeng + BinbinLiu + DongpengXu + QilongZheng + YunXu + 635–644 + Mixed Boolean-Arithmetic (MBA) expressions involve both arithmetic calculation (e.g.,plus, minus, multiply) and bitwise computation (e.g., and, or, negate, xor). MBA expressions have been widely applied in software obfuscation, transforming programs from a simple form to a complex form. MBA expressions are challenging to be simplified, because the interleaving bitwise and arithmetic operations causing mathematical reduction laws to be ineffective. Our goal is to recover the original, simple form from an obfuscated MBA expression. In this paper, we first propose NeuReduce, a string to string method based on neural networks to automatically learn and reduce complex MBA expressions. We develop a comprehensive MBA dataset, including one million diversified MBA expression samples and corresponding simplified forms. After training on the dataset, NeuReduce can reduce MBA rules to homelier but mathematically equivalent forms. By comparing with three state-of-the-art MBA reduction methods, our evaluation result shows that NeuReduce outperforms all other tools in terms of accuracy, solving time, and performance overhead. + 2020.findings-emnlp.56 + + + From Language to Language-ish: How Brain-Like is an <fixed-case>LSTM</fixed-case>’s Representation of Atypical Language Stimuli? + MaryamHashemzadeh + GretaKaufeld + MarthaWhite + Andrea E.Martin + AlonaFyshe + 645–656 + The representations generated by many models of language (word embeddings, recurrent neural networks and transformers) correlate to brain activity recorded while people read. However, these decoding results are usually based on the brain’s reaction to syntactically and semantically sound language stimuli. In this study, we asked: how does an LSTM (long short term memory) language model, trained (by and large) on semantically and syntactically intact language, represent a language sample with degraded semantic or syntactic information? Does the LSTM representation still resemble the brain’s reaction? We found that, even for some kinds of nonsensical language, there is a statistically significant relationship between the brain’s activity and the representations of an LSTM. This indicates that, at least in some instances, LSTMs and the human brain handle nonsensical data similarly. + 2020.findings-emnlp.57 + + + Revisiting Pre-Trained Models for <fixed-case>C</fixed-case>hinese Natural Language Processing + YimingCui + WanxiangChe + TingLiu + BingQin + ShijinWang + GuopingHu + 657–668 + Bidirectional Encoder Representations from Transformers (BERT) has shown marvelous improvements across various NLP tasks, and consecutive variants have been proposed to further improve the performance of the pre-trained language models. In this paper, we target on revisiting Chinese pre-trained language models to examine their effectiveness in a non-English language and release the Chinese pre-trained language model series to the community. We also propose a simple but effective model called MacBERT, which improves upon RoBERTa in several ways, especially the masking strategy that adopts MLM as correction (Mac). We carried out extensive experiments on eight Chinese NLP tasks to revisit the existing pre-trained language models as well as the proposed MacBERT. Experimental results show that MacBERT could achieve state-of-the-art performances on many NLP tasks, and we also ablate details with several findings that may help future research. https://github.com/ymcui/MacBERT + 2020.findings-emnlp.58 + + + Cascaded Semantic and Positional Self-Attention Network for Document Classification + JuyongJiang + JieZhang + KaiZhang + 669–677 + Transformers have shown great success in learning representations for language modelling. However, an open challenge still remains on how to systematically aggregate semantic information (word embedding) with positional (or temporal) information (word orders). In this work, we propose a new architecture to aggregate the two sources of information using cascaded semantic and positional self-attention network (CSPAN) in the context of document classification. The CSPAN uses a semantic self-attention layer cascaded with Bi-LSTM to process the semantic and positional information in a sequential manner, and then adaptively combine them together through a residue connection. Compared with commonly used positional encoding schemes, CSPAN can exploit the interaction between semantics and word positions in a more interpretable and adaptive manner, and the classification performance can be notably improved while simultaneously preserving a compact model size and high convergence rate. We evaluate the CSPAN model on several benchmark data sets for document classification with careful ablation studies, and demonstrate the encouraging results compared with state of the art. + 2020.findings-emnlp.59 + + + Toward Recognizing More Entity Types in <fixed-case>NER</fixed-case>: An Efficient Implementation using Only Entity Lexicons + MinlongPeng + RuotianMa + QiZhang + LujunZhao + MengxiWei + ChanglongSun + XuanjingHuang + 678–688 + In this work, we explore the way to quickly adjust an existing named entity recognition (NER) system to make it capable of recognizing entity types not defined in the system. As an illustrative example, consider the case that a NER system has been built to recognize person and organization names, and now it requires to additionally recognize job titles. Such a situation is common in the industrial areas, where the entity types required to recognize vary a lot in different products and keep changing. To avoid laborious data labeling and achieve fast adaptation, we propose to adjust the existing NER system using the previously labeled data and entity lexicons of the newly introduced entity types. We formulate such a task as a partially supervised learning problem and accordingly propose an effective algorithm to solve the problem. Comprehensive experimental studies on several public NER datasets validate the effectiveness of our method. + 2020.findings-emnlp.60 + + + From Disjoint Sets to Parallel Data to Train <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models for Sentiment Transfer + PauloCavalin + MarisaVasconcelos + MarceloGrave + ClaudioPinhanez + Victor HenriqueAlves Ribeiro + 689–698 + We present a method for creating parallel data to train Seq2Seq neural networks for sentiment transfer. Most systems for this task, which can be viewed as monolingual machine translation (MT), have relied on unsupervised methods, such as Generative Adversarial Networks (GANs)-inspired approaches, for coping with the lack of parallel corpora. Given that the literature shows that Seq2Seq methods have been consistently outperforming unsupervised methods in MT-related tasks, in this work we exploit the use of semantic similarity computation for converting non-parallel data onto a parallel corpus. That allows us to train a transformer neural network for the sentiment transfer task, and compare its performance against unsupervised approaches. With experiments conducted on two well-known public datasets, i.e. Yelp and Amazon, we demonstrate that the proposed methodology outperforms existing unsupervised methods very consistently in fluency, and presents competitive results in terms of sentiment conversion and content preservation. We believe that this works opens up an opportunity for seq2seq neural networks to be better exploited in problems for which they have not been applied owing to the lack of parallel training data. + 2020.findings-emnlp.61 + + + Learning to Stop: A Simple yet Effective Approach to Urban Vision-Language Navigation + JiannanXiang + XinWang + William YangWang + 699–707 + Vision-and-Language Navigation (VLN) is a natural language grounding task where an agent learns to follow language instructions and navigate to specified destinations in real-world environments. A key challenge is to recognize and stop at the correct location, especially for complicated outdoor environments. Existing methods treat the STOP action equally as other actions, which results in undesirable behaviors that the agent often fails to stop at the destination even though it might be on the right path. Therefore, we propose Learning to Stop (L2Stop), a simple yet effective policy module that differentiates STOP and other actions. Our approach achieves the new state of the art on a challenging urban VLN dataset Touchdown, outperforming the baseline by 6.89% (absolute improvement) on Success weighted by Edit Distance (SED). + 2020.findings-emnlp.62 + 2020.findings-emnlp.62.OptionalSupplementaryMaterial.zip + + + Document Ranking with a Pretrained Sequence-to-Sequence Model + RodrigoNogueira + ZhiyingJiang + RonakPradeep + JimmyLin + 708–718 + This work proposes the use of a pretrained sequence-to-sequence model for document ranking. Our approach is fundamentally different from a commonly adopted classification-based formulation based on encoder-only pretrained transformer architectures such as BERT. We show how a sequence-to-sequence model can be trained to generate relevance labels as “target tokens”, and how the underlying logits of these target tokens can be interpreted as relevance probabilities for ranking. Experimental results on the MS MARCO passage ranking task show that our ranking approach is superior to strong encoder-only models. On three other document retrieval test collections, we demonstrate a zero-shot transfer-based approach that outperforms previous state-of-the-art models requiring in-domain cross-validation. Furthermore, we find that our approach significantly outperforms an encoder-only architecture in a data-poor setting. We investigate this observation in more detail by varying target tokens to probe the model’s use of latent knowledge. Surprisingly, we find that the choice of target tokens impacts effectiveness, even for words that are closely related semantically. This finding sheds some light on why our sequence-to-sequence formulation for document ranking is effective. Code and models are available at pygaggle.ai. + 2020.findings-emnlp.63 + + + Pruning Redundant Mappings in Transformer Models via Spectral-Normalized Identity Prior + ZiLin + JeremiahLiu + ZiYang + NanHua + DanRoth + 719–730 + Traditional (unstructured) pruning methods for a Transformer model focus on regularizing the individual weights by penalizing them toward zero. In this work, we explore spectral-normalized identity priors (SNIP), a structured pruning approach which penalizes an entire residual module in a Transformer model toward an identity mapping. Our method identifies and discards unimportant non-linear mappings in the residual connections by applying a thresholding operator on the function norm, and is applicable to any structured module including a single attention head, an entire attention blocks, or a feed-forward subnetwork. Furthermore, we introduce spectral normalization to stabilize the distribution of the post-activation values of the Transformer layers, further improving the pruning effectiveness of the proposed methodology. We conduct experiments with BERT on 5 GLUE benchmark tasks to demonstrate that SNIP achieves effective pruning results while maintaining comparable performance. Specifically, we improve the performance over the state-of-the-art by 0.5 to 1.0% on average at 50% compression ratio. + 2020.findings-emnlp.64 + + + Rethinking Self-Attention: Towards Interpretability in Neural Parsing + KhalilMrini + FranckDernoncourt + Quan HungTran + TrungBui + WalterChang + NdapaNakashole + 731–742 + Attention mechanisms have improved the performance of NLP tasks while allowing models to remain explainable. Self-attention is currently widely used, however interpretability is difficult due to the numerous attention distributions. Recent work has shown that model representations can benefit from label-specific information, while facilitating interpretation of predictions. We introduce the Label Attention Layer: a new form of self-attention where attention heads represent labels. We test our novel layer by running constituency and dependency parsing experiments and show our new model obtains new state-of-the-art results for both tasks on both the Penn Treebank (PTB) and Chinese Treebank. Additionally, our model requires fewer self-attention layers compared to existing work. Finally, we find that the Label Attention heads learn relations between syntactic categories and show pathways to analyze errors. + 2020.findings-emnlp.65 + + + <fixed-case>P</fixed-case>olicy<fixed-case>QA</fixed-case>: A Reading Comprehension Dataset for Privacy Policies + WasiAhmad + JianfengChi + YuanTian + Kai-WeiChang + 743–749 + Privacy policy documents are long and verbose. A question answering (QA) system can assist users in finding the information that is relevant and important to them. Prior studies in this domain frame the QA task as retrieving the most relevant text segment or a list of sentences from the policy document given a question. On the contrary, we argue that providing users with a short text span from policy documents reduces the burden of searching the target information from a lengthy text segment. In this paper, we present PolicyQA, a dataset that contains 25,017 reading comprehension style examples curated from an existing corpus of 115 website privacy policies. PolicyQA provides 714 human-annotated questions written for a wide range of privacy practices. We evaluate two existing neural QA models and perform rigorous analysis to reveal the advantages and challenges offered by PolicyQA. + 2020.findings-emnlp.66 + + + A Linguistic Analysis of Visually Grounded Dialogues Based on Spatial Expressions + TakumaUdagawa + TakatoYamazaki + AkikoAizawa + 750–765 + Recent models achieve promising results in visually grounded dialogues. However, existing datasets often contain undesirable biases and lack sophisticated linguistic analyses, which make it difficult to understand how well current models recognize their precise linguistic structures. To address this problem, we make two design choices: first, we focus on OneCommon Corpus (CITATION), a simple yet challenging common grounding dataset which contains minimal bias by design. Second, we analyze their linguistic structures based on spatial expressions and provide comprehensive and reliable annotation for 600 dialogues. We show that our annotation captures important linguistic structures including predicate-argument structure, modification and ellipsis. In our experiments, we assess the model’s understanding of these structures through reference resolution. We demonstrate that our annotation can reveal both the strengths and weaknesses of baseline models in essential levels of detail. Overall, we propose a novel framework and resource for investigating fine-grained language understanding in visually grounded dialogues. + 2020.findings-emnlp.67 + + + Efficient Context and Schema Fusion Networks for Multi-Domain Dialogue State Tracking + SuZhu + JieyuLi + LuChen + KaiYu + 766–781 + Dialogue state tracking (DST) aims at estimating the current dialogue state given all the preceding conversation. For multi-domain DST, the data sparsity problem is a major obstacle due to increased numbers of state candidates and dialogue lengths. To encode the dialogue context efficiently, we utilize the previous dialogue state (predicted) and the current dialogue utterance as the input for DST. To consider relations among different domain-slots, the schema graph involving prior knowledge is exploited. In this paper, a novel context and schema fusion network is proposed to encode the dialogue context and schema graph by using internal and external attention mechanisms. Experiment results show that our approach can outperform strong baselines, and the previous state-of-the-art method (SOM-DST) can also be improved by our proposed schema graph. + 2020.findings-emnlp.68 + + + Syntactic and Semantic-driven Learning for Open Information Extraction + JialongTang + YaojieLu + HongyuLin + XianpeiHan + LeSun + XinyanXiao + HuaWu + 782–792 + One of the biggest bottlenecks in building accurate, high coverage neural open IE systems is the need for large labelled corpora. The diversity of open domain corpora and the variety of natural language expressions further exacerbate this problem. In this paper, we propose a syntactic and semantic-driven learning approach, which can learn neural open IE models without any human-labelled data by leveraging syntactic and semantic knowledge as noisier, higher-level supervision. Specifically, we first employ syntactic patterns as data labelling functions and pretrain a base model using the generated labels. Then we propose a syntactic and semantic-driven reinforcement learning algorithm, which can effectively generalize the base model to open situations with high accuracy. Experimental results show that our approach significantly outperforms the supervised counterparts, and can even achieve competitive performance to supervised state-of-the-art (SoA) model. + 2020.findings-emnlp.69 + 2020.findings-emnlp.69.OptionalSupplementaryMaterial.zip + + + Group-wise Contrastive Learning for Neural Dialogue Generation + HengyiCai + HongshenChen + YonghaoSong + ZhuoyeDing + YongjunBao + WeipengYan + XiaofangZhao + 793–802 + Neural dialogue response generation has gained much popularity in recent years. Maximum Likelihood Estimation (MLE) objective is widely adopted in existing dialogue model learning. However, models trained with MLE objective function are plagued by the low-diversity issue when it comes to the open-domain conversational setting. Inspired by the observation that humans not only learn from the positive signals but also benefit from correcting behaviors of undesirable actions, in this work, we introduce contrastive learning into dialogue generation, where the model explicitly perceives the difference between the well-chosen positive and negative utterances. Specifically, we employ a pretrained baseline model as a reference. During contrastive learning, the target dialogue model is trained to give higher conditional probabilities for the positive samples, and lower conditional probabilities for those negative samples, compared to the reference model. To manage the multi-mapping relations prevalent in human conversation, we augment contrastive dialogue learning with group-wise dual sampling. Extensive experimental results show that the proposed group-wise contrastive learning framework is suited for training a wide range of neural dialogue generation models with very favorable performance over the baseline training approaches. + 2020.findings-emnlp.70 + + + <fixed-case>E</fixed-case>-<fixed-case>BERT</fixed-case>: Efficient-Yet-Effective Entity Embeddings for <fixed-case>BERT</fixed-case> + NinaPoerner + UlliWaltinger + HinrichSchütze + 803–818 + We present a novel way of injecting factual knowledge about entities into the pretrained BERT model (Devlin et al., 2019): We align Wikipedia2Vec entity vectors (Yamada et al., 2016) with BERT’s native wordpiece vector space and use the aligned entity vectors as if they were wordpiece vectors. The resulting entity-enhanced version of BERT (called E-BERT) is similar in spirit to ERNIE (Zhang et al., 2019) and KnowBert (Peters et al., 2019), but it requires no expensive further pre-training of the BERT encoder. We evaluate E-BERT on unsupervised question answering (QA), supervised relation classification (RC) and entity linking (EL). On all three tasks, E-BERT outperforms BERT and other baselines. We also show quantitatively that the original BERT model is overly reliant on the surface form of entity names (e.g., guessing that someone with an Italian-sounding name speaks Italian), and that E-BERT mitigates this problem. + 2020.findings-emnlp.71 + 2020.findings-emnlp.71.OptionalSupplementaryMaterial.pdf + + + A Multi-task Learning Framework for Opinion Triplet Extraction + ChenZhang + QiuchiLi + DaweiSong + BenyouWang + 819–828 + The state-of-the-art Aspect-based Sentiment Analysis (ABSA) approaches are mainly based on either detecting aspect terms and their corresponding sentiment polarities, or co-extracting aspect and opinion terms. However, the extraction of aspect-sentiment pairs lacks opinion terms as a reference, while co-extraction of aspect and opinion terms would not lead to meaningful pairs without determining their sentiment dependencies. To address the issue, we present a novel view of ABSA as an opinion triplet extraction task, and propose a multi-task learning framework to jointly extract aspect terms and opinion terms, and simultaneously parses sentiment dependencies between them with a biaffine scorer. At inference phase, the extraction of triplets is facilitated by a triplet decoding method based on the above outputs. We evaluate the proposed framework on four SemEval benchmarks for ASBA. The results demonstrate that our approach significantly outperforms a range of strong baselines and state-of-the-art approaches. + 2020.findings-emnlp.72 + + + Event Extraction as Multi-turn Question Answering + FayuanLi + WeihuaPeng + YuguangChen + QuanWang + LuPan + YajuanLyu + YongZhu + 829–838 + Event extraction, which aims to identify event triggers of pre-defined event types and their arguments of specific roles, is a challenging task in NLP. Most traditional approaches formulate this task as classification problems, with event types or argument roles taken as golden labels. Such approaches fail to model rich interactions among event types and arguments of different roles, and cannot generalize to new types or roles. This work proposes a new paradigm that formulates event extraction as multi-turn question answering. Our approach, MQAEE, casts the extraction task into a series of reading comprehension problems, by which it extracts triggers and arguments successively from a given sentence. A history answer embedding strategy is further adopted to model question answering history in the multi-turn process. By this new formulation, MQAEE makes full use of dependency among arguments and event types, and generalizes well to new types with new argument roles. Empirical results on ACE 2005 shows that MQAEE outperforms current state-of-the-art, pushing the final F1 of argument extraction to 53.4% (+2.0%). And it also has a good generalization ability, achieving competitive performance on 13 new event types even if trained only with a few samples of them. + 2020.findings-emnlp.73 + + + Improving <fixed-case>QA</fixed-case> Generalization by Concurrent Modeling of Multiple Biases + MingzhuWu + Nafise SadatMoosavi + AndreasRücklé + IrynaGurevych + 839–853 + Existing NLP datasets contain various biases that models can easily exploit to achieve high performances on the corresponding evaluation sets. However, focusing on dataset-specific biases limits their ability to learn more generalizable knowledge about the task from more general data patterns. In this paper, we investigate the impact of debiasing methods for improving generalization and propose a general framework for improving the performance on both in-domain and out-of-domain datasets by concurrent modeling of multiple biases in the training data. Our framework weights each example based on the biases it contains and the strength of those biases in the training data. It then uses these weights in the training objective so that the model relies less on examples with high bias weights. We extensively evaluate our framework on extractive question answering with training data from various domains with multiple biases of different strengths. We perform the evaluations in two different settings, in which the model is trained on a single domain or multiple domains simultaneously, and show its effectiveness in both settings compared to state-of-the-art debiasing methods. + 2020.findings-emnlp.74 + + + Actor-Double-Critic: Incorporating Model-Based Critic for Task-Oriented Dialogue Systems + Yen-chenWu + Bo-HsiangTseng + MilicaGasic + 854–863 + In order to improve the sample-efficiency of deep reinforcement learning (DRL), we implemented imagination augmented agent (I2A) in spoken dialogue systems (SDS). Although I2A achieves a higher success rate than baselines by augmenting predicted future into a policy network, its complicated architecture introduces unwanted instability. In this work, we propose actor-double-critic (ADC) to improve the stability and overall performance of I2A. ADC simplifies the architecture of I2A to reduce excessive parameters and hyper-parameters. More importantly, a separate model-based critic shares parameters between actions and makes back-propagation explicit. In our experiments on Cambridge Restaurant Booking task, ADC enhances success rates considerably and shows robustness to imperfect environment models. In addition, ADC exhibits the stability and sample-efficiency as significantly reducing the baseline standard deviation of success rates and reaching the 80% success rate with half training data. + 2020.findings-emnlp.75 + + + Controlled Hallucinations: Learning to Generate Faithfully from Noisy Data + KatjaFilippova + 864–870 + Neural text generation (data- or text-to-text) demonstrates remarkable performance when training data is abundant which for many applications is not the case. To collect a large corpus of parallel data, heuristic rules are often used but they inevitably let noise into the data, such as phrases in the output which cannot be explained by the input. Consequently, models pick up on the noise and may hallucinate–generate fluent but unsupported text. Our contribution is a simple but powerful technique to treat such hallucinations as a controllable aspect of the generated text, without dismissing any input and without modifying the model architecture. On the WikiBio corpus (Lebret et al., 2016), a particularly noisy dataset, we demonstrate the efficacy of the technique both in an automatic and in a human evaluation. + 2020.findings-emnlp.76 + + + Sequential Span Classification with Neural Semi-<fixed-case>M</fixed-case>arkov <fixed-case>CRF</fixed-case>s for Biomedical Abstracts + KosukeYamada + TsutomuHirao + RyoheiSasano + KoichiTakeda + MasaakiNagata + 871–877 + Dividing biomedical abstracts into several segments with rhetorical roles is essential for supporting researchers’ information access in the biomedical domain. Conventional methods have regarded the task as a sequence labeling task based on sequential sentence classification, i.e., they assign a rhetorical label to each sentence by considering the context in the abstract. However, these methods have a critical problem: they are prone to mislabel longer continuous sentences with the same rhetorical label. To tackle the problem, we propose sequential span classification that assigns a rhetorical label, not to a single sentence but to a span that consists of continuous sentences. Accordingly, we introduce Neural Semi-Markov Conditional Random Fields to assign the labels to such spans by considering all possible spans of various lengths. Experimental results obtained from PubMed 20k RCT and NICTA-PIBOSO datasets demonstrate that our proposed method achieved the best micro sentence-F1 score as well as the best micro span-F1 score. + 2020.findings-emnlp.77 + 2020.findings-emnlp.77.OptionalSupplementaryMaterial.zip + + + Where to Submit? Helping Researchers to Choose the Right Venue + KonstantinKobs + TobiasKoopmann + AlbinZehe + DavidFernes + PhilippKrop + AndreasHotho + 878–883 + Whenever researchers write a paper, the same question occurs: “Where to submit?” In this work, we introduce WTS, an open and interpretable NLP system that recommends conferences and journals to researchers based on the title, abstract, and/or keywords of a given paper. We adapt the TextCNN architecture and automatically analyze its predictions using the Integrated Gradients method to highlight words and phrases that led to the recommendation of a scientific venue. We train and test our method on publications from the fields of artificial intelligence (AI) and medicine, both derived from the Semantic Scholar dataset. WTS achieves an Accuracy@5 of approximately 83% for AI papers and 95% in the field of medicine. It is open source and available for testing on https://wheretosubmit.ml. + 2020.findings-emnlp.78 + + + <fixed-case>A</fixed-case>ir<fixed-case>C</fixed-case>oncierge: Generating Task-Oriented Dialogue via Efficient Large-Scale Knowledge Retrieval + Chieh-YangChen + Pei-HsinWang + Shih-ChiehChang + Da-ChengJuan + WeiWei + Jia-YuPan + 884–897 + Despite recent success in neural task-oriented dialogue systems, developing such a real-world system involves accessing large-scale knowledge bases (KBs), which cannot be simply encoded by neural approaches, such as memory network mechanisms. To alleviate the above problem, we propose , an end-to-end trainable text-to-SQL guided framework to learn a neural agent that interacts with KBs using the generated SQL queries. Specifically, the neural agent first learns to ask and confirm the customer’s intent during the multi-turn interactions, then dynamically determining when to ground the user constraints into executable SQL queries so as to fetch relevant information from KBs. With the help of our method, the agent can use less but more accurate fetched results to generate useful responses efficiently, instead of incorporating the entire KBs. We evaluate the proposed method on the AirDialogue dataset, a large corpus released by Google, containing the conversations of customers booking flight tickets from the agent. The experimental results show that significantly improves over previous work in terms of accuracy and the BLEU score, which demonstrates not only the ability to achieve the given task but also the good quality of the generated dialogues. + 2020.findings-emnlp.79 + 2020.findings-emnlp.79.OptionalSupplementaryMaterial.zip + + + <fixed-case>D</fixed-case>oc<fixed-case>S</fixed-case>truct: A Multimodal Method to Extract Hierarchy Structure in Document for General Form Understanding + ZilongWang + MingjieZhan + XueboLiu + DingLiang + 898–908 + Form understanding depends on both textual contents and organizational structure. Although modern OCR performs well, it is still challenging to realize general form understanding because forms are commonly used and of various formats. The table detection and handcrafted features in previous works cannot apply to all forms because of their requirements on formats. Therefore, we concentrate on the most elementary components, the key-value pairs, and adopt multimodal methods to extract features. We consider the form structure as a tree-like or graph-like hierarchy of text fragments. The parent-child relation corresponds to the key-value pairs in forms. We utilize the state-of-the-art models and design targeted extraction modules to extract multimodal features from semantic contents, layout information, and visual images. A hybrid fusion method of concatenation and feature shifting is designed to fuse the heterogeneous features and provide an informative joint representation. We adopt an asymmetric algorithm and negative sampling in our model as well. We validate our method on two benchmarks, MedForm and FUNSD, and extensive experiments demonstrate the effectiveness of our method. + 2020.findings-emnlp.80 + + + Pretrained Language Models for Dialogue Generation with Multiple Input Sources + YuCao + WeiBi + MengFang + DachengTao + 909–917 + Large-scale pretrained language models have achieved outstanding performance on natural language understanding tasks. However, it is still under investigating how to apply them to dialogue generation tasks, especially those with responses conditioned on multiple sources. Previous work simply concatenates all input sources or averages information from different input sources. In this work, we study dialogue models with multiple input sources adapted from the pretrained language model GPT2. We explore various methods to fuse multiple separate attention information corresponding to different sources. Our experimental results show that proper fusion methods deliver higher relevance with dialogue history than simple fusion baselines. + 2020.findings-emnlp.81 + + + A Study in Improving <fixed-case>BLEU</fixed-case> Reference Coverage with Diverse Automatic Paraphrasing + RachelBawden + BiaoZhang + LisaYankovskaya + AndreTättar + MattPost + 918–932 + We investigate a long-perceived shortcoming in the typical use of BLEU: its reliance on a single reference. Using modern neural paraphrasing techniques, we study whether automatically generating additional *diverse* references can provide better coverage of the space of valid translations and thereby improve its correlation with human judgments. Our experiments on the into-English language directions of the WMT19 metrics task (at both the system and sentence level) show that using paraphrased references does generally improve BLEU, and when it does, the more diverse the better. However, we also show that better results could be achieved if those paraphrases were to specifically target the parts of the space most relevant to the MT outputs being evaluated. Moreover, the gains remain slight even when human paraphrases are used, suggesting inherent limitations to BLEU’s capacity to correctly exploit multiple references. Surprisingly, we also find that adequacy appears to be less important, as shown by the high results of a strong sampling approach, which even beats human paraphrases when used with sentence-level BLEU. + 2020.findings-emnlp.82 + 2020.findings-emnlp.82.OptionalSupplementaryMaterial.txt + + + Cross-lingual Alignment Methods for Multilingual <fixed-case>BERT</fixed-case>: A Comparative Study + SaurabhKulshreshtha + Jose LuisRedondo Garcia + Ching-YunChang + 933–942 + Multilingual BERT (mBERT) has shown reasonable capability for zero-shot cross-lingual transfer when fine-tuned on downstream tasks. Since mBERT is not pre-trained with explicit cross-lingual supervision, transfer performance can further be improved by aligning mBERT with cross-lingual signal. Prior work propose several approaches to align contextualised embeddings. In this paper we analyse how different forms of cross-lingual supervision and various alignment methods influence the transfer capability of mBERT in zero-shot setting. Specifically, we compare parallel corpora vs dictionary-based supervision and rotational vs fine-tuning based alignment methods. We evaluate the performance of different alignment methodologies across eight languages on two tasks: Name Entity Recognition and Semantic Slot Filling. In addition, we propose a novel normalisation method which consistently improves the performance of rotation-based alignment including a notable 3% F1 improvement for distant and typologically dissimilar languages. Importantly we identify the biases of the alignment methods to the type of task and proximity to the transfer language. We also find that supervision from parallel corpus is generally superior to dictionary alignments. + 2020.findings-emnlp.83 + + + Hybrid Emoji-Based Masked Language Models for Zero-Shot Abusive Language Detection + MicheleCorazza + StefanoMenini + ElenaCabrio + SaraTonelli + SerenaVillata + 943–949 + Recent studies have demonstrated the effectiveness of cross-lingual language model pre-training on different NLP tasks, such as natural language inference and machine translation. In our work, we test this approach on social media data, which are particularly challenging to process within this framework, since the limited length of the textual messages and the irregularity of the language make it harder to learn meaningful encodings. More specifically, we propose a hybrid emoji-based Masked Language Model (MLM) to leverage the common information conveyed by emojis across different languages and improve the learned cross-lingual representation of short text messages, with the goal to perform zero- shot abusive language detection. We compare the results obtained with the original MLM to the ones obtained by our method, showing improved performance on German, Italian and Spanish. + 2020.findings-emnlp.84 + + + <fixed-case>S</fixed-case>e<fixed-case>N</fixed-case>s<fixed-case>ER</fixed-case>: Learning Cross-Building Sensor Metadata Tagger + YangJiao + JiachengLi + JiamanWu + DezhiHong + RajeshGupta + JingboShang + 950–960 + Sensor metadata tagging, akin to the named entity recognition task, provides key contextual information (e.g., measurement type and location) about sensors for running smart building applications. Unfortunately, sensor metadata in different buildings often follows distinct naming conventions. Therefore, learning a tagger currently requires extensive annotations on a per building basis. In this work, we propose a novel framework, SeNsER, which learns a sensor metadata tagger for a new building based on its raw metadata and some existing fully annotated building. It leverages the commonality between different buildings: At the character level, it employs bidirectional neural language models to capture the shared underlying patterns between two buildings and thus regularizes the feature learning process; At the word level, it leverages as features the k-mers existing in the fully annotated building. During inference, we further incorporate the information obtained from sources such as Wikipedia as prior knowledge. As a result, SeNsER shows promising results in extensive experiments on multiple real-world buildings. + 2020.findings-emnlp.85 + + + <fixed-case>P</fixed-case>ersian Ezafe Recognition Using Transformers and Its Role in Part-Of-Speech Tagging + EhsanDoostmohammadi + MinooNassajian + AdelRahimi + 961–971 + Ezafe is a grammatical particle in some Iranian languages that links two words together. Regardless of the important information it conveys, it is almost always not indicated in Persian script, resulting in mistakes in reading complex sentences and errors in natural language processing tasks. In this paper, we experiment with different machine learning methods to achieve state-of-the-art results in the task of ezafe recognition. Transformer-based methods, BERT and XLMRoBERTa, achieve the best results, the latter achieving 2.68% F1-score more than the previous state-of-the-art. We, moreover, use ezafe information to improve Persian part-of-speech tagging results and show that such information will not be useful to transformer-based methods and explain why that might be the case. + 2020.findings-emnlp.86 + + + Scene Graph Modification Based on Natural Language Commands + XuanliHe + Quan HungTran + GholamrezaHaffari + WalterChang + ZheLin + TrungBui + FranckDernoncourt + NhanDam + 972–990 + Structured representations like graphs and parse trees play a crucial role in many Natural Language Processing systems. In recent years, the advancements in multi-turn user interfaces necessitate the need for controlling and updating these structured representations given new sources of information. Although there have been many efforts focusing on improving the performance of the parsers that map text to graphs or parse trees, very few have explored the problem of directly manipulating these representations. In this paper, we explore the novel problem of graph modification, where the systems need to learn how to update an existing scene graph given a new user’s command. Our novel models based on graph-based sparse transformer and cross attention information fusion outperform previous systems adapted from the machine translation and graph generation literature. We further contribute our large graph modification datasets to the research community to encourage future research for this new problem. + 2020.findings-emnlp.87 + + + <fixed-case>L</fixed-case>i<fixed-case>M</fixed-case>i<fixed-case>T</fixed-case>: The Literal Motion in Text Dataset + IreneManotas + Ngoc Phuoc AnVo + VadimSheinin + 991–1000 + Motion recognition is one of the basic cognitive capabilities of many life forms, yet identifying motion of physical entities in natural language have not been explored extensively and empirically. We present the Literal-Motion-in-Text (LiMiT) dataset, a large human-annotated collection of English text sentences describing physical occurrence of motion, with annotated physical entities in motion. We describe the annotation process for the dataset, analyze its scale and diversity, and report results of several baseline models. We also present future research directions and applications of the LiMiT dataset and share it publicly as a new resource for the research community. + 2020.findings-emnlp.88 + + + Transition-based Parsing with Stack-Transformers + RamónFernandez Astudillo + MiguelBallesteros + TahiraNaseem + AustinBlodgett + RaduFlorian + 1001–1007 + Modeling the parser state is key to good performance in transition-based parsing. Recurrent Neural Networks considerably improved the performance of transition-based systems by modelling the global state, e.g. stack-LSTM parsers, or local state modeling of contextualized features, e.g. Bi-LSTM parsers. Given the success of Transformer architectures in recent parsing systems, this work explores modifications of the sequence-to-sequence Transformer architecture to model either global or local parser states in transition-based parsing. We show that modifications of the cross attention mechanism of the Transformer considerably strengthen performance both on dependency and Abstract Meaning Representation (AMR) parsing tasks, particularly for smaller models or limited training data. + 2020.findings-emnlp.89 + + + <fixed-case>G</fixed-case>-<fixed-case>DA</fixed-case>ug: Generative Data Augmentation for Commonsense Reasoning + YibenYang + ChaitanyaMalaviya + JaredFernandez + SwabhaSwayamdipta + RonanLe Bras + Ji-PingWang + ChandraBhagavatula + YejinChoi + DougDowney + 1008–1025 + Recent advances in commonsense reasoning depend on large-scale human-annotated training sets to achieve peak performance. However, manual curation of training sets is expensive and has been shown to introduce annotation artifacts that neural models can readily exploit and overfit to. We propose a novel generative data augmentation technique, G-DAUGˆC, that aims to achieve more accurate and robust learning in a low-resource setting. Our approach generates synthetic examples using pretrained language models and selects the most informative and diverse set of examples for data augmentation. On experiments with multiple commonsense reasoning benchmarks, G-DAUGˆC consistently outperforms existing data augmentation methods based on back-translation, establishing a new state-of-the-art on WinoGrande, CODAH, and CommonsenseQA, as well as enhances out-of-distribution generalization, proving to be robust against adversaries or perturbations. Our analysis demonstrates that G-DAUGˆC produces a diverse set of fluent training examples, and that its selection and training approaches are important for performance. + 2020.findings-emnlp.90 + 2020.findings-emnlp.90.OptionalSupplementaryMaterial.zip + + + <fixed-case>H</fixed-case>ybrid<fixed-case>QA</fixed-case>: A Dataset of Multi-Hop Question Answering over Tabular and Textual Data + WenhuChen + HanwenZha + ZhiyuChen + WenhanXiong + HongWang + William YangWang + 1026–1036 + Existing question answering datasets focus on dealing with homogeneous information, based either only on text or KB/Table information alone. However, as human knowledge is distributed over heterogeneous forms, using homogeneous information alone might lead to severe coverage problems. To fill in the gap, we present HybridQA, a new large-scale question-answering dataset that requires reasoning on heterogeneous information. Each question is aligned with a Wikipedia table and multiple free-form corpora linked with the entities in the table. The questions are designed to aggregate both tabular information and text information, i.e., lack of either form would render the question unanswerable. We test with three different models: 1) a table-only model. 2) text-only model. 3) a hybrid model that combines heterogeneous information to find the answer. The experimental results show that the EM scores obtained by two baselines are below 20%, while the hybrid model can achieve an EM over 40%. This gap suggests the necessity to aggregate heterogeneous information in HybridQA. However, the hybrid model’s score is still far behind human performance. Hence, HybridQA can serve as a challenging benchmark to study question answering with heterogeneous information. + 2020.findings-emnlp.91 + + + <fixed-case>P</fixed-case>ho<fixed-case>BERT</fixed-case>: Pre-trained language models for <fixed-case>V</fixed-case>ietnamese + Dat QuocNguyen + AnhTuan Nguyen + 1037–1042 + We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and Natural language inference. We release PhoBERT to facilitate future research and downstream applications for Vietnamese NLP. Our PhoBERT models are available at https://github.com/VinAIResearch/PhoBERT + 2020.findings-emnlp.92 + + + <fixed-case>EST</fixed-case>e<fixed-case>R</fixed-case>: Combining Word Co-occurrences and Word Associations for Unsupervised Emotion Detection + Sujatha DasGollapalli + PolinaRozenshtein + See-KiongNg + 1043–1056 + Accurate detection of emotions in user- generated text was shown to have several applications for e-commerce, public well-being, and disaster management. Currently, the state-of-the-art performance for emotion detection in text is obtained using complex, deep learning models trained on domain-specific, labeled data. In this paper, we propose ESTeR , an unsupervised model for identifying emotions using a novel similarity function based on random walks on graphs. Our model combines large-scale word co-occurrence information with word-associations from lexicons avoiding not only the dependence on labeled datasets, but also an explicit mapping of words to latent spaces used in emotion-enriched word embeddings. Our similarity function can also be computed efficiently. We study a range of datasets including recent tweets related to COVID-19 to illustrate the superior performance of our model and report insights on public emotions during the on-going pandemic. + 2020.findings-emnlp.93 + + + Make Templates Smarter: A Template Based <fixed-case>D</fixed-case>ata2<fixed-case>T</fixed-case>ext System Powered by Text Stitch Model + BingfengLuo + ZuoBai + KunfengLai + JianpingShen + 1057–1062 + Neural network (NN) based data2text models achieve state-of-the-art (SOTA) performance in most metrics, but they sometimes drop or modify the information in the input, and it is hard to control the generation contents. Moreover, it requires paired training data that are usually expensive to collect. Template-based methods have good fidelity and controllability but require heavy human involvement. We propose a novel template-based data2text system powered by a text stitch model. It ensures fidelity and controllability by using templates to produce the main contents. In addition, it reduces human involvement in template design by using a text stitch model to automatically stitch adjacent template units, which is a step that usually requires careful template design and limits template reusability. The text stitch model can be trained in self-supervised fashion, which only requires free texts. The experiments on a benchmark dataset show that our system outperforms SOTA NN-based systems in fidelity and surpasses template-based systems in diversity and human involvement. + 2020.findings-emnlp.94 + 2020.findings-emnlp.94.OptionalSupplementaryMaterial.zip + + + <fixed-case>GCDST</fixed-case>: A Graph-based and Copy-augmented Multi-domain Dialogue State Tracking + PengWu + BoweiZou + RidongJiang + AiTiAw + 1063–1073 + As an essential component of task-oriented dialogue systems, Dialogue State Tracking (DST) takes charge of estimating user intentions and requests in dialogue contexts and extracting substantial goals (states) from user utterances to help the downstream modules to determine the next actions of dialogue systems. For practical usages, a major challenge to constructing a robust DST model is to process a conversation with multi-domain states. However, most existing approaches trained DST on a single domain independently, ignoring the information across domains. To tackle the multi-domain DST task, we first construct a dialogue state graph to transfer structured features among related domain-slot pairs across domains. Then, we encode the graph information of dialogue states by graph convolutional networks and utilize a hard copy mechanism to directly copy historical states from the previous conversation. Experimental results show that our model improves the performances of the multi-domain DST baseline (TRADE) with the absolute joint accuracy of 2.0% and 1.0% on the MultiWOZ 2.0 and 2.1 dialogue datasets, respectively. + 2020.findings-emnlp.95 + + + Incorporating Stylistic Lexical Preferences in Generative Language Models + HriturajSingh + GauravVerma + Balaji VasanSrinivasan + 1074–1079 + While recent advances in language modeling has resulted in powerful generation models, their generation style remains implicitly dependent on the training data and can not emulate a specific target style. Leveraging the generative capabilities of a transformer-based language models, we present an approach to induce certain target-author attributes by incorporating continuous multi-dimensional lexical preferences of an author into generative language models. We introduce rewarding strategies in a reinforcement learning framework that encourages the use of words across multiple categorical dimensions, to varying extents. Our experiments demonstrate that the proposed approach can generate text that distinctively aligns with a given target author’s lexical style. We conduct quantitative and qualitative comparisons with competitive and relevant baselines to illustrate the benefits of the proposed approach. + 2020.findings-emnlp.96 + + + Why do you think that? Exploring faithful sentence–level rationales without supervision + MaxGlockner + IvanHabernal + IrynaGurevych + 1080–1095 + Evaluating the trustworthiness of a model’s prediction is essential for differentiating between ‘right for the right reasons’ and ‘right for the wrong reasons’. Identifying textual spans that determine the target label, known as faithful rationales, usually relies on pipeline approaches or reinforcement learning. However, such methods either require supervision and thus costly annotation of the rationales or employ non-differentiable models. We propose a differentiable training–framework to create models which output faithful rationales on a sentence level, by solely applying supervision on the target task. To achieve this, our model solves the task based on each rationale individually and learns to assign high scores to those which solved the task best. Our evaluation on three different datasets shows competitive results compared to a standard BERT blackbox while exceeding a pipeline counterpart’s performance in two cases. We further exploit the transparent decision–making process of these models to prefer selecting the correct rationales by applying direct supervision, thereby boosting the performance on the rationale–level. + 2020.findings-emnlp.97 + + + Semi-Supervised Learning for Video Captioning + KeLin + ZhuoxinGan + LiweiWang + 1096–1106 + Deep neural networks have made great success on video captioning in supervised learning setting. However, annotating videos with descriptions is very expensive and time-consuming. If the video captioning algorithm can benefit from a large number of unlabeled videos, the cost of annotation can be reduced. In the proposed study, we make the first attempt to train the video captioning model on labeled data and unlabeled data jointly, in a semi-supervised learning manner. For labeled data, we train them with the traditional cross-entropy loss. For unlabeled data, we leverage a self-critical policy gradient method with the difference between the scores obtained by Monte-Carlo sampling and greedy decoding as the reward function, while the scores are the negative K-L divergence between output distributions of original video data and augmented video data. The final loss is the weighted sum of losses obtained by labeled data and unlabeled data. Experiments conducted on VATEX, MSR-VTT and MSVD dataset demonstrate that the introduction of unlabeled data can improve the performance of the video captioning model. The proposed semi-supervised learning algorithm also outperforms several state-of-the-art semi-supervised learning approaches. + 2020.findings-emnlp.98 + + + <fixed-case>M</fixed-case>ultiˆ2<fixed-case>OIE</fixed-case>: Multilingual Open Information Extraction based on Multi-Head Attention with <fixed-case>BERT</fixed-case> + YoungbinRo + YukyungLee + PilsungKang + 1107–1117 + In this paper, we propose Multi^2OIE, which performs open information extraction (open IE) by combining BERT with multi-head attention. Our model is a sequence-labeling system with an efficient and effective argument extraction method. We use a query, key, and value setting inspired by the Multimodal Transformer to replace the previously used bidirectional long short-term memory architecture with multi-head attention. Multi^2OIE outperforms existing sequence-labeling systems with high computational efficiency on two benchmark evaluation datasets, Re-OIE2016 and CaRB. Additionally, we apply the proposed method to multilingual open IE using multilingual BERT. Experimental results on new benchmark datasets introduced for two languages (Spanish and Portuguese) demonstrate that our model outperforms other multilingual systems without training data for the target languages. + 2020.findings-emnlp.99 + 2020.findings-emnlp.99.OptionalSupplementaryMaterial.zip + + + <fixed-case>LGPS</fixed-case>olver - Solving Logic Grid Puzzles Automatically + ElgunJabrayilzade + SelmaTekir + 1118–1123 + Logic grid puzzle (LGP) is a type of word problem where the task is to solve a problem in logic. Constraints for the problem are given in the form of textual clues. Once these clues are transformed into formal logic, a deductive reasoning process provides the solution. Solving logic grid puzzles in a fully automatic manner has been a challenge since a precise understanding of clues is necessary to develop the corresponding formal logic representation. To meet this challenge, we propose a solution that uses a DistilBERT-based classifier to classify a clue into one of the predefined predicate types for logic grid puzzles. Another novelty of the proposed solution is the recognition of comparison structures in clues. By collecting comparative adjectives from existing dictionaries and utilizing a semantic framework to catch comparative quantifiers, the semantics of clues concerning comparison structures are better understood, ensuring conversion to correct logic representation. Our approach solves logic grid puzzles in a fully automated manner with 100% accuracy on the given puzzle datasets and outperforms state-of-the-art solutions by a large margin. + 2020.findings-emnlp.100 + + + Using the Past Knowledge to Improve Sentiment Classification + QiQin + WenpengHu + BingLiu + 1124–1133 + This paper studies sentiment classification in the lifelong learning setting that incrementally learns a sequence of sentiment classification tasks. It proposes a new lifelong learning model (called L2PG) that can retain and selectively transfer the knowledge learned in the past to help learn the new task. A key innovation of this proposed model is a novel parameter-gate (p-gate) mechanism that regulates the flow or transfer of the previously learned knowledge to the new task. Specifically, it can selectively use the network parameters (which represent the retained knowledge gained from the previous tasks) to assist the learning of the new task t. Knowledge distillation is also employed in the process to preserve the past knowledge by approximating the network output at the state when task t-1 was learned. Experimental results show that L2PG outperforms strong baselines, including even multiple task learning. + 2020.findings-emnlp.101 + + + High-order Semantic Role Labeling + ZuchaoLi + HaiZhao + RuiWang + KevinParnow + 1134–1151 + Semantic role labeling is primarily used to identify predicates, arguments, and their semantic relationships. Due to the limitations of modeling methods and the conditions of pre-identified predicates, previous work has focused on the relationships between predicates and arguments and the correlations between arguments at most, while the correlations between predicates have been neglected for a long time. High-order features and structure learning were very common in modeling such correlations before the neural network era. In this paper, we introduce a high-order graph structure for the neural semantic role labeling model, which enables the model to explicitly consider not only the isolated predicate-argument pairs but also the interaction between the predicate-argument pairs. Experimental results on 7 languages of the CoNLL-2009 benchmark show that the high-order structural learning techniques are beneficial to the strong performing SRL models and further boost our baseline to achieve new state-of-the-art results. + 2020.findings-emnlp.102 + 2020.findings-emnlp.102.OptionalSupplementaryMaterial.zip + + + Undersensitivity in Neural Reading Comprehension + JohannesWelbl + PasqualeMinervini + MaxBartolo + PontusStenetorp + SebastianRiedel + 1152–1165 + Current reading comprehension methods generalise well to in-distribution test sets, yet perform poorly on adversarially selected data. Prior work on adversarial inputs typically studies model oversensitivity: semantically invariant text perturbations that cause a model’s prediction to change. Here we focus on the complementary problem: excessive prediction undersensitivity, where input text is meaningfully changed but the model’s prediction does not, even though it should. We formulate an adversarial attack which searches among semantic variations of the question for which a model erroneously predicts the same answer, and with even higher probability. We demonstrate that models trained on both SQuAD2.0 and NewsQA are vulnerable to this attack, and then investigate data augmentation and adversarial training as defences. Both substantially decrease adversarial vulnerability, which generalises to held-out data and held-out attack spaces. Addressing undersensitivity furthermore improves model robustness on the previously introduced ADDSENT and ADDONESENT datasets, and models generalise better when facing train / evaluation distribution mismatch: they are less prone to overly rely on shallow predictive cues present only in the training set, and outperform a conventional model by as much as 10.9% F1. + 2020.findings-emnlp.103 + + + <fixed-case>H</fixed-case>yper<fixed-case>T</fixed-case>ext: Endowing <fixed-case>F</fixed-case>ast<fixed-case>T</fixed-case>ext with Hyperbolic Geometry + YudongZhu + DiZhou + JinghuiXiao + XinJiang + XiaoChen + QunLiu + 1166–1171 + Natural language data exhibit tree-like hierarchical structures such as the hypernym-hyponym hierarchy in WordNet. FastText, as the state-of-the-art text classifier based on shallow neural network in Euclidean space, may not represent such hierarchies precisely with limited representation capacity. Considering that hyperbolic space is naturally suitable for modelling tree-like hierarchical data, we propose a new model named HyperText for efficient text classification by endowing FastText with hyperbolic geometry. Empirically, we show that HyperText outperforms FastText on a range of text classification tasks with much reduced parameters. + 2020.findings-emnlp.104 + + + <fixed-case>A</fixed-case>uto<fixed-case>ETER</fixed-case>: Automated Entity Type Representation with Relation-Aware Attention for Knowledge Graph Embedding + GuanglinNiu + BoLi + YongfeiZhang + ShiliangPu + JingyangLi + 1172–1181 + Recent advances in Knowledge Graph Embedding (KGE) allow for representing entities and relations in continuous vector spaces. Some traditional KGE models leveraging additional type information can improve the representation of entities which however totally rely on the explicit types or neglect the diverse type representations specific to various relations. Besides, none of the existing methods is capable of inferring all the relation patterns of symmetry, inversion and composition as well as the complex properties of 1-N, N-1 and N-N relations, simultaneously. To explore the type information for any KG, we develop a novel KGE framework with Automated Entity TypE Representation (AutoETER), which learns the latent type embedding of each entity by regarding each relation as a translation operation between the types of two entities with a relation-aware projection mechanism. Particularly, our designed automated type representation learning mechanism is a pluggable module which can be easily incorporated with any KGE model. Besides, our approach could model and infer all the relation patterns and complex relations. Experiments on four datasets demonstrate the superior performance of our model compared to state-of-the-art baselines on link prediction tasks, and the visualization of type clustering provides clearly the explanation of type embeddings and verifies the effectiveness of our model. + 2020.findings-emnlp.105 + + + Learning Robust and Multilingual Speech Representations + KazuyaKawakami + LuyuWang + ChrisDyer + PhilBlunsom + Aaronvan den Oord + 1182–1192 + Unsupervised speech representation learning has shown remarkable success at finding representations that correlate with phonetic structures and improve downstream speech recognition performance. However, most research has been focused on evaluating the representations in terms of their ability to improve the performance of speech recognition systems on read English (e.g. Wall Street Journal and LibriSpeech). This evaluation methodology overlooks two important desiderata that speech representations should have: robustness to domain shifts and transferability to other languages. In this paper we learn representations from up to 8000 hours of diverse and noisy speech data and evaluate the representations by looking at their robustness to domain shifts and their ability to improve recognition performance in many languages. We find that our representations confer significant robustness advantages to the resulting recognition systems: we see significant improvements in out-of-domain transfer relative to baseline feature sets and the features likewise provide improvements in 25 phonetically diverse languages. + 2020.findings-emnlp.106 + + + <fixed-case>FQ</fixed-case>u<fixed-case>AD</fixed-case>: <fixed-case>F</fixed-case>rench Question Answering Dataset + Martind’Hoffschmidt + WacimBelblidia + QuentinHeinrich + TomBrendlé + MaximeVidal + 1193–1208 + Recent advances in the field of language modeling have improved state-of-the-art results on many Natural Language Processing tasks. Among them, Reading Comprehension has made significant progress over the past few years. However, most results are reported in English since labeled resources available in other languages, such as French, remain scarce. In the present work, we introduce the French Question Answering Dataset (FQuAD). FQuAD is a French Native Reading Comprehension dataset of questions and answers on a set of Wikipedia articles that consists of 25,000+ samples for the 1.0 version and 60,000+ samples for the 1.1 version. We train a baseline model which achieves an F1 score of 92.2 and an exact match ratio of 82.1 on the test set. In an effort to track the progress of French Question Answering models we propose a leaderboard and we have made the 1.0 version of our dataset freely available at https://illuin-tech.github.io/FQuAD-explorer/. + 2020.findings-emnlp.107 + + + Semantic Matching and Aggregation Network for Few-shot Intent Detection + HoangNguyen + ChenweiZhang + CongyingXia + PhilipYu + 1209–1218 + Few-shot Intent Detection is challenging due to the scarcity of available annotated utterances. Although recent works demonstrate that multi-level matching plays an important role in transferring learned knowledge from seen training classes to novel testing classes, they rely on a static similarity measure and overly fine-grained matching components. These limitations inhibit generalizing capability towards Generalized Few-shot Learning settings where both seen and novel classes are co-existent. In this paper, we propose a novel Semantic Matching and Aggregation Network where semantic components are distilled from utterances via multi-head self-attention with additional dynamic regularization constraints. These semantic components capture high-level information, resulting in more effective matching between instances. Our multi-perspective matching method provides a comprehensive matching measure to enhance representations of both labeled and unlabeled instances. We also propose a more challenging evaluation setting that considers classification on the joint all-class label space. Extensive experimental results demonstrate the effectiveness of our method. Our code and data are publicly available. + 2020.findings-emnlp.108 + 2020.findings-emnlp.108.OptionalSupplementaryMaterial.zip + + + Quantifying the Contextualization of Word Representations with Semantic Class Probing + MengjieZhao + PhilippDufter + YadollahYaghoobzadeh + HinrichSchütze + 1219–1234 + Pretrained language models achieve state-of-the-art results on many NLP tasks, but there are still many open questions about how and why they work so well. We investigate the contextualization of words in BERT. We quantify the amount of contextualization, i.e., how well words are interpreted in context, by studying the extent to which semantic classes of a word can be inferred from its contextualized embedding. Quantifying contextualization helps in understanding and utilizing pretrained language models. We show that the top layer representations support highly accurate inference of semantic classes; that the strongest contextualization effects occur in the lower layers; that local context is mostly sufficient for contextualizing words; and that top layer representations are more task-specific after finetuning while lower layer representations are more transferable. Finetuning uncovers task-related features, but pretrained knowledge about contextualization is still well preserved. + 2020.findings-emnlp.109 + + + Learning to Generate Clinically Coherent Chest <fixed-case>X</fixed-case>-Ray Reports + JustinLovelace + BobakMortazavi + 1235–1243 + Automated radiology report generation has the potential to reduce the time clinicians spend manually reviewing radiographs and streamline clinical care. However, past work has shown that typical abstractive methods tend to produce fluent, but clinically incorrect radiology reports. In this work, we develop a radiology report generation model utilizing the transformer architecture that produces superior reports as measured by both standard language generation and clinical coherence metrics compared to competitive baselines. We then develop a method to differentiably extract clinical information from generated reports and utilize this differentiability to fine-tune our model to produce more clinically coherent reports. + 2020.findings-emnlp.110 + + + <fixed-case>FELIX</fixed-case>: Flexible Text Editing Through Tagging and Insertion + JonathanMallinson + AliakseiSeveryn + EricMalmi + GuillermoGarrido + 1244–1255 + We present FELIX – a flexible text-editing approach for generation, designed to derive maximum benefit from the ideas of decoding with bi-directional contexts and self-supervised pretraining. In contrast to conventional sequenceto-sequence (seq2seq) models, FELIX is efficient in low-resource settings and fast at inference time, while being capable of modeling flexible input-output transformations. We achieve this by decomposing the text-editing task into two sub-tasks: tagging to decide on the subset of input tokens and their order in the output text and insertion to in-fill the missing tokens in the output not present in the input. The tagging model employs a novel Pointer mechanism, while the insertion model is based on a Masked Language Model (MLM). Both of these models are chosen to be non-autoregressive to guarantee faster inference. FELIX performs favourably when compared to recent text-editing methods and strong seq2seq baselines when evaluated on four NLG tasks: Sentence Fusion, Machine Translation Automatic Post-Editing, Summarization, and Text Simplification + 2020.findings-emnlp.111 + + + What Can We Do to Improve Peer Review in <fixed-case>NLP</fixed-case>? + AnnaRogers + IsabelleAugenstein + 1256–1262 + Peer review is our best tool for judging the quality of conference submissions, but it is becoming increasingly spurious. We argue that a part of the problem is that the reviewers and area chairs face a poorly defined task forcing apples-to-oranges comparisons. There are several potential ways forward, but the key difficulty is creating the incentives and mechanisms for their consistent implementation in the NLP community. + 2020.findings-emnlp.112 + + + Unsupervised Relation Extraction from Language Models using Constrained Cloze Completion + AnkurGoswami + AkshataBhat + HadarOhana + TheodorosRekatsinas + 1263–1276 + We show that state-of-the-art self-supervised language models can be readily used to extract relations from a corpus without the need to train a fine-tuned extractive head. We introduce RE-Flex, a simple framework that performs constrained cloze completion over pretrained language models to perform unsupervised relation extraction. RE-Flex uses contextual matching to ensure that language model predictions matches supporting evidence from the input corpus that is relevant to a target relation. We perform an extensive experimental study over multiple relation extraction benchmarks and demonstrate that RE-Flex outperforms competing unsupervised relation extraction methods based on pretrained language models by up to 27.8 F1 points compared to the next-best method. Our results show that constrained inference queries against a language model can enable accurate unsupervised relation extraction. + 2020.findings-emnlp.113 + 2020.findings-emnlp.113.OptionalSupplementaryMaterial.zip + + + Biomedical Event Extraction on Graph Edge-conditioned Attention Networks with Hierarchical Knowledge Graphs + Kung-HsiangHuang + MuYang + NanyunPeng + 1277–1285 + Biomedical event extraction is critical in understanding biomolecular interactions described in scientific corpus. One of the main challenges is to identify nested structured events that are associated with non-indicative trigger words. We propose to incorporate domain knowledge from Unified Medical Language System (UMLS) to a pre-trained language model via Graph Edge-conditioned Attention Networks (GEANet) and hierarchical graph representation. To better recognize the trigger words, each sentence is first grounded to a sentence graph based on a jointly modeled hierarchical knowledge graph from UMLS. The grounded graphs are then propagated by GEANet, a novel graph neural networks for enhanced capabilities in inferring complex events. On BioNLP 2011 GENIA Event Extraction task, our approach achieved 1.41% F1 and 3.19% F1 improvements on all events and complex events, respectively. Ablation studies confirm the importance of GEANet and hierarchical KG. + 2020.findings-emnlp.114 + 2020.findings-emnlp.114.OptionalSupplementaryMaterial.zip + + + Constraint Satisfaction Driven Natural Language Generation: A Tree Search Embedded <fixed-case>MCMC</fixed-case> Approach + MaosenZhang + NanJiang + LeiLi + YexiangXue + 1286–1298 + Generating natural language under complex constraints is a principled formulation towards controllable text generation. We present a framework to allow specification of combinatorial constraints for sentence generation. We propose TSMC, an efficient method to generate high likelihood sentences with respect to a pre-trained language model while satisfying the constraints. Our approach is highly flexible, requires no task-specific train- ing, and leverages efficient constraint satisfaction solving techniques. To better handle the combinatorial constraints, a tree search algorithm is embedded into the proposal process of the Markov Chain Monte Carlo (MCMC) to explore candidates that satisfy more constraints. Compared to existing MCMC approaches, our sampling approach has a better mixing performance. Experiments show that TSMC achieves consistent and significant improvement on multiple language generation tasks. + 2020.findings-emnlp.115 + 2020.findings-emnlp.115.OptionalSupplementaryMaterial.pdf + + + Examining the Ordering of Rhetorical Strategies in Persuasive Requests + OmarShaikh + JiaaoChen + JonSaad-Falcon + PoloChau + DiyiYang + 1299–1306 + Interpreting how persuasive language influences audiences has implications across many domains like advertising, argumentation, and propaganda. Persuasion relies on more than a message’s content. Arranging the order of the message itself (i.e., ordering specific rhetorical strategies) also plays an important role. To examine how strategy orderings contribute to persuasiveness, we first utilize a Variational Autoencoder model to disentangle content and rhetorical strategies in textual requests from a large-scale loan request corpus. We then visualize interplay between content and strategy through an attentional LSTM that predicts the success of textual requests. We find that specific (orderings of) strategies interact uniquely with a request’s content to impact success rate, and thus the persuasiveness of a request. + 2020.findings-emnlp.116 + + + Evaluating Models’ Local Decision Boundaries via Contrast Sets + MattGardner + YoavArtzi + VictoriaBasmov + JonathanBerant + BenBogin + SihaoChen + PradeepDasigi + DheeruDua + YanaiElazar + AnanthGottumukkala + NitishGupta + HannanehHajishirzi + GabrielIlharco + DanielKhashabi + KevinLin + JiangmingLiu + Nelson F.Liu + PhoebeMulcaire + QiangNing + SameerSingh + Noah A.Smith + SanjaySubramanian + ReutTsarfaty + EricWallace + AllyZhang + BenZhou + 1307–1323 + Standard test sets for supervised learning evaluate in-distribution generalization. Unfortunately, when a dataset has systematic gaps (e.g., annotation artifacts), these evaluations are misleading: a model can learn simple decision rules that perform well on the test set but do not capture the abilities a dataset is intended to test. We propose a more rigorous annotation paradigm for NLP that helps to close systematic gaps in the test data. In particular, after a dataset is constructed, we recommend that the dataset authors manually perturb the test instances in small but meaningful ways that (typically) change the gold label, creating contrast sets. Contrast sets provide a local view of a model’s decision boundary, which can be used to more accurately evaluate a model’s true linguistic capabilities. We demonstrate the efficacy of contrast sets by creating them for 10 diverse NLP datasets (e.g., DROP reading comprehension, UD parsing, and IMDb sentiment analysis). Although our contrast sets are not explicitly adversarial, model performance is significantly lower on them than on the original test sets—up to 25% in some cases. We release our contrast sets as new evaluation benchmarks and encourage future dataset construction efforts to follow similar annotation processes. + 2020.findings-emnlp.117 + + + Parsing with Multilingual <fixed-case>BERT</fixed-case>, a Small Treebank, and a Small Corpus + Ethan C.Chau + Lucy H.Lin + Noah A.Smith + 1324–1334 + Pretrained multilingual contextual representations have shown great success, but due to the limits of their pretraining data, their benefits do not apply equally to all language varieties. This presents a challenge for language varieties unfamiliar to these models, whose labeled and unlabeled data is too limited to train a monolingual model effectively. We propose the use of additional language-specific pretraining and vocabulary augmentation to adapt multilingual models to low-resource settings. Using dependency parsing of four diverse low-resource language varieties as a case study, we show that these methods significantly improve performance over baselines, especially in the lowest-resource cases, and demonstrate the importance of the relationship between such models’ pretraining data and target language varieties. + 2020.findings-emnlp.118 + + + <fixed-case>O</fixed-case>pt<fixed-case>SLA</fixed-case>: an Optimization-Based Approach for Sequential Label Aggregation + NasimSabetpour + AdithyaKulkarni + QiLi + 1335–1340 + The need for the annotated training dataset on which data-hungry machine learning algorithms feed has increased dramatically with advanced acclaim of machine learning applications. To annotate the data, people with domain expertise are needed, but they are seldom available and expensive to hire. This has lead to the thriving of crowdsourcing platforms such as Amazon Mechanical Turk (AMT). However, the annotations provided by one worker cannot be used directly to train the model due to the lack of expertise. Existing literature in annotation aggregation focuses on binary and multi-choice problems. In contrast, little work has been done on complex tasks such as sequence labeling with imbalanced classes, a ubiquitous task in Natural Language Processing (NLP), and Bio-Informatics. We propose OptSLA, an Optimization-based Sequential Label Aggregation method, that jointly considers the characteristics of sequential labeling tasks, workers reliabilities, and advanced deep learning techniques to conquer the challenge. We evaluate our model on crowdsourced data for named entity recognition task. Our results show that the proposed OptSLA outperforms the state-of-the-art aggregation methods, and the results are easier to interpret. + 2020.findings-emnlp.119 + + + Optimizing Word Segmentation for Downstream Task + TatsuyaHiraoka + ShoTakase + KeiUchiumi + AtsushiKeyaki + NaoakiOkazaki + 1341–1351 + In traditional NLP, we tokenize a given sentence as a preprocessing, and thus the tokenization is unrelated to a target downstream task. To address this issue, we propose a novel method to explore a tokenization which is appropriate for the downstream task. Our proposed method, optimizing tokenization (OpTok), is trained to assign a high probability to such appropriate tokenization based on the downstream task loss. OpTok can be used for any downstream task which uses a vector representation of a sentence such as text classification. Experimental results demonstrate that OpTok improves the performance of sentiment analysis and textual entailment. In addition, we introduce OpTok into BERT, the state-of-the-art contextualized embeddings and report a positive effect. + 2020.findings-emnlp.120 + + + Dynamically Updating Event Representations for Temporal Relation Classification with Multi-category Learning + FeiCheng + MasayukiAsahara + IchiroKobayashi + SadaoKurohashi + 1352–1357 + Temporal relation classification is the pair-wise task for identifying the relation of a temporal link (TLINKs) between two mentions, i.e. event, time and document creation time (DCT). It leads to two crucial limits: 1) Two TLINKs involving a common mention do not share information. 2) Existing models with independent classifiers for each TLINK category (E2E, E2T and E2D) hinder from using the whole data. This paper presents an event centric model that allows to manage dynamic event representations across multiple TLINKs. Our model deals with three TLINK categories with multi-task learning to leverage the full size of data. The experimental results show that our proposal outperforms state-of-the-art models and two strong transfer learning baselines on both the English and Japanese data. + 2020.findings-emnlp.121 + + + A Compare Aggregate Transformer for Understanding Document-grounded Dialogue + LongxuanMa + Wei-NanZhang + RunxinSun + TingLiu + 1358–1367 + Unstructured documents serving as external knowledge of the dialogues help to generate more informative responses. Previous research focused on knowledge selection (KS) in the document with dialogue. However, dialogue history that is not related to the current dialogue may introduce noise in the KS processing. In this paper, we propose a Compare Aggregate Transformer (CAT) to jointly denoise the dialogue context and aggregate the document information for response generation. We designed two different comparison mechanisms to reduce noise (before and during decoding). In addition, we propose two metrics for evaluating document utilization efficiency based on word overlap. Experimental results on the CMU_DoG dataset show that the proposed CAT model outperforms the state-of-the-art approach and strong baselines. + 2020.findings-emnlp.122 + + + <fixed-case>T</fixed-case>ext<fixed-case>H</fixed-case>ide: Tackling Data Privacy for Language Understanding Tasks + YangsiboHuang + ZhaoSong + DanqiChen + KaiLi + SanjeevArora + 1368–1382 + An unsolved challenge in distributed or federated learning is to effectively mitigate privacy risks without slowing down training or reducing accuracy. In this paper, we propose TextHide aiming at addressing this challenge for natural language understanding tasks. It requires all participants to add a simple encryption step to prevent an eavesdropping attacker from recovering private text data. Such an encryption step is efficient and only affects the task performance slightly. In addition, TextHide fits well with the popular framework of fine-tuning pre-trained language models (e.g., BERT) for any sentence or sentence-pair task. We evaluate TextHide on the GLUE benchmark, and our experiments show that TextHide can effectively defend attacks on shared gradients or representations and the averaged accuracy reduction is only 1.9%. We also present an analysis of the security of TextHide using a conjecture about the computational intractability of a mathematical problem. + 2020.findings-emnlp.123 + + + Modeling Intra and Inter-modality Incongruity for Multi-Modal Sarcasm Detection + HongliangPan + ZhengLin + PengFu + YataoQi + WeipingWang + 1383–1392 + Sarcasm is a pervasive phenomenon in today’s social media platforms such as Twitter and Reddit. These platforms allow users to create multi-modal messages, including texts, images, and videos. Existing multi-modal sarcasm detection methods either simply concatenate the features from multi modalities or fuse the multi modalities information in a designed manner. However, they ignore the incongruity character in sarcastic utterance, which is often manifested between modalities or within modalities. Inspired by this, we propose a BERT architecture-based model, which concentrates on both intra and inter-modality incongruity for multi-modal sarcasm detection. To be specific, we are inspired by the idea of self-attention mechanism and design inter-modality attention to capturing inter-modality incongruity. In addition, the co-attention mechanism is applied to model the contradiction within the text. The incongruity information is then used for prediction. The experimental results demonstrate that our model achieves state-of-the-art performance on a public multi-modal sarcasm detection dataset. + 2020.findings-emnlp.124 + + + Investigating Transferability in Pretrained Language Models + AlexTamkin + TrishaSingh + DavideGiovanardi + NoahGoodman + 1393–1401 + How does language model pretraining help transfer learning? We consider a simple ablation technique for determining the impact of each pretrained layer on transfer task performance. This method, partial reinitialization, involves replacing different layers of a pretrained model with random weights, then finetuning the entire model on the transfer task and observing the change in performance. This technique reveals that in BERT, layers with high probing performance on downstream GLUE tasks are neither necessary nor sufficient for high accuracy on those tasks. Furthermore, the benefit of using pretrained parameters for a layer varies dramatically with finetuning dataset size: parameters that provide tremendous performance improvement when data is plentiful may provide negligible benefits in data-scarce settings. These results reveal the complexity of the transfer learning process, highlighting the limitations of methods that operate on frozen models or single data samples. + 2020.findings-emnlp.125 + + + Improving Knowledge-Aware Dialogue Response Generation by Using Human-Written Prototype Dialogues + SixingWu + YingLi + DaweiZhang + ZhonghaiWu + 1402–1411 + Incorporating commonsense knowledge can alleviate the issue of generating generic responses in open-domain generative dialogue systems. However, selecting knowledge facts for the dialogue context is still a challenge. The widely used approach Entity Name Matching always retrieves irrelevant facts from the view of local entity words. This paper proposes a novel knowledge selection approach, Prototype-KR, and a knowledge-aware generative model, Prototype-KRG. Given a query, our approach first retrieves a set of prototype dialogues that are relevant to the query. We find knowledge facts used in prototype dialogues usually are highly relevant to the current query; thus, Prototype-KR ranks such knowledge facts based on the semantic similarity and then selects the most appropriate facts. Subsequently, Prototype-KRG can generate an informative response using the selected knowledge facts. Experiments demonstrate that our approach has achieved notable improvements on the most metrics, compared to generative baselines. Meanwhile, compared to IR(Retrieval)-based baselines, responses generated by our approach are more relevant to the context and have comparable informativeness. + 2020.findings-emnlp.126 + + + Filtering before Iteratively Referring for Knowledge-Grounded Response Selection in Retrieval-Based Chatbots + Jia-ChenGu + ZhenhuaLing + QuanLiu + ZhigangChen + XiaodanZhu + 1412–1422 + The challenges of building knowledge-grounded retrieval-based chatbots lie in how to ground a conversation on its background knowledge and how to match response candidates with both context and knowledge simultaneously. This paper proposes a method named Filtering before Iteratively REferring (FIRE) for this task. In this method, a context filter and a knowledge filter are first built, which derive knowledge-aware context representations and context-aware knowledge representations respectively by global and bidirectional attention. Besides, the entries irrelevant to the conversation are discarded by the knowledge filter. After that, iteratively referring is performed between context and response representations as well as between knowledge and response representations, in order to collect deep matching features for scoring response candidates. Experimental results show that FIRE outperforms previous methods by margins larger than 2.8% and 4.1% on the PERSONA-CHAT dataset with original and revised personas respectively, and margins larger than 3.1% on the CMU_DoG dataset in terms of top-1 accuracy. We also show that FIRE is more interpretable by visualizing the knowledge grounding process. + 2020.findings-emnlp.127 + + + Privacy-Preserving News Recommendation Model Learning + TaoQi + FangzhaoWu + ChuhanWu + YongfengHuang + XingXie + 1423–1432 + News recommendation aims to display news articles to users based on their personal interest. Existing news recommendation methods rely on centralized storage of user behavior data for model training, which may lead to privacy concerns and risks due to the privacy-sensitive nature of user behaviors. In this paper, we propose a privacy-preserving method for news recommendation model training based on federated learning, where the user behavior data is locally stored on user devices. Our method can leverage the useful information in the behaviors of massive number users to train accurate news recommendation models and meanwhile remove the need of centralized storage of them. More specifically, on each user device we keep a local copy of the news recommendation model, and compute gradients of the local model based on the user behaviors in this device. The local gradients from a group of randomly selected users are uploaded to server, which are further aggregated to update the global model in the server. Since the model gradients may contain some implicit private information, we apply local differential privacy (LDP) to them before uploading for better privacy protection. The updated global model is then distributed to each user device for local model update. We repeat this process for multiple rounds. Extensive experiments on a real-world dataset show the effectiveness of our method in news recommendation model training with privacy protection. + 2020.findings-emnlp.128 + + + ex<fixed-case>BERT</fixed-case>: Extending Pre-trained Models with Domain-specific Vocabulary Under Constrained Training Resources + WenTai + H. T.Kung + XinDong + MarcusComiter + Chang-FuKuo + 1433–1439 + We introduce exBERT, a training method to extend BERT pre-trained models from a general domain to a new pre-trained model for a specific domain with a new additive vocabulary under constrained training resources (i.e., constrained computation and data). exBERT uses a small extension module to learn to adapt an augmenting embedding for the new domain in the context of the original BERT’s embedding of a general vocabulary. The exBERT training method is novel in learning the new vocabulary and the extension module while keeping the weights of the original BERT model fixed, resulting in a substantial reduction in required training resources. We pre-train exBERT with biomedical articles from ClinicalKey and PubMed Central, and study its performance on biomedical downstream benchmark tasks using the MTL-Bioinformatics-2016 datasets. We demonstrate that exBERT consistently outperforms prior approaches when using limited corpus and pre-training computation resources. + 2020.findings-emnlp.129 + + + Balancing via Generation for Multi-Class Text Classification Improvement + NaamaTepper + EstherGoldbraich + NaamaZwerdling + GeorgeKour + AteretAnaby Tavor + BoazCarmeli + 1440–1452 + Data balancing is a known technique for improving the performance of classification tasks. In this work we define a novel balancing-viageneration framework termed BalaGen. BalaGen consists of a flexible balancing policy coupled with a text generation mechanism. Combined, these two techniques can be used to augment a dataset for more balanced distribution. We evaluate BalaGen on three publicly available semantic utterance classification (SUC) datasets. One of these is a new COVID-19 Q&A dataset published here for the first time. Our work demonstrates that optimal balancing policies can significantly improve classifier performance, while augmenting just part of the classes and under-sampling others. Furthermore, capitalizing on the advantages of balancing, we show its usefulness in all relevant BalaGen framework components. We validate the superiority of BalaGen on ten semantic utterance datasets taken from real-life goaloriented dialogue systems. Based on our results we encourage using data balancing prior to training for text classification tasks. + 2020.findings-emnlp.130 + + + Conditional Neural Generation using Sub-Aspect Functions for Extractive News Summarization + ZhengyuanLiu + KeShi + NancyChen + 1453–1463 + Much progress has been made in text summarization, fueled by neural architectures using large-scale training corpora. However, in the news domain, neural models easily overfit by leveraging position-related features due to the prevalence of the inverted pyramid writing style. In addition, there is an unmet need to generate a variety of summaries for different users. In this paper, we propose a neural framework that can flexibly control summary generation by introducing a set of sub-aspect functions (i.e. importance, diversity, position). These sub-aspect functions are regulated by a set of control codes to decide which sub-aspect to focus on during summary generation. We demonstrate that extracted summaries with minimal position bias is comparable with those generated by standard models that take advantage of position preference. We also show that news summaries generated with a focus on diversity can be more preferred by human raters. These results suggest that a more flexible neural summarization framework providing more control options could be desirable in tailoring to different user preferences, which is useful since it is often impractical to articulate such preferences for different applications a priori. + 2020.findings-emnlp.131 + 2020.findings-emnlp.131.OptionalSupplementaryMaterial.zip + + + Research Replication Prediction Using Weakly Supervised Learning + TianyiLuo + XingyuLi + HainanWang + YangLiu + 1464–1474 + Knowing whether a published research result can be replicated is important. Carrying out direct replication of published research incurs a high cost. There are efforts tried to use machine learning aided methods to predict scientific claims’ replicability. However, existing machine learning aided approaches use only hand-extracted statistics features such as p-value, sample size, etc. without utilizing research papers’ text information and train only on a very small size of annotated data without making the most use of a large number of unlabeled articles. Therefore, it is desirable to develop effective machine learning aided automatic methods which can automatically extract text information as features so that we can benefit from Natural Language Processing techniques. Besides, we aim for an approach that benefits from both labeled and the large number of unlabeled data. In this paper, we propose two weakly supervised learning approaches that use automatically extracted text information of research papers to improve the prediction accuracy of research replication using both labeled and unlabeled datasets. Our experiments over real-world datasets show that our approaches obtain much better prediction performance compared to the supervised models utilizing only statistic features and a small size of labeled dataset. Further, we are able to achieve an accuracy of 75.76% for predicting the replicability of research. + 2020.findings-emnlp.132 + + + Open Domain Question Answering based on Text Enhanced Knowledge Graph with Hyperedge Infusion + JialeHan + BoCheng + XuWang + 1475–1481 + The incompleteness of knowledge base (KB) is a vital factor limiting the performance of question answering (QA). This paper proposes a novel QA method by leveraging text information to enhance the incomplete KB. The model enriches the entity representation through semantic information contained in the text, and employs graph convolutional networks to update the entity status. Furthermore, to exploit the latent structural information of text, we treat the text as hyperedges connecting entities among it to complement the deficient relations in KB, and hypergraph convolutional networks are further applied to reason on the hypergraph-formed text. Extensive experiments on the WebQuestionsSP benchmark with different KB settings prove the effectiveness of our model. + 2020.findings-emnlp.133 + + + Inexpensive Domain Adaptation of Pretrained Language Models: Case Studies on Biomedical <fixed-case>NER</fixed-case> and Covid-19 <fixed-case>QA</fixed-case> + NinaPoerner + UlliWaltinger + HinrichSchütze + 1482–1490 + Domain adaptation of Pretrained Language Models (PTLMs) is typically achieved by unsupervised pretraining on target-domain text. While successful, this approach is expensive in terms of hardware, runtime and CO 2 emissions. Here, we propose a cheaper alternative: We train Word2Vec on target-domain text and align the resulting word vectors with the wordpiece vectors of a general-domain PTLM. We evaluate on eight English biomedical Named Entity Recognition (NER) tasks and compare against the recently proposed BioBERT model. We cover over 60% of the BioBERT - BERT F1 delta, at 5% of BioBERT’s CO 2 footprint and 2% of its cloud compute cost. We also show how to quickly adapt an existing general-domain Question Answering (QA) model to an emerging domain: the Covid-19 pandemic. + 2020.findings-emnlp.134 + 2020.findings-emnlp.134.OptionalSupplementaryMaterial.pdf + + + Semantically Driven Sentence Fusion: Modeling and Evaluation + EyalBen-David + OrgadKeller + EricMalmi + IdanSzpektor + RoiReichart + 1491–1505 + Sentence fusion is the task of joining related sentences into coherent text. Current training and evaluation schemes for this task are based on single reference ground-truths and do not account for valid fusion variants. We show that this hinders models from robustly capturing the semantic relationship between input sentences. To alleviate this, we present an approach in which ground-truth solutions are automatically expanded into multiple references via curated equivalence classes of connective phrases. We apply this method to a large-scale dataset and use the augmented dataset for both model training and evaluation. To improve the learning of semantic representation using multiple references, we enrich the model with auxiliary discourse classification tasks under a multi-tasking framework. Our experiments highlight the improvements of our approach over state-of-the-art models. + 2020.findings-emnlp.135 + + + Pseudo-Bidirectional Decoding for Local Sequence Transduction + WangchunshuZhou + TaoGe + KeXu + 1506–1511 + Local sequence transduction (LST) tasks are sequence transduction tasks where there exists massive overlapping between the source and target sequences, such as grammatical error correction and spell or OCR correction. Motivated by this characteristic of LST tasks, we propose Pseudo-Bidirectional Decoding (PBD), a simple but versatile approach for LST tasks. PBD copies the representation of source tokens to the decoder as pseudo future context that enables the decoder self-attention to attends to its bi-directional context. In addition, the bidirectional decoding scheme and the characteristic of LST tasks motivate us to share the encoder and the decoder of LST models. Our approach provides right-side context information for the decoder, reduces the number of parameters by half, and provides good regularization effects. Experimental results on several benchmark datasets show that our approach consistently improves the performance of standard seq2seq models on LST tasks. + 2020.findings-emnlp.136 + + + Predicting Responses to Psychological Questionnaires from Participants’ Social Media Posts and Question Text Embeddings + HuyVu + SuhaibAbdurahman + SudeepBhatia + LyleUngar + 1512–1524 + Psychologists routinely assess people’s emotions and traits, such as their personality, by collecting their responses to survey questionnaires. Such assessments can be costly in terms of both time and money, and often lack generalizability, as existing data cannot be used to predict responses for new survey questions or participants. In this study, we propose a method for predicting a participant’s questionnaire response using their social media texts and the text of the survey question they are asked. Specifically, we use Natural Language Processing (NLP) tools such as BERT embeddings to represent both participants (via the text they write) and survey questions as embeddings vectors, allowing us to predict responses for out-of-sample participants and questions. Our novel approach can be used by researchers to integrate new participants or new questions into psychological studies without the constraint of costly data collection, facilitating novel practical applications and furthering the development of psychological theory. Finally, as a side contribution, the success of our model also suggests a new approach to study survey questions using NLP tools such as text embeddings rather than response data used in traditional methods. + 2020.findings-emnlp.137 + 2020.findings-emnlp.137.OptionalSupplementaryMaterial.zip + + + Will it Unblend? + YuvalPinter + Cassandra L.Jacobs + JacobEisenstein + 1525–1535 + Natural language processing systems often struggle with out-of-vocabulary (OOV) terms, which do not appear in training data. Blends, such as “innoventor”, are one particularly challenging class of OOV, as they are formed by fusing together two or more bases that relate to the intended meaning in unpredictable manners and degrees. In this work, we run experiments on a novel dataset of English OOV blends to quantify the difficulty of interpreting the meanings of blends by large-scale contextual language models such as BERT. We first show that BERT’s processing of these blends does not fully access the component meanings, leaving their contextual representations semantically impoverished. We find this is mostly due to the loss of characters resulting from blend formation. Then, we assess how easily different models can recognize the structure and recover the origin of blends, and find that context-aware embedding systems outperform character-level and context-free embeddings, although their results are still far from satisfactory. + 2020.findings-emnlp.138 + 2020.findings-emnlp.138.OptionalSupplementaryMaterial.zip + + + <fixed-case>C</fixed-case>ode<fixed-case>BERT</fixed-case>: A Pre-Trained Model for Programming and Natural Languages + ZhangyinFeng + DayaGuo + DuyuTang + NanDuan + XiaochengFeng + MingGong + LinjunShou + BingQin + TingLiu + DaxinJiang + MingZhou + 1536–1547 + We present CodeBERT, a bimodal pre-trained model for programming language (PL) and natural language (NL). CodeBERT learns general-purpose representations that support downstream NL-PL applications such as natural language code search, code documentation generation, etc. We develop CodeBERT with Transformer-based neural architecture, and train it with a hybrid objective function that incorporates the pre-training task of replaced token detection, which is to detect plausible alternatives sampled from generators. This enables us to utilize both “bimodal” data of NL-PL pairs and “unimodal data, where the former provides input tokens for model training while the latter helps to learn better generators. We evaluate CodeBERT on two NL-PL applications by fine-tuning model parameters. Results show that CodeBERT achieves state-of-the-art performance on both natural language code search and code documentation generation. Furthermore, to investigate what type of knowledge is learned in CodeBERT, we construct a dataset for NL-PL probing, and evaluate in a zero-shot setting where parameters of pre-trained models are fixed. Results show that CodeBERT performs better than previous pre-trained models on NLPL probing. + 2020.findings-emnlp.139 + + + <fixed-case>S</fixed-case>tyle<fixed-case>DGPT</fixed-case>: Stylized Response Generation with Pre-trained Language Models + ZeYang + WeiWu + CanXu + XinnianLiang + JiaqiBai + LiranWang + WeiWang + ZhoujunLi + 1548–1559 + Generating responses following a desired style has great potentials to extend applications of open-domain dialogue systems, yet is refrained by lacking of parallel data for training. In this work, we explore the challenging task with pre-trained language models that have brought breakthrough to various natural language tasks. To this end, we introduce a KL loss and a style classifier to the fine-tuning step in order to steer response generation towards the target style in both a word-level and a sentence-level. Comprehensive empirical studies with two public datasets indicate that our model can significantly outperform state-of-the-art methods in terms of both style consistency and contextual coherence. + 2020.findings-emnlp.140 + + + Enhancing Automated Essay Scoring Performance via Cohesion Measurement and Combination of Regression and Ranking + RuosongYang + JiannongCao + ZhiyuanWen + YouzhengWu + XiaodongHe + 1560–1569 + Automated Essay Scoring (AES) is a critical text regression task that automatically assigns scores to essays based on their writing quality. Recently, the performance of sentence prediction tasks has been largely improved by using Pre-trained Language Models via fusing representations from different layers, constructing an auxiliary sentence, using multi-task learning, etc. However, to solve the AES task, previous works utilize shallow neural networks to learn essay representations and constrain calculated scores with regression loss or ranking loss, respectively. Since shallow neural networks trained on limited samples show poor performance to capture deep semantic of texts. And without an accurate scoring function, ranking loss and regression loss measures two different aspects of the calculated scores. To improve AES’s performance, we find a new way to fine-tune pre-trained language models with multiple losses of the same task. In this paper, we propose to utilize a pre-trained language model to learn text representations first. With scores calculated from the representations, mean square error loss and the batch-wise ListNet loss with dynamic weights constrain the scores simultaneously. We utilize Quadratic Weighted Kappa to evaluate our model on the Automated Student Assessment Prize dataset. Our model outperforms not only state-of-the-art neural models near 3 percent but also the latest statistic model. Especially on the two narrative prompts, our model performs much better than all other state-of-the-art models. + 2020.findings-emnlp.141 + + + Neural Dialogue State Tracking with Temporally Expressive Networks + JunfanChen + RichongZhang + YongyiMao + JieXu + 1570–1579 + Dialogue state tracking (DST) is an important part of a spoken dialogue system. Existing DST models either ignore temporal feature dependencies across dialogue turns or fail to explicitly model temporal state dependencies in a dialogue. In this work, we propose Temporally Expressive Networks (TEN) to jointly model the two types of temporal dependencies in DST. The TEN model utilizes the power of recurrent networks and probabilistic graphical models. Evaluating on standard datasets, TEN is demonstrated to improve the accuracy of turn-level-state prediction and the state aggregation. + 2020.findings-emnlp.142 + + + Inferring about fraudulent collusion risk on <fixed-case>B</fixed-case>razilian public works contracts in official texts using a <fixed-case>B</fixed-case>i-<fixed-case>LSTM</fixed-case> approach + MarcosLima + RobertaSilva + FelipeLopes de Souza Mendes + LeonardoR. de Carvalho + AleteiaAraujo + Flaviode Barros Vidal + 1580–1588 + Public works procurements move US$ 10 billion yearly in Brazil and are a preferred field for collusion and fraud. Federal Police and audit agencies investigate collusion (bid-rigging), over-pricing, and delivery fraud in this field and efforts have been employed to early detect fraud and collusion on public works procurements. The current automatic methods of fraud detection use structured data to classification and usually do not involve annotated data. The use of NLP for this kind of application is rare. Our work introduces a new dataset formed by public procurement calls available on Brazilian official journal (Diário Oficial da União), using by 15,132,968 textual entries of which 1,907 are annotated risky entries. Both bottleneck deep neural network and BiLSTM shown competitive compared with classical classifiers and achieved better precision (93.0% and 92.4%, respectively), which signs improvements in a criminal fraud investigation. + 2020.findings-emnlp.143 + + + Record-to-Text Generation with Style Imitation + ShuaiLin + WentaoWang + ZichaoYang + XiaodanLiang + Frank F.Xu + EricXing + ZhitingHu + 1589–1598 + Recent neural approaches to data-to-text generation have mostly focused on improving content fidelity while lacking explicit control over writing styles (e.g., sentence structures, word choices). More traditional systems use templates to determine the realization of text. Yet manual or automatic construction of high-quality templates is difficult, and a template acting as hard constraints could harm content fidelity when it does not match the record perfectly. We study a new way of stylistic control by using existing sentences as “soft” templates. That is, a model learns to imitate the writing style of any given exemplar sentence, with automatic adaptions to faithfully describe the record. The problem is challenging due to the lack of parallel data. We develop a neural approach that includes a hybrid attention-copy mechanism, learns with weak supervisions, and is enhanced with a new content coverage constraint. We conduct experiments in restaurants and sports domains. Results show our approach achieves stronger performance than a range of comparison methods. Our approach balances well between content fidelity and style control given exemplars that match the records to varying degrees. + 2020.findings-emnlp.144 + + + Teaching Machine Comprehension with Compositional Explanations + QinyuanYe + XiaoHuang + ElizabethBoschee + XiangRen + 1599–1615 + Advances in machine reading comprehension (MRC) rely heavily on the collection of large scale human-annotated examples in the form of (question, paragraph, answer) triples. In contrast, humans are typically able to generalize with only a few examples, relying on deeper underlying world knowledge, linguistic sophistication, and/or simply superior deductive powers. In this paper, we focus on “teaching” machines reading comprehension, using a small number of semi-structured explanations that explicitly inform machines why answer spans are correct. We extract structured variables and rules from explanations and compose neural module teachers that annotate instances for training downstream MRC models. We use learnable neural modules and soft logic to handle linguistic variation and overcome sparse coverage; the modules are jointly optimized with the MRC model to improve final performance. On the SQuAD dataset, our proposed method achieves 70.14% F1 score with supervision from 26 explanations, comparable to plain supervised learning using 1,100 labeled instances, yielding a 12x speed up. + 2020.findings-emnlp.145 + + + A Knowledge-driven Approach to Classifying Object and Attribute Coreferences in Opinion Mining + JiahuaChen + ShuaiWang + SahisnuMazumder + BingLiu + 1616–1626 + Classifying and resolving coreferences of objects (e.g., product names) and attributes (e.g., product aspects) in opinionated reviews is crucial for improving the opinion mining performance. However, the task is challenging as one often needs to consider domain-specific knowledge (e.g., iPad is a tablet and has aspect resolution) to identify coreferences in opinionated reviews. Also, compiling a handcrafted and curated domain-specific knowledge base for each domain is very time consuming and arduous. This paper proposes an approach to automatically mine and leverage domain-specific knowledge for classifying objects and attribute coreferences. The approach extracts domain-specific knowledge from unlabeled review data and trains a knowledgeaware neural coreference classification model to leverage (useful) domain knowledge together with general commonsense knowledge for the task. Experimental evaluation on realworld datasets involving five domains (product types) shows the effectiveness of the approach + 2020.findings-emnlp.146 + + + <fixed-case>S</fixed-case>im<fixed-case>A</fixed-case>lign: High Quality Word Alignments without Parallel Training Data using Static and Contextualized Embeddings + MasoudJalili Sabet + PhilippDufter + FrançoisYvon + HinrichSchütze + 1627–1643 + Word alignments are useful for tasks like statistical and neural machine translation (NMT) and cross-lingual annotation projection. Statistical word aligners perform well, as do methods that extract alignments jointly with translations in NMT. However, most approaches require parallel training data and quality decreases as less training data is available. We propose word alignment methods that require no parallel data. The key idea is to leverage multilingual word embeddings – both static and contextualized – for word alignment. Our multilingual embeddings are created from monolingual data only without relying on any parallel data or dictionaries. We find that alignments created from embeddings are superior for four and comparable for two language pairs compared to those produced by traditional statistical aligners – even with abundant parallel data; e.g., contextualized embeddings achieve a word alignment F1 for English-German that is 5 percentage points higher than eflomal, a high-quality statistical aligner, trained on 100k parallel sentences. + 2020.findings-emnlp.147 + + + <fixed-case>T</fixed-case>weet<fixed-case>E</fixed-case>val: Unified Benchmark and Comparative Evaluation for Tweet Classification + FrancescoBarbieri + JoseCamacho-Collados + LuisEspinosa Anke + LeonardoNeves + 1644–1650 + The experimental landscape in natural language processing for social media is too fragmented. Each year, new shared tasks and datasets are proposed, ranging from classics like sentiment analysis to irony detection or emoji prediction. Therefore, it is unclear what the current state of the art is, as there is no standardized evaluation protocol, neither a strong set of baselines trained on such domain-specific data. In this paper, we propose a new evaluation framework (TweetEval) consisting of seven heterogeneous Twitter-specific classification tasks. We also provide a strong set of baselines as starting point, and compare different language modeling pre-training strategies. Our initial experiments show the effectiveness of starting off with existing pre-trained generic language models, and continue training them on Twitter corpora. + 2020.findings-emnlp.148 + + + Octa: Omissions and Conflicts in Target-Aspect Sentiment Analysis + ZheZhang + Chung-WeiHang + MunindarSingh + 1651–1662 + Sentiments in opinionated text are often determined by both aspects and target words (or targets). We observe that targets and aspects interrelate in subtle ways, often yielding conflicting sentiments. Thus, a naive aggregation of sentiments from aspects and targets treated separately, as in existing sentiment analysis models, impairs performance. We propose Octa, an approach that jointly considers aspects and targets when inferring sentiments. To capture and quantify relationships between targets and context words, Octa uses a selective self-attention mechanism that handles implicit or missing targets. Specifically, Octa involves two layers of attention mechanisms for, respectively, selective attention between targets and context words and attention over words based on aspects. On benchmark datasets, Octa outperforms leading models by a large margin, yielding (absolute) gains in accuracy of 1.6% to 4.3%. + 2020.findings-emnlp.149 + + + On the Language Neutrality of Pre-trained Multilingual Representations + JindřichLibovický + RudolfRosa + AlexanderFraser + 1663–1674 + Multilingual contextual embeddings, such as multilingual BERT and XLM-RoBERTa, have proved useful for many multi-lingual tasks. Previous work probed the cross-linguality of the representations indirectly using zero-shot transfer learning on morphological and syntactic tasks. We instead investigate the language-neutrality of multilingual contextual embeddings directly and with respect to lexical semantics. Our results show that contextual embeddings are more language-neutral and, in general, more informative than aligned static word-type embeddings, which are explicitly trained for language neutrality. Contextual embeddings are still only moderately language-neutral by default, so we propose two simple methods for achieving stronger language neutrality: first, by unsupervised centering of the representation for each language and second, by fitting an explicit projection on small parallel data. Besides, we show how to reach state-of-the-art accuracy on language identification and match the performance of statistical methods for word alignment of parallel sentences without using parallel data. + 2020.findings-emnlp.150 + 2020.findings-emnlp.150.OptionalSupplementaryMaterial.tgz + + + Cost-effective Selection of Pretraining Data: A Case Study of Pretraining <fixed-case>BERT</fixed-case> on Social Media + XiangDai + SarvnazKarimi + BenHachey + CecileParis + 1675–1681 + Recent studies on domain-specific BERT models show that effectiveness on downstream tasks can be improved when models are pretrained on in-domain data. Often, the pretraining data used in these models are selected based on their subject matter, e.g., biology or computer science. Given the range of applications using social media text, and its unique language variety, we pretrain two models on tweets and forum text respectively, and empirically demonstrate the effectiveness of these two resources. In addition, we investigate how similarity measures can be used to nominate in-domain pretraining data. We publicly release our pretrained models at https://bit.ly/35RpTf0. + 2020.findings-emnlp.151 + + + <fixed-case>T</fixed-case>opic<fixed-case>BERT</fixed-case> for Energy Efficient Document Classification + YatinChaudhary + PankajGupta + KhushbuSaxena + VivekKulkarni + ThomasRunkler + HinrichSchütze + 1682–1690 + Prior research notes that BERT’s computational cost grows quadratically with sequence length thus leading to longer training times, higher GPU memory constraints and carbon emissions. While recent work seeks to address these scalability issues at pre-training, these issues are also prominent in fine-tuning especially for long sequence tasks like document classification. Our work thus focuses on optimizing the computational cost of fine-tuning for document classification. We achieve this by complementary learning of both topic and language models in a unified framework, named TopicBERT. This significantly reduces the number of self-attention operations – a main performance bottleneck. Consequently, our model achieves a 1.4x ( 40%) speedup with 40% reduction in CO2 emission while retaining 99.9% performance over 5 datasets. + 2020.findings-emnlp.152 + + + Improving Constituency Parsing with Span Attention + YuanheTian + YanSong + FeiXia + TongZhang + 1691–1703 + Constituency parsing is a fundamental and important task for natural language understanding, where a good representation of contextual information can help this task. N-grams, which is a conventional type of feature for contextual information, have been demonstrated to be useful in many tasks, and thus could also be beneficial for constituency parsing if they are appropriately modeled. In this paper, we propose span attention for neural chart-based constituency parsing to leverage n-gram information. Considering that current chart-based parsers with Transformer-based encoder represent spans by subtraction of the hidden states at the span boundaries, which may cause information loss especially for long spans, we incorporate n-grams into span representations by weighting them according to their contributions to the parsing process. Moreover, we propose categorical span attention to further enhance the model by weighting n-grams within different length categories, and thus benefit long-sentence parsing. Experimental results on three widely used benchmark datasets demonstrate the effectiveness of our approach in parsing Arabic, Chinese, and English, where state-of-the-art performance is obtained by our approach on all of them. + 2020.findings-emnlp.153 + 2020.findings-emnlp.153.OptionalSupplementaryMaterial.zip + + + Optimizing <fixed-case>BERT</fixed-case> for Unlabeled Text-Based Items Similarity + ItzikMalkiel + OrenBarkan + AviCaciularu + NoamRazin + OriKatz + NoamKoenigstein + 1704–1714 + Language models that utilize extensive self-supervised pre-training from unlabeled text, have recently shown to significantly advance the state-of-the-art performance in a variety of language understanding tasks. However, it is yet unclear if and how these recent models can be harnessed for conducting text-based recommendations. In this work, we introduce RecoBERT, a BERT-based approach for learning catalog-specialized language models for text-based item recommendations. We suggest novel training and inference procedures for scoring similarities between pairs of items, that don’t require item similarity labels. Both the training and the inference techniques were designed to utilize the unlabeled structure of textual catalogs, and minimize the discrepancy between them. By incorporating four scores during inference, RecoBERT can infer text-based item-to-item similarities more accurately than other techniques. In addition, we introduce a new language understanding task for wine recommendations using similarities based on professional wine reviews. As an additional contribution, we publish annotated recommendations dataset crafted by human wine experts. Finally, we evaluate RecoBERT and compare it to various state-of-the-art NLP models on wine and fashion recommendations tasks. + 2020.findings-emnlp.154 + 2020.findings-emnlp.154.OptionalSupplementaryMaterial.zip + + + Multi-Agent Mutual Learning at Sentence-Level and Token-Level for Neural Machine Translation + BaohaoLiao + YingboGao + HermannNey + 1715–1724 + Mutual learning, where multiple agents learn collaboratively and teach one another, has been shown to be an effective way to distill knowledge for image classification tasks. In this paper, we extend mutual learning to the machine translation task and operate at both the sentence-level and the token-level. Firstly, we co-train multiple agents by using the same parallel corpora. After convergence, each agent selects and learns its poorly predicted tokens from other agents. The poorly predicted tokens are determined by the acceptance-rejection sampling algorithm. Our experiments show that sequential mutual learning at the sentence-level and the token-level improves the results cumulatively. Absolute improvements compared to strong baselines are obtained on various translation tasks. On the IWSLT’14 German-English task, we get a new state-of-the-art BLEU score of 37.0. We also report a competitive result, 29.9 BLEU score, on the WMT’14 English-German task. + 2020.findings-emnlp.155 + + + <fixed-case>D</fixed-case>om<fixed-case>BERT</fixed-case>: Domain-oriented Language Model for Aspect-based Sentiment Analysis + HuXu + BingLiu + LeiShu + PhilipYu + 1725–1731 + This paper focuses on learning domain-oriented language models driven by end tasks, which aims to combine the worlds of both general-purpose language models (such as ELMo and BERT) and domain-specific language understanding. We propose DomBERT, an extension of BERT to learn from both in-domain corpus and relevant domain corpora. This helps in learning domain language models with low-resources. Experiments are conducted on an assortment of tasks in aspect-based sentiment analysis (ABSA), demonstrating promising results. + 2020.findings-emnlp.156 + + + <fixed-case>RMM</fixed-case>: A Recursive Mental Model for Dialog Navigation + HomeroRoman Roman + YonatanBisk + JesseThomason + AsliCelikyilmaz + JianfengGao + 1732–1745 + Language-guided robots must be able to both ask humans questions and understand answers. Much existing work focuses only on the latter. In this paper, we go beyond instruction following and introduce a two-agent task where one agent navigates and asks questions that a second, guiding agent answers. Inspired by theory of mind, we propose the Recursive Mental Model (RMM). The navigating agent models the guiding agent to simulate answers given candidate generated questions. The guiding agent in turn models the navigating agent to simulate navigation steps it would take to generate answers. We use the progress agents make towards the goal as a reinforcement learning reward signal to directly inform not only navigation actions, but also both question and answer generation. We demonstrate that RMM enables better generalization to novel environments. Interlocutor modelling may be a way forward for human-agent RMM where robots need to both ask and answer questions. + 2020.findings-emnlp.157 + + + Will this Idea Spread Beyond Academia? Understanding Knowledge Transfer of Scientific Concepts across Text Corpora + HanchengCao + MengjieCheng + ZhepengCen + DanielMcFarland + XiangRen + 1746–1757 + What kind of basic research ideas are more likely to get applied in practice? There is a long line of research investigating patterns of knowledge transfer, but it generally focuses on documents as the unit of analysis and follow their transfer into practice for a specific scientific domain. Here we study translational research at the level of scientific concepts for all scientific fields. We do this through text mining and predictive modeling using three corpora: 38.6 million paper abstracts, 4 million patent documents, and 0.28 million clinical trials. We extract scientific concepts (i.e., phrases) from corpora as instantiations of “research ideas”, create concept-level features as motivated by literature, and then follow the trajectories of over 450,000 new concepts (emerged from 1995-2014) to identify factors that lead only a small proportion of these ideas to be used in inventions and drug trials. Results from our analysis suggest several mechanisms that distinguish which scientific concept will be adopted in practice, and which will not. We also demonstrate that our derived features can be used to explain and predict knowledge transfer with high accuracy. Our work provides greater understanding of knowledge transfer for researchers, practitioners, and government agencies interested in encouraging translational research. + 2020.findings-emnlp.158 + + + Recurrent Inference in Text Editing + NingShi + ZihengZeng + HaotianZhang + YichenGong + 1758–1769 + In neural text editing, prevalent sequence-to-sequence based approaches directly map the unedited text either to the edited text or the editing operations, in which the performance is degraded by the limited source text encoding and long, varying decoding steps. To address this problem, we propose a new inference method, Recurrence, that iteratively performs editing actions, significantly narrowing the problem space. In each iteration, encoding the partially edited text, Recurrence decodes the latent representation, generates an action of short, fixed-length, and applies the action to complete a single edit. For a comprehensive comparison, we introduce three types of text editing tasks: Arithmetic Operators Restoration (AOR), Arithmetic Equation Simplification (AES), Arithmetic Equation Correction (AEC). Extensive experiments on these tasks with varying difficulties demonstrate that Recurrence achieves improvements over conventional inference methods. + 2020.findings-emnlp.159 + 2020.findings-emnlp.159.OptionalSupplementaryMaterial.zip + + + An Empirical Exploration of Local Ordering Pre-training for Structured Learning + ZhisongZhang + XiangKong + LoriLevin + EduardHovy + 1770–1783 + Recently, pre-training contextualized encoders with language model (LM) objectives has been shown an effective semi-supervised method for structured prediction. In this work, we empirically explore an alternative pre-training method for contextualized encoders. Instead of predicting words in LMs, we “mask out” and predict word order information, with a local ordering strategy and word-selecting objectives. With evaluations on three typical structured prediction tasks (dependency parsing, POS tagging, and NER) over four languages (English, Finnish, Czech, and Italian), we show that our method is consistently beneficial. We further conduct detailed error analysis, including one that examines a specific type of parsing error where the head is misidentified. The results show that pre-trained contextual encoders can bring improvements in a structured way, suggesting that they may be able to capture higher-order patterns and feature combinations from unlabeled data. + 2020.findings-emnlp.160 + + + Unsupervised Extractive Summarization by Pre-training Hierarchical Transformers + ShushengXu + XingxingZhang + YiWu + FuruWei + MingZhou + 1784–1795 + Unsupervised extractive document summarization aims to select important sentences from a document without using labeled summaries during training. Existing methods are mostly graph-based with sentences as nodes and edge weights measured by sentence similarities. In this work, we find that transformer attentions can be used to rank sentences for unsupervised extractive summarization. Specifically, we first pre-train a hierarchical transformer model using unlabeled documents only. Then we propose a method to rank sentences using sentence-level self-attentions and pre-training objectives. Experiments on CNN/DailyMail and New York Times datasets show our model achieves state-of-the-art performance on unsupervised summarization. We also find in experiments that our model is less dependent on sentence positions. When using a linear combination of our model and a recent unsupervised model explicitly modeling sentence positions, we obtain even better results. + 2020.findings-emnlp.161 + 2020.findings-emnlp.161.OptionalSupplementaryMaterial.pdf + + + Active Learning Approaches to Enhancing Neural Machine Translation: An Empirical Study + YuekaiZhao + HaoranZhang + ShuchangZhou + ZhihuaZhang + 1796–1806 + Active learning is an efficient approach for mitigating data dependency when training neural machine translation (NMT) models. In this paper, we explore new training frameworks by incorporating active learning into various techniques such as transfer learning and iterative back-translation (IBT) under a limited human translation budget. We design a word frequency based acquisition function and combine it with a strong uncertainty based method. The combined method steadily outperforms all other acquisition functions in various scenarios. As far as we know, we are the first to do a large-scale study on actively training Transformer for NMT. Specifically, with a human translation budget of only 20% of the original parallel corpus, we manage to surpass Transformer trained on the entire parallel corpus in three language pairs. + 2020.findings-emnlp.162 + 2020.findings-emnlp.162.OptionalSupplementaryMaterial.pdf + + + Towards Fine-Grained Transfer: An Adaptive Graph-Interactive Framework for Joint Multiple Intent Detection and Slot Filling + LiboQin + XiaoXu + WanxiangChe + TingLiu + 1807–1816 + In real-world scenarios, users usually have multiple intents in the same utterance. Unfortunately, most spoken language understanding (SLU) models either mainly focused on the single intent scenario, or simply incorporated an overall intent context vector for all tokens, ignoring the fine-grained multiple intents information integration for token-level slot prediction. In this paper, we propose an Adaptive Graph-Interactive Framework (AGIF) for joint multiple intent detection and slot filling, where we introduce an intent-slot graph interaction layer to model the strong correlation between the slot and intents. Such an interaction layer is applied to each token adaptively, which has the advantage to automatically extract the relevant intents information, making a fine-grained intent information integration for the token-level slot prediction. Experimental results on three multi-intent datasets show that our framework obtains substantial improvement and achieves the state-of-the-art performance. In addition, our framework achieves new state-of-the-art performance on two single-intent datasets. + 2020.findings-emnlp.163 + + + Continual Learning Long Short Term Memory + XinGuo + YuTian + QinghanXue + PanosLampropoulos + StevenEliuk + KennethBarner + XiaolongWang + 1817–1822 + Catastrophic forgetting in neural networks indicates the performance decreasing of deep learning models on previous tasks while learning new tasks. To address this problem, we propose a novel Continual Learning Long Short Term Memory (CL-LSTM) cell in Recurrent Neural Network (RNN) in this paper. CL-LSTM considers not only the state of each individual task’s output gates but also the correlation of the states between tasks, so that the deep learning models can incrementally learn new tasks without catastrophically forgetting previously tasks. Experimental results demonstrate significant improvements of CL-LSTM over state-of-the-art approaches on spoken language understanding (SLU) tasks. + 2020.findings-emnlp.164 + 2020.findings-emnlp.164.OptionalSupplementaryMaterial.pdf + + + <fixed-case>C</fixed-case>ommon<fixed-case>G</fixed-case>en: A Constrained Text Generation Challenge for Generative Commonsense Reasoning + Bill YuchenLin + WangchunshuZhou + MingShen + PeiZhou + ChandraBhagavatula + YejinChoi + XiangRen + 1823–1840 + Recently, large-scale pre-trained language models have demonstrated impressive performance on several commonsense-reasoning benchmark datasets. However, building machines with commonsense to compose realistically plausible sentences remains challenging. In this paper, we present a constrained text generation task, CommonGen associated with a benchmark dataset, to explicitly test machines for the ability of generative commonsense reasoning. Given a set of common concepts (e.g., dog, frisbee, catch, throw); the task is to generate a coherent sentence describing an everyday scenario using these concepts (e.g., “a man throws a frisbee and his dog catches it”). The CommonGen task is challenging because it inherently requires 1) relational reasoning with background commonsense knowledge and 2) compositional generalization ability to work on unseen concept combinations. Our dataset, constructed through a combination of crowdsourced and existing caption corpora, consists of 77k commonsense descriptions over 35k unique concept-sets. Experiments show that there is a large gap between state-of-the-art text generation models (e.g., T5) and human performance (31.6% v.s. 63.5% in SPICE metric). Furthermore, we demonstrate that the learned generative commonsense reasoning capability can be transferred to improve downstream tasks such as CommonsenseQA (76.9% to 78.4 in dev accuracy) by generating additional context. + 2020.findings-emnlp.165 + + + Constrained Decoding for Computationally Efficient Named Entity Recognition Taggers + BrianLester + DanielPressel + AmyHemmeter + SagnikRay Choudhury + SrinivasBangalore + 1841–1848 + Current state-of-the-art models for named entity recognition (NER) are neural models with a conditional random field (CRF) as the final layer. Entities are represented as per-token labels with a special structure in order to decode them into spans. Current work eschews prior knowledge of how the span encoding scheme works and relies on the CRF learning which transitions are illegal and which are not to facilitate global coherence. We find that by constraining the output to suppress illegal transitions we can train a tagger with a cross-entropy loss twice as fast as a CRF with differences in F1 that are statistically insignificant, effectively eliminating the need for a CRF. We analyze the dynamics of tag co-occurrence to explain when these constraints are most effective and provide open source implementations of our tagger in both PyTorch and TensorFlow. + 2020.findings-emnlp.166 + 2020.findings-emnlp.166.OptionalSupplementaryMaterial.zip + + + On the Potential of Lexico-logical Alignments for Semantic Parsing to <fixed-case>SQL</fixed-case> Queries + TianzeShi + ChenZhao + JordanBoyd-Graber + HalDaumé III + LillianLee + 1849–1864 + Large-scale semantic parsing datasets annotated with logical forms have enabled major advances in supervised approaches. But can richer supervision help even more? To explore the utility of fine-grained, lexical-level supervision, we introduce SQUALL, a dataset that enriches 11,276 WIKITABLEQUESTIONS English-language questions with manually created SQL equivalents plus alignments between SQL and question fragments. Our annotation enables new training possibilities for encoderdecoder models, including approaches from machine translation previously precluded by the absence of alignments. We propose and test two methods: (1) supervised attention; (2) adopting an auxiliary objective of disambiguating references in the input queries to table columns. In 5-fold cross validation, these strategies improve over strong baselines by 4.4% execution accuracy. Oracle experiments suggest that annotated alignments can support further accuracy gains of up to 23.9%. + 2020.findings-emnlp.167 + + + <fixed-case>TED</fixed-case>: A Pretrained Unsupervised Summarization Model with Theme Modeling and Denoising + ZiyiYang + ChenguangZhu + RobertGmyr + MichaelZeng + XuedongHuang + EricDarve + 1865–1874 + Text summarization aims to extract essential information from a piece of text and transform the text into a concise version. Existing unsupervised abstractive summarization models leverage recurrent neural networks framework while the recently proposed transformer exhibits much more capability. Moreover, most of previous summarization models ignore abundant unlabeled corpora resources available for pretraining. In order to address these issues, we propose TED, a transformer-based unsupervised abstractive summarization system with pretraining on large-scale data. We first leverage the lead bias in news articles to pretrain the model on millions of unlabeled corpora. Next, we finetune TED on target domains through theme modeling and a denoising autoencoder to enhance the quality of generated summaries. Notably, TED outperforms all unsupervised abstractive baselines on NYT, CNN/DM and English Gigaword datasets with various document styles. Further analysis shows that the summaries generated by TED are highly abstractive, and each component in the objective function of TED is highly effective. + 2020.findings-emnlp.168 + 2020.findings-emnlp.168.OptionalSupplementaryMaterial.pdf + + + Improving End-to-End <fixed-case>B</fixed-case>angla Speech Recognition with Semi-supervised Training + NafisSadeq + Nafis TahmidChowdhury + Farhan TanvirUtshaw + ShafayatAhmed + Muhammad AbdullahAdnan + 1875–1883 + Automatic speech recognition systems usually require large annotated speech corpus for training. The manual annotation of a large corpus is very difficult. It can be very helpful to use unsupervised and semi-supervised learning methods in addition to supervised learning. In this work, we focus on using a semi-supervised training approach for Bangla Speech Recognition that can exploit large unpaired audio and text data. We encode speech and text data in an intermediate domain and propose a novel loss function based on the global encoding distance between encoded data to guide the semi-supervised training. Our proposed method reduces the Word Error Rate (WER) of the system from 37% to 31.9%. + 2020.findings-emnlp.169 + + + No Gestures Left Behind: Learning Relationships between Spoken Language and Freeform Gestures + ChaitanyaAhuja + Dong WonLee + RyoIshii + Louis-PhilippeMorency + 1884–1895 + We study relationships between spoken language and co-speech gestures in context of two key challenges. First, distributions of text and gestures are inherently skewed making it important to model the long tail. Second, gesture predictions are made at a subword level, making it important to learn relationships between language and acoustic cues. We introduce AISLe, which combines adversarial learning with importance sampling to strike a balance between precision and coverage. We propose the use of a multimodal multiscale attention block to perform subword alignment without the need of explicit alignment between language and acoustic cues. Finally, to empirically study the importance of language in this task, we extend the dataset proposed in Ahuja et al. (2020) with automatically extracted transcripts for audio signals. We substantiate the effectiveness of our approach through large-scale quantitative and user studies, which show that our proposed methodology significantly outperforms previous state-of-the-art approaches for gesture generation. Link to code, data and videos: https://github.com/chahuja/aisle + 2020.findings-emnlp.170 + 2020.findings-emnlp.170.OptionalSupplementaryMaterial.pdf + + + <fixed-case>U</fixed-case>nified<fixed-case>QA</fixed-case>: Crossing Format Boundaries With a Single <fixed-case>QA</fixed-case> System + DanielKhashabi + SewonMin + TusharKhot + AshishSabharwal + OyvindTafjord + PeterClark + HannanehHajishirzi + 1896–1907 + Question answering (QA) tasks have been posed using a variety of formats, such as extractive span selection, multiple choice, etc. This has led to format-specialized models, and even to an implicit division in the QA community. We argue that such boundaries are artificial and perhaps unnecessary, given the reasoning abilities we seek to teach are not governed by the format. As evidence, we use the latest advances in language modeling to build a single pre-trained QA model, UNIFIEDQA, that performs well across 19 QA datasets spanning 4 diverse formats. UNIFIEDQA performs on par with 8 different models that were trained on individual datasets themselves. Even when faced with 12 unseen datasets of observed formats, UNIFIEDQA performs surprisingly well, showing strong generalization from its outof-format training data. Finally, simply finetuning this pre trained QA model into specialized models results in a new state of the art on 10 factoid and commonsense question answering datasets, establishing UNIFIEDQA as a strong starting point for building QA systems. + 2020.findings-emnlp.171 + 2020.findings-emnlp.171.OptionalSupplementaryMaterial.pdf + + + Robust and Interpretable Grounding of Spatial References with Relation Networks + Tsung-YenYang + AndrewLan + KarthikNarasimhan + 1908–1923 + Learning representations of spatial references in natural language is a key challenge in tasks like autonomous navigation and robotic manipulation. Recent work has investigated various neural architectures for learning multi-modal representations for spatial concepts. However, the lack of explicit reasoning over entities makes such approaches vulnerable to noise in input text or state observations. In this paper, we develop effective models for understanding spatial references in text that are robust and interpretable, without sacrificing performance. We design a text-conditioned relation network whose parameters are dynamically computed with a cross-modal attention module to capture fine-grained spatial relations between entities. This design choice provides interpretability of learned intermediate outputs. Experiments across three tasks demonstrate that our model achieves superior performance, with a 17% improvement in predicting goal locations and a 15% improvement in robustness compared to state-of-the-art systems. + 2020.findings-emnlp.172 + + + Pragmatic Issue-Sensitive Image Captioning + AllenNie + ReubenCohn-Gordon + ChristopherPotts + 1924–1938 + Image captioning systems need to produce texts that are not only true but also relevant in that they are properly aligned with the current issues. For instance, in a newspaper article about a sports event, a caption that not only identifies the player in a picture but also comments on their ethnicity could create unwanted reader reactions. To address this, we propose Issue-Sensitive Image Captioning (ISIC). In ISIC, the captioner is given a target image and an issue, which is a set of images partitioned in a way that specifies what information is relevant. For the sports article, we could construct a partition that places images into equivalence classes based on player position. To model this task, we use an extension of the Rational Speech Acts model. Our extension is built on top of state-of-the-art pretrained neural image captioners and explicitly uses image partitions to control caption generation. In both automatic and human evaluations, we show that these models generate captions that are descriptive and issue-sensitive. Finally, we show how ISIC can complement and enrich the related task of Visual Question Answering. + 2020.findings-emnlp.173 + + + <fixed-case>PTUM</fixed-case>: Pre-training User Model from Unlabeled User Behaviors via Self-supervision + ChuhanWu + FangzhaoWu + TaoQi + JianxunLian + YongfengHuang + XingXie + 1939–1944 + User modeling is critical for many personalized web services. Many existing methods model users based on their behaviors and the labeled data of target tasks. However, these methods cannot exploit useful information in unlabeled user behavior data, and their performance may be not optimal when labeled data is scarce. Motivated by pre-trained language models which are pre-trained on large-scale unlabeled corpus to empower many downstream tasks, in this paper we propose to pre-train user models from large-scale unlabeled user behaviors data. We propose two self-supervision tasks for user model pre-training. The first one is masked behavior prediction, which can model the relatedness between historical behaviors. The second one is next K behavior prediction, which can model the relatedness between past and future behaviors. The pre-trained user models are finetuned in downstream tasks to learn task-specific user representations. Experimental results on two real-world datasets validate the effectiveness of our proposed user model pre-training method. + 2020.findings-emnlp.174 + + + Adversarial Subword Regularization for Robust Neural Machine Translation + JungsooPark + MujeenSung + JinhyukLee + JaewooKang + 1945–1953 + Exposing diverse subword segmentations to neural machine translation (NMT) models often improves the robustness of machine translation as NMT models can experience various subword candidates. However, the diversification of subword segmentations mostly relies on the pre-trained subword language models from which erroneous segmentations of unseen words are less likely to be sampled. In this paper, we present adversarial subword regularization (ADVSR) to study whether gradient signals during training can be a substitute criterion for exposing diverse subword segmentations. We experimentally show that our model-based adversarial samples effectively encourage NMT models to be less sensitive to segmentation errors and improve the performance of NMT models in low-resource and out-domain datasets. + 2020.findings-emnlp.175 + + + Learning Visual-Semantic Embeddings for Reporting Abnormal Findings on Chest <fixed-case>X</fixed-case>-rays + JianmoNi + Chun-NanHsu + AmilcareGentili + JulianMcAuley + 1954–1960 + Automatic medical image report generation has drawn growing attention due to its potential to alleviate radiologists’ workload. Existing work on report generation often trains encoder-decoder networks to generate complete reports. However, such models are affected by data bias (e.g. label imbalance) and face common issues inherent in text generation models (e.g. repetition). In this work, we focus on reporting abnormal findings on radiology images; instead of training on complete radiology reports, we propose a method to identify abnormal findings from the reports in addition to grouping them with unsupervised clustering and minimal rules. We formulate the task as cross-modal retrieval and propose Conditional Visual-Semantic Embeddings to align images and fine-grained abnormal findings in a joint embedding space. We demonstrate that our method is able to retrieve abnormal findings and outperforms existing generation models on both clinical correctness and text generation metrics. + 2020.findings-emnlp.176 + + + <fixed-case>S</fixed-case>yn<fixed-case>ET</fixed-case>: Synonym Expansion using Transitivity + JialeYu + YongliangShen + XinyinMa + ChenghaoJia + ChenChen + WeimingLu + 1961–1970 + In this paper, we study a new task of synonym expansion using transitivity, and propose a novel approach named SynET, which considers both the contexts of two given synonym pairs. It introduces an auxiliary task to reduce the impact of noisy sentences, and proposes a Multi-Perspective Entity Matching Network to match entities from multiple perspectives. Extensive experiments on a real-world dataset show the effectiveness of our approach. + 2020.findings-emnlp.177 + + + Scheduled <fixed-case>D</fixed-case>rop<fixed-case>H</fixed-case>ead: A Regularization Method for Transformer Models + WangchunshuZhou + TaoGe + FuruWei + MingZhou + KeXu + 1971–1980 + We introduce DropHead, a structured dropout method specifically designed for regularizing the multi-head attention mechanism which is a key component of transformer. In contrast to the conventional dropout mechanism which randomly drops units or connections, DropHead drops entire attention heads during training to prevent the multi-head attention model from being dominated by a small portion of attention heads. It can help reduce the risk of overfitting and allow the models to better benefit from the multi-head attention. Given the interaction between multi-headedness and training dynamics, we further propose a novel dropout rate scheduler to adjust the dropout rate of DropHead throughout training, which results in a better regularization effect. Experimental results demonstrate that our proposed approach can improve transformer models by 0.9 BLEU score on WMT14 En-De translation task and around 1.0 accuracy for various text classification tasks. + 2020.findings-emnlp.178 + + + Multi-Turn Dialogue Generation in <fixed-case>E</fixed-case>-Commerce Platform with the Context of Historical Dialogue + WeiShengZhang + KaisongSong + YangyangKang + ZhongqingWang + ChanglongSun + XiaozhongLiu + ShoushanLi + MinZhang + LuoSi + 1981–1990 + As an important research topic, customer service dialogue generation tends to generate generic seller responses by leveraging current dialogue information. In this study, we propose a novel and extensible dialogue generation method by leveraging sellers’ historical dialogue information, which can be both accessible and informative. By utilizing innovative historical dialogue representation learning and historical dialogue selection mechanism, the proposed model is capable of detecting most related responses from sellers’ historical dialogues, which can further enhance the current dialogue generation quality. Unlike prior dialogue generation efforts, we treat each seller’s historical dialogues as a list of Customer-Seller utterance pairs and allow the model to measure their different importance, and copy words directly from most relevant pairs. Extensive experimental results show that the proposed approach can generate high-quality responses that cater to specific sellers’ characteristics and exhibit consistent superiority over baselines on a real-world multi-turn customer service dialogue dataset. + 2020.findings-emnlp.179 + + + Automatically Identifying Gender Issues in Machine Translation using Perturbations + HilaGonen + KellieWebster + 1991–1995 + The successful application of neural methods to machine translation has realized huge quality advances for the community. With these improvements, many have noted outstanding challenges, including the modeling and treatment of gendered language. While previous studies have identified issues using synthetic examples, we develop a novel technique to mine examples from real world data to explore challenges for deployed systems. We use our method to compile an evaluation benchmark spanning examples for four languages from three language families, which we publicly release to facilitate research. The examples in our benchmark expose where model representations are gendered, and the unintended consequences these gendered representations can have in downstream application. + 2020.findings-emnlp.180 + + + Ruler: Data Programming by Demonstration for Document Labeling + SaraEvensen + ChangGe + CagatayDemiralp + 1996–2005 + Data programming aims to reduce the cost of curating training data by encoding domain knowledge as labeling functions over source data. As such it not only requires domain expertise but also programming experience, a skill that many subject matter experts lack. Additionally, generating functions by enumerating rules is not only time consuming but also inherently difficult, even for people with programming experience. In this paper we introduce Ruler, an interactive system that synthesizes labeling rules using span-level interactive demonstrations over document examples. Ruler is a first-of-a-kind implementation of data programming by demonstration (DPBD). This new framework aims to relieve users from the burden of writing labeling functions, enabling them to focus on higher-level semantic analysis, such as identifying relevant signals for the labeling task. We compare Ruler with conventional data programming through a user study conducted with 10 data scientists who were asked to create labeling functions for sentiment and spam classification tasks. Results show Ruler is easier to learn and to use, and that it offers higher overall user-satisfaction while providing model performances comparable to those achieved by conventional data programming. + 2020.findings-emnlp.181 + + + Dual Reconstruction: a Unifying Objective for Semi-Supervised Neural Machine Translation + WeijiaXu + XingNiu + MarineCarpuat + 2006–2020 + While Iterative Back-Translation and Dual Learning effectively incorporate monolingual training data in neural machine translation, they use different objectives and heuristic gradient approximation strategies, and have not been extensively compared. We introduce a novel dual reconstruction objective that provides a unified view of Iterative Back-Translation and Dual Learning. It motivates a theoretical analysis and controlled empirical study on German-English and Turkish-English tasks, which both suggest that Iterative Back-Translation is more effective than Dual Learning despite its relative simplicity. + 2020.findings-emnlp.182 + + + Focus-Constrained Attention Mechanism for <fixed-case>CVAE</fixed-case>-based Response Generation + ZhiCui + YanranLi + JiayiZhang + JianweiCui + ChenWei + BinWang + 2021–2030 + To model diverse responses for a given post, one promising way is to introduce a latent variable into Seq2Seq models. The latent variable is supposed to capture the discourse-level information and encourage the informativeness of target responses. However, such discourse-level information is often too coarse for the decoder to be utilized. To tackle it, our idea is to transform the coarse-grained discourse-level information into fine-grained word-level information. Specifically, we firstly measure the semantic concentration of corresponding target response on the post words by introducing a fine-grained focus signal. Then, we propose a focus-constrained attention mechanism to take full advantage of focus in well aligning the input to the target response. The experimental results demonstrate that by exploiting the fine-grained signal, our model can generate more diverse and informative responses compared with several state-of-the-art models. + 2020.findings-emnlp.183 + + + Chunk-based <fixed-case>C</fixed-case>hinese Spelling Check with Global Optimization + ZuyiBao + ChenLi + RuiWang + 2031–2040 + Chinese spelling check is a challenging task due to the characteristics of the Chinese language, such as the large character set, no word boundary, and short word length. On the one hand, most of the previous works only consider corrections with similar character pronunciation or shape, failing to correct visually and phonologically irrelevant typos. On the other hand, pipeline-style architectures are widely adopted to deal with different types of spelling errors in individual modules, which is difficult to optimize. In order to handle these issues, in this work, 1) we extend the traditional confusion sets with semantical candidates to cover different types of errors; 2) we propose a chunk-based framework to correct single-character and multi-character word errors uniformly; and 3) we adopt a global optimization strategy to enable a sentence-level correction selection. The experimental results show that the proposed approach achieves a new state-of-the-art performance on three benchmark datasets, as well as an optical character recognition dataset. + 2020.findings-emnlp.184 + + + Multi-pretraining for Large-scale Text Classification + Kang-MinKim + BumsuHyeon + YeachanKim + Jun-HyungPark + SangKeunLee + 2041–2050 + Deep neural network-based pretraining methods have achieved impressive results in many natural language processing tasks including text classification. However, their applicability to large-scale text classification with numerous categories (e.g., several thousands) is yet to be well-studied, where the training data is insufficient and skewed in terms of categories. In addition, existing pretraining methods usually involve excessive computation and memory overheads. In this paper, we develop a novel multi-pretraining framework for large-scale text classification. This multi-pretraining framework includes both a self-supervised pretraining and a weakly supervised pretraining. We newly introduce an out-of-context words detection task on the unlabeled data as the self-supervised pretraining. It captures the topic-consistency of words used in sentences, which is proven to be useful for text classification. In addition, we propose a weakly supervised pretraining, where labels for text classification are obtained automatically from an existing approach. Experimental results clearly show that both pretraining approaches are effective for large-scale text classification task. The proposed scheme exhibits significant improvements as much as 3.8% in terms of macro-averaging F1-score over strong pretraining methods, while being computationally efficient. + 2020.findings-emnlp.185 + + + End-to-End Speech Recognition and Disfluency Removal + PariaJamshid Lou + MarkJohnson + 2051–2061 + Disfluency detection is usually an intermediate step between an automatic speech recognition (ASR) system and a downstream task. By contrast, this paper aims to investigate the task of end-to-end speech recognition and disfluency removal. We specifically explore whether it is possible to train an ASR model to directly map disfluent speech into fluent transcripts, without relying on a separate disfluency detection model. We show that end-to-end models do learn to directly generate fluent transcripts; however, their performance is slightly worse than a baseline pipeline approach consisting of an ASR system and a specialized disfluency detection model. We also propose two new metrics for evaluating integrated ASR and disfluency removal models. The findings of this paper can serve as a benchmark for further research on the task of end-to-end speech recognition and disfluency removal in the future. + 2020.findings-emnlp.186 + + + Characterizing the Value of Information in Medical Notes + Chao-ChunHsu + ShantanuKarnwal + SendhilMullainathan + ZiadObermeyer + ChenhaoTan + 2062–2072 + Machine learning models depend on the quality of input data. As electronic health records are widely adopted, the amount of data in health care is growing, along with complaints about the quality of medical notes. We use two prediction tasks, readmission prediction and in-hospital mortality prediction, to characterize the value of information in medical notes. We show that as a whole, medical notes only provide additional predictive power over structured information in readmission prediction. We further propose a probing framework to select parts of notes that enable more accurate predictions than using all notes, despite that the selected information leads to a distribution shift from the training data (“all notes”). Finally, we demonstrate that models trained on the selected valuable information achieve even better predictive performance, with only 6.8%of all the tokens for readmission prediction. + 2020.findings-emnlp.187 + 2020.findings-emnlp.187.OptionalSupplementaryMaterial.zip + + + <fixed-case>KL</fixed-case>earn: Background Knowledge Inference from Summarization Data + MaximePeyrard + RobertWest + 2073–2085 + The goal of text summarization is to compress documents to the relevant information while excluding background information already known to the receiver. So far, summarization researchers have given considerably more attention to relevance than to background knowledge. In contrast, this work puts background knowledge in the foreground. Building on the realization that the choices made by human summarizers and annotators contain implicit information about their background knowledge, we develop and compare techniques for inferring background knowledge from summarization data. Based on this framework, we define summary scoring functions that explicitly model background knowledge, and show that these scoring functions fit human judgments significantly better than baselines. We illustrate some of the many potential applications of our framework. First, we provide insights into human information importance priors. Second, we demonstrate that averaging the background knowledge of multiple, potentially biased annotators or corpora greatly improves summaryscoring performance. Finally, we discuss potential applications of our framework beyond summarization. + 2020.findings-emnlp.188 + + + Extracting Chemical–Protein Interactions via Calibrated Deep Neural Network and Self-training + DonghaChoi + HyunjuLee + 2086–2095 + The extraction of interactions between chemicals and proteins from several biomedical articles is important in many fields of biomedical research such as drug development and prediction of drug side effects. Several natural language processing methods, including deep neural network (DNN) models, have been applied to address this problem. However, these methods were trained with hard-labeled data, which tend to become over-confident, leading to degradation of the model reliability. To estimate the data uncertainty and improve the reliability, “calibration” techniques have been applied to deep learning models. In this study, to extract chemical–protein interactions, we propose a DNN-based approach incorporating uncertainty information and calibration techniques. Our model first encodes the input sequence using a pre-trained language-understanding model, following which it is trained using two calibration methods: mixup training and addition of a confidence penalty loss. Finally, the model is re-trained with augmented data that are extracted using the estimated uncertainties. Our approach has achieved state-of-the-art performance with regard to the Biocreative VI ChemProt task, while preserving higher calibration abilities than those of previous approaches. Furthermore, our approach also presents the possibilities of using uncertainty estimation for performance improvement. + 2020.findings-emnlp.189 + 2020.findings-emnlp.189.OptionalSupplementaryMaterial.zip + + + <fixed-case>L</fixed-case>ogic2<fixed-case>T</fixed-case>ext: High-Fidelity Natural Language Generation from Logical Forms + ZhiyuChen + WenhuChen + HanwenZha + XiyouZhou + YunkaiZhang + SairamSundaresan + William YangWang + 2096–2111 + Previous studies on Natural Language Generation (NLG) from structured data have primarily focused on surface-level descriptions of record sequences. However, for complex structured data, e.g., multi-row tables, it is often desirable for an NLG system to describe interesting facts from logical inferences across records. If only provided with the table, it is hard for existing models to produce controllable and high-fidelity logical generations. In this work, we formulate high-fidelity NLG as generation from logical forms in order to obtain controllable and faithful generations. We present a new large-scale dataset, Logic2Text, with 10,753 descriptions involving common logic types paired with the underlying logical forms. The logical forms show diversified graph structure of free schema, which pose great challenges on the model’s ability to understand the semantics. We experiment on (1) Fully-supervised training with the full datasets, and (2) Few-shot setting, provided with hundreds of paired examples; We compare several popular generation models and analyze their performances. We hope our dataset can encourage research towards building an advanced NLG system capable of natural, faithful, and human-like generation. The dataset and code is available at https://github.com/czyssrs/Logic2Text. + 2020.findings-emnlp.190 + + + <fixed-case>M</fixed-case>ed<fixed-case>IC</fixed-case>a<fixed-case>T</fixed-case>: A Dataset of Medical Images, Captions, and Textual References + SanjaySubramanian + Lucy LuWang + BenBogin + SachinMehta + Madeleinevan Zuylen + SravanthiParasa + SameerSingh + MattGardner + HannanehHajishirzi + 2112–2120 + Understanding the relationship between figures and text is key to scientific document understanding. Medical figures in particular are quite complex, often consisting of several subfigures (75% of figures in our dataset), with detailed text describing their content. Previous work studying figures in scientific papers focused on classifying figure content rather than understanding how images relate to the text. To address challenges in figure retrieval and figure-to-text alignment, we introduce MedICaT, a dataset of medical images in context. MedICaT consists of 217K images from 131K open access biomedical papers, and includes captions, inline references for 74% of figures, and manually annotated subfigures and subcaptions for a subset of figures. Using MedICaT, we introduce the task of subfigure to subcaption alignment in compound figures and demonstrate the utility of inline references in image-text matching. Our data and code can be accessed at https://github.com/allenai/medicat. + 2020.findings-emnlp.191 + 2020.findings-emnlp.191.OptionalSupplementaryMaterial.zip + + + <fixed-case>TSDG</fixed-case>: Content-aware Neural Response Generation with Two-stage Decoding Process + JunshengKong + ZhichengZhong + YiCai + XinWu + DaRen + 2121–2126 + Neural response generative models have achieved remarkable progress in recent years but tend to yield irrelevant and uninformative responses. One of the reasons is that encoder-decoder based models always use a single decoder to generate a complete response at a stroke. This tends to generate high-frequency function words with less semantic information rather than low-frequency content words with more semantic information. To address this issue, we propose a content-aware model with two-stage decoding process named Two-stage Dialogue Generation (TSDG). We separate the decoding process of content words and function words so that content words can be generated independently without the interference of function words. Experimental results on two datasets indicate that our model significantly outperforms several competitive generative models in terms of automatic and human evaluation. + 2020.findings-emnlp.192 + + + Unsupervised Cross-Lingual Adaptation of Dependency Parsers Using <fixed-case>CRF</fixed-case> Autoencoders + ZhaoLi + KeweiTu + 2127–2133 + We consider the task of cross-lingual adaptation of dependency parsers without annotated target corpora and parallel corpora. Previous work either directly applies a discriminative source parser to the target language, ignoring unannotated target corpora, or employs an unsupervised generative parser that can leverage unannotated target data but has weaker representational power than discriminative parsers. In this paper, we propose to utilize unsupervised discriminative parsers based on the CRF autoencoder framework for this task. We train a source parser and use it to initialize and regularize a target parser that is trained on unannotated target data. We conduct experiments that transfer an English parser to 20 target languages. The results show that our method significantly outperforms previous methods. + 2020.findings-emnlp.193 + + + Diversify Question Generation with Continuous Content Selectors and Question Type Modeling + ZhenWang + SiweiRao + JieZhang + ZhenQin + GuangjianTian + JunWang + 2134–2143 + Generating questions based on answers and relevant contexts is a challenging task. Recent work mainly pays attention to the quality of a single generated question. However, question generation is actually a one-to-many problem, as it is possible to raise questions with different focuses on contexts and various means of expression. In this paper, we explore the diversity of question generation and come up with methods from these two aspects. Specifically, we relate contextual focuses with content selectors, which are modeled by a continuous latent variable with the technique of conditional variational auto-encoder (CVAE). In the realization of CVAE, a multimodal prior distribution is adopted to allow for more diverse content selectors. To take into account various means of expression, question types are explicitly modeled and a diversity-promoting algorithm is proposed further. Experimental results on public datasets show that our proposed method can significantly improve the diversity of generated questions, especially from the perspective of using different question types. Overall, our proposed method achieves a better trade-off between generation quality and diversity compared with existing approaches. + 2020.findings-emnlp.194 + + + Participatory Research for Low-resourced Machine Translation: A Case Study in <fixed-case>A</fixed-case>frican Languages + WilhelminaNekoto + VukosiMarivate + TshinondiwaMatsila + TimiFasubaa + TaiwoFagbohungbe + Solomon OluwoleAkinola + ShamsuddeenMuhammad + SalomonKabongo Kabenamualu + SalomeyOsei + FreshiaSackey + Rubungo AndreNiyongabo + RickyMacharm + PerezOgayo + OrevaogheneAhia + Musie MeressaBerhe + MofetoluwaAdeyemi + MasabataMokgesi-Selinga + LawrenceOkegbemi + LauraMartinus + KolawoleTajudeen + KevinDegila + KelechiOgueji + KathleenSiminyu + JuliaKreutzer + JasonWebster + Jamiil ToureAli + JadeAbbott + IroroOrife + IgnatiusEzeani + Idris AbdulkadirDangana + HermanKamper + HadyElsahar + GoodnessDuru + GhollahKioko + MurhabaziEspoir + Elanvan Biljon + DanielWhitenack + ChristopherOnyefuluchi + Chris ChinenyeEmezue + Bonaventure F. P.Dossou + BlessingSibanda + BlessingBassey + AyodeleOlabiyi + ArshathRamkilowan + AlpÖktem + AdewaleAkinfaderin + AbdallahBashir + 2144–2160 + Research in NLP lacks geographic diversity, and the question of how NLP can be scaled to low-resourced languages has not yet been adequately solved. ‘Low-resourced’-ness is a complex problem going beyond data availability and reflects systemic problems in society. In this paper, we focus on the task of Machine Translation (MT), that plays a crucial role for information accessibility and communication worldwide. Despite immense improvements in MT over the past decade, MT is centered around a few high-resourced languages. As MT researchers cannot solve the problem of low-resourcedness alone, we propose participatory research as a means to involve all necessary agents required in the MT development process. We demonstrate the feasibility and scalability of participatory research with a case study on MT for African languages. Its implementation leads to a collection of novel translation datasets, MT benchmarks for over 30 languages, with human evaluations for a third of them, and enables participants without formal training to make a unique scientific contribution. Benchmarks, models, data, code, and evaluation results are released at https://github.com/masakhane-io/masakhane-mt. + 2020.findings-emnlp.195 + + + <fixed-case>C</fixed-case>onve<fixed-case>RT</fixed-case>: Efficient and Accurate Conversational Representations from Transformers + MatthewHenderson + IñigoCasanueva + NikolaMrkšić + Pei-HaoSu + Tsung-HsienWen + IvanVulić + 2161–2174 + General-purpose pretrained sentence encoders such as BERT are not ideal for real-world conversational AI applications; they are computationally heavy, slow, and expensive to train. We propose ConveRT (Conversational Representations from Transformers), a pretraining framework for conversational tasks satisfying all the following requirements: it is effective, affordable, and quick to train. We pretrain using a retrieval-based response selection task, effectively leveraging quantization and subword-level parameterization in the dual encoder to build a lightweight memory- and energy-efficient model. We show that ConveRT achieves state-of-the-art performance across widely established response selection tasks. We also demonstrate that the use of extended dialog history as context yields further performance gains. Finally, we show that pretrained representations from the proposed encoder can be transferred to the intent classification task, yielding strong results across three diverse data sets. ConveRT trains substantially faster than standard sentence encoders or previous state-of-the-art dual encoders. With its reduced size and superior performance, we believe this model promises wider portability and scalability for Conversational AI applications. + 2020.findings-emnlp.196 + + + Computer Assisted Translation with Neural Quality Estimation and Auotmatic Post-Editing + KeWang + JiayiWang + NiyuGe + YangbinShi + YuZhao + KaiFan + 2175–2186 + With the advent of neural machine translation, there has been a marked shift towards leveraging and consuming the machine translation results. However, the gap between machine translation systems and human translators needs to be manually closed by post-editing. In this paper, we propose an end-to-end deep learning framework of the quality estimation and automatic post-editing of the machine translation output. Our goal is to provide error correction suggestions and to further relieve the burden of human translators through an interpretable model. To imitate the behavior of human translators, we design three efficient delegation modules – quality estimation, generative post-editing, and atomic operation post-editing and construct a hierarchical model based on them. We examine this approach with the English–German dataset from WMT 2017 APE shared task and our experimental results can achieve the state-of-the-art performance. We also verify that the certified translators can significantly expedite their post-editing processing with our model in human evaluation. + 2020.findings-emnlp.197 + 2020.findings-emnlp.197.OptionalSupplementaryMaterial.pdf + + + Zero-Shot Rationalization by Multi-Task Transfer Learning from Question Answering + Po-NienKung + Tse-HsuanYang + Yi-ChengChen + Sheng-SiangYin + Yun-NungChen + 2187–2197 + Extracting rationales can help human understand which information the model utilizes and how it makes the prediction towards better interpretability. However, annotating rationales requires much effort and only few datasets contain such labeled rationales, making supervised learning for rationalization difficult. In this paper, we propose a novel approach that leverages the benefits of both multi-task learning and transfer learning for generating rationales through question answering in a zero-shot fashion. For two benchmark rationalization datasets, the proposed method achieves comparable or even better performance of rationalization without any supervised signal, demonstrating the great potential of zero-shot rationalization for better interpretability. + 2020.findings-emnlp.198 + + + The Role of Reentrancies in <fixed-case>A</fixed-case>bstract <fixed-case>M</fixed-case>eaning <fixed-case>R</fixed-case>epresentation Parsing + MarcoDamonte + IdaSzubert + Shay B.Cohen + MarkSteedman + 2198–2207 + Abstract Meaning Representation (AMR) parsing aims at converting sentences into AMR representations. These are graphs and not trees because AMR supports reentrancies (nodes with more than one parent). Following previous findings on the importance of reen- trancies for AMR, we empirically find and discuss several linguistic phenomena respon- sible for reentrancies in AMR, some of which have not received attention before. We cate- gorize the types of errors AMR parsers make with respect to reentrancies. Furthermore, we find that correcting these errors provides an in- crease of up to 5% Smatch in parsing perfor- mance and 20% in reentrancy prediction + 2020.findings-emnlp.199 + 2020.findings-emnlp.199.OptionalSupplementaryMaterial.zip + + + Cross-Lingual Suicidal-Oriented Word Embedding toward Suicide Prevention + DaeunLee + SoyoungPark + JiwonKang + DaejinChoi + JinyoungHan + 2208–2217 + Early intervention for suicide risks with social media data has increasingly received great attention. Using a suicide dictionary created by mental health experts is one of the effective ways to detect suicidal ideation. However, little attention has been paid to validate whether and how the existing dictionaries for other languages (i.e., English and Chinese) can be used for predicting suicidal ideation for a low-resource language (i.e., Korean) where a knowledge-based suicide dictionary has not yet been developed. To this end, we propose a cross-lingual suicidal ideation detection model that can identify whether a given social media post includes suicidal ideation or not. To utilize the existing suicide dictionaries developed for other languages (i.e., English and Chinese) in word embedding, our model translates a post written in the target language (i.e., Korean) into English and Chinese, and then uses the separate suicidal-oriented word embeddings developed for English and Chinese, respectively. By applying an ensemble approach for different languages, the model achieves high accuracy, over 87%. We believe our model is useful in accessing suicidal ideation using social media data for preventing potential suicide risk in an early stage. + 2020.findings-emnlp.200 + + + Service-oriented Text-to-<fixed-case>SQL</fixed-case> Parsing + WangsuHu + JileiTian + 2218–2222 + The information retrieval from relational database requires professionals who has an understanding of structural query language such as SQL. TEXT2SQL models apply natural language inference to enable user interacting the database via natural language utterance. Current TEXT2SQL models normally focus on generating complex SQL query in a precise and complete fashion while certain features of real-world application in the production environment is not fully addressed. This paper is aimed to develop a service-oriented Text-to-SQL parser that translates natural language utterance to structural and executable SQL query. We introduce a algorithmic framework named Semantic-Enriched SQL generator (SE-SQL) that enables flexibly access database than rigid API in the application while keeping the performance quality for the most commonly used cases. The qualitative result shows that the proposed model achieves 88.3% execution accuracy on WikiSQL task, outperforming baseline by 13% error reduction. Moreover, the framework considers several service-oriented needs including low-complexity inference, out-of-table rejection, and text normalization. + 2020.findings-emnlp.201 + 2020.findings-emnlp.201.OptionalSupplementaryMaterial.pdf + + + Reinforcement Learning with Imbalanced Dataset for Data-to-Text Medical Report Generation + ToruNishino + RyotaOzaki + YoheiMomoki + TomokiTaniguchi + RyujiKano + NorihisaNakano + YukiTagawa + MotokiTaniguchi + TomokoOhkuma + KeigoNakamura + 2223–2236 + Automated generation of medical reports that describe the findings in the medical images helps radiologists by alleviating their workload. Medical report generation system should generate correct and concise reports. However, data imbalance makes it difficult to train models accurately. Medical datasets are commonly imbalanced in their finding labels because incidence rates differ among diseases; moreover, the ratios of abnormalities to normalities are significantly imbalanced. We propose a novel reinforcement learning method with a reconstructor to improve the clinical correctness of generated reports to train the data-to-text module with a highly imbalanced dataset. Moreover, we introduce a novel data augmentation strategy for reinforcement learning to additionally train the model on infrequent findings. From the perspective of a practical use, we employ a Two-Stage Medical Report Generator (TS-MRGen) for controllable report generation from input images. TS-MRGen consists of two separated stages: an image diagnosis module and a data-to-text module. Radiologists can modify the image diagnosis module results to control the reports that the data-to-text module generates. We conduct an experiment with two medical datasets to assess the data-to-text module and the entire two-stage model. Results demonstrate that the reports generated by our model describe the findings in the input image more correctly. + 2020.findings-emnlp.202 + 2020.findings-emnlp.202.OptionalSupplementaryMaterial.pdf + + + Reducing the Frequency of Hallucinated Quantities in Abstractive Summaries + ZhengZhao + Shay B.Cohen + BonnieWebber + 2237–2249 + It is well-known that abstractive summaries are subject to hallucination—including material that is not supported by the original text. While summaries can be made hallucination-free by limiting them to general phrases, such summaries would fail to be very informative. Alternatively, one can try to avoid hallucinations by verifying that any specific entities in the summary appear in the original text in a similar context. This is the approach taken by our system, Herman. The system learns to recognize and verify quantity entities (dates, numbers, sums of money, etc.) in a beam-worth of abstractive summaries produced by state-of-the-art models, in order to up-rank those summaries whose quantity terms are supported by the original text. Experimental results demonstrate that the ROUGE scores of such up-ranked summaries have a higher Precision than summaries that have not been up-ranked, without a comparable loss in Recall, resulting in higher F1. Preliminary human evaluation of up-ranked vs. original summaries shows people’s preference for the former. + 2020.findings-emnlp.203 + + + Rethinking Topic Modelling: From Document-Space to Term-Space + MagnusSahlgren + 2250–2259 + This paper problematizes the reliance on documents as the basic notion for defining term interactions in standard topic models. As an alternative to this practice, we reformulate topic distributions as latent factors in term similarity space. We exemplify the idea using a number of standard word embeddings built with very wide context windows. The embedding spaces are transformed to sparse similarity spaces, and topics are extracted in standard fashion by factorizing to a lower-dimensional space. We use a number of different factorization techniques, and evaluate the various models using a large set of evaluation metrics, including previously published coherence measures, as well as a number of novel measures that we suggest better correspond to real-world applications of topic models. Our results clearly demonstrate that term-based models outperform standard document-based models by a large margin. + 2020.findings-emnlp.204 + + + Sparse and Decorrelated Representations for Stable Zero-shot <fixed-case>NMT</fixed-case> + BokyungSon + SungwonLyu + 2260–2266 + Using a single encoder and decoder for all directions and training with English-centric data is a popular scheme for multilingual NMT. However, zero-shot translation under this scheme is vulnerable to changes in training conditions, as the model degenerates by decoding non-English texts into English regardless of the target specifier token. We present that enforcing both sparsity and decorrelation on encoder intermediate representations with the SLNI regularizer (Aljundi et al., 2019) efficiently mitigates this problem, without performance loss in supervised directions. Notably, effects of SLNI turns out to be irrelevant to promoting language-invariance in encoder representations. + 2020.findings-emnlp.205 + + + A Semi-supervised Approach to Generate the Code-Mixed Text using Pre-trained Encoder and Transfer Learning + DeepakGupta + AsifEkbal + PushpakBhattacharyya + 2267–2280 + Code-mixing, the interleaving of two or more languages within a sentence or discourse is ubiquitous in multilingual societies. The lack of code-mixed training data is one of the major concerns for the development of end-to-end neural network-based models to be deployed for a variety of natural language processing (NLP) applications. A potential solution is to either manually create or crowd-source the code-mixed labelled data for the task at hand, but that requires much human efforts and often not feasible because of the language specific diversity in the code-mixed text. To circumvent the data scarcity issue, we propose an effective deep learning approach for automatically generating the code-mixed text from English to multiple languages without any parallel data. In order to train the neural network, we create synthetic code-mixed texts from the available parallel corpus by modelling various linguistic properties of code-mixing. Our codemixed text generator is built upon the encoder-decoder framework, where the encoder is augmented with the linguistic and task-agnostic features obtained from the transformer based language model. We also transfer the knowledge from a neural machine translation (NMT) to warm-start the training of code-mixed generator. Experimental results and in-depth analysis show the effectiveness of our proposed code-mixed text generation on eight diverse language pairs. + 2020.findings-emnlp.206 + 2020.findings-emnlp.206.OptionalSupplementaryMaterial.pdf + + + Integrating Graph Contextualized Knowledge into Pre-trained Language Models + BinHe + DiZhou + JinghuiXiao + XinJiang + QunLiu + Nicholas JingYuan + TongXu + 2281–2290 + Complex node interactions are common in knowledge graphs (KGs), and these interactions can be considered as contextualized knowledge exists in the topological structure of KGs. Traditional knowledge representation learning (KRL) methods usually treat a single triple as a training unit, neglecting the usage of graph contextualized knowledge. To utilize these unexploited graph-level knowledge, we propose an approach to model subgraphs in a medical KG. Then, the learned knowledge is integrated with a pre-trained language model to do the knowledge generalization. Experimental results demonstrate that our model achieves the state-of-the-art performance on several medical NLP tasks, and the improvement above MedERNIE indicates that graph contextualized knowledge is beneficial. + 2020.findings-emnlp.207 + + + Recursive Top-Down Production for Sentence Generation with Latent Trees + ShawnTan + YikangShen + AlessandroSordoni + AaronCourville + Timothy J.O’Donnell + 2291–2307 + We model the recursive production property of context-free grammars for natural and synthetic languages. To this end, we present a dynamic programming algorithm that marginalises over latent binary tree structures with N leaves, allowing us to compute the likelihood of a sequence of N tokens under a latent tree model, which we maximise to train a recursive neural function. We demonstrate performance on two synthetic tasks: SCAN, where it outperforms previous models on the LENGTH split, and English question formation, where it performs comparably to decoders with the ground-truth tree structure. We also present experimental results on German-English translation on the Multi30k dataset, and qualitatively analyse the induced tree structures our model learns for the SCAN tasks and the German-English translation task. + 2020.findings-emnlp.208 + 2020.findings-emnlp.208.OptionalSupplementaryMaterial.zip + + + Guided Dialogue Policy Learning without Adversarial Learning in the Loop + ZimingLi + SungjinLee + BaolinPeng + JinchaoLi + JuliaKiseleva + Maartende Rijke + ShahinShayandeh + JianfengGao + 2308–2317 + Reinforcement learning methods have emerged as a popular choice for training an efficient and effective dialogue policy. However, these methods suffer from sparse and unstable reward signals returned by a user simulator only when a dialogue finishes. Besides, the reward signal is manually designed by human experts, which requires domain knowledge. Recently, a number of adversarial learning methods have been proposed to learn the reward function together with the dialogue policy. However, to alternatively update the dialogue policy and the reward model on the fly, we are limited to policy-gradient-based algorithms, such as REINFORCE and PPO. Moreover, the alternating training of a dialogue agent and the reward model can easily get stuck in local optima or result in mode collapse. To overcome the listed issues, we propose to decompose the adversarial training into two steps. First, we train the discriminator with an auxiliary dialogue generator and then incorporate a derived reward model into a common reinforcement learning method to guide the dialogue policy learning. This approach is applicable to both on-policy and off-policy reinforcement learning methods. Based on our extensive experimentation, we can conclude the proposed method: (1) achieves a remarkable task success rate using both on-policy and off-policy reinforcement learning methods; and (2) has potential to transfer knowledge from existing domains to a new domain. + 2020.findings-emnlp.209 + + + <fixed-case>M</fixed-case>ulti<fixed-case>DM</fixed-case>-<fixed-case>GCN</fixed-case>: Aspect-Guided Response Generation in Multi-Domain Multi-Modal Dialogue System using Graph Convolution Network + MauajamaFirdaus + NidhiThakur + AsifEkbal + 2318–2328 + In the recent past, dialogue systems have gained immense popularity and have become ubiquitous. During conversations, humans not only rely on languages but seek contextual information through visual contents as well. In every task-oriented dialogue system, the user is guided by the different aspects of a product or service that regulates the conversation towards selecting the product or service. In this work, we present a multi-modal conversational framework for a task-oriented dialogue setup that generates the responses following the different aspects of a product or service to cater to the user’s needs. We show that the responses guided by the aspect information provide more interactive and informative responses for better communication between the agent and the user. We first create a Multi-domain Multi-modal Dialogue (MDMMD) dataset having conversations involving both text and images belonging to the three different domains, such as restaurants, electronics, and furniture. We implement a Graph Convolutional Network (GCN) based framework that generates appropriate textual responses from the multi-modal inputs. The multi-modal information having both textual and image representation is fed to the decoder and the aspect information for generating aspect guided responses. Quantitative and qualitative analyses show that the proposed methodology outperforms several baselines for the proposed task of aspect-guided response generation. + 2020.findings-emnlp.210 + + + Edge-Enhanced Graph Convolution Networks for Event Detection with Syntactic Relation + ShiyaoCui + BowenYu + TingwenLiu + ZhenyuZhang + XuebinWang + JinqiaoShi + 2329–2339 + Event detection (ED), a key subtask of information extraction, aims to recognize instances of specific event types in text. Previous studies on the task have verified the effectiveness of integrating syntactic dependency into graph convolutional networks. However, these methods usually ignore dependency label information, which conveys rich and useful linguistic knowledge for ED. In this paper, we propose a novel architecture named Edge-Enhanced Graph Convolution Networks (EE-GCN), which simultaneously exploits syntactic structure and typed dependency label information to perform ED. Specifically, an edge-aware node update module is designed to generate expressive word representations by aggregating syntactically-connected words through specific dependency types. Furthermore, to fully explore clues hidden from dependency edges, a node-aware edge update module is introduced, which refines the relation representations with contextual information.These two modules are complementary to each other and work in a mutual promotion way. We conduct experiments on the widely used ACE2005 dataset and the results show significant improvement over competitive baseline methods. + 2020.findings-emnlp.211 + + + Semi-supervised Formality Style Transfer using Language Model Discriminator and Mutual Information Maximization + KunalChawla + DiyiYang + 2340–2354 + Formality style transfer is the task of converting informal sentences to grammatically-correct formal sentences, which can be used to improve performance of many downstream NLP tasks. In this work, we propose a semi-supervised formality style transfer model that utilizes a language model-based discriminator to maximize the likelihood of the output sentence being formal, which allows us to use maximization of token-level conditional probabilities for training. We further propose to maximize mutual information between source and target styles as our training objective instead of maximizing the regular likelihood that often leads to repetitive and trivial generated responses. Experiments showed that our model outperformed previous state-of-the-art baselines significantly in terms of both automated metrics and human judgement. We further generalized our model to unsupervised text style transfer task, and achieved significant improvements on two benchmark sentiment style transfer datasets. + 2020.findings-emnlp.212 + 2020.findings-emnlp.212.OptionalSupplementaryMaterial.zip + + + Differentially Private Representation for <fixed-case>NLP</fixed-case>: Formal Guarantee and An Empirical Study on Privacy and Fairness + LingjuanLyu + XuanliHe + YitongLi + 2355–2365 + It has been demonstrated that hidden representation learned by deep model can encode private information of the input, hence can be exploited to recover such information with reasonable accuracy. To address this issue, we propose a novel approach called Differentially Private Neural Representation (DPNR) to preserve privacy of the extracted representation from text. DPNR utilises Differential Privacy (DP) to provide formal privacy guarantee. Further, we show that masking words via dropout can further enhance privacy. To maintain utility of the learned representation, we integrate DP-noisy representation into a robust training process to derive a robust target model, which also helps for model fairness over various demographic variables. Experimental results on benchmark datasets under various parameter settings demonstrate that DPNR largely reduces privacy leakage without significantly sacrificing the main task performance. + 2020.findings-emnlp.213 + + + Helpful or Hierarchical? Predicting the Communicative Strategies of Chat Participants, and their Impact on Success + FarzanaRashid + TommasoFornaciari + DirkHovy + EduardoBlanco + FernandoVega-Redondo + 2366–2371 + When interacting with each other, we motivate, advise, inform, show love or power towards our peers. However, the way we interact may also hold some indication on how successful we are, as people often try to help each other to achieve their goals. We study the chat interactions of thousands of aspiring entrepreneurs who discuss and develop business models. We manually annotate a set of about 5,500 chat interactions with four dimensions of interaction styles (motivation, cooperation, equality, advice). We find that these styles can be reliably predicted, and that the communication styles can be used to predict a number of indices of business success. Our findings indicate that successful communicators are also successful in other domains. + 2020.findings-emnlp.214 + + + Learning Knowledge Bases with Parameters for Task-Oriented Dialogue Systems + AndreaMadotto + SamuelCahyawijaya + Genta IndraWinata + YanXu + ZihanLiu + ZhaojiangLin + PascaleFung + 2372–2394 + Task-oriented dialogue systems are either modularized with separate dialogue state tracking (DST) and management steps or end-to-end trainable. In either case, the knowledge base (KB) plays an essential role in fulfilling user requests. Modularized systems rely on DST to interact with the KB, which is expensive in terms of annotation and inference time. End-to-end systems, instead, use the KB directly as input, but they cannot scale when the KB is larger than a few hundred entries. In this paper, we propose a method to embed the KB, of any size, directly into the model parameters. The resulting model does not require any DST or template responses, nor the KB as input, and it can dynamically update its KB via fine-tuning. We evaluate our solution in five task-oriented dialogue datasets with small, medium, and large KB size. Our experiments show that end-to-end models can effectively embed knowledge bases in their parameters and achieve competitive performance in all evaluated datasets. + 2020.findings-emnlp.215 + 2020.findings-emnlp.215.OptionalSupplementaryMaterial.pdf + + + Generalizing Open Domain Fact Extraction and Verification to <fixed-case>COVID</fixed-case>-<fixed-case>FACT</fixed-case> thorough In-Domain Language Modeling + ZhenghaoLiu + ChenyanXiong + ZhuyunDai + SiSun + MaosongSun + ZhiyuanLiu + 2395–2400 + With the epidemic of COVID-19, verifying the scientifically false online information, such as fake news and maliciously fabricated statements, has become crucial. However, the lack of training data in the scientific domain limits the performance of fact verification models. This paper proposes an in-domain language modeling method for fact extraction and verification systems. We come up with SciKGAT to combine the advantages of open-domain literature search, state-of-the-art fact verification systems and in-domain medical knowledge through language modeling. Our experiments on SCIFACT, a dataset of expert-written scientific fact verification, show that SciKGAT achieves 30% absolute improvement on precision. Our analyses show that such improvement thrives from our in-domain language model by picking up more related evidence pieces and accurate fact verification. Our codes and data are released via Github. + 2020.findings-emnlp.216 + + + <fixed-case>P</fixed-case>rophet<fixed-case>N</fixed-case>et: Predicting Future N-gram for Sequence-to-Sequence Pre-training + WeizhenQi + YuYan + YeyunGong + DayihengLiu + NanDuan + JiushengChen + RuofeiZhang + MingZhou + 2401–2410 + This paper presents a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of optimizing one-step-ahead prediction in the traditional sequence-to-sequence model, the ProphetNet is optimized by n-step ahead prediction that predicts the next n tokens simultaneously based on previous context tokens at each time step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large-scale dataset (160GB), respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus. + 2020.findings-emnlp.217 + + + <fixed-case>D</fixed-case>iv<fixed-case>GAN</fixed-case>: Towards Diverse Paraphrase Generation via Diversified Generative Adversarial Network + YueCao + XiaojunWan + 2411–2421 + Paraphrases refer to texts that convey the same meaning with different expression forms. Traditional seq2seq-based models on paraphrase generation mainly focus on the fidelity while ignoring the diversity of outputs. In this paper, we propose a deep generative model to generate diverse paraphrases. We build our model based on the conditional generative adversarial network, and propose to incorporate a simple yet effective diversity loss term into the model in order to improve the diversity of outputs. The proposed diversity loss maximizes the ratio of pairwise distance between the generated texts and their corresponding latent codes, forcing the generator to focus more on the latent codes and produce diverse samples. Experimental results on benchmarks of paraphrase generation show that our proposed model can generate more diverse paraphrases compared with baselines. + 2020.findings-emnlp.218 + + + Plug-and-Play Conversational Models + AndreaMadotto + EtsukoIshii + ZhaojiangLin + SumanthDathathri + PascaleFung + 2422–2433 + There has been considerable progress made towards conversational models that generate coherent and fluent responses; however, this often involves training large language models on large dialogue datasets, such as Reddit. These large conversational models provide little control over the generated responses, and this control is further limited in the absence of annotated conversational datasets for attribute specific generation that can be used for fine-tuning the model. In this paper, we first propose and evaluate plug-and-play methods for controllable response generation, which does not require dialogue specific datasets and does not rely on fine-tuning a large model. While effective, the decoding procedure induces considerable computational overhead, rendering the conversational model unsuitable for interactive usage. To overcome this, we introduce an approach that does not require further computation at decoding time, while also does not require any fine-tuning of a large language model. We demonstrate, through extensive automatic and human evaluation, a high degree of control over the generated conversational responses with regard to multiple desired attributes, while being fluent. + 2020.findings-emnlp.219 + 2020.findings-emnlp.219.OptionalSupplementaryMaterial.pdf + + + Event-Driven Learning of Systematic Behaviours in Stock Markets + XianchaoWu + 2434–2444 + It is reported that financial news, especially financial events expressed in news, provide information to investors’ long/short decisions and influence the movements of stock markets. Motivated by this, we leverage financial event streams to train a classification neural network that detects latent event-stock linkages and stock markets’ systematic behaviours in the U.S. stock market. Our proposed pipeline includes (1) a combined event extraction method that utilizes Open Information Extraction and neural co-reference resolution, (2) a BERT/ALBERT enhanced representation of events, and (3) an extended hierarchical attention network that includes attentions on event, news and temporal levels. Our pipeline achieves significantly better accuracies and higher simulated annualized returns than state-of-the-art models when being applied to predicting Standard&Poor 500, Dow Jones, Nasdaq indices and 10 individual stocks. + 2020.findings-emnlp.220 + + + You could have said that instead: Improving Chatbots with Natural Language Feedback + Makesh NarsimhanSreedhar + KunNi + SivaReddy + 2445–2453 + The ubiquitous nature of dialogue systems and their interaction with users generate an enormous amount of data. Can we improve chatbots using this data? A self-feeding chatbot improves itself by asking natural language feedback when a user is dissatisfied with its response and uses this feedback as an additional training sample. However, user feedback in most cases contains extraneous sequences hindering their usefulness as a training sample. In this work, we propose a generative adversarial model that converts noisy feedback into a plausible natural response in a conversation. The generator’s goal is to convert the feedback into a response that answers the user’s previous utterance and to fool the discriminator which distinguishes feedback from natural responses. We show that augmenting original training data with these modified feedback responses improves the original chatbot performance from 69.94%to 75.96% in ranking correct responses on the PERSONACHATdataset, a large improvement given that the original model is already trained on 131k samples. + 2020.findings-emnlp.221 + + + Adapting Coreference Resolution to <fixed-case>T</fixed-case>witter Conversations + BerfinAktaş + VeronikaSolopova + AnnalenaKohnert + ManfredStede + 2454–2460 + The performance of standard coreference resolution is known to drop significantly on Twitter texts. We improve the performance of the (Lee et al., 2018) system, which is originally trained on OntoNotes, by retraining on manually-annotated Twitter conversation data. Further experiments by combining different portions of OntoNotes with Twitter data show that selecting text genres for the training data can beat the mere maximization of training data amount. In addition, we inspect several phenomena such as the role of deictic pronouns in conversational data, and present additional results for variant settings. Our best configuration improves the performance of the”out of the box” system by 21.6%. + 2020.findings-emnlp.222 + + + On <fixed-case>R</fixed-case>omanization for Model Transfer Between Scripts in Neural Machine Translation + ChantalAmrhein + RicoSennrich + 2461–2469 + Transfer learning is a popular strategy to improve the quality of low-resource machine translation. For an optimal transfer of the embedding layer, the child and parent model should share a substantial part of the vocabulary. This is not the case when transferring to languages with a different script. We explore the benefit of romanization in this scenario. Our results show that romanization entails information loss and is thus not always superior to simpler vocabulary transfer methods, but can improve the transfer between related languages with different scripts. We compare two romanization tools and find that they exhibit different degrees of information loss, which affects translation quality. Finally, we extend romanization to the target side, showing that this can be a successful strategy when coupled with a simple deromanization model. + 2020.findings-emnlp.223 + + + <fixed-case>COSMIC</fixed-case>: <fixed-case>CO</fixed-case>mmon<fixed-case>S</fixed-case>ense knowledge for e<fixed-case>M</fixed-case>otion Identification in Conversations + DeepanwayGhosal + NavonilMajumder + AlexanderGelbukh + RadaMihalcea + SoujanyaPoria + 2470–2481 + In this paper, we address the task of utterance level emotion recognition in conversations using commonsense knowledge. We propose COSMIC, a new framework that incorporates different elements of commonsense such as mental states, events, and causal relations, and build upon them to learn interactions between interlocutors participating in a conversation. Current state-of-theart methods often encounter difficulties in context propagation, emotion shift detection, and differentiating between related emotion classes. By learning distinct commonsense representations, COSMIC addresses these challenges and achieves new state-of-the-art results for emotion recognition on four different benchmark conversational datasets. Our code is available at https://github.com/declare-lab/conv-emotion. + 2020.findings-emnlp.224 + + + Improving Compositional Generalization in Semantic Parsing + InbarOren + JonathanHerzig + NitishGupta + MattGardner + JonathanBerant + 2482–2495 + Generalization of models to out-of-distribution (OOD) data has captured tremendous attention recently. Specifically, compositional generalization, i.e., whether a model generalizes to new structures built of components observed during training, has sparked substantial interest. In this work, we investigate compositional generalization in semantic parsing, a natural test-bed for compositional generalization, as output programs are constructed from sub-components. We analyze a wide variety of models and propose multiple extensions to the attention module of the semantic parser, aiming to improve compositional generalization. We find that the following factors improve compositional generalization: (a) using contextual representations, such as ELMo and BERT, (b) informing the decoder what input tokens have previously been attended to, (c) training the decoder attention to agree with pre-computed token alignments, and (d) downsampling examples corresponding to frequent program templates. While we substantially reduce the gap between in-distribution and OOD generalization, performance on OOD compositions is still substantially lower. + 2020.findings-emnlp.225 + + + Answer Span Correction in Machine Reading Comprehension + RevanthGangi Reddy + Md ArafatSultan + EfsunSarioglu Kayi + RongZhang + VittorioCastelli + AviSil + 2496–2501 + Answer validation in machine reading comprehension (MRC) consists of verifying an extracted answer against an input context and question pair. Previous work has looked at re-assessing the “answerability” of the question given the extracted answer. Here we address a different problem: the tendency of existing MRC systems to produce partially correct answers when presented with answerable questions. We explore the nature of such errors and propose a post-processing correction method that yields statistically significant performance improvements over state-of-the-art MRC systems in both monolingual and multilingual evaluation. + 2020.findings-emnlp.226 + + + On the Interplay Between Fine-tuning and Sentence-Level Probing for Linguistic Knowledge in Pre-Trained Transformers + MariusMosbach + AnnaKhokhlova + Michael A.Hedderich + DietrichKlakow + 2502–2516 + Fine-tuning pre-trained contextualized embedding models has become an integral part of the NLP pipeline. At the same time, probing has emerged as a way to investigate the linguistic knowledge captured by pre-trained models. Very little is, however, understood about how fine-tuning affects the representations of pre-trained models and thereby the linguistic knowledge they encode. This paper contributes towards closing this gap. We study three different pre-trained models: BERT, RoBERTa, and ALBERT, and investigate through sentence-level probing how fine-tuning affects their representations. We find that for some probing tasks fine-tuning leads to substantial changes in accuracy, possibly suggesting that fine-tuning introduces or even removes linguistic knowledge from a pre-trained model. These changes, however, vary greatly across different models, fine-tuning and probing tasks. Our analysis reveals that while fine-tuning indeed changes the representations of a pre-trained model and these changes are typically larger for higher layers, only in very few cases, fine-tuning has a positive effect on probing accuracy that is larger than just using the pre-trained model with a strong pooling method. Based on our findings, we argue that both positive and negative effects of fine-tuning on probing require a careful interpretation. + 2020.findings-emnlp.227 + + + Zero-shot Entity Linking with Efficient Long Range Sequence Modeling + ZonghaiYao + LiangliangCao + HuapuPan + 2517–2522 + This paper considers the problem of zero-shot entity linking, in which a link in the test time may not present in training. Following the prevailing BERT-based research efforts, we find a simple yet effective way is to expand the long-range sequence modeling. Unlike many previous methods, our method does not require expensive pre-training of BERT with long position embeddings. Instead, we propose an efficient position embeddings initialization method called Embedding-repeat, which initializes larger position embeddings based on BERT-Base. On the zero-shot entity linking dataset, our method improves the STOA from 76.06% to 79.08%, and for its long data, the corresponding improvement is from 74.57% to 82.14%. Our experiments suggest the effectiveness of long-range sequence modeling without retraining the BERT model. + 2020.findings-emnlp.228 + + + How Does Context Matter? On the Robustness of Event Detection with Context-Selective Mask Generalization + JianLiu + YuboChen + KangLiu + YantaoJia + ZhichengSheng + 2523–2532 + Event detection (ED) aims to identify and classify event triggers in texts, which is a crucial subtask of event extraction (EE). Despite many advances in ED, the existing studies are typically centered on improving the overall performance of an ED model, which rarely consider the robustness of an ED model. This paper aims to fill this research gap by stressing the importance of robustness modeling in ED models. We first pinpoint three stark cases demonstrating the brittleness of the existing ED models. After analyzing the underlying reason, we propose a new training mechanism, called context-selective mask generalization for ED, which can effectively mine context-specific patterns for learning and robustify an ED model. The experimental results have confirmed the effectiveness of our model regarding defending against adversarial attacks, exploring unseen predicates, and tackling ambiguity cases. Moreover, a deeper analysis suggests that our approach can learn a complementary predictive bias with most ED models that use full context for feature learning. + 2020.findings-emnlp.229 + + + Adaptive Feature Selection for End-to-End Speech Translation + BiaoZhang + IvanTitov + BarryHaddow + RicoSennrich + 2533–2544 + Information in speech signals is not evenly distributed, making it an additional challenge for end-to-end (E2E) speech translation (ST) to learn to focus on informative features. In this paper, we propose adaptive feature selection (AFS) for encoder-decoder based E2E ST. We first pre-train an ASR encoder and apply AFS to dynamically estimate the importance of each encoded speech feature to ASR. A ST encoder, stacked on top of the ASR encoder, then receives the filtered features from the (frozen) ASR encoder. We take L0DROP (Zhang et al., 2020) as the backbone for AFS, and adapt it to sparsify speech features with respect to both temporal and feature dimensions. Results on LibriSpeech EnFr and MuST-C benchmarks show that AFS facilitates learning of ST by pruning out ~84% temporal features, yielding an average translation gain of ~1.3-1.6 BLEU and a decoding speedup of ~1.4x. In particular, AFS reduces the performance gap compared to the cascade baseline, and outperforms it on LibriSpeech En-Fr with a BLEU score of 18.56 (without data augmentation). + 2020.findings-emnlp.230 + + + Abstractive Multi-Document Summarization via Joint Learning with Single-Document Summarization + HanqiJin + XiaojunWan + 2545–2554 + Single-document and multi-document summarizations are very closely related in both task definition and solution method. In this work, we propose to improve neural abstractive multi-document summarization by jointly learning an abstractive single-document summarizer. We build a unified model for single-document and multi-document summarizations by fully sharing the encoder and decoder and utilizing a decoding controller to aggregate the decoder’s outputs for multiple input documents. We evaluate our model on two multi-document summarization datasets: Multi-News and DUC-04. Experimental results show the efficacy of our approach, and it can substantially outperform several strong baselines. We also verify the helpfulness of single-document summarization to abstractive multi-document summarization task. + 2020.findings-emnlp.231 + + + Blockwise Self-Attention for Long Document Understanding + JiezhongQiu + HaoMa + OmerLevy + Wen-tauYih + SinongWang + JieTang + 2555–2565 + We present BlockBERT, a lightweight and efficient BERT model for better modeling long-distance dependencies. Our model extends BERT by introducing sparse block structures into the attention matrix to reduce both memory consumption and training/inference time, which also enables attention heads to capture either short- or long-range contextual information. We conduct experiments on language model pre-training and several benchmark question answering datasets with various paragraph lengths. BlockBERT uses 18.7-36.1% less memory and 12.0-25.1% less time to learn the model. During testing, BlockBERT saves 27.8% inference time, while having comparable and sometimes better prediction accuracy, compared to an advanced BERT-based model, RoBERTa. + 2020.findings-emnlp.232 + + + Unsupervised Few-Bits Semantic Hashing with Implicit Topics Modeling + FanghuaYe + JaranaManotumruksa + EmineYilmaz + 2566–2575 + Semantic hashing is a powerful paradigm for representing texts as compact binary hash codes. The explosion of short text data has spurred the demand of few-bits hashing. However, the performance of existing semantic hashing methods cannot be guaranteed when applied to few-bits hashing because of severe information loss. In this paper, we present a simple but effective unsupervised neural generative semantic hashing method with a focus on few-bits hashing. Our model is built upon variational autoencoder and represents each hash bit as a Bernoulli variable, which allows the model to be end-to-end trainable. To address the issue of information loss, we introduce a set of auxiliary implicit topic vectors. With the aid of these topic vectors, the generated hash codes are not only low-dimensional representations of the original texts but also capture their implicit topics. We conduct comprehensive experiments on four datasets. The results demonstrate that our approach achieves significant improvements over state-of-the-art semantic hashing methods in few-bits hashing. + 2020.findings-emnlp.233 + + + Grid Tagging Scheme for End-to-End Fine-grained Opinion Extraction + ZhenWu + ChengcanYing + FeiZhao + ZhifangFan + XinyuDai + RuiXia + 2576–2585 + Aspect-oriented Fine-grained Opinion Extraction (AFOE) aims at extracting aspect terms and opinion terms from review in the form of opinion pairs or additionally extracting sentiment polarity of aspect term to form opinion triplet. Because of containing several opinion factors, the complete AFOE task is usually divided into multiple subtasks and achieved in the pipeline. However, pipeline approaches easily suffer from error propagation and inconvenience in real-world scenarios. To this end, we propose a novel tagging scheme, Grid Tagging Scheme (GTS), to address the AFOE task in an end-to-end fashion only with one unified grid tagging task. Additionally, we design an effective inference strategy on GTS to exploit mutual indication between different opinion factors for more accurate extractions. To validate the feasibility and compatibility of GTS, we implement three different GTS models respectively based on CNN, BiLSTM, and BERT, and conduct experiments on the aspect-oriented opinion pair extraction and opinion triplet extraction datasets. Extensive experimental results indicate that GTS models outperform strong baselines significantly and achieve state-of-the-art performance. + 2020.findings-emnlp.234 + + + Learning Numeral Embedding + ChengyueJiang + ZhonglinNian + KaihaoGuo + ShanboChu + YinggongZhao + LibinShen + KeweiTu + 2586–2599 + Word embedding is an essential building block for deep learning methods for natural language processing. Although word embedding has been extensively studied over the years, the problem of how to effectively embed numerals, a special subset of words, is still underexplored. Existing word embedding methods do not learn numeral embeddings well because there are an infinite number of numerals and their individual appearances in training corpora are highly scarce. In this paper, we propose two novel numeral embedding methods that can handle the out-of-vocabulary (OOV) problem for numerals. We first induce a finite set of prototype numerals using either a self-organizing map or a Gaussian mixture model. We then represent the embedding of a numeral as a weighted average of the prototype number embeddings. Numeral embeddings represented in this manner can be plugged into existing word embedding learning approaches such as skip-gram for training. We evaluated our methods and showed its effectiveness on four intrinsic and extrinsic tasks: word similarity, embedding numeracy, numeral prediction, and sequence labeling. + 2020.findings-emnlp.235 + + + An Investigation of Potential Function Designs for Neural <fixed-case>CRF</fixed-case> + ZechuanHu + YongJiang + NguyenBach + TaoWang + ZhongqiangHuang + FeiHuang + KeweiTu + 2600–2609 + The neural linear-chain CRF model is one of the most widely-used approach to sequence labeling. In this paper, we investigate a series of increasingly expressive potential functions for neural CRF models, which not only integrate the emission and transition functions, but also explicitly take the representations of the contextual words as input. Our extensive experiments show that the decomposed quadrilinear potential function based on the vector representations of two neighboring labels and two neighboring words consistently achieves the best performance. + 2020.findings-emnlp.236 + + + Fast End-to-end Coreference Resolution for <fixed-case>K</fixed-case>orean + CheoneumPark + JaminShin + SungjoonPark + JoonhoLim + ChangkiLee + 2610–2624 + Recently, end-to-end neural network-based approaches have shown significant improvements over traditional pipeline-based models in English coreference resolution. However, such advancements came at a cost of computational complexity and recent works have not focused on tackling this problem. Hence, in this paper, to cope with this issue, we propose BERT-SRU-based Pointer Networks that leverages the linguistic property of head-final languages. Applying this model to the Korean coreference resolution, we significantly reduce the coreference linking search space. Combining this with Ensemble Knowledge Distillation, we maintain state-of-the-art performance 66.9% of CoNLL F1 on ETRI test set while achieving 2x speedup (30 doc/sec) in document processing time. + 2020.findings-emnlp.237 + 2020.findings-emnlp.237.OptionalSupplementaryMaterial.pdf + + + Toward Stance-based Personas for Opinionated Dialogues + ThomasScialom + Serra SinemTekiroğlu + JacopoStaiano + MarcoGuerini + 2625–2635 + In the context of chit-chat dialogues it has been shown that endowing systems with a persona profile is important to produce more coherent and meaningful conversations. Still, the representation of such personas has thus far been limited to a fact-based representation (e.g. “I have two cats.”). We argue that these representations remain superficial w.r.t. the complexity of human personality. In this work, we propose to make a step forward and investigate stance-based persona, trying to grasp more profound characteristics, such as opinions, values, and beliefs to drive language generation. To this end, we introduce a novel dataset allowing to explore different stance-based persona representations and their impact on claim generation, showing that they are able to grasp abstract and profound aspects of the author persona. + 2020.findings-emnlp.238 + + + Hierarchical Pre-training for Sequence Labelling in Spoken Dialog + EmileChapuis + PierreColombo + MatteoManica + MatthieuLabeau + ChloéClavel + 2636–2648 + Sequence labelling tasks like Dialog Act and Emotion/Sentiment identification are a key component of spoken dialog systems. In this work, we propose a new approach to learn generic representations adapted to spoken dialog, which we evaluate on a new benchmark we call Sequence labellIng evaLuatIon benChmark fOr spoken laNguagE benchmark (SILICONE). SILICONE is model-agnostic and contains 10 different datasets of various sizes. We obtain our representations with a hierarchical encoder based on transformer architectures, for which we extend two well-known pre-training objectives. Pre-training is performed on OpenSubtitles: a large corpus of spoken dialog containing over 2.3 billion of tokens. We demonstrate how hierarchical encoders achieve competitive results with consistently fewer parameters compared to state-of-the-art models and we show their importance for both pre-training and fine-tuning. + 2020.findings-emnlp.239 + 2020.findings-emnlp.239.OptionalSupplementaryMaterial.pdf + + + Extending Multilingual <fixed-case>BERT</fixed-case> to Low-Resource Languages + ZihanWang + KarthikeyanK + StephenMayhew + DanRoth + 2649–2656 + Multilingual BERT (M-BERT) has been a huge success in both supervised and zero-shot cross-lingual transfer learning. However, this success is focused only on the top 104 languages in Wikipedia it was trained on. In this paper, we propose a simple but effective approach to extend M-BERT E-MBERT so it can benefit any new language, and show that our approach aids languages that are already in M-BERT as well. We perform an extensive set of experiments with Named Entity Recognition (NER) on 27 languages, only 16 of which are in M-BERT, and show an average increase of about 6% F1 on M-BERT languages and 23% F1 increase on new languages. We release models and code at http://cogcomp.org/page/publication_view/912. + 2020.findings-emnlp.240 + + + Out-of-Sample Representation Learning for Knowledge Graphs + MarjanAlbooyeh + RishabGoel + Seyed MehranKazemi + 2657–2666 + Many important problems can be formulated as reasoning in knowledge graphs. Representation learning has proved extremely effective for transductive reasoning, in which one needs to make new predictions for already observed entities. This is true for both attributed graphs(where each entity has an initial feature vector) and non-attributed graphs (where the only initial information derives from known relations with other entities). For out-of-sample reasoning, where one needs to make predictions for entities that were unseen at training time, much prior work considers attributed graph. However, this problem is surprisingly under-explored for non-attributed graphs. In this paper, we study the out-of-sample representation learning problem for non-attributed knowledge graphs, create benchmark datasets for this task, develop several models and baselines, and provide empirical analyses and comparisons of the proposed models and baselines. + 2020.findings-emnlp.241 + + + Fine-Grained Grounding for Multimodal Speech Recognition + TejasSrinivasan + RamonSanabria + FlorianMetze + DesmondElliott + 2667–2677 + Multimodal automatic speech recognition systems integrate information from images to improve speech recognition quality, by grounding the speech in the visual context. While visual signals have been shown to be useful for recovering entities that have been masked in the audio, these models should be capable of recovering a broader range of word types. Existing systems rely on global visual features that represent the entire image, but localizing the relevant regions of the image will make it possible to recover a larger set of words, such as adjectives and verbs. In this paper, we propose a model that uses finer-grained visual information from different parts of the image, using automatic object proposals. In experiments on the Flickr8K Audio Captions Corpus, we find that our model improves over approaches that use global visual features, that the proposals enable the model to recover entities and other related words, such as adjectives, and that improvements are due to the model’s ability to localize the correct proposals. + 2020.findings-emnlp.242 + + + Unsupervised Expressive Rules Provide Explainability and Assist Human Experts Grasping New Domains + EyalShnarch + LeshemChoshen + GuyMoshkowich + RanitAharonov + NoamSlonim + 2678–2697 + Approaching new data can be quite deterrent; you do not know how your categories of interest are realized in it, commonly, there is no labeled data at hand, and the performance of domain adaptation methods is unsatisfactory. Aiming to assist domain experts in their first steps into a new task over a new corpus, we present an unsupervised approach to reveal complex rules which cluster the unexplored corpus by its prominent categories (or facets). These rules are human-readable, thus providing an important ingredient which has become in short supply lately - explainability. Each rule provides an explanation for the commonality of all the texts it clusters together. The experts can then identify which rules best capture texts of their categories of interest, and utilize them to deepen their understanding of these categories. These rules can also bootstrap the process of data labeling by pointing at a subset of the corpus which is enriched with texts demonstrating the target categories. We present an extensive evaluation of the usefulness of these rules in identifying target categories, as well as a user study which assesses their interpretability. + 2020.findings-emnlp.243 + 2020.findings-emnlp.243.OptionalSupplementaryMaterial.txt + + + Textual supervision for visually grounded spoken language understanding + BertrandHigy + DesmondElliott + GrzegorzChrupała + 2698–2709 + Visually-grounded models of spoken language understanding extract semantic information directly from speech, without relying on transcriptions. This is useful for low-resource languages, where transcriptions can be expensive or impossible to obtain. Recent work showed that these models can be improved if transcriptions are available at training time. However, it is not clear how an end-to-end approach compares to a traditional pipeline-based approach when one has access to transcriptions. Comparing different strategies, we find that the pipeline approach works better when enough text is available. With low-resource languages in mind, we also show that translations can be effectively used in place of transcriptions but more data is needed to obtain similar results. + 2020.findings-emnlp.244 + + + <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies according to <fixed-case>BERT</fixed-case>: both more specific and more general + TomaszLimisiewicz + DavidMareček + RudolfRosa + 2710–2722 + This work focuses on analyzing the form and extent of syntactic abstraction captured by BERT by extracting labeled dependency trees from self-attentions. Previous work showed that individual BERT heads tend to encode particular dependency relation types. We extend these findings by explicitly comparing BERT relations to Universal Dependencies (UD) annotations, showing that they often do not match one-to-one. We suggest a method for relation identification and syntactic tree construction. Our approach produces significantly more consistent dependency trees than previous work, showing that it better explains the syntactic abstractions in BERT. At the same time, it can be successfully applied with only a minimal amount of supervision and generalizes well across languages. + 2020.findings-emnlp.245 + + + Visual Objects As Context: Exploiting Visual Objects for Lexical Entailment + MasayasuMuraoka + TetsuyaNasukawa + BishwaranjanBhattacharjee + 2723–2735 + We propose a new word representation method derived from visual objects in associated images to tackle the lexical entailment task. Although it has been shown that the Distributional Informativeness Hypothesis (DIH) holds on text, in which the DIH assumes that a context surrounding a hyponym is more informative than that of a hypernym, it has never been tested on visual objects. Since our perception is tightly associated with language, it is meaningful to explore whether the DIH holds on visual objects. To this end, we consider visual objects as the context of a word and represent a word as a bag of visual objects found in images associated with the word. This allows us to test the feasibility of the visual DIH. To better distinguish word pairs in a hypernym relation from other relations such as co-hypernyms, we also propose a new measurable function that takes into account both the difference in the generality of meaning and similarity of meaning between words. Our experimental results show that the DIH holds on visual objects and that the proposed method combined with the proposed function outperforms existing unsupervised representation methods. + 2020.findings-emnlp.246 + + + Learning to Plan and Realize Separately for Open-Ended Dialogue Systems + SashankSanthanam + ZhuoCheng + BrodieMather + BonnieDorr + ArchnaBhatia + BryannaHebenstreit + AlanZemel + AdamDalton + TomekStrzalkowski + SamiraShaikh + 2736–2750 + Achieving true human-like ability to conduct a conversation remains an elusive goal for open-ended dialogue systems. We posit this is because extant approaches towards natural language generation (NLG) are typically construed as end-to-end architectures that do not adequately model human generation processes. To investigate, we decouple generation into two separate phases: planning and realization. In the planning phase, we train two planners to generate plans for response utterances. The realization phase uses response plans to produce an appropriate response. Through rigorous evaluations, both automated and human, we demonstrate that decoupling the process into planning and realization performs better than an end-to-end approach. + 2020.findings-emnlp.247 + 2020.findings-emnlp.247.OptionalSupplementaryMaterial.txt + + + Be Different to Be Better! A Benchmark to Leverage the Complementarity of Language and Vision + SandroPezzelle + ClaudioGreco + GretaGandolfi + EleonoraGualdoni + RaffaellaBernardi + 2751–2767 + This paper introduces BD2BB, a novel language and vision benchmark that requires multimodal models combine complementary information from the two modalities. Recently, impressive progress has been made to develop universal multimodal encoders suitable for virtually any language and vision tasks. However, current approaches often require them to combine redundant information provided by language and vision. Inspired by real-life communicative contexts, we propose a novel task where either modality is necessary but not sufficient to make a correct prediction. To do so, we first build a dataset of images and corresponding sentences provided by human participants. Second, we evaluate state-of-the-art models and compare their performance against human speakers. We show that, while the task is relatively easy for humans, best-performing models struggle to achieve similar results. + 2020.findings-emnlp.248 + + + Cross-Lingual Training of Neural Models for Document Ranking + PengShi + HeBai + JimmyLin + 2768–2773 + We tackle the challenge of cross-lingual training of neural document ranking models for mono-lingual retrieval, specifically leveraging relevance judgments in English to improve search in non-English languages. Our work successfully applies multi-lingual BERT (mBERT) to document ranking and additionally compares against a number of alternatives: translating the training data, translating documents, multi-stage hybrids, and ensembles. Experiments on test collections in six different languages from diverse language families reveal many interesting findings: model-based relevance transfer using mBERT can significantly improve search quality in (non-English) mono-lingual retrieval, but other “low resource” approaches are competitive as well. + 2020.findings-emnlp.249 + + + Improving Word Embedding Factorization for Compression using Distilled Nonlinear Neural Decomposition + VasileiosLioutas + AhmadRashid + KrtinKumar + Md. AkmalHaidar + MehdiRezagholizadeh + 2774–2784 + Word-embeddings are vital components of Natural Language Processing (NLP) models and have been extensively explored. However, they consume a lot of memory which poses a challenge for edge deployment. Embedding matrices, typically, contain most of the parameters for language models and about a third for machine translation systems. In this paper, we propose Distilled Embedding, an (input/output) embedding compression method based on low-rank matrix decomposition and knowledge distillation. First, we initialize the weights of our decomposed matrices by learning to reconstruct the full pre-trained word-embedding and then fine-tune end-to-end, employing knowledge distillation on the factorized embedding. We conduct extensive experiments with various compression rates on machine translation and language modeling, using different data-sets with a shared word-embedding matrix for both embedding and vocabulary projection matrices. We show that the proposed technique is simple to replicate, with one fixed parameter controlling compression size, has higher BLEU score on translation and lower perplexity on language modeling compared to complex, difficult to tune state-of-the-art methods. + 2020.findings-emnlp.250 + 2020.findings-emnlp.250.OptionalSupplementaryMaterial.zip + + + <fixed-case>P</fixed-case>harm<fixed-case>MT</fixed-case>: A Neural Machine Translation Approach to Simplify Prescription Directions + JiazhaoLi + CoreyLester + XinyanZhao + YutingDing + YunJiang + V.G.VinodVydiswaran + 2785–2796 + The language used by physicians and health professionals in prescription directions includes medical jargon and implicit directives and causes much confusion among patients. Human intervention to simplify the language at the pharmacies may introduce additional errors that can lead to potentially severe health outcomes. We propose a novel machine translation-based approach, PharmMT, to automatically and reliably simplify prescription directions into patient-friendly language, thereby significantly reducing pharmacist workload. We evaluate the proposed approach over a dataset consisting of over 530K prescriptions obtained from a large mail-order pharmacy. The end-to-end system achieves a BLEU score of 60.27 against the reference directions generated by pharmacists, a 39.6% relative improvement over the rule-based normalization. Pharmacists judged 94.3% of the simplified directions as usable as-is or with minimal changes. This work demonstrates the feasibility of a machine translation-based tool for simplifying prescription directions in real-life. + 2020.findings-emnlp.251 + + + <fixed-case>LSTMS</fixed-case> Compose — and Learn — Bottom-Up + NaomiSaphra + AdamLopez + 2797–2809 + Recent work in NLP shows that LSTM language models capture compositional structure in language data. In contrast to existing work, we consider the learning process that leads to compositional behavior. For a closer look at how an LSTM’s sequential representations are composed hierarchically, we present a related measure of Decompositional Interdependence (DI) between word meanings in an LSTM, based on their gate interactions. We support this measure with experiments on English language data, where DI is higher on pairs of words with lower syntactic distance. To explore the inductive biases that cause these compositional representations to arise during training, we conduct simple experiments on synthetic data. These synthetic experiments support a specific hypothesis about how hierarchical structures are discovered over the course of training: that LSTM constituent representations are learned bottom-up, relying on effective representations of their shorter children, rather than on learning the longer-range relations independently. + 2020.findings-emnlp.252 + + + Natural Language Rationales with Full-Stack Visual Reasoning: From Pixels to Semantic Frames to Commonsense Graphs + AnaMarasović + ChandraBhagavatula + Jae sungPark + RonanLe Bras + Noah A.Smith + YejinChoi + 2810–2829 + Natural language rationales could provide intuitive, higher-level explanations that are easily understandable by humans, complementing the more broadly studied lower-level explanations based on gradients or attention weights. We present the first study focused on generating natural language rationales across several complex visual reasoning tasks: visual commonsense reasoning, visual-textual entailment, and visual question answering. The key challenge of accurate rationalization is comprehensive image understanding at all levels: not just their explicit content at the pixel level, but their contextual contents at the semantic and pragmatic levels. We present RationaleˆVT Transformer, an integrated model that learns to generate free-text rationales by combining pretrained language models with object recognition, grounded visual semantic frames, and visual commonsense graphs. Our experiments show that free-text rationalization is a promising research direction to complement model interpretability for complex visual-textual reasoning tasks. In addition, we find that integration of richer semantic and pragmatic visual features improves visual fidelity of rationales. + 2020.findings-emnlp.253 + 2020.findings-emnlp.253.OptionalSupplementaryMaterial.pdf + + + Corpora Evaluation and System Bias detection in Multi Document Summarization + AlvinDey + TanyaChowdhury + YashKumar + TanmoyChakraborty + 2830–2840 + Multi-document summarization (MDS) is the task of reflecting key points from any set of documents into a concise text paragraph. In the past, it has been used to aggregate news, tweets, product reviews, etc. from various sources. Owing to no standard definition of the task, we encounter a plethora of datasets with varying levels of overlap and conflict between participating documents. There is also no standard regarding what constitutes summary information in MDS. Adding to the challenge is the fact that new systems report results on a set of chosen datasets, which might not correlate with their performance on the other datasets. In this paper, we study this heterogeneous task with the help of a few widely used MDS corpora and a suite of state-of-theart models. We make an attempt to quantify the quality of summarization corpus and prescribe a list of points to consider while proposing a new MDS corpus. Next, we analyze the reason behind the absence of an MDS system which achieves superior performance across all corpora. We then observe the extent to which system metrics are influenced, and bias is propagated due to corpus properties. The scripts to reproduce the experiments in this work are available at https://github.com/LCS2-IIITD/summarization_bias.git + 2020.findings-emnlp.254 + 2020.findings-emnlp.254.OptionalSupplementaryMaterial.txt + + + Graph-to-Tree Neural Networks for Learning Structured Input-Output Translation with Applications to Semantic Parsing and Math Word Problem + ShuchengLi + LingfeiWu + ShiweiFeng + FangliXu + FengyuanXu + ShengZhong + 2841–2852 + The celebrated Seq2Seq technique and its numerous variants achieve excellent performance on many tasks such as neural machine translation, semantic parsing, and math word problem solving. However, these models either only consider input objects as sequences while ignoring the important structural information for encoding, or they simply treat output objects as sequence outputs instead of structural objects for decoding. In this paper, we present a novel Graph-to-Tree Neural Networks, namely Graph2Tree consisting of a graph encoder and a hierarchical tree decoder, that encodes an augmented graph-structured input and decodes a tree-structured output. In particular, we investigated our model for solving two problems, neural semantic parsing and math word problem. Our extensive experiments demonstrate that our Graph2Tree model outperforms or matches the performance of other state-of-the-art models on these tasks. + 2020.findings-emnlp.255 + 2020.findings-emnlp.255.OptionalSupplementaryMaterial.pdf + + + Target Conditioning for One-to-Many Generation + Marie-AnneLachaux + ArmandJoulin + GuillaumeLample + 2853–2862 + Neural Machine Translation (NMT) models often lack diversity in their generated translations, even when paired with search algorithm, like beam search. A challenge is that the diversity in translations are caused by the variability in the target language, and cannot be inferred from the source sentence alone. In this paper, we propose to explicitly model this one-to-many mapping by conditioning the decoder of a NMT model on a latent variable that represents the domain of target sentences. The domain is a discrete variable generated by a target encoder that is jointly trained with the NMT model.The predicted domain of target sentences are given as input to the decoder during training. At inference, we can generate diverse translations by decoding with different domains. Unlike our strongest baseline (Shen et al., 2019), our method can scale to any number of domains without affecting the performance or the training time. We assess the quality and diversity of translations generated by our model with several metrics, on three different datasets. + 2020.findings-emnlp.256 + + + Can Pre-training help <fixed-case>VQA</fixed-case> with Lexical Variations? + ShailzaJolly + ShubhamKapoor + 2863–2868 + Rephrasings or paraphrases are sentences with similar meanings expressed in different ways. Visual Question Answering (VQA) models are closing the gap with the oracle performance for datasets like VQA2.0. However, these models fail to perform well on rephrasings of a question, which raises some important questions like Are these models robust towards linguistic variations? Is it the architecture or the dataset that we need to optimize? In this paper, we analyzed VQA models in the space of paraphrasing. We explored the role of language & cross-modal pre-training to investigate the robustness of VQA models towards lexical variations. Our experiments find that pre-trained language encoders generate efficient representations of question rephrasings, which help VQA models correctly infer these samples. We empirically determine why pre-training language encoders improve lexical robustness. Finally, we observe that although pre-training all VQA components obtain state-of-the-art results on the VQA-Rephrasings dataset, it still fails to completely close the performance gap between original and rephrasing validation splits. + 2020.findings-emnlp.257 + 2020.findings-emnlp.257.OptionalSupplementaryMaterial.pdf + + + <fixed-case>FENAS</fixed-case>: Flexible and Expressive Neural Architecture Search + RamakanthPasunuru + MohitBansal + 2869–2876 + Architecture search is the automatic process of designing the model or cell structure that is optimal for the given dataset or task. Recently, this approach has shown good improvements in terms of performance (tested on language modeling and image classification) with reasonable training speed using a weight sharing-based approach called Efficient Neural Architecture Search (ENAS). In this work, we propose a novel architecture search algorithm called Flexible and Expressible Neural Architecture Search (FENAS), with more flexible and expressible search space than ENAS, in terms of more activation functions, input edges, and atomic operations. Also, our FENAS approach is able to reproduce the well-known LSTM and GRU architectures (unlike ENAS), and is also able to initialize with them for finding architectures more efficiently. We explore this extended search space via evolutionary search and show that FENAS performs significantly better on several popular text classification tasks and performs similar to ENAS on standard language model benchmark. Further, we present ablations and analyses on our FENAS approach. + 2020.findings-emnlp.258 + + + Inferring symmetry in natural language + ChelseaTanchip + LeiYu + AotaoXu + YangXu + 2877–2886 + We present a methodological framework for inferring symmetry of verb predicates in natural language. Empirical work on predicate symmetry has taken two main approaches. The feature-based approach focuses on linguistic features pertaining to symmetry. The context-based approach denies the existence of absolute symmetry but instead argues that such inference is context dependent. We develop methods that formalize these approaches and evaluate them against a novel symmetry inference sentence (SIS) dataset comprised of 400 naturalistic usages of literature-informed verbs spanning the spectrum of symmetry-asymmetry. Our results show that a hybrid transfer learning model that integrates linguistic features with contextualized language models most faithfully predicts the empirical data. Our work integrates existing approaches to symmetry in natural language and suggests how symmetry inference can improve systematicity in state-of-the-art language models. + 2020.findings-emnlp.259 + + + A Concise Model for Multi-Criteria <fixed-case>C</fixed-case>hinese Word Segmentation with Transformer Encoder + XipengQiu + HengzhiPei + HangYan + XuanjingHuang + 2887–2897 + Multi-criteria Chinese word segmentation (MCCWS) aims to exploit the relations among the multiple heterogeneous segmentation criteria and further improve the performance of each single criterion. Previous work usually regards MCCWS as different tasks, which are learned together under the multi-task learning framework. In this paper, we propose a concise but effective unified model for MCCWS, which is fully-shared for all the criteria. By leveraging the powerful ability of the Transformer encoder, the proposed unified model can segment Chinese text according to a unique criterion-token indicating the output criterion. Besides, the proposed unified model can segment both simplified and traditional Chinese and has an excellent transfer capability. Experiments on eight datasets with different criteria show that our model outperforms our single-criterion baseline model and other multi-criteria models. Source codes of this paper are available on Github. + 2020.findings-emnlp.260 + + + <fixed-case>LEGAL</fixed-case>-<fixed-case>BERT</fixed-case>: “Preparing the Muppets for Court’” + IliasChalkidis + ManosFergadiotis + ProdromosMalakasiotis + NikolaosAletras + IonAndroutsopoulos + 2898–2904 + BERT has achieved impressive performance in several NLP tasks. However, there has been limited investigation on its adaptation guidelines in specialised domains. Here we focus on the legal domain, where we explore several approaches for applying BERT models to downstream legal tasks, evaluating on multiple datasets. Our findings indicate that the previous guidelines for pre-training and fine-tuning, often blindly followed, do not always generalize well in the legal domain. Thus we propose a systematic investigation of the available strategies when applying BERT in specialised domains. These are: (a) use the original BERT out of the box, (b) adapt BERT by additional pre-training on domain-specific corpora, and (c) pre-train BERT from scratch on domain-specific corpora. We also propose a broader hyper-parameter search space when fine-tuning for downstream tasks and we release LEGAL-BERT, a family of BERT models intended to assist legal NLP research, computational law, and legal technology applications. + 2020.findings-emnlp.261 + 2020.findings-emnlp.261.OptionalSupplementaryMaterial.pdf + + + Enhancing Content Planning for Table-to-Text Generation with Data Understanding and Verification + HengGong + WeiBi + XiaochengFeng + BingQin + XiaojiangLiu + TingLiu + 2905–2914 + Neural table-to-text models, which select and order salient data, as well as verbalizing them fluently via surface realization, have achieved promising progress. Based on results from previous work, the performance bottleneck of current models lies in the stage of content planing (selecting and ordering salient content from the input). That is, performance drops drastically when an oracle content plan is replaced by a model-inferred one during surface realization. In this paper, we propose to enhance neural content planning by (1) understanding data values with contextual numerical value representations that bring the sense of value comparison into content planning; (2) verifying the importance and ordering of the selected sequence of records with policy gradient. We evaluated our model on ROTOWIRE and MLB, two datasets on this task, and results show that our model outperforms existing systems with respect to content planning metrics. + 2020.findings-emnlp.262 + 2020.findings-emnlp.262.OptionalSupplementaryMaterial.zip + + + Contextual Text Style Transfer + YuCheng + ZheGan + YizheZhang + OussamaElachqar + DianqiLi + JingjingLiu + 2915–2924 + We introduce a new task, Contextual Text Style Transfer - translating a sentence into a desired style with its surrounding context taken into account. This brings two key challenges to existing style transfer approaches: (I) how to preserve the semantic meaning of target sentence and its consistency with surrounding context during transfer; (ii) how to train a robust model with limited labeled data accompanied by context. To realize high-quality style transfer with natural context preservation, we propose a Context-Aware Style Transfer (CAST) model, which uses two separate encoders for each input sentence and its surrounding context. A classifier is further trained to ensure contextual consistency of the generated sentence. To compensate for the lack of parallel data, additional self-reconstruction and back-translation losses are introduced to leverage non-parallel data in a semi-supervised fashion. Two new benchmarks, Enron-Context and Reddit-Context, are introduced for formality and offensiveness style transfer. Experimental results on these datasets demonstrate the effectiveness of the proposed CAST model over state-of-the-art methods across style accuracy, content preservation and contextual consistency metrics. + 2020.findings-emnlp.263 + + + <fixed-case>D</fixed-case>i<fixed-case>P</fixed-case>air: Fast and Accurate Distillation for Trillion-<fixed-case>S</fixed-case>cale<fixed-case>T</fixed-case>ext Matching and Pair Modeling + JiecaoChen + LiuYang + KarthikRaman + MichaelBendersky + Jung-JungYeh + YunZhou + MarcNajork + DanyangCai + EhsanEmadzadeh + 2925–2937 + Pre-trained models like BERT ((Devlin et al., 2018) have dominated NLP / IR applications such as single sentence classification, text pair classification, and question answering. However, deploying these models in real systems is highly non-trivial due to their exorbitant computational costs. A common remedy to this is knowledge distillation (Hinton et al., 2015), leading to faster inference. However – as we show here – existing works are not optimized for dealing with pairs (or tuples) of texts. Consequently, they are either not scalable or demonstrate subpar performance. In this work, we propose DiPair — a novel framework for distilling fast and accurate models on text pair tasks. Coupled with an end-to-end training strategy, DiPair is both highly scalable and offers improved quality-speed tradeoffs. Empirical studies conducted on both academic and real-world e-commerce benchmarks demonstrate the efficacy of the proposed approach with speedups of over 350x and minimal quality drop relative to the cross-attention teacher BERT model. + 2020.findings-emnlp.264 + + + Cross-Lingual Dependency Parsing by <fixed-case>POS</fixed-case>-Guided Word Reordering + LuLiu + YiZhou + JianhanXu + XiaoqingZheng + Kai-WeiChang + XuanjingHuang + 2938–2948 + We propose a novel approach to cross-lingual dependency parsing based on word reordering. The words in each sentence of a source language corpus are rearranged to meet the word order in a target language under the guidance of a part-of-speech based language model (LM). To obtain the highest reordering score under the LM, a population-based optimization algorithm and its genetic operators are designed to deal with the combinatorial nature of such word reordering. A parser trained on the reordered corpus then can be used to parse sentences in the target language. We demonstrate through extensive experimentation that our approach achieves better or comparable results across 25 target languages (1.73% increase in average), and outperforms a baseline by a significant margin on the languages that are greatly different from the source one. For example, when transferring the English parser to Hindi and Latin, our approach outperforms the baseline by 15.3% and 6.7% respectively. + 2020.findings-emnlp.265 + + + Assessing Robustness of Text Classification through Maximal Safe Radius Computation + EmanueleLa Malfa + MinWu + LucaLaurenti + BenjieWang + AnthonyHartshorn + MartaKwiatkowska + 2949–2968 + Neural network NLP models are vulnerable to small modifications of the input that maintain the original meaning but result in a different prediction. In this paper, we focus on robustness of text classification against word substitutions, aiming to provide guarantees that the model prediction does not change if a word is replaced with a plausible alternative, such as a synonym. As a measure of robustness, we adopt the notion of the maximal safe radius for a given input text, which is the minimum distance in the embedding space to the decision boundary. Since computing the exact maximal safe radius is not feasible in practice, we instead approximate it by computing a lower and upper bound. For the upper bound computation, we employ Monte Carlo Tree Search in conjunction with syntactic filtering to analyse the effect of single and multiple word substitutions. The lower bound computation is achieved through an adaptation of the linear bounding techniques implemented in tools CNN-Cert and POPQORN, respectively for convolutional and recurrent network models. We evaluate the methods on sentiment analysis and news classification models for four datasets (IMDB, SST, AG News and NEWS) and a range of embeddings, and provide an analysis of robustness trends. We also apply our framework to interpretability analysis and compare it with LIME. + 2020.findings-emnlp.266 + + + Social Commonsense Reasoning with Multi-Head Knowledge Attention + DebjitPaul + AnetteFrank + 2969–2980 + Social Commonsense Reasoning requires understanding of text, knowledge about social events and their pragmatic implications, as well as commonsense reasoning skills. In this work we propose a novel multi-head knowledge attention model that encodes semi-structured commonsense inference rules and learns to incorporate them in a transformer-based reasoning cell.We assess the model’s performance on two tasks that require different reasoning skills: Abductive Natural Language Inference and Counterfactual Invariance Prediction as a new task. We show that our proposed model improves performance over strong state-of-the-art models (i.e., RoBERTa) across both reasoning tasks. Notably we are, to the best of our knowledge, the first to demonstrate that a model that learns to perform counterfactual reasoning helps predicting the best explanation in an abductive reasoning task. We validate the robustness of the model’s reasoning capabilities by perturbing the knowledge and provide qualitative analysis on the model’s knowledge incorporation capabilities. + 2020.findings-emnlp.267 + 2020.findings-emnlp.267.OptionalSupplementaryMaterial.zip + + + <fixed-case>T</fixed-case>urn<fixed-case>GPT</fixed-case>: a Transformer-based Language Model for Predicting Turn-taking in Spoken Dialog + ErikEkstedt + GabrielSkantze + 2981–2990 + Syntactic and pragmatic completeness is known to be important for turn-taking prediction, but so far machine learning models of turn-taking have used such linguistic information in a limited way. In this paper, we introduce TurnGPT, a transformer-based language model for predicting turn-shifts in spoken dialog. The model has been trained and evaluated on a variety of written and spoken dialog datasets. We show that the model outperforms two baselines used in prior work. We also report on an ablation study, as well as attention and gradient analyses, which show that the model is able to utilize the dialog context and pragmatic completeness for turn-taking prediction. Finally, we explore the model’s potential in not only detecting, but also projecting, turn-completions. + 2020.findings-emnlp.268 + + + A little goes a long way: Improving toxic language classification despite data scarcity + MikaJuuti + TommiGröndahl + AdrianFlanagan + N.Asokan + 2991–3009 + Detection of some types of toxic language is hampered by extreme scarcity of labeled training data. Data augmentation – generating new synthetic data from a labeled seed dataset – can help. The efficacy of data augmentation on toxic language classification has not been fully explored. We present the first systematic study on how data augmentation techniques impact performance across toxic language classifiers, ranging from shallow logistic regression architectures to BERT – a state-of-the-art pretrained Transformer network. We compare the performance of eight techniques on very scarce seed datasets. We show that while BERT performed the best, shallow classifiers performed comparably when trained on data augmented with a combination of three techniques, including GPT-2-generated sentences. We discuss the interplay of performance and computational overhead, which can inform the choice of techniques under different constraints. + 2020.findings-emnlp.269 + + + An Instance Level Approach for Shallow Semantic Parsing in Scientific Procedural Text + DaivikSwarup + AhsaasBajaj + ShesheraMysore + TimO’Gorman + RajarshiDas + AndrewMcCallum + 3010–3017 + In specific domains, such as procedural scientific text, human labeled data for shallow semantic parsing is especially limited and expensive to create. Fortunately, such specific domains often use rather formulaic writing, such that the different ways of expressing relations in a small number of grammatically similar labeled sentences may provide high coverage of semantic structures in the corpus, through an appropriately rich similarity metric. In light of this opportunity, this paper explores an instance-based approach to the relation prediction sub-task within shallow semantic parsing, in which semantic labels from structurally similar sentences in the training set are copied to test sentences. Candidate similar sentences are retrieved using SciBERT embeddings. For labels where it is possible to copy from a similar sentence we employ an instance level copy network, when this is not possible, a globally shared parametric model is employed. Experiments show our approach outperforms both baseline and prior methods by 0.75 to 3 F1 absolute in the Wet Lab Protocol Corpus and 1 F1 absolute in the Materials Science Procedural Text Corpus. + 2020.findings-emnlp.270 + + + General Purpose Text Embeddings from Pre-trained Language Models for Scalable Inference + JingfeiDu + MyleOtt + HaoranLi + XingZhou + VeselinStoyanov + 3018–3030 + The state of the art on many NLP tasks is currently achieved by large pre-trained language models, which require a considerable amount of computation. We aim to reduce the inference cost in a setting where many different predictions are made on a single piece of text. In that case, computational cost during inference can be amortized over the different predictions (tasks) using a shared text encoder. We compare approaches for training such an encoder and show that encoders pre-trained over multiple tasks generalize well to unseen tasks. We also compare ways of extracting fixed- and limited-size representations from this encoder, including pooling features extracted from multiple layers or positions. Our best approach compares favorably to knowledge distillation, achieving higher accuracy and lower computational cost once the system is handling around 7 tasks. Further, we show that through binary quantization, we can reduce the size of the extracted representations by a factor of 16 to store them for later use. The resulting method offers a compelling solution for using large-scale pre-trained models at a fraction of the computational cost when multiple tasks are performed on the same text. + 2020.findings-emnlp.271 + + + Learning to Model and Ignore Dataset Bias with Mixed Capacity Ensembles + ChristopherClark + MarkYatskar + LukeZettlemoyer + 3031–3045 + Many datasets have been shown to contain incidental correlations created by idiosyncrasies in the data collection process. For example, sentence entailment datasets can have spurious word-class correlations if nearly all contradiction sentences contain the word “not”, and image recognition datasets can have tell-tale object-background correlations if dogs are always indoors. In this paper, we propose a method that can automatically detect and ignore these kinds of dataset-specific patterns, which we call dataset biases. Our method trains a lower capacity model in an ensemble with a higher capacity model. During training, the lower capacity model learns to capture relatively shallow correlations, which we hypothesize are likely to reflect dataset bias. This frees the higher capacity model to focus on patterns that should generalize better. We ensure the models learn non-overlapping approaches by introducing a novel method to make them conditionally independent. Importantly, our approach does not require the bias to be known in advance. We evaluate performance on synthetic datasets, and four datasets built to penalize models that exploit known biases on textual entailment, visual question answering, and image recognition tasks. We show improvement in all settings, including a 10 point gain on the visual question answering dataset. + 2020.findings-emnlp.272 + + + Learning to Generalize for Sequential Decision Making + XusenYin + RalphWeischedel + JonathanMay + 3046–3063 + We consider problems of making sequences of decisions to accomplish tasks, interacting via the medium of language. These problems are often tackled with reinforcement learning approaches. We find that these models do not generalize well when applied to novel task domains. However, the large amount of computation necessary to adequately train and explore the search space of sequential decision making, under a reinforcement learning paradigm, precludes the inclusion of large contextualized language models, which might otherwise enable the desired generalization ability. We introduce a teacher-student imitation learning methodology and a means of converting a reinforcement learning model into a natural language understanding model. Together, these methodologies enable the introduction of contextualized language models into the sequential decision making problem space. We show that models can learn faster and generalize more, leveraging both the imitation learning and the reformulation. Our models exceed teacher performance on various held-out decision problems, by up to 7% on in-domain problems and 24% on out-of-domain problems. + 2020.findings-emnlp.273 + + + Effective Crowd-Annotation of Participants, Interventions, and Outcomes in the Text of Clinical Trial Reports + MarkusZlabinger + MartaSabou + SebastianHofstätter + AllanHanbury + 3064–3074 + The search for Participants, Interventions, and Outcomes (PIO) in clinical trial reports is a critical task in Evidence Based Medicine. For an automatic PIO extraction, high-quality corpora are needed. Obtaining such a corpus from crowdworkers, however, has been shown to be ineffective since (i) workers usually lack domain-specific expertise to conduct the task with sufficient quality, and (ii) the standard approach of annotating entire abstracts of trial reports as one task-instance (i.e. HIT) leads to an uneven distribution in task effort. In this paper, we switch from entire abstract to sentence annotation, referred to as the SenBase approach. We build upon SenBase in SenSupport, where we compensate the lack of domain-specific expertise of crowdworkers by showing for each task-instance similar sentences that are already annotated by experts. Such tailored task-instance examples are retrieved via unsupervised semantic short-text similarity (SSTS) method – and we evaluate nine methods to find an effective solution for SenSupport. We compute the Cohen’s Kappa agreement between crowd-annotations and gold standard annotations and show that (i) both sentence-based approaches outperform a Baseline approach where entire abstracts are annotated; (ii) supporting annotators with tailored task-instance examples is the best performing approach with Kappa agreements of 0.78/0.75/0.69 for P, I, and O respectively. + 2020.findings-emnlp.274 + 2020.findings-emnlp.274.OptionalSupplementaryMaterial.zip + + + Adversarial Grammatical Error Correction + VipulRaheja + DimitrisAlikaniotis + 3075–3087 + Recent works in Grammatical Error Correction (GEC) have leveraged the progress in Neural Machine Translation (NMT), to learn rewrites from parallel corpora of grammatically incorrect and corrected sentences, achieving state-of-the-art results. At the same time, Generative Adversarial Networks (GANs) have been successful in generating realistic texts across many different tasks by learning to directly minimize the difference between human-generated and synthetic text. In this work, we present an adversarial learning approach to GEC, using the generator-discriminator framework. The generator is a Transformer model, trained to produce grammatically correct sentences given grammatically incorrect ones. The discriminator is a sentence-pair classification model, trained to judge a given pair of grammatically incorrect-correct sentences on the quality of grammatical correction. We pre-train both the discriminator and the generator on parallel texts and then fine-tune them further using a policy gradient method that assigns high rewards to sentences which could be true corrections of the grammatically incorrect text. Experimental results on FCE, CoNLL-14, and BEA-19 datasets show that Adversarial-GEC can achieve competitive GEC quality compared to NMT-based baselines. + 2020.findings-emnlp.275 + 2020.findings-emnlp.275.OptionalSupplementaryMaterial.pdf + + + On Long-Tailed Phenomena in Neural Machine Translation + VikasRaunak + SiddharthDalmia + VivekGupta + FlorianMetze + 3088–3095 + State-of-the-art Neural Machine Translation (NMT) models struggle with generating low-frequency tokens, tackling which remains a major challenge. The analysis of long-tailed phenomena in the context of structured prediction tasks is further hindered by the added complexities of search during inference. In this work, we quantitatively characterize such long-tailed phenomena at two levels of abstraction, namely, token classification and sequence generation. We propose a new loss function, the Anti-Focal loss, to better adapt model training to the structural dependencies of conditional text generation by incorporating the inductive biases of beam search in the training process. We show the efficacy of the proposed technique on a number of Machine Translation (MT) datasets, demonstrating that it leads to significant gains over cross-entropy across different language pairs, especially on the generation of low-frequency words. We have released the code to reproduce our results. + 2020.findings-emnlp.276 + + + Knowing What You Know: Calibrating Dialogue Belief State Distributions via Ensembles + Carelvan Niekerk + MichaelHeck + ChristianGeishauser + Hsien-chinLin + NurulLubis + MarcoMoresi + MilicaGasic + 3096–3102 + The ability to accurately track what happens during a conversation is essential for the performance of a dialogue system. Current state-of-the-art multi-domain dialogue state trackers achieve just over 55% accuracy on the current go-to benchmark, which means that in almost every second dialogue turn they place full confidence in an incorrect dialogue state. Belief trackers, on the other hand, maintain a distribution over possible dialogue states. However, they lack in performance compared to dialogue state trackers, and do not produce well calibrated distributions. In this work we present state-of-the-art performance in calibration for multi-domain dialogue belief trackers using a calibrated ensemble of models. Our resulting dialogue belief tracker also outperforms previous dialogue belief tracking models in terms of accuracy. + 2020.findings-emnlp.277 + + + Domain Adversarial Fine-Tuning as an Effective Regularizer + GiorgosVernikos + KaterinaMargatina + AlexandraChronopoulou + IonAndroutsopoulos + 3103–3112 + In Natural Language Processing (NLP), pretrained language models (LMs) that are transferred to downstream tasks have been recently shown to achieve state-of-the-art results. However, standard fine-tuning can degrade the general-domain representations captured during pretraining. To address this issue, we introduce a new regularization technique, AFTER; domain Adversarial Fine-Tuning as an Effective Regularizer. Specifically, we complement the task-specific loss used during fine-tuning with an adversarial objective. This additional loss term is related to an adversarial classifier, that aims to discriminate between in-domain and out-of-domain text representations. Indomain refers to the labeled dataset of the task at hand while out-of-domain refers to unlabeled data from a different domain. Intuitively, the adversarial classifier acts as a regularize which prevents the model from overfitting to the task-specific domain. Empirical results on various natural language understanding tasks show that AFTER leads to improved performance compared to standard fine-tuning. + 2020.findings-emnlp.278 + + + <fixed-case>CLAR</fixed-case>: A Cross-Lingual Argument Regularizer for Semantic Role Labeling + IshanJindal + YunyaoLi + SiddharthaBrahma + HuaiyuZhu + 3113–3125 + Semantic role labeling (SRL) identifies predicate-argument structure(s) in a given sentence. Although different languages have different argument annotations, polyglot training, the idea of training one model on multiple languages, has previously been shown to outperform monolingual baselines, especially for low resource languages. In fact, even a simple combination of data has been shown to be effective with polyglot training by representing the distant vocabularies in a shared representation space. Meanwhile, despite the dissimilarity in argument annotations between languages, certain argument labels do share common semantic meaning across languages (e.g. adjuncts have more or less similar semantic meaning across languages). To leverage such similarity in annotation space across languages, we propose a method called Cross-Lingual Argument Regularizer (CLAR). CLAR identifies such linguistic annotation similarity across languages and exploits this information to map the target language arguments using a transformation of the space on which source language arguments lie. By doing so, our experimental results show that CLAR consistently improves SRL performance on multiple languages over monolingual and polyglot baselines for low resource languages. + 2020.findings-emnlp.279 + + + Neutralizing Gender Bias in Word Embedding with Latent Disentanglement and Counterfactual Generation + SeungjaeShin + KyungwooSong + JoonHoJang + HyemiKim + WeonyoungJoo + Il-ChulMoon + 3126–3140 + Recent research demonstrates that word embeddings, trained on the human-generated corpus, have strong gender biases in embedding spaces, and these biases can result in the discriminative results from the various downstream tasks. Whereas the previous methods project word embeddings into a linear subspace for debiasing, we introduce a Latent Disentanglement method with a siamese auto-encoder structure with an adapted gradient reversal layer. Our structure enables the separation of the semantic latent information and gender latent information of given word into the disjoint latent dimensions. Afterwards, we introduce a Counterfactual Generation to convert the gender information of words, so the original and the modified embeddings can produce a gender-neutralized word embedding after geometric alignment regularization, without loss of semantic information. From the various quantitative and qualitative debiasing experiments, our method shows to be better than existing debiasing methods in debiasing word embeddings. In addition, Our method shows the ability to preserve semantic information during debiasing by minimizing the semantic information losses for extrinsic NLP downstream tasks. + 2020.findings-emnlp.280 + + + Towards Domain-Independent Text Structuring Trainable on Large Discourse Treebanks + GrigoriiGuz + GiuseppeCarenini + 3141–3152 + Text structuring is a fundamental step in NLG, especially when generating multi-sentential text. With the goal of fostering more general and data-driven approaches to text structuring, we propose the new and domain-independent NLG task of structuring and ordering a (possibly large) set of EDUs. We then present a solution for this task that combines neural dependency tree induction with pointer networks, and can be trained on large discourse treebanks that have only recently become available. Further, we propose a new evaluation metric that is arguably more suitable for our new task compared to existing content ordering metrics. Finally, we empirically show that our approach outperforms competitive alternatives on the proposed measure and is equivalent in performance with respect to previously established measures. + 2020.findings-emnlp.281 + + + Data Annealing for Informal Language Understanding Tasks + JingGu + ZhouYu + 3153–3159 + There is a huge performance gap between formal and informal language understanding tasks. The recent pre-trained models that improved formal language understanding tasks did not achieve a comparable result on informal language. We propose data annealing transfer learning procedure to bridge the performance gap on informal natural language understanding tasks. It successfully utilizes a pre-trained model such as BERT in informal language. In the data annealing procedure, the training set contains mainly formal text data at first; then, the proportion of the informal text data is gradually increased during the training process. Our data annealing procedure is model-independent and can be applied to various tasks. We validate its effectiveness in exhaustive experiments. When BERT is implemented with our learning procedure, it outperforms all the state-of-the-art models on the three common informal language tasks. + 2020.findings-emnlp.282 + + + A Multilingual View of Unsupervised Machine Translation + XavierGarcia + PierreForet + ThibaultSellam + AnkurParikh + 3160–3170 + We present a probabilistic framework for multilingual neural machine translation that encompasses supervised and unsupervised setups, focusing on unsupervised translation. In addition to studying the vanilla case where there is only monolingual data available, we propose a novel setup where one language in the (source, target) pair is not associated with any parallel data, but there may exist auxiliary parallel data that contains the other. This auxiliary data can naturally be utilized in our probabilistic framework via a novel cross-translation loss term. Empirically, we show that our approach results in higher BLEU scores over state-of-the-art unsupervised models on the WMT’14 English-French, WMT’16 English-German, and WMT’16 English-Romanian datasets in most directions. + 2020.findings-emnlp.283 + + + An Evaluation Method for Diachronic Word Sense Induction + AshjanAlsulaimani + ErwanMoreau + CarlVogel + 3171–3180 + The task of Diachronic Word Sense Induction (DWSI) aims to identify the meaning of words from their context, taking the temporal dimension into account. In this paper we propose an evaluation method based on large-scale time-stamped annotated biomedical data, and a range of evaluation measures suited to the task. The approach is applied to two recent DWSI systems, thus demonstrating its relevance and providing an in-depth analysis of the models. + 2020.findings-emnlp.284 + 2020.findings-emnlp.284.OptionalSupplementaryMaterial.zip + + + Integrating Task Specific Information into Pretrained Language Models for Low Resource Fine Tuning + RuiWang + ShijingSi + GuoyinWang + LeiZhang + LawrenceCarin + RicardoHenao + 3181–3186 + Pretrained Language Models (PLMs) have improved the performance of natural language understanding in recent years. Such models are pretrained on large corpora, which encode the general prior knowledge of natural languages but are agnostic to information characteristic of downstream tasks. This often results in overfitting when fine-tuned with low resource datasets where task-specific information is limited. In this paper, we integrate label information as a task-specific prior into the self-attention component of pretrained BERT models. Experiments on several benchmarks and real-word datasets suggest that the proposed approach can largely improve the performance of pretrained models when fine-tuning with small datasets. + 2020.findings-emnlp.285 + + + Efficient Transformer-based Large Scale Language Representations using Hardware-friendly Block Structured Pruning + BingbingLi + ZhenglunKong + TianyunZhang + JiLi + ZhengangLi + HangLiu + CaiwenDing + 3187–3199 + Pretrained large-scale language models have increasingly demonstrated high accuracy on many natural language processing (NLP) tasks. However, the limited weight storage and computational speed on hardware platforms have impeded the popularity of pretrained models, especially in the era of edge computing. In this work, we propose an efficient transformer-based large-scale language representation using hardware-friendly block structure pruning. We incorporate the reweighted group Lasso into block-structured pruning for optimization. Besides the significantly reduced weight storage and computation, the proposed approach achieves high compression rates. Experimental results on different models (BERT, RoBERTa, and DistilBERT) on the General Language Understanding Evaluation (GLUE) benchmark tasks show that we achieve up to 5.0x with zero or minor accuracy degradation on certain task(s). Our proposed method is also orthogonal to existing compact pretrained language models such as DistilBERT using knowledge distillation, since a further 1.79x average compression rate can be achieved on top of DistilBERT with zero or minor accuracy degradation. It is suitable to deploy the final compressed model on resource-constrained edge devices. + 2020.findings-emnlp.286 + + + <fixed-case>K</fixed-case>o<fixed-case>BE</fixed-case>: Knowledge-Based Machine Translation Evaluation + ZorikGekhman + RoeeAharoni + GenadyBeryozkin + MarkusFreitag + WolfgangMacherey + 3200–3207 + We propose a simple and effective method for machine translation evaluation which does not require reference translations. Our approach is based on (1) grounding the entity mentions found in each source sentence and candidate translation against a large-scale multilingual knowledge base, and (2) measuring the recall of the grounded entities found in the candidate vs. those found in the source. Our approach achieves the highest correlation with human judgements on 9 out of the 18 language pairs from the WMT19 benchmark for evaluation without references, which is the largest number of wins for a single evaluation method on this task. On 4 language pairs, we also achieve higher correlation with human judgements than BLEU. To foster further research, we release a dataset containing 1.8 million grounded entity mentions across 18 language pairs from the WMT19 metrics track data. + 2020.findings-emnlp.287 + + + Pushing the Limits of <fixed-case>AMR</fixed-case> Parsing with Self-Learning + Young-SukLee + RamónFernandez Astudillo + TahiraNaseem + RevanthGangi Reddy + RaduFlorian + SalimRoukos + 3208–3214 + Abstract Meaning Representation (AMR) parsing has experienced a notable growth in performance in the last two years, due both to the impact of transfer learning and the development of novel architectures specific to AMR. At the same time, self-learning techniques have helped push the performance boundaries of other natural language processing applications, such as machine translation or question answering. In this paper, we explore different ways in which trained models can be applied to improve AMR parsing performance, including generation of synthetic text and AMR annotations as well as refinement of actions oracle. We show that, without any additional human annotations, these techniques improve an already performant parser and achieve state-of-the-art results on AMR 1.0 and AMR 2.0. + 2020.findings-emnlp.288 + + + Towards Zero Shot Conditional Summarization with Adaptive Multi-task Fine-Tuning + TravisGoodwin + MaxSavery + DinaDemner-Fushman + 3215–3226 + Automatic summarization research has traditionally focused on providing high quality general-purpose summaries of documents. However, there are many applications which require more specific summaries, such as supporting question answering or topic-based literature discovery. In this paper we study the problem of conditional summarization in which content selection and surface realization are explicitly conditioned on an ad-hoc natural language question or topic description. Because of the difficulty in obtaining sufficient reference summaries to support arbitrary conditional summarization, we explore the use of multi-task fine-tuning (MTFT) on twenty-one natural language tasks to enable zero-shot conditional summarization on five tasks. We present four new summarization datasets, two novel “online” or adaptive task-mixing strategies, and report zero-shot performance using T5 and BART, demonstrating that MTFT can improve zero-shot summarization quality. + 2020.findings-emnlp.289 + + + Multilingual Knowledge Graph Completion via Ensemble Knowledge Transfer + XueluChen + MuhaoChen + ChangjunFan + AnkithUppunda + YizhouSun + CarloZaniolo + 3227–3238 + Predicting missing facts in a knowledge graph(KG) is a crucial task in knowledge base construction and reasoning, and it has been the subject of much research in recent works us-ing KG embeddings. While existing KG embedding approaches mainly learn and predict facts within a single KG, a more plausible solution would benefit from the knowledge in multiple language-specific KGs, considering that different KGs have their own strengths and limitations on data quality and coverage. This is quite challenging since the transfer of knowledge among multiple independently maintained KGs is often hindered by the insufficiency of alignment information and inconsistency of described facts. In this paper, we propose kens, a novel framework for embedding learning and ensemble knowledge transfer across a number of language-specific KGs.KEnS embeds all KGs in a shared embedding space, where the association of entities is captured based on self-learning. Then, KEnS performs ensemble inference to com-bine prediction results from multiple language-specific embeddings, for which multiple en-semble techniques are investigated. Experiments on the basis of five real-world language-specific KGs show that, by effectively identifying and leveraging complementary knowledge, KEnS consistently improves state-of-the-art methods on KG completion. + 2020.findings-emnlp.290 + + + Towards Controllable Biases in Language Generation + EmilySheng + Kai-WeiChang + PremNatarajan + NanyunPeng + 3239–3254 + We present a general approach towards controllable societal biases in natural language generation (NLG). Building upon the idea of adversarial triggers, we develop a method to induce societal biases in generated text when input prompts contain mentions of specific demographic groups. We then analyze two scenarios: 1) inducing negative biases for one demographic and positive biases for another demographic, and 2) equalizing biases between demographics. The former scenario enables us to detect the types of biases present in the model. Specifically, we show the effectiveness of our approach at facilitating bias analysis by finding topics that correspond to demographic inequalities in generated text and comparing the relative effectiveness of inducing biases for different demographics. The second scenario is useful for mitigating biases in downstream applications such as dialogue generation. In our experiments, the mitigation technique proves to be effective at equalizing the amount of biases across demographics while simultaneously generating less negatively biased text overall. + 2020.findings-emnlp.291 + + + <fixed-case>R</fixed-case>ob<fixed-case>BERT</fixed-case>: a <fixed-case>D</fixed-case>utch <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a-based Language Model + PieterDelobelle + ThomasWinters + BettinaBerendt + 3255–3265 + Pre-trained language models have been dominating the field of natural language processing in recent years, and have led to significant performance gains for various complex natural language tasks. One of the most prominent pre-trained language models is BERT, which was released as an English as well as a multilingual version. Although multilingual BERT performs well on many tasks, recent studies show that BERT models trained on a single language significantly outperform the multilingual version. Training a Dutch BERT model thus has a lot of potential for a wide range of Dutch NLP tasks. While previous approaches have used earlier implementations of BERT to train a Dutch version of BERT, we used RoBERTa, a robustly optimized BERT approach, to train a Dutch language model called RobBERT. We measured its performance on various tasks as well as the importance of the fine-tuning dataset size. We also evaluated the importance of language-specific tokenizers and the model’s fairness. We found that RobBERT improves state-of-the-art results for various tasks, and especially significantly outperforms other models when dealing with smaller datasets. These results indicate that it is a powerful pre-trained model for a large variety of Dutch language tasks. The pre-trained and fine-tuned models are publicly available to support further downstream Dutch NLP applications. + 2020.findings-emnlp.292 + + + Regularization of Distinct Strategies for Unsupervised Question Generation + JunmoKang + GiwonHong + HaritzPuerto San Roman + Sung-HyonMyaeng + 3266–3277 + Unsupervised question answering (UQA) has been proposed to avoid the high cost of creating high-quality datasets for QA. One approach to UQA is to train a QA model with questions generated automatically. However, the generated questions are either too similar to a word sequence in the context or too drifted from the semantics of the context, thereby making it difficult to train a robust QA model. We propose a novel regularization method based on teacher-student architecture to avoid bias toward a particular question generation strategy and modulate the process of generating individual words when a question is generated. Our experiments demonstrate that we have achieved the goal of generating higher-quality questions for UQA across diverse QA datasets and tasks. We also show that this method can be useful for creating a QA model with few-shot learning. + 2020.findings-emnlp.293 + + + Graph-to-Graph Transformer for Transition-based Dependency Parsing + AlirezaMohammadshahi + JamesHenderson + 3278–3289 + We propose the Graph2Graph Transformer architecture for conditioning on and predicting arbitrary graphs, and apply it to the challenging task of transition-based dependency parsing. After proposing two novel Transformer models of transition-based dependency parsing as strong baselines, we show that adding the proposed mechanisms for conditioning on and predicting graphs of Graph2Graph Transformer results in significant improvements, both with and without BERT pre-training. The novel baselines and their integration with Graph2Graph Transformer significantly outperform the state-of-the-art in traditional transition-based dependency parsing on both English Penn Treebank, and 13 languages of Universal Dependencies Treebanks. Graph2Graph Transformer can be integrated with many previous structured prediction methods, making it easy to apply to a wide range of NLP tasks. + 2020.findings-emnlp.294 + 2020.findings-emnlp.294.OptionalSupplementaryMaterial.zip + + + <fixed-case>WER</fixed-case> we are and <fixed-case>WER</fixed-case> we think we are + PiotrSzymański + PiotrŻelasko + MikolajMorzy + AdrianSzymczak + MarzenaŻyła-Hoppe + JoannaBanaszczak + LukaszAugustyniak + JanMizgajski + YishayCarmiel + 3290–3295 + Natural language processing of conversational speech requires the availability of high-quality transcripts. In this paper, we express our skepticism towards the recent reports of very low Word Error Rates (WERs) achieved by modern Automatic Speech Recognition (ASR) systems on benchmark datasets. We outline several problems with popular benchmarks and compare three state-of-the-art commercial ASR systems on an internal dataset of real-life spontaneous human conversations and HUB’05 public benchmark. We show that WERs are significantly higher than the best reported results. We formulate a set of guidelines which may aid in the creation of real-life, multi-domain datasets with high quality annotations for training and testing of robust ASR systems. + 2020.findings-emnlp.295 + + + <fixed-case>D</fixed-case>e<fixed-case>SMOG</fixed-case>: Detecting Stance in Media On Global Warming + YiweiLuo + DallasCard + DanJurafsky + 3296–3315 + Citing opinions is a powerful yet understudied strategy in argumentation. For example, an environmental activist might say, “Leading scientists agree that global warming is a serious concern,” framing a clause which affirms their own stance (“that global warming is serious”) as an opinion endorsed ("[scientists] agree”) by a reputable source (“leading”). In contrast, a global warming denier might frame the same clause as the opinion of an untrustworthy source with a predicate connoting doubt: “Mistaken scientists claim [...]." Our work studies opinion-framing in the global warming (GW) debate, an increasingly partisan issue that has received little attention in NLP. We introduce DeSMOG, a dataset of stance-labeled GW sentences, and train a BERT classifier to study novel aspects of argumentation in how different sides of a debate represent their own and each other’s opinions. From 56K news articles, we find that similar linguistic devices for self-affirming and opponent-doubting discourse are used across GW-accepting and skeptic media, though GW-skeptical media shows more opponent-doubt. We also find that authors often characterize sources as hypocritical, by ascribing opinions expressing the author’s own view to source entities known to publicly endorse the opposing view. We release our stance dataset, model, and lexicons of framing devices for future work on opinion-framing and the automatic detection of GW stance. + 2020.findings-emnlp.296 + + + A Novel Challenge Set for <fixed-case>H</fixed-case>ebrew Morphological Disambiguation and Diacritics Restoration + AviShmidman + JoshuaGuedalia + ShaltielShmidman + MosheKoppel + ReutTsarfaty + 3316–3326 + One of the primary tasks of morphological parsers is the disambiguation of homographs. Particularly difficult are cases of unbalanced ambiguity, where one of the possible analyses is far more frequent than the others. In such cases, there may not exist sufficient examples of the minority analyses in order to properly evaluate performance, nor to train effective classifiers. In this paper we address the issue of unbalanced morphological ambiguities in Hebrew. We offer a challenge set for Hebrew homographs — the first of its kind — containing substantial attestation of each analysis of 21 Hebrew homographs. We show that the current SOTA of Hebrew disambiguation performs poorly on cases of unbalanced ambiguity. Leveraging our new dataset, we achieve a new state-of-the-art for all 21 words, improving the overall average F1 score from 0.67 to 0.95. Our resulting annotated datasets are made publicly available for further research. + 2020.findings-emnlp.297 + + + Improve Transformer Models with Better Relative Position Embeddings + ZhihengHuang + DavisLiang + PengXu + BingXiang + 3327–3335 + The transformer model has demonstrated superior results on NLP tasks including machine translation and question answering. In this paper, we argue that the position information is not fully utilized in existing work. For example, the initial proposal of a sinusoid embedding is fixed and not learnable. In this paper, we first review the absolute position embeddings and existing relative position embedding methods. We then propose new methods to encourage increased interaction between query, key and relative position embeddings in the self-attention mechanism. Our most promising approach is a generalization of the absolute position embedding. Our method results in increased accuracy compared to previous approaches in absolute and relative position embeddings on the SQuAD1.1 dataset. In addition, we address the inductive property of whether a position embedding can be robust enough to handle long sequences. We demonstrate empirically that our relative embedding method can be reasonably generalized to and is robust in the inductive perspective. Finally, we show that our proposed method can be effectively and efficiently adopted as a near drop-in replacement for improving the accuracy of large models with little computational overhead. + 2020.findings-emnlp.298 + + + A Sentiment-Controllable Topic-to-Essay Generator with Topic Knowledge Graph + LinQiao + JianhaoYan + FandongMeng + ZhendongYang + JieZhou + 3336–3344 + Generating a vivid, novel, and diverse essay with only several given topic words is a promising task of natural language generation. Previous work in this task exists two challenging problems: neglect of sentiment beneath the text and insufficient utilization of topic-related knowledge. Therefore, we propose a novel Sentiment Controllable topic-to- essay generator with a Topic Knowledge Graph enhanced decoder, named SCTKG, which is based on the conditional variational auto-encoder (CVAE) framework. We firstly inject the sentiment information into the generator for controlling sentiment for each sentence, which leads to various generated essays. Then we design a Topic Knowledge Graph enhanced decoder. Unlike existing models that use knowledge entities separately, our model treats knowledge graph as a whole and encodes more structured, connected semantic information in the graph to generate a more relevant essay. Experimental results show that our SCTKG can generate sentiment controllable essays and outperform the state-of-the-art approach in terms of topic relevance, fluency, and diversity on both automatic and human evaluation. + 2020.findings-emnlp.299 + + + What-if <fixed-case>I</fixed-case> ask you to explain: Explaining the effects of perturbations in procedural text + DheerajRajagopal + NiketTandon + PeterClark + BhavanaDalvi + EduardHovy + 3345–3355 + Our goal is to explain the effects of perturbations in procedural text, e.g., given a passage describing a rabbit’s life cycle, explain why illness (the perturbation) may reduce the rabbit population (the effect). Although modern systems are able to solve the original prediction task well (e.g., illness results in less rabbits), the explanation task - identifying the causal chain of events from perturbation to effect - remains largely unaddressed, and is the goal of this research. We present QUARTET, a system that constructs such explanations from paragraphs, by modeling the explanation task as a multitask learning problem. QUARTET constructs explanations from the sentences in the procedural text, achieving ~18 points better on explanation accuracy compared to several strong baselines on a recent process comprehension benchmark. On an end task on this benchmark, we show a surprising finding that good explanations do not have to come at the expense of end task performance, in fact leading to a 7% F1 improvement over SOTA. + 2020.findings-emnlp.300 + + + <fixed-case>R</fixed-case>eal<fixed-case>T</fixed-case>oxicity<fixed-case>P</fixed-case>rompts: Evaluating Neural Toxic Degeneration in Language Models + SamuelGehman + SuchinGururangan + MaartenSap + YejinChoi + Noah A.Smith + 3356–3369 + Pretrained neural language models (LMs) are prone to generating racist, sexist, or otherwise toxic language which hinders their safe deployment. We investigate the extent to which pretrained LMs can be prompted to generate toxic language, and the effectiveness of controllable text generation algorithms at preventing such toxic degeneration. We create and release RealToxicityPrompts, a dataset of 100K naturally occurring, sentence-level prompts derived from a large corpus of English web text, paired with toxicity scores from a widely-used toxicity classifier. Using RealToxicityPrompts, we find that pretrained LMs can degenerate into toxic text even from seemingly innocuous prompts. We empirically assess several controllable generation methods, and find that while data- or compute-intensive methods (e.g., adaptive pretraining on non-toxic data) are more effective at steering away from toxicity than simpler solutions (e.g., banning “bad” words), no current method is failsafe against neural toxic degeneration. To pinpoint the potential cause of such persistent toxic degeneration, we analyze two web text corpora used to pretrain several LMs (including GPT-2; Radford et. al, 2019), and find a significant amount of offensive, factually unreliable, and otherwise toxic content. Our work provides a test bed for evaluating toxic generations by LMs and stresses the need for better data selection processes for pretraining. + 2020.findings-emnlp.301 + + + Improving Event Duration Prediction via Time-aware Pre-training + ZonglinYang + XinyaDu + AlexanderRush + ClaireCardie + 3370–3378 + End-to-end models in NLP rarely encode external world knowledge about length of time. We introduce two effective models for duration prediction, which incorporate external knowledge by reading temporal-related news sentences (time-aware pre-training). Specifically, one model predicts the range/unit where the duration value falls in (R-PRED); and the other predicts the exact duration value (E-PRED). Our best model – E-PRED, substantially outperforms previous work, and captures duration information more accurately than R-PRED. We also demonstrate our models are capable of duration prediction in the unsupervised setting, outperforming the baselines. + 2020.findings-emnlp.302 + + + Composed Variational Natural Language Generation for Few-shot Intents + CongyingXia + CaimingXiong + PhilipYu + RichardSocher + 3379–3388 + In this paper, we focus on generating training examples for few-shot intents in the realistic imbalanced scenario. To build connections between existing many-shot intents and few-shot intents, we consider an intent as a combination of a domain and an action, and propose a composed variational natural language generator (CLANG), a transformer-based conditional variational autoencoder. CLANG utilizes two latent variables to represent the utterances corresponding to two different independent parts (domain and action) in the intent, and the latent variables are composed together to generate natural examples. Additionally, to improve the generator learning, we adopt the contrastive regularization loss that contrasts the in-class with the out-of-class utterance generation given the intent. To evaluate the quality of the generated utterances, experiments are conducted on the generalized few-shot intent detection task. Empirical results show that our proposed model achieves state-of-the-art performances on two real-world intent detection datasets. + 2020.findings-emnlp.303 + + + Document Reranking for Precision Medicine with Neural Matching and Faceted Summarization + JihoNoh + RamakanthKavuluru + 3389–3399 + Information retrieval (IR) for precision medicine (PM) often involves looking for multiple pieces of evidence that characterize a patient case. This typically includes at least the name of a condition and a genetic variation that applies to the patient. Other factors such as demographic attributes, comorbidities, and social determinants may also be pertinent. As such, the retrieval problem is often formulated as ad hoc search but with multiple facets (e.g., disease, mutation) that may need to be incorporated. In this paper, we present a document reranking approach that combines neural query-document matching and text summarization toward such retrieval scenarios. Our architecture builds on the basic BERT model with three specific components for reranking: (a). document-query matching (b). keyword extraction and (c). facet-conditioned abstractive summarization. The outcomes of (b) and (c) are used to essentially transform a candidate document into a concise summary that can be compared with the query at hand to compute a relevance score. Component (a) directly generates a matching score of a candidate document for a query. The full architecture benefits from the complementary potential of document-query matching and the novel document transformation approach based on summarization along PM facets. Evaluations using NIST’s TREC-PM track datasets (2017–2019) show that our model achieves state-of-the-art performance. To foster reproducibility, our code is made available here: https://github.com/bionlproc/text-summ-for-doc-retrieval. + 2020.findings-emnlp.304 + + + On the Importance of Adaptive Data Collection for Extremely Imbalanced Pairwise Tasks + StephenMussmann + RobinJia + PercyLiang + 3400–3413 + Many pairwise classification tasks, such as paraphrase detection and open-domain question answering, naturally have extreme label imbalance (e.g., 99.99% of examples are negatives). In contrast, many recent datasets heuristically choose examples to ensure label balance. We show that these heuristics lead to trained models that generalize poorly: State-of-the art models trained on QQP and WikiQA each have only 2.4% average precision when evaluated on realistically imbalanced test data. We instead collect training data with active learning, using a BERT-based embedding model to efficiently retrieve uncertain points from a very large pool of unlabeled utterance pairs. By creating balanced training data with more informative negative examples, active learning greatly improves average precision to 32.5% on QQP and 20.1% on WikiQA. + 2020.findings-emnlp.305 + + + A Dual-Attention Network for Joint Named Entity Recognition and Sentence Classification of Adverse Drug Events + SusmithaWunnava + XiaoQin + TabassumKakar + XiangnanKong + ElkeRundensteiner + 3414–3423 + An adverse drug event (ADE) is an injury resulting from medical intervention related to a drug. Automatic ADE detection from text is either fine-grained (ADE entity recognition) or coarse-grained (ADE assertive sentence classification), with limited efforts leveraging inter-dependencies among the two granularities. We instead propose a multi-grained joint deep network to concurrently learn the ADE entity recognition and ADE sentence classification tasks. Our joint approach takes advantage of their symbiotic relationship, with a transfer of knowledge between the two levels of granularity. Our dual-attention mechanism constructs multiple distinct representations of a sentence that capture both task-specific and semantic information in the sentence, providing stronger emphasis on the key elements essential for sentence classification. Our model improves state-of- art F1-score for both tasks: (i) entity recognition of ADE words (12.5% increase) and (ii) ADE sentence classification (13.6% increase) on MADE 1.0 benchmark of EHR notes. + 2020.findings-emnlp.306 + + + <fixed-case>BERT</fixed-case>-k<fixed-case>NN</fixed-case>: Adding a k<fixed-case>NN</fixed-case> Search Component to Pretrained Language Models for Better <fixed-case>QA</fixed-case> + NoraKassner + HinrichSchütze + 3424–3430 + Khandelwal et al. (2020) use a k-nearest-neighbor (kNN) component to improve language model performance. We show that this idea is beneficial for open-domain question answering (QA). To improve the recall of facts encountered during training, we combine BERT (Devlin et al., 2019) with a traditional information retrieval step (IR) and a kNN search over a large datastore of an embedded text collection. Our contributions are as follows: i) BERT-kNN outperforms BERT on cloze-style QA by large margins without any further training. ii) We show that BERT often identifies the correct response category (e.g., US city), but only kNN recovers the factually correct answer (e.g.,“Miami”). iii) Compared to BERT, BERT-kNN excels for rare facts. iv) BERT-kNN can easily handle facts not covered by BERT’s training set, e.g., recent events. + 2020.findings-emnlp.307 + + + Identifying spurious correlations for robust text classification + ZhaoWang + AronCulotta + 3431–3440 + The predictions of text classifiers are often driven by spurious correlations – e.g., the term “Spielberg” correlates with positively reviewed movies, even though the term itself does not semantically convey a positive sentiment. In this paper, we propose a method to distinguish spurious and genuine correlations in text classification. We treat this as a supervised classification problem, using features derived from treatment effect estimators to distinguish spurious correlations from “genuine” ones. Due to the generic nature of these features and their small dimensionality, we find that the approach works well even with limited training examples, and that it is possible to transport the word classifier to new domains. Experiments on four datasets (sentiment classification and toxicity detection) suggest that using this approach to inform feature selection also leads to more robust classification, as measured by improved worst-case accuracy on the samples affected by spurious correlations. + 2020.findings-emnlp.308 + + + <fixed-case>H</fixed-case>o<fixed-case>V</fixed-case>er: A Dataset for Many-Hop Fact Extraction And Claim Verification + YichenJiang + ShikhaBordia + ZhengZhong + CharlesDognin + ManeeshSingh + MohitBansal + 3441–3460 + We introduce HoVer (HOppy VERification), a dataset for many-hop evidence extraction and fact verification. It challenges models to extract facts from several Wikipedia articles that are relevant to a claim and classify whether the claim is supported or not-supported by the facts. In HoVer, the claims require evidence to be extracted from as many as four English Wikipedia articles and embody reasoning graphs of diverse shapes. Moreover, most of the 3/4-hop claims are written in multiple sentences, which adds to the complexity of understanding long-range dependency relations such as coreference. We show that the performance of an existing state-of-the-art semantic-matching model degrades significantly on our dataset as the number of reasoning hops increases, hence demonstrating the necessity of many-hop reasoning to achieve strong results. We hope that the introduction of this challenging dataset and the accompanying evaluation task will encourage research in many-hop fact retrieval and information verification. + 2020.findings-emnlp.309 + + + Continual Learning for Natural Language Generation in Task-oriented Dialog Systems + FeiMi + LiangweiChen + MengjieZhao + MinlieHuang + BoiFaltings + 3461–3474 + Natural language generation (NLG) is an essential component of task-oriented dialog systems. Despite the recent success of neural approaches for NLG, they are typically developed in an offline manner for particular domains. To better fit real-life applications where new data come in a stream, we study NLG in a “continual learning” setting to expand its knowledge to new domains or functionalities incrementally. The major challenge towards this goal is catastrophic forgetting, meaning that a continually trained model tends to forget the knowledge it has learned before. To this end, we propose a method called ARPER (Adaptively Regularized Prioritized Exemplar Replay) by replaying prioritized historical exemplars, together with an adaptive regularization technique based on Elastic Weight Consolidation. Extensive experiments to continually learn new domains and intents are conducted on MultiWoZ-2.0 to benchmark ARPER with a wide range of techniques. Empirical results demonstrate that ARPER significantly outperforms other methods by effectively mitigating the detrimental catastrophic forgetting issue. + 2020.findings-emnlp.310 + + + <fixed-case>UNQOVER</fixed-case>ing Stereotypical Biases via Underspecified Questions + TaoLi + DanielKhashabi + TusharKhot + AshishSabharwal + VivekSrikumar + 3475–3489 + While language embeddings have been shown to have stereotyping biases, how these biases affect downstream question answering (QA) models remains unexplored. We present UNQOVER, a general framework to probe and quantify biases through underspecified questions. We show that a naive use of model scores can lead to incorrect bias estimates due to two forms of reasoning errors: positional dependence and question independence. We design a formalism that isolates the aforementioned errors. As case studies, we use this metric to analyze four important classes of stereotypes: gender, nationality, ethnicity, and religion. We probe five transformer-based QA models trained on two QA datasets, along with their underlying language models. Our broad study reveals that (1) all these models, with and without fine-tuning, have notable stereotyping biases in these classes; (2) larger models often have higher bias; and (3) the effect of fine-tuning on bias varies strongly with the dataset and the model size. + 2020.findings-emnlp.311 + + + A Semantics-based Approach to Disclosure Classification in User-Generated Online Content + ChandanAkiti + AnnaSquicciarini + SarahRajtmajer + 3490–3499 + As users engage in public discourse, the rate of voluntarily disclosed personal information has seen a steep increase. So-called self-disclosure can result in a number of privacy concerns. Users are often unaware of the sheer amount of personal information they share across online forums, commentaries, and social networks, as well as the power of modern AI to synthesize and gain insights from this data. This paper presents an approach to detect emotional and informational self-disclosure in natural language. We hypothesize that identifying frame semantics can meaningfully support this task. Specifically, we use Semantic Role Labeling to identify the lexical units and their semantic roles that signal self-disclosure. Experimental results on Reddit data show the performance gain of our method when compared to standard text classification methods based on BiLSTM, and BERT. In addition to improved performance, our approach provides insights into the drivers of disclosure behaviors. + 2020.findings-emnlp.312 + 2020.findings-emnlp.312.OptionalSupplementaryMaterial.zip + + + Mining Knowledge for Natural Language Inference from <fixed-case>W</fixed-case>ikipedia Categories + MingdaChen + ZeweiChu + KarlStratos + KevinGimpel + 3500–3511 + Accurate lexical entailment (LE) and natural language inference (NLI) often require large quantities of costly annotations. To alleviate the need for labeled data, we introduce WikiNLI: a resource for improving model performance on NLI and LE tasks. It contains 428,899 pairs of phrases constructed from naturally annotated category hierarchies in Wikipedia. We show that we can improve strong baselines such as BERT and RoBERTa by pretraining them on WikiNLI and transferring the models on downstream tasks. We conduct systematic comparisons with phrases extracted from other knowledge bases such as WordNet and Wikidata to find that pretraining on WikiNLI gives the best performance. In addition, we construct WikiNLI in other languages, and show that pretraining on them improves performance on NLI tasks of corresponding languages. + 2020.findings-emnlp.313 + 2020.findings-emnlp.313.OptionalSupplementaryMaterial.zip + + + <fixed-case>OCNLI</fixed-case>: Original <fixed-case>C</fixed-case>hinese Natural Language Inference + HaiHu + KyleRichardson + LiangXu + LuLi + SandraKübler + LawrenceMoss + 3512–3526 + Despite the tremendous recent progress on natural language inference (NLI), driven largely by large-scale investment in new datasets (e.g.,SNLI, MNLI) and advances in modeling, most progress has been limited to English due to a lack of reliable datasets for most of the world’s languages. In this paper, we present the first large-scale NLI dataset (consisting of ~56,000 annotated sentence pairs) for Chinese called the Original Chinese Natural Language Inference dataset (OCNLI). Unlike recent attempts at extending NLI to other languages, our dataset does not rely on any automatic translation or non-expert annotation. Instead, we elicit annotations from native speakers specializing in linguistics. We follow closely the annotation protocol used for MNLI, but create new strategies for eliciting diverse hypotheses. We establish several baseline results on our dataset using state-of-the-art pre-trained models for Chinese, and find even the best performing models to be far outpaced by human performance (~12% absolute performance gap), making it a challenging new resource that we hope will help to accelerate progress in Chinese NLU. To the best of our knowledge, this is the first human-elicited MNLI-style corpus for a non-English language. + 2020.findings-emnlp.314 + + + Unsupervised Domain Adaptation for Cross-lingual Text Labeling + DejiaoZhang + RameshNallapati + HenghuiZhu + FengNan + CiceroNogueira dos Santos + KathleenMcKeown + BingXiang + 3527–3536 + Unsupervised domain adaptation addresses the problem of leveraging labeled data in a source domain to learn a well-performing model in a target domain where labels are unavailable. In this paper, we improve upon a recent theoretical work (Zhang et al., 2019b) and adopt the Margin Disparity Discrepancy (MDD) unsupervised domain adaptation algorithm to solve the cross-lingual text labeling problems. Experiments on cross-lingual document classification and NER demonstrate the proposed domain adaptation approach advances the state-of-the-art results by a large margin. Specifically, we improve MDD by efficiently optimizing the margin loss on the source domain via Virtual Adversarial Training (VAT). This bridges the gap between theory and the loss function used in the original work Zhang et al.(2019b), and thereby significantly boosts the performance. Our numerical results also indicate that VAT can remarkably improve the generalization performance of both domains for various domain adaptation approaches. + 2020.findings-emnlp.315 + + + Rethinking Supervised Learning and Reinforcement Learning in Task-Oriented Dialogue Systems + ZimingLi + JuliaKiseleva + Maartende Rijke + 3537–3546 + Dialogue policy learning for task-oriented dialogue systems has enjoyed great progress recently mostly through employing reinforcement learning methods. However, these approaches have become very sophisticated. It is time to re-evaluate it. Are we really making progress developing dialogue agents only based on reinforcement learning? We demonstrate how (1) traditional supervised learning together with (2) a simulator-free adversarial learning method can be used to achieve performance comparable to state-of-the-art reinforcement learning-based methods. First, we introduce a simple dialogue action decoder to predict the appropriate actions. Then, the traditional multi-label classification solution for dialogue policy learning is extended by adding dense layers to improve the dialogue agent performance. Finally, we employ the Gumbel-Softmax estimator to alternatively train the dialogue agent and the dialogue reward model without using reinforcement learning. Based on our extensive experimentation, we can conclude the proposed methods can achieve more stable and higher performance with fewer efforts, such as the domain knowledge required to design a user simulator and the intractable parameter tuning in reinforcement learning. Our main goal is not to beat RL with supervised learning, but to demonstrate the value of rethinking the role of reinforcement learning and supervised learning in optimizing task-oriented dialogue systems. + 2020.findings-emnlp.316 + + + What do we expect from Multiple-choice <fixed-case>QA</fixed-case> Systems? + KrunalShah + NitishGupta + DanRoth + 3547–3553 + The recent success of machine learning systems on various QA datasets could be interpreted as a significant improvement in models’ language understanding abilities. However, using various perturbations, multiple recent works have shown that good performance on a dataset might not indicate performance that correlates well with human’s expectations from models that “understand” language. In this work we consider a top performing model on several Multiple Choice Question Answering (MCQA) datasets, and evaluate it against a set of expectations one might have from such a model, using a series of zero-information perturbations of the model’s inputs. Our results show that the model clearly falls short of our expectations, and motivates a modified training approach that forces the model to better attend to the inputs. We show that the new training paradigm leads to a model that performs on par with the original model while better satisfying our expectations. + 2020.findings-emnlp.317 + + + Resource-Enhanced Neural Model for Event Argument Extraction + JieMa + ShuaiWang + RishitaAnubhai + MiguelBallesteros + YaserAl-Onaizan + 3554–3559 + Event argument extraction (EAE) aims to identify the arguments of an event and classify the roles that those arguments play. Despite great efforts made in prior work, there remain many challenges: (1) Data scarcity. (2) Capturing the long-range dependency, specifically, the connection between an event trigger and a distant event argument. (3) Integrating event trigger information into candidate argument representation. For (1), we explore using unlabeled data. For (2), we use Transformer that uses dependency parses to guide the attention mechanism. For (3), we propose a trigger-aware sequence encoder with several types of trigger-dependent sequence representations. We also support argument extraction either from text annotated with gold entities or from plain text. Experiments on the English ACE 2005 benchmark show that our approach achieves a new state-of-the-art. + 2020.findings-emnlp.318 + + + Improving Target-side Lexical Transfer in Multilingual Neural Machine Translation + LuyuGao + XinyiWang + GrahamNeubig + 3560–3566 + To improve the performance of Neural Machine Translation (NMT) for low-resource languages (LRL), one effective strategy is to leverage parallel data from a related high-resource language (HRL). However, multilingual data has been found more beneficial for NMT models that translate from the LRL to a target language than the ones that translate into the LRLs. In this paper, we aim to improve the effectiveness of multilingual transfer for NMT models that translate into the LRL, by designing a better decoder word embedding. Extending upon a general-purpose multilingual encoding method Soft Decoupled Encoding (Wang et al., 2019), we propose DecSDE, an efficient character n-gram based embedding specifically designed for the NMT decoder. Our experiments show that DecSDE leads to consistent gains of up to 1.8 BLEU on translation from English to four different languages. + 2020.findings-emnlp.319 + + + Accurate Polyglot Semantic Parsing With <fixed-case>DAG</fixed-case> Grammars + FedericoFancellu + ÁkosKádár + RanZhang + AfsanehFazly + 3567–3580 + Semantic parses are directed acyclic graphs (DAGs), but in practice most parsers treat them as strings or trees, mainly because models that predict graphs are far less understood. This simplification, however, comes at a cost: there is no guarantee that the output is a well-formed graph. A recent work by Fancellu et al. (2019) addressed this problem by proposing a graph-aware sequence model that utilizes a DAG grammar to guide graph generation. We significantly improve upon this work, by proposing a simpler architecture as well as more efficient training and inference algorithms that can always guarantee the well-formedness of the generated graphs. Importantly, unlike Fancellu et al., our model does not require language-specific features, and hence can harness the inherent ability of DAG-grammar parsing in multilingual settings. We perform monolingual as well as multilingual experiments on the Parallel Meaning Bank (Abzianidze et al., 2017). Our parser outperforms previous graph-aware models by a large margin, and closes the performance gap between string-based and DAG-grammar parsing. + 2020.findings-emnlp.320 + + + Approximation of Response Knowledge Retrieval in Knowledge-grounded Dialogue Generation + WenZheng + NatasaMilic-Frayling + KeZhou + 3581–3591 + This paper is concerned with improving dialogue generation models through injection of knowledge, e.g., content relevant to the post that can increase the quality of responses. Past research extends the training of the generative models by incorporating statistical properties of posts, responses and related knowledge, without explicitly assessing the knowledge quality. In our work, we demonstrate the importance of knowledge relevance and adopt a two-phase approach. We first apply a novel method, Transformer & Post based Posterior Approximation (TPPA) to select knowledge, and then use the Transformer with Expanded Decoder (TED) model to generate responses from both the post and the knowledge. TPPA method processes posts, post related knowledge, and response related knowledge at both word and sentence level. Our experiments with the TED generative model demonstrate the effectiveness of TPPA as it outperforms a set of strong baseline models. Our TPPA method is extendable and supports further optimization of knowledge retrieval and injection. + 2020.findings-emnlp.321 + + + Evaluating Factuality in Generation with Dependency-level Entailment + TanyaGoyal + GregDurrett + 3592–3603 + Despite significant progress in text generation models, a serious limitation is their tendency to produce text that is factually inconsistent with information in the input. Recent work has studied whether textual entailment systems can be used to identify factual errors; however, these sentence-level entailment models are trained to solve a different problem than generation filtering and they do not localize which part of a generation is non-factual. In this paper, we propose a new formulation of entailment that decomposes it at the level of dependency arcs. Rather than focusing on aggregate decisions, we instead ask whether the semantic relationship manifested by individual dependency arcs in the generated output is supported by the input. Human judgments on this task are difficult to obtain; we therefore propose a method to automatically create data based on existing entailment or paraphrase corpora. Experiments show that our dependency arc entailment model trained on this data can identify factual inconsistencies in paraphrasing and summarization better than sentence-level methods or those based on question generation, while additionally localizing the erroneous parts of the generation. + 2020.findings-emnlp.322 + + + Cross-Lingual Text Classification with Minimal Resources by Transferring a Sparse Teacher + GiannisKaramanolakis + DanielHsu + LuisGravano + 3604–3622 + Cross-lingual text classification alleviates the need for manually labeled documents in a target language by leveraging labeled documents from other languages. Existing approaches for transferring supervision across languages require expensive cross-lingual resources, such as parallel corpora, while less expensive cross-lingual representation learning approaches train classifiers without target labeled documents. In this work, we propose a cross-lingual teacher-student method, CLTS, that generates “weak” supervision in the target language using minimal cross-lingual resources, in the form of a small number of word translations. Given a limited translation budget, CLTS extracts and transfers only the most important task-specific seed words across languages and initializes a teacher classifier based on the translated seed words. Then, CLTS iteratively trains a more powerful student that also exploits the context of the seed words in unlabeled target documents and outperforms the teacher. CLTS is simple and surprisingly effective in 18 diverse languages: by transferring just 20 seed words, even a bag-of-words logistic regression student outperforms state-of-the-art cross-lingual methods (e.g., based on multilingual BERT). Moreover, CLTS can accommodate any type of student classifier: leveraging a monolingual BERT student leads to further improvements and outperforms even more expensive approaches by up to 12% in accuracy. Finally, CLTS addresses emerging tasks in low-resource languages using just a small number of word translations. + 2020.findings-emnlp.323 + + + A Multi-Persona Chatbot for Hotline Counselor Training + OriannaDemasi + YuLi + ZhouYu + 3623–3636 + Suicide prevention hotline counselors aid individuals during difficult times through millions of calls and chats. A chatbot cannot safely replace a counselor, but we explore whether a chatbot can be developed to help train human counselors. Such a system needs to simulate intimate situations across multiple practice sessions. Open-domain dialogue systems frequently suffer from generic responses that do not characterize personal stories, so we look to infuse conversations with persona information by mimicking prototype conversations. Towards building a “Crisisbot” hotline visitor simulation, we propose a counseling strategy annotation scheme and a multi-task framework that leverages these counselor strategies to retrieve similar examples, generate diverse sub-utterances, and interleave prototype and generated sub-utterances into complex responses. We evaluate this framework with crowdworkers and experienced hotline counselors. The framework considerably increases response diversity and specificity, with limited impact to coherence. Our results also show a considerable discrepancy between crowdworker and counselor judgements, which emphasizes the importance of including target populations in system development and evaluation. + 2020.findings-emnlp.324 + + + Narrative Text Generation with a Latent Discrete Plan + HarshJhamtani + TaylorBerg-Kirkpatrick + 3637–3650 + Past work on story generation has demonstrated the usefulness of conditioning on a generation plan to generate coherent stories. However, these approaches have used heuristics or off-the-shelf models to first tag training stories with the desired type of plan, and then train generation models in a supervised fashion. In this paper, we propose a deep latent variable model that first samples a sequence of anchor words, one per sentence in the story, as part of its generative process. During training, our model treats the sequence of anchor words as a latent variable and attempts to induce anchoring sequences that help guide generation in an unsupervised fashion. We conduct experiments with several types of sentence decoder distributions – left-to-right and non-monotonic, with different degrees of restriction. Further, since we use amortized variational inference to train our model, we introduce two corresponding types of inference network for predicting the posterior on anchor words. We conduct human evaluations which demonstrate that the stories produced by our model are rated better in comparison with baselines which do not consider story plans, and are similar or better in quality relative to baselines which use external supervision for plans. Additionally, the proposed model gets favorable scores when evaluated on perplexity, diversity, and control of story via discrete plan + 2020.findings-emnlp.325 + + + Graph Transformer Networks with Syntactic and Semantic Structures for Event Argument Extraction + AmirPouran Ben Veyseh + Tuan NgoNguyen + Thien HuuNguyen + 3651–3661 + The goal of Event Argument Extraction (EAE) is to find the role of each entity mention for a given event trigger word. It has been shown in the previous works that the syntactic structures of the sentences are helpful for the deep learning models for EAE. However, a major problem in such prior works is that they fail to exploit the semantic structures of the sentences to induce effective representations for EAE. Consequently, in this work, we propose a novel model for EAE that exploits both syntactic and semantic structures of the sentences with the Graph Transformer Networks (GTNs) to learn more effective sentence structures for EAE. In addition, we introduce a novel inductive bias based on information bottleneck to improve generalization of the EAE models. Extensive experiments are performed to demonstrate the benefits of the proposed model, leading to state-of-the-art performance for EAE on standard datasets. + 2020.findings-emnlp.326 + + + The Box is in the Pen: Evaluating Commonsense Reasoning in Neural Machine Translation + JieHe + TaoWang + DeyiXiong + QunLiu + 3662–3672 + Does neural machine translation yield translations that are congenial with common sense? In this paper, we present a test suite to evaluate the commonsense reasoning capability of neural machine translation. The test suite consists of three test sets, covering lexical and contextless/contextual syntactic ambiguity that requires commonsense knowledge to resolve. We manually create 1,200 triples, each of which contain a source sentence and two contrastive translations, involving 7 different common sense types. Language models pretrained on large-scale corpora, such as BERT, GPT-2, achieve a commonsense reasoning accuracy of lower than 72% on target translations of this test suite. We conduct extensive experiments on the test suite to evaluate commonsense reasoning in neural machine translation and investigate factors that have impact on this capability. Our experiments and analyses demonstrate that neural machine translation performs poorly on commonsense reasoning of the three ambiguity types in terms of both reasoning accuracy ( 6 60.1%) and reasoning consistency (6 31%). We will release our test suite as a machine translation commonsense reasoning testbed to promote future work in this direction. + 2020.findings-emnlp.327 + + + Using Visual Feature Space as a Pivot Across Languages + ZiyanYang + LeticiaPinto-Alva + FranckDernoncourt + VicenteOrdonez + 3673–3678 + Our work aims to leverage visual feature space to pass information across languages. We show that models trained to generate textual captions in more than one language conditioned on an input image can leverage their jointly trained feature space during inference to pivot across languages. We particularly demonstrate improved quality on a caption generated from an input image, by leveraging a caption in a second language. More importantly, we demonstrate that even without conditioning on any visual input, the model demonstrates to have learned implicitly to perform to some extent machine translation from one language to another in their shared visual feature space. We show results in German-English, and Japanese-English language pairs that pave the way for using the visual world to learn a common representation for language. + 2020.findings-emnlp.328 + + + An Empirical Study of Cross-Dataset Evaluation for Neural Summarization Systems + YiranChen + PengfeiLiu + MingZhong + Zi-YiDou + DanqingWang + XipengQiu + XuanjingHuang + 3679–3691 + Neural network-based models augmented with unsupervised pre-trained knowledge have achieved impressive performance on text summarization. However, most existing evaluation methods are limited to an in-domain setting, where summarizers are trained and evaluated on the same dataset. We argue that this approach can narrow our understanding of the generalization ability for different summarization systems. In this paper, we perform an in-depth analysis of characteristics of different datasets and investigate the performance of different summarization models under a cross-dataset setting, in which a summarizer trained on one corpus will be evaluated on a range of out-of-domain corpora. A comprehensive study of 11 representative summarization systems on 5 datasets from different domains reveals the effect of model architectures and generation ways (i.e. abstractive and extractive) on model generalization ability. Further, experimental results shed light on the limitations of existing summarizers. Brief introduction and supplementary code can be found in https://github.com/zide05/CDEvalSumm. + 2020.findings-emnlp.329 + 2020.findings-emnlp.329.OptionalSupplementaryMaterial.zip + + + Attending to Long-Distance Document Context for Sequence Labeling + MatthewJörke + JonGillick + MatthewSims + DavidBamman + 3692–3704 + We present in this work a method for incorporating global context in long documents when making local decisions in sequence labeling problems like NER. Inspired by work in featurized log-linear models (Chieu and Ng, 2002; Sutton and McCallum, 2004), our model learns to attend to multiple mentions of the same word type in generating a representation for each token in context, extending that work to learning representations that can be incorporated into modern neural models. Attending to broader context at test time provides complementary information to pretraining (Gururangan et al., 2020), yields strong gains over equivalently parameterized models lacking such context, and performs best at recognizing entities with high TF-IDF scores (i.e., those that are important within a document). + 2020.findings-emnlp.330 + + + Global Bootstrapping Neural Network for Entity Set Expansion + LingyongYan + XianpeiHan + BenHe + LeSun + 3705–3714 + Bootstrapping for entity set expansion (ESE) has been studied for a long period, which expands new entities using only a few seed entities as supervision. Recent end-to-end bootstrapping approaches have shown their advantages in information capturing and bootstrapping process modeling. However, due to the sparse supervision problem, previous end-to-end methods often only leverage information from near neighborhoods (local semantics) rather than those propagated from the co-occurrence structure of the whole corpus (global semantics). To address this issue, this paper proposes Global Bootstrapping Network (GBN) with the “pre-training and fine-tuning” strategies for effective learning. Specifically, it contains a global-sighted encoder to capture and encode both local and global semantics into entity embedding, and an attention-guided decoder to sequentially expand new entities based on these embeddings. The experimental results show that the GBN learned by “pre-training and fine-tuning” strategies achieves state-of-the-art performance on two bootstrapping datasets. + 2020.findings-emnlp.331 + + + Document Classification for <fixed-case>COVID</fixed-case>-19 Literature + BernalJimenez Gutierrez + JuchengZeng + DongdongZhang + PingZhang + YuSu + 3715–3722 + The global pandemic has made it more important than ever to quickly and accurately retrieve relevant scientific literature for effective consumption by researchers in a wide range of fields. We provide an analysis of several multi-label document classification models on the LitCovid dataset, a growing collection of 23,000 research papers regarding the novel 2019 coronavirus. We find that pre-trained language models fine-tuned on this dataset outperform all other baselines and that BioBERT surpasses the others by a small margin with micro-F1 and accuracy scores of around 86% and 75% respectively on the test set. We evaluate the data efficiency and generalizability of these models as essential features of any system prepared to deal with an urgent situation like the current health crisis. We perform a data ablation study to determine how important article titles are for achieving reasonable performance on this dataset. Finally, we explore 50 errors made by the best performing models on LitCovid documents and find that they often (1) correlate certain labels too closely together and (2) fail to focus on discriminative sections of the articles; both of which are important issues to address in future work. Both data and code are available on GitHub. + 2020.findings-emnlp.332 + + + Adversarial Augmentation Policy Search for Domain and Cross-Lingual Generalization in Reading Comprehension + AdyashaMaharana + MohitBansal + 3723–3738 + Reading comprehension models often overfit to nuances of training datasets and fail at adversarial evaluation. Training with adversarially augmented dataset improves robustness against those adversarial attacks but hurts generalization of the models. In this work, we present several effective adversaries and automated data augmentation policy search methods with the goal of making reading comprehension models more robust to adversarial evaluation, but also improving generalization to the source domain as well as new domains and languages. We first propose three new methods for generating QA adversaries, that introduce multiple points of confusion within the context, show dependence on insertion location of the distractor, and reveal the compounding effect of mixing adversarial strategies with syntactic and semantic paraphrasing methods. Next, we find that augmenting the training datasets with uniformly sampled adversaries improves robustness to the adversarial attacks but leads to decline in performance on the original unaugmented dataset. We address this issue via RL and more efficient Bayesian policy search methods for automatically learning the best augmentation policy combinations of the transformation probability for each adversary in a large search space. Using these learned policies, we show that adversarial training can lead to significant improvements in in-domain, out-of-domain, and cross-lingual (German, Russian, Turkish) generalization. + 2020.findings-emnlp.333 + + + Denoising Multi-Source Weak Supervision for Neural Text Classification + WendiRen + YinghaoLi + HantingSu + DavidKartchner + CassieMitchell + ChaoZhang + 3739–3754 + We study the problem of learning neural text classifiers without using any labeled data, but only easy-to-provide rules as multiple weak supervision sources. This problem is challenging because rule-induced weak labels are often noisy and incomplete. To address these two challenges, we design a label denoiser, which estimates the source reliability using a conditional soft attention mechanism and then reduces label noise by aggregating rule-annotated weak labels. The denoised pseudo labels then supervise a neural classifier to predicts soft labels for unmatched samples, which address the rule coverage issue. We evaluate our model on five benchmarks for sentiment, topic, and relation classifications. The results show that our model outperforms state-of-the-art weakly-supervised and semi-supervised methods consistently, and achieves comparable performance with fully-supervised methods even without any labeled data. Our code can be found at https://github.com/weakrules/Denoise-multi-weak-sources. + 2020.findings-emnlp.334 + + + Dr. Summarize: Global Summarization of Medical Dialogue by Exploiting Local Structures. + AnirudhJoshi + NamitKatariya + XavierAmatriain + AnithaKannan + 3755–3763 + Understanding a medical conversation between a patient and a physician poses unique natural language understanding challenge since it combines elements of standard open-ended conversation with very domain-specific elements that require expertise and medical knowledge. Summarization of medical conversations is a particularly important aspect of medical conversation understanding since it addresses a very real need in medical practice: capturing the most important aspects of a medical encounter so that they can be used for medical decision making and subsequent follow ups. In this paper we present a novel approach to medical conversation summarization that leverages the unique and independent local structures created when gathering a patient’s medical history. Our approach is a variation of the pointer generator network where we introduce a penalty on the generator distribution, and we explicitly model negations. The model also captures important properties of medical conversations such as medical knowledge coming from standardized medical ontologies better than when those concepts are introduced explicitly. Through evaluation by doctors, we show that our approach is preferred on twice the number of summaries to the baseline pointer generator model and captures most or all of the information in 80% of the conversations making it a realistic alternative to costly manual summarization by medical experts. + 2020.findings-emnlp.335 + 2020.findings-emnlp.335.OptionalSupplementaryMaterial.pdf + + + Generating Accurate <fixed-case>EHR</fixed-case> Assessment from Medical Graph + ZhichaoYang + HongYu + 3764–3773 + One of the fundamental goals of artificial intelligence is to build computer-based expert systems. Inferring clinical diagnoses to generate a clinical assessment during a patient encounter is a crucial step towards building a medical diagnostic system. Previous works were mainly based on either medical domain-specific knowledge, or patients’ prior diagnoses and clinical encounters. In this paper, we propose a novel model for automated clinical assessment generation (MCAG). MCAG is built on an innovative graph neural network, where rich clinical knowledge is incorporated into an end-to-end corpus-learning system. Our evaluation results against physician generated gold standard show that MCAG significantly improves the BLEU and rouge score compared with competitive baseline models. Further, physicians’ evaluation showed that MCAG could generate high-quality assessments. + 2020.findings-emnlp.336 + 2020.findings-emnlp.336.OptionalSupplementaryMaterial.zip + + + Do Models of Mental Health Based on Social Media Data Generalize? + KeithHarrigian + CarlosAguirre + MarkDredze + 3774–3788 + Proxy-based methods for annotating mental health status in social media have grown popular in computational research due to their ability to gather large training samples. However, an emerging body of literature has raised new concerns regarding the validity of these types of methods for use in clinical applications. To further understand the robustness of distantly supervised mental health models, we explore the generalization ability of machine learning classifiers trained to detect depression in individuals across multiple social media platforms. Our experiments not only reveal that substantial loss occurs when transferring between platforms, but also that there exist several unreliable confounding factors that may enable researchers to overestimate classification performance. Based on these results, we enumerate recommendations for future mental health dataset construction. + 2020.findings-emnlp.337 + + + Context Analysis for Pre-trained Masked Language Models + Yi-AnLai + GarimaLalwani + YiZhang + 3789–3804 + Pre-trained language models that learn contextualized word representations from a large un-annotated corpus have become a standard component for many state-of-the-art NLP systems. Despite their successful applications in various downstream NLP tasks, the extent of contextual impact on the word representation has not been explored. In this paper, we present a detailed analysis of contextual impact in Transformer- and BiLSTM-based masked language models. We follow two different approaches to evaluate the impact of context: a masking based approach that is architecture agnostic, and a gradient based approach that requires back-propagation through networks. The findings suggest significant differences on the contextual impact between the two model architectures. Through further breakdown of analysis by syntactic categories, we find the contextual impact in Transformer-based MLM aligns well with linguistic intuition. We further explore the Transformer attention pruning based on our findings in contextual analysis. + 2020.findings-emnlp.338 + + + Controllable Text Generation with Focused Variation + LeiShu + AlexandrosPapangelis + Yi-ChiaWang + GokhanTur + HuXu + ZhalehFeizollahi + BingLiu + PieroMolino + 3805–3817 + This work introduces Focused-Variation Network (FVN), a novel model to control language generation. The main problems in previous controlled language generation models range from the difficulty of generating text according to the given attributes, to the lack of diversity of the generated texts. FVN addresses these issues by learning disjoint discrete latent spaces for each attribute inside codebooks, which allows for both controllability and diversity, while at the same time generating fluent text. We evaluate FVN on two text generation datasets with annotated content and style, and show state-of-the-art performance as assessed by automatic and human evaluations. + 2020.findings-emnlp.339 + + + Modeling Preconditions in Text with a Crowd-sourced Dataset + HeeyoungKwon + MahnazKoupaee + PratyushSingh + GargiSawhney + AnmolShukla + Keerthi KumarKallur + NathanaelChambers + NiranjanBalasubramanian + 3818–3828 + Preconditions provide a form of logical connection between events that explains why some events occur together and information that is complementary to the more widely studied relations such as causation, temporal ordering, entailment, and discourse relations. Modeling preconditions in text has been hampered in part due to the lack of large scale labeled data grounded in text. This paper introduces PeKo, a crowd-sourced annotation of preconditions between event pairs in newswire, an order of magnitude larger than prior text annotations. To complement this new corpus, we also introduce two challenge tasks aimed at modeling preconditions: (i) Precondition Identification – a standard classification task defined over pairs of event mentions, and (ii) Precondition Generation – a generative task aimed at testing a more general ability to reason about a given event. Evaluation on both tasks shows that modeling preconditions is challenging even for today’s large language models (LM). This suggests that precondition knowledge is not easily accessible in LM-derived representations alone. Our generation results show that fine-tuning an LM on PeKo yields better conditional relations than when trained on raw text or temporally-ordered corpora. + 2020.findings-emnlp.340 + 2020.findings-emnlp.340.OptionalSupplementaryMaterial.zip + + + Reevaluating Adversarial Examples in Natural Language + JohnMorris + EliLifland + JackLanchantin + YangfengJi + YanjunQi + 3829–3839 + State-of-the-art attacks on NLP models lack a shared definition of a what constitutes a successful attack. We distill ideas from past work into a unified framework: a successful natural language adversarial example is a perturbation that fools the model and follows some linguistic constraints. We then analyze the outputs of two state-of-the-art synonym substitution attacks. We find that their perturbations often do not preserve semantics, and 38% introduce grammatical errors. Human surveys reveal that to successfully preserve semantics, we need to significantly increase the minimum cosine similarities between the embeddings of swapped words and between the sentence encodings of original and perturbed sentences.With constraints adjusted to better preserve semantics and grammaticality, the attack success rate drops by over 70 percentage points. + 2020.findings-emnlp.341 + 2020.findings-emnlp.341.OptionalSupplementaryMaterial.pdf + + + Question Answering with Long Multiple-Span Answers + MingZhu + AmanAhuja + Da-ChengJuan + WeiWei + Chandan K.Reddy + 3840–3849 + Answering questions in many real-world applications often requires complex and precise information excerpted from texts spanned across a long document. However, currently no such annotated dataset is publicly available, which hinders the development of neural question-answering (QA) systems. To this end, we present MASH-QA, a Multiple Answer Spans Healthcare Question Answering dataset from the consumer health domain, where answers may need to be excerpted from multiple, non-consecutive parts of text spanned across a long document. We also propose MultiCo, a neural architecture that is able to capture the relevance among multiple answer spans, by using a query-based contextualized sentence selection approach, for forming the answer to the given question. We also demonstrate that conventional QA models are not suitable for this type of task and perform poorly in this setting. Extensive experiments are conducted, and the experimental results confirm the proposed model significantly outperforms the state-of-the-art QA models in this multi-span QA setting. + 2020.findings-emnlp.342 + + + Inserting Information Bottleneck for Attribution in Transformers + ZhiyingJiang + RaphaelTang + JiXin + JimmyLin + 3850–3857 + Pretrained transformers achieve the state of the art across tasks in natural language processing, motivating researchers to investigate their inner mechanisms. One common direction is to understand what features are important for prediction. In this paper, we apply information bottlenecks to analyze the attribution of each feature for prediction on a black-box model. We use BERT as the example and evaluate our approach both quantitatively and qualitatively. We show the effectiveness of our method in terms of attribution and the ability to provide insight into how information flows through layers. We demonstrate that our technique outperforms two competitive methods in degradation tests on four datasets. Code is available at https://github.com/bazingagin/IBA. + 2020.findings-emnlp.343 + + + Event-Related Bias Removal for Real-time Disaster Events + SalvadorMedina maza + EvangeliaSpiliopoulou + EduardHovy + AlexanderHauptmann + 3858–3868 + Social media has become an important tool to share information about crisis events such as natural disasters and mass attacks. Detecting actionable posts that contain useful information requires rapid analysis of huge volumes of data in real-time. This poses a complex problem due to the large amount of posts that do not contain any actionable information. Furthermore, the classification of information in real-time systems requires training on out-of-domain data, as we do not have any data from a new emerging crisis. Prior work focuses on models pre-trained on similar event types. However, those models capture unnecessary event-specific biases, like the location of the event, which affect the generalizability and performance of the classifiers on new unseen data from an emerging new event. In our work, we train an adversarial neural model to remove latent event-specific biases and improve the performance on tweet importance classification. + 2020.findings-emnlp.344 + + + It’s not a Non-Issue: Negation as a Source of Error in Machine Translation + Md MosharafHossain + AntoniosAnastasopoulos + EduardoBlanco + AlexisPalmer + 3869–3885 + As machine translation (MT) systems progress at a rapid pace, questions of their adequacy linger. In this study we focus on negation, a universal, core property of human language that significantly affects the semantics of an utterance. We investigate whether translating negation is an issue for modern MT systems using 17 translation directions as test bed. Through thorough analysis, we find that indeed the presence of negation can significantly impact downstream quality, in some cases resulting in quality reductions of more than 60%. We also provide a linguistically motivated analysis that directly explains the majority of our findings. We release our annotations and code to replicate our analysis here: https://github.com/mosharafhossain/negation-mt. + 2020.findings-emnlp.345 + + + Incremental Text-to-Speech Synthesis with Prefix-to-Prefix Framework + MingboMa + BaigongZheng + KaiboLiu + RenjieZheng + HairongLiu + KainanPeng + KennethChurch + LiangHuang + 3886–3896 + Text-to-speech synthesis (TTS) has witnessed rapid progress in recent years, where neural methods became capable of producing audios with high naturalness. However, these efforts still suffer from two types of latencies: (a) the computational latency (synthesizing time), which grows linearly with the sentence length, and (b) the input latency in scenarios where the input text is incrementally available (such as in simultaneous translation, dialog generation, and assistive technologies). To reduce these latencies, we propose a neural incremental TTS approach using the prefix-to-prefix framework from simultaneous translation. We synthesize speech in an online fashion, playing a segment of audio while generating the next, resulting in an O(1) rather than O(n) latency. Experiments on English and Chinese TTS show that our approach achieves similar speech naturalness compared to full sentence TTS, but only with a constant (1-2 words) latency. + 2020.findings-emnlp.346 + + + Joint Turn and Dialogue level User Satisfaction Estimation on Mulit-Domain Conversations + Praveen KumarBodigutla + AdityaTiwari + SpyrosMatsoukas + JosepValls-Vargas + LazarosPolymenakos + 3897–3909 + Dialogue level quality estimation is vital for optimizing data driven dialogue management. Current automated methods to estimate turn and dialogue level user satisfaction employ hand-crafted features and rely on complex annotation schemes, which reduce the generalizability of the trained models. We propose a novel user satisfaction estimation approach which minimizes an adaptive multi-task loss function in order to jointly predict turn-level Response Quality labels provided by experts and explicit dialogue-level ratings provided by end users. The proposed BiLSTM based deep neural net model automatically weighs each turn’s contribution towards the estimated dialogue-level rating, implicitly encodes temporal dependencies, and removes the need to hand-craft features. On dialogues sampled from 28 Alexa domains, two dialogue systems and three user groups, the joint dialogue-level satisfaction estimation model achieved up to an absolute 27% (0.43 -> 0.70) and 7% (0.63 -> 0.70) improvement in linear correlation performance over baseline deep neural net and benchmark Gradient boosting regression models, respectively. + 2020.findings-emnlp.347 + + + <fixed-case>A</fixed-case>rra<fixed-case>M</fixed-case>on: A Joint Navigation-Assembly Instruction Interpretation Task in Dynamic Environments + HyounghunKim + AbhaysinhZala + GrahamBurri + HaoTan + MohitBansal + 3910–3927 + For embodied agents, navigation is an important ability but not an isolated goal. Agents are also expected to perform specific tasks after reaching the target location, such as picking up objects and assembling them into a particular arrangement. We combine Vision-andLanguage Navigation, assembling of collected objects, and object referring expression comprehension, to create a novel joint navigation-and-assembly task, named ARRAMON. During this task, the agent (similar to a PokeMON GO player) is asked to find and collect different target objects one-by-one by navigating based on natural language (English) instructions in a complex, realistic outdoor environment, but then also ARRAnge the collected objects part-by-part in an egocentric grid-layout environment. To support this task, we implement a 3D dynamic environment simulator and collect a dataset with human-written navigation and assembling instructions, and the corresponding ground truth trajectories. We also filter the collected instructions via a verification stage, leading to a total of 7.7K task instances (30.8K instructions and paths). We present results for several baseline models (integrated and biased) and metrics (nDTW, CTC, rPOD, and PTC), and the large model-human performance gap demonstrates that our task is challenging and presents a wide scope for future work. + 2020.findings-emnlp.348 + + + Fluent and Low-latency Simultaneous Speech-to-Speech Translation with Self-adaptive Training + RenjieZheng + MingboMa + BaigongZheng + KaiboLiu + JiahongYuan + KennethChurch + LiangHuang + 3928–3937 + Simultaneous speech-to-speech translation is an extremely challenging but widely useful scenario that aims to generate target-language speech only a few seconds behind the source-language speech. In addition, we have to continuously translate a speech of multiple sentences, but all recent solutions merely focus on the single-sentence scenario. As a result, current approaches will accumulate more and more latencies in later sentences when the speaker talks faster and introduce unnatural pauses into translated speech when the speaker talks slower. To overcome these issues, we propose Self-Adaptive Translation which flexibly adjusts the length of translations to accommodate different source speech rates. At similar levels of translation quality (as measured by BLEU), our method generates more fluent target speech latency than the baseline, in both Zh<->En directions. + 2020.findings-emnlp.349 + + + Towards Context-Aware Code Comment Generation + XiaohanYu + QuzheHuang + ZhengWang + YansongFeng + DongyanZhao + 3938–3947 + Code comments are vital for software maintenance and comprehension, but many software projects suffer from the lack of meaningful and up-to-date comments in practice. This paper presents a novel approach to automatically generate code comments at a function level by targeting object-oriented programming languages. Unlike prior work that only uses information locally available within the target function, our approach leverages broader contextual information by considering all other functions of the same class. To propagate and integrate information beyond the scope of the target function, we design a novel learning framework based on the bidirectional gated recurrent unit and a graph attention network with a pointer mechanism. We apply our approach to produce code comments for Java methods and compare it against four strong baseline methods. Experimental results show that our approach outperforms most methods by a large margin and achieves a comparable result with the state-of-the-art method. + 2020.findings-emnlp.350 + 2020.findings-emnlp.350.OptionalSupplementaryMaterial.zip + + + <fixed-case>MCMH</fixed-case>: Learning Multi-Chain Multi-Hop Rules for Knowledge Graph Reasoning + LuZhang + MoYu + TianGao + YueYu + 3948–3954 + Multi-hop reasoning approaches over knowledge graphs infer a missing relationship between entities with a multi-hop rule, which corresponds to a chain of relationships. We extend existing works to consider a generalized form of multi-hop rules, where each rule is a set of relation chains. To learn such generalized rules efficiently, we propose a two-step approach that first selects a small set of relation chains as a rule and then evaluates the confidence of the target relationship by jointly scoring the selected chains. A game-theoretical framework is proposed to this end to simultaneously optimize the rule selection and prediction steps. Empirical results show that our multi-chain multi-hop (MCMH) rules result in superior results compared to the standard single-chain approaches, justifying both our formulation of generalized rules and the effectiveness of the proposed learning framework. + 2020.findings-emnlp.351 + 2020.findings-emnlp.351.OptionalSupplementaryMaterial.pdf + + + Finding the Optimal Vocabulary Size for Neural Machine Translation + ThammeGowda + JonathanMay + 3955–3964 + We cast neural machine translation (NMT) as a classification task in an autoregressive setting and analyze the limitations of both classification and autoregression components. Classifiers are known to perform better with balanced class distributions during training. Since the Zipfian nature of languages causes imbalanced classes, we explore its effect on NMT. We analyze the effect of various vocabulary sizes on NMT performance on multiple languages with many data sizes, and reveal an explanation for why certain vocabulary sizes are better than others. + 2020.findings-emnlp.352 + 2020.findings-emnlp.352.OptionalSupplementaryMaterial.zip + + + Weakly- and Semi-supervised Evidence Extraction + DanishPruthi + BhuwanDhingra + GrahamNeubig + Zachary C.Lipton + 3965–3970 + For many prediction tasks, stakeholders desire not only predictions but also supporting evidence that a human can use to verify its correctness. However, in practice, evidence annotations may only be available for a minority of training examples (if available at all). In this paper, we propose new methods to combine few evidence annotations (strong semi-supervision) with abundant document-level labels (weak supervision) for the task of evidence extraction. Evaluating on two classification tasks that feature evidence annotations, we find that our methods outperform baselines adapted from the interpretability literature to our task. Our approach yields gains with as few as hundred evidence annotations. + 2020.findings-emnlp.353 + + + Making Information Seeking Easier: An Improved Pipeline for Conversational Search + VaibhavKumar + JamieCallan + 3971–3980 + This paper presents a highly effective pipeline for passage retrieval in a conversational search setting. The pipeline comprises of two components: Conversational Term Selection (CTS) and Multi-View Reranking (MVR). CTS is responsible for performing the first-stage of passage retrieval. Given an input question, it uses a BERT-based classifier (trained with weak supervision) to de-contextualize the input by selecting relevant terms from the dialog history. Using the question and the selected terms, it issues a query to a search engine to perform the first-stage of passage retrieval. On the other hand, MVR is responsible for contextualized passage reranking. It first constructs multiple views of the information need embedded within an input question. The views are based on the dialog history and the top documents obtained in the first-stage of retrieval. It then uses each view to rerank passages using BERT (fine-tuned for passage ranking). Finally, MVR performs a fusion over the rankings produced by the individual views. Experiments show that the above combination improves first-state retrieval as well as the overall accuracy in a reranking pipeline. On the key metric of NDCG@3, the proposed combination achieves a relative performance improvement of 14.8% over the state-of-the-art baseline and is also able to surpass the Oracle. + 2020.findings-emnlp.354 + + + Generalizable and Explainable Dialogue Generation via Explicit Action Learning + XintingHuang + JianzhongQi + YuSun + RuiZhang + 3981–3991 + Response generation for task-oriented dialogues implicitly optimizes two objectives at the same time: task completion and language quality. Conditioned response generation serves as an effective approach to separately and better optimize these two objectives. Such an approach relies on system action annotations which are expensive to obtain. To alleviate the need of action annotations, latent action learning is introduced to map each utterance to a latent representation. However, this approach is prone to over-dependence on the training data, and the generalization capability is thus restricted. To address this issue, we propose to learn natural language actions that represent utterances as a span of words. This explicit action representation promotes generalization via the compositional structure of language. It also enables an explainable generation process. Our proposed unsupervised approach learns a memory component to summarize system utterances into a short span of words. To further promote a compact action representation, we propose an auxiliary task that restores state annotations as the summarized dialogue context using the memory component. Our proposed approach outperforms latent action baselines on MultiWOZ, a benchmark multi-domain dataset. + 2020.findings-emnlp.355 + + + More Embeddings, Better Sequence Labelers? + XinyuWang + YongJiang + NguyenBach + TaoWang + ZhongqiangHuang + FeiHuang + KeweiTu + 3992–4006 + Recent work proposes a family of contextual embeddings that significantly improves the accuracy of sequence labelers over non-contextual embeddings. However, there is no definite conclusion on whether we can build better sequence labelers by combining different kinds of embeddings in various settings. In this paper, we conduct extensive experiments on 3 tasks over 18 datasets and 8 languages to study the accuracy of sequence labeling with various embedding concatenations and make three observations: (1) concatenating more embedding variants leads to better accuracy in rich-resource and cross-domain settings and some conditions of low-resource settings; (2) concatenating contextual sub-word embeddings with contextual character embeddings hurts the accuracy in extremely low-resource settings; (3) based on the conclusion of (1), concatenating additional similar contextual embeddings cannot lead to further improvements. We hope these conclusions can help people build stronger sequence labelers in various settings. + 2020.findings-emnlp.356 + + + <fixed-case>NLP</fixed-case> Service <fixed-case>API</fixed-case>s and Models for Efficient Registration of New Clients + SahilShah + VihariPiratla + SoumenChakrabarti + SunitaSarawagi + 4007–4012 + State-of-the-art NLP inference uses enormous neural architectures and models trained for GPU-months, well beyond the reach of most consumers of NLP. This has led to one-size-fits-all public API-based NLP service models by major AI companies, serving millions of clients. They cannot afford traditional fine tuning for individual clients. Many clients cannot even afford significant fine tuning, and own little or no labeled data. Recognizing that word usage and salience diversity across clients leads to reduced accuracy, we initiate a study of practical and lightweight adaptation of centralized NLP services to clients. Each client uses an unsupervised, corpus-based sketch to register to the service. The server modifies its network mildly to accommodate client sketches, and occasionally trains the augmented network over existing clients. When a new client registers with its sketch, it gets immediate accuracy benefits. We demonstrate the proposed architecture using sentiment labeling, NER, and predictive language modeling. + 2020.findings-emnlp.357 + 2020.findings-emnlp.357.OptionalSupplementaryMaterial.pdf + + + Effects of Naturalistic Variation in Goal-Oriented Dialog + JatinGanhotra + RobertMoore + SachindraJoshi + KahiniWadhawan + 4013–4020 + Existing benchmarks used to evaluate the performance of end-to-end neural dialog systems lack a key component: natural variation present in human conversations. Most datasets are constructed through crowdsourcing, where the crowd workers follow a fixed template of instructions while enacting the role of a user/agent. This results in straight-forward, somewhat routine, and mostly trouble-free conversations, as crowd workers do not think to represent the full range of actions that occur naturally with real users. In this work, we investigate the impact of naturalistic variation on two goal-oriented datasets: bAbI dialog task and Stanford Multi-Domain Dataset (SMD). We also propose new and more effective testbeds for both datasets, by introducing naturalistic variation by the user. We observe that there is a significant drop in performance (more than 60% in Ent. F1 on SMD and 85% in per-dialog accuracy on bAbI task) of recent state-of-the-art end-to-end neural methods such as BossNet and GLMP on both datasets. + 2020.findings-emnlp.358 + + + Determining Event Outcomes: The Case of #fail + SrikalaMurugan + DhivyaChinnappa + EduardoBlanco + 4021–4033 + This paper targets the task of determining event outcomes in social media. We work with tweets containing either #cookingFail or #bakingFail, and show that many of the events described in them resulted in something edible. Tweets that contain images are more likely to result in edible albeit imperfect outcomes. Experimental results show that edibility is easier to predict than outcome quality. + 2020.findings-emnlp.359 + 2020.findings-emnlp.359.OptionalSupplementaryMaterial.txt + + + <fixed-case>W</fixed-case>iki<fixed-case>L</fixed-case>ingua: A New Benchmark Dataset for Multilingual Abstractive Summarization + FaisalLadhak + EsinDurmus + ClaireCardie + KathleenMcKeown + 4034–4048 + We introduce WikiLingua, a large-scale, multilingual dataset for the evaluation of cross-lingual abstractive summarization systems. We extract article and summary pairs in 18 languages from WikiHow, a high quality, collaborative resource of how-to guides on a diverse set of topics written by human authors. We create gold-standard article-summary alignments across languages by aligning the images that are used to describe each how-to step in an article. As a set of baselines for further studies, we evaluate the performance of existing cross-lingual abstractive summarization methods on our dataset. We further propose a method for direct cross-lingual summarization (i.e., without requiring translation at inference time) by leveraging synthetic data and Neural Machine Translation as a pre-training step. Our method significantly outperforms the baseline approaches, while being more cost efficient during inference. + 2020.findings-emnlp.360 + + + Adversarial Training for Code Retrieval with Question-Description Relevance Regularization + JieZhao + HuanSun + 4049–4059 + Code retrieval is a key task aiming to match natural and programming languages. In this work, we propose adversarial learning for code retrieval, that is regularized by question-description relevance. First, we adapt a simple adversarial learning technique to generate difficult code snippets given the input question, which can help the learning of code retrieval that faces bi-modal and data-scarce challenges. Second, we propose to leverage question-description relevance to regularize adversarial learning, such that a generated code snippet should contribute more to the code retrieval training loss, only if its paired natural language description is predicted to be less relevant to the user given question. Experiments on large-scale code retrieval datasets of two programming languages show that our adversarial learning method is able to improve the performance of state-of-the-art models. Moreover, using an additional duplicated question detection model to regularize adversarial learning further improves the performance, and this is more effective than using the duplicated questions in strong multi-task learning baselines. + 2020.findings-emnlp.361 + + + Large Product Key Memory for Pre-trained Language Models + GyuwanKim + Tae HwanJung + 4060–4069 + Product key memory (PKM) proposed by Lample et al. (2019) enables to improve prediction accuracy by increasing model capacity efficiently with insignificant computational overhead. However, their empirical application is only limited to causal language modeling. Motivated by the recent success of pretrained language models (PLMs), we investigate how to incorporate large PKM into PLMs that can be finetuned for a wide variety of downstream NLP tasks. We define a new memory usage metric, and careful observation using this metric reveals that most memory slots remain outdated during the training of PKM-augmented models. To train better PLMs by tackling this issue, we propose simple but effective solutions: (1) initialization from the model weights pretrained without memory and (2) augmenting PKM by addition rather than replacing a feed-forward network. We verify that both of them are crucial for the pretraining of PKM-augmented PLMs, enhancing memory utilization and downstream performance. Code and pretrained weights are available at https://github.com/clovaai/pkm-transformers. + 2020.findings-emnlp.362 + + + Temporal Reasoning in Natural Language Inference + SiddharthVashishtha + AdamPoliak + Yash KumarLal + BenjaminVan Durme + Aaron StevenWhite + 4070–4078 + We introduce five new natural language inference (NLI) datasets focused on temporal reasoning. We recast four existing datasets annotated for event duration—how long an event lasts—and event ordering—how events are temporally arranged—into more than one million NLI examples. We use these datasets to investigate how well neural models trained on a popular NLI corpus capture these forms of temporal reasoning. + 2020.findings-emnlp.363 + + + A Pilot Study of Text-to-<fixed-case>SQL</fixed-case> Semantic Parsing for <fixed-case>V</fixed-case>ietnamese + AnhTuan Nguyen + Mai HoangDao + Dat QuocNguyen + 4079–4085 + Semantic parsing is an important NLP task. However, Vietnamese is a low-resource language in this research area. In this paper, we present the first public large-scale Text-to-SQL semantic parsing dataset for Vietnamese. We extend and evaluate two strong semantic parsing baselines EditSQL (Zhang et al., 2019) and IRNet (Guo et al., 2019) on our dataset. We compare the two baselines with key configurations and find that: automatic Vietnamese word segmentation improves the parsing results of both baselines; the normalized pointwise mutual information (NPMI) score (Bouma, 2009) is useful for schema linking; latent syntactic features extracted from a neural dependency parser for Vietnamese also improve the results; and the monolingual language model PhoBERT for Vietnamese (Nguyen and Nguyen, 2020) helps produce higher performances than the recent best multilingual language model XLM-R (Conneau et al., 2020). + 2020.findings-emnlp.364 + + + <fixed-case>STANDER</fixed-case>: An Expert-Annotated Dataset for News Stance Detection and Evidence Retrieval + CostanzaConforti + JakobBerndt + Mohammad TaherPilehvar + ChryssiGiannitsarou + FlavioToxvaerd + NigelCollier + 4086–4101 + We present a new challenging news dataset that targets both stance detection (SD) and fine-grained evidence retrieval (ER). With its 3,291 expert-annotated articles, the dataset constitutes a high-quality benchmark for future research in SD and multi-task learning. We provide a detailed description of the corpus collection methodology and carry out an extensive analysis on the sources of disagreement between annotators, observing a correlation between their disagreement and the diffusion of uncertainty around a target in the real world. Our experiments show that the dataset poses a strong challenge to recent state-of-the-art models. Notably, our dataset aligns with an existing Twitter SD dataset: their union thus addresses a key shortcoming of previous works, by providing the first dedicated resource to study multi-genre SD as well as the interplay of signals from social media and news sources in rumour verification. + 2020.findings-emnlp.365 + + + An Empirical Methodology for Detecting and Prioritizing Needs during Crisis Events + M. JaninaSarol + LyDinh + RezvanehRezapour + Chieh-LiChin + PingjingYang + JanaDiesner + 4102–4107 + In times of crisis, identifying essential needs is crucial to providing appropriate resources and services to affected entities. Social media platforms such as Twitter contain a vast amount of information about the general public’s needs. However, the sparsity of information and the amount of noisy content present a challenge for practitioners to effectively identify relevant information on these platforms. This study proposes two novel methods for two needs detection tasks: 1) extracting a list of needed resources, such as masks and ventilators, and 2) detecting sentences that specify who-needs-what resources (e.g., we need testing). We evaluate our methods on a set of tweets about the COVID-19 crisis. For extracting a list of needs, we compare our results against two official lists of resources, achieving 0.64 precision. For detecting who-needs-what sentences, we compared our results against a set of 1,000 annotated tweets and achieved a 0.68 F1-score. + 2020.findings-emnlp.366 + + + <fixed-case>S</fixed-case>up<fixed-case>MMD</fixed-case>: A Sentence Importance Model for Extractive Summarisation using Maximum Mean Discrepancy + UmangaBista + AlexanderMathews + AdityaMenon + LexingXie + 4108–4122 + Most work on multi-document summarization has focused on generic summarization of information present in each individual document set. However, the under-explored setting of update summarization, where the goal is to identify the new information present in each set, is of equal practical interest (e.g., presenting readers with updates on an evolving news topic). In this work, we present SupMMD, a novel technique for generic and update summarization based on the maximum mean discrepancy from kernel two-sample testing. SupMMD combines both supervised learning for salience and unsupervised learning for coverage and diversity. Further, we adapt multiple kernel learning to make use of similarity across multiple information sources (e.g., text features and knowledge based concepts). We show the efficacy of SupMMD in both generic and update summarization tasks by meeting or exceeding the current state-of-the-art on the DUC-2004 and TAC-2009 datasets. + 2020.findings-emnlp.367 + + + Towards Low-Resource Semi-Supervised Dialogue Generation with Meta-Learning + YiHuang + JunlanFeng + ShuoMa + XiaoyuDu + XiaotingWu + 4123–4128 + In this paper, we propose a meta-learning based semi-supervised explicit dialogue state tracker (SEDST) for neural dialogue generation, denoted as MEDST. Our main motivation is to further bridge the chasm between the need for high accuracy dialogue state tracker and the common reality that only scarce annotated data is available for most real-life dialogue tasks. Specifically, MEDST has two core steps: meta-training with adequate unlabelled data in an automatic way and meta-testing with a few annotated data by supervised learning. In particular, we enhance SEDST via entropy regularization, and investigate semi-supervised learning frameworks based on model-agnostic meta-learning (MAML) that are able to reduce the amount of required intermediate state labelling. We find that by leveraging un-annotated data in meta-way instead, the amount of dialogue state annotations can be reduced below 10% while maintaining equivalent system performance. Experimental results show MEDST outperforms SEDST substantially by 18.7% joint goal accuracy and 14.3% entity match rate on the KVRET corpus with 2% labelled data in semi-supervision. + 2020.findings-emnlp.368 + + + Connecting the Dots: A Knowledgeable Path Generator for Commonsense Question Answering + PeifengWang + NanyunPeng + FilipIlievski + PedroSzekely + XiangRen + 4129–4140 + Commonsense question answering (QA) requires background knowledge which is not explicitly stated in a given context. Prior works use commonsense knowledge graphs (KGs) to obtain this knowledge for reasoning. However, relying entirely on these KGs may not suffice, considering their limited coverage and the contextual dependence of their knowledge. In this paper, we augment a general commonsense QA framework with a knowledgeable path generator. By extrapolating over existing paths in a KG with a state-of-the-art language model, our generator learns to connect a pair of entities in text with a dynamic, and potentially novel, multi-hop relational path. Such paths can provide structured evidence for solving commonsense questions without fine-tuning the path generator. Experiments on two datasets show the superiority of our method over previous works which fully rely on knowledge from KGs (with up to 6% improvement in accuracy), across various amounts of training data. Further evaluation suggests that the generated paths are typically interpretable, novel, and relevant to the task. + 2020.findings-emnlp.369 + + + No Answer is Better Than Wrong Answer: A Reflection Model for Document Level Machine Reading Comprehension + XuguangWang + LinjunShou + MingGong + NanDuan + DaxinJiang + 4141–4150 + The Natural Questions (NQ) benchmark set brings new challenges to Machine Reading Comprehension: the answers are not only at different levels of granularity (long and short), but also of richer types (including no-answer, yes/no, single-span and multi-span). In this paper, we target at this challenge and handle all answer types systematically. In particular, we propose a novel approach called Reflection Net which leverages a two-step training procedure to identify the no-answer and wrong-answer cases. Extensive experiments are conducted to verify the effectiveness of our approach. At the time of paper writing (May. 20, 2020), our approach achieved the top 1 on both long and short answer leaderboard, with F1 scores of 77.2 and 64.1, respectively. + 2020.findings-emnlp.370 + 2020.findings-emnlp.370.OptionalSupplementaryMaterial.zip + + + Reference Language based Unsupervised Neural Machine Translation + ZuchaoLi + HaiZhao + RuiWang + MasaoUtiyama + EiichiroSumita + 4151–4162 + Exploiting a common language as an auxiliary for better translation has a long tradition in machine translation and lets supervised learning-based machine translation enjoy the enhancement delivered by the well-used pivot language in the absence of a source language to target language parallel corpus. The rise of unsupervised neural machine translation (UNMT) almost completely relieves the parallel corpus curse, though UNMT is still subject to unsatisfactory performance due to the vagueness of the clues available for its core back-translation training. Further enriching the idea of pivot translation by extending the use of parallel corpora beyond the source-target paradigm, we propose a new reference language-based framework for UNMT, RUNMT, in which the reference language only shares a parallel corpus with the source, but this corpus still indicates a signal clear enough to help the reconstruction training of UNMT through a proposed reference agreement mechanism. Experimental results show that our methods improve the quality of UNMT over that of a strong baseline that uses only one auxiliary language, demonstrating the usefulness of the proposed reference language-based UNMT and establishing a good start for the community. + 2020.findings-emnlp.371 + 2020.findings-emnlp.371.OptionalSupplementaryMaterial.zip + + + <fixed-case>T</fixed-case>iny<fixed-case>BERT</fixed-case>: Distilling <fixed-case>BERT</fixed-case> for Natural Language Understanding + XiaoqiJiao + YichunYin + LifengShang + XinJiang + XiaoChen + LinlinLi + FangWang + QunLiu + 4163–4174 + Language model pre-training, such as BERT, has significantly improved the performances of many natural language processing tasks. However, pre-trained language models are usually computationally expensive, so it is difficult to efficiently execute them on resource-restricted devices. To accelerate inference and reduce model size while maintaining accuracy, we first propose a novel Transformer distillation method that is specially designed for knowledge distillation (KD) of the Transformer-based models. By leveraging this new KD method, the plenty of knowledge encoded in a large “teacher” BERT can be effectively transferred to a small “student” TinyBERT. Then, we introduce a new two-stage learning framework for TinyBERT, which performs Transformer distillation at both the pre-training and task-specific learning stages. This framework ensures that TinyBERT can capture the general-domain as well as the task-specific knowledge in BERT. TinyBERT4 with 4 layers is empirically effective and achieves more than 96.8% the performance of its teacher BERT-Base on GLUE benchmark, while being 7.5x smaller and 9.4x faster on inference. TinyBERT4 is also significantly better than 4-layer state-of-the-art baselines on BERT distillation, with only ~28% parameters and ~31% inference time of them. Moreover, TinyBERT6 with 6 layers performs on-par with its teacher BERT-Base. + 2020.findings-emnlp.372 + 2020.findings-emnlp.372.OptionalSupplementaryMaterial.zip + + + Poison Attacks against Text Datasets with Conditional Adversarially Regularized Autoencoder + AlvinChan + YiTay + Yew-SoonOng + AstonZhang + 4175–4189 + This paper demonstrates a fatal vulnerability in natural language inference (NLI) and text classification systems. More concretely, we present a ‘backdoor poisoning’ attack on NLP models. Our poisoning attack utilizes conditional adversarially regularized autoencoder (CARA) to generate poisoned training samples by poison injection in latent space. Just by adding 1% poisoned data, our experiments show that a victim BERT finetuned classifier’s predictions can be steered to the poison target class with success rates of >80\% when the input hypothesis is injected with the poison signature, demonstrating that NLI and text classification systems face a huge security risk. + 2020.findings-emnlp.373 + + + #Turki$h<fixed-case>T</fixed-case>weets: A Benchmark Dataset for <fixed-case>T</fixed-case>urkish Text Correction + Asiye TubaKoksal + OzgeBozal + EmreYürekli + GizemGezici + 4190–4198 + #TurkihTweets is a benchmark dataset for the task of correcting the user misspellings, with the purpose of introducing the first public Turkish dataset in this area. \#TurkihTweets provides correct/incorrect word annotations with a detailed misspelling category formulation based on the real user data. We evaluated four state-of-the-art approaches on our dataset to present a preliminary analysis for the sake of reproducibility. + 2020.findings-emnlp.374 + 2020.findings-emnlp.374.OptionalSupplementaryMaterial.zip + + + Assessing Human-Parity in Machine Translation on the Segment Level + YvetteGraham + ChristianFedermann + MariaEskevich + BarryHaddow + 4199–4207 + Recent machine translation shared tasks have shown top-performing systems to tie or in some cases even outperform human translation. Such conclusions about system and human performance are, however, based on estimates aggregated from scores collected over large test sets of translations and unfortunately leave some remaining questions unanswered. For instance, simply because a system significantly outperforms the human translator on average may not necessarily mean that it has done so for every translation in the test set. Firstly, are there remaining source segments present in evaluation test sets that cause significant challenges for top-performing systems and can such challenging segments go unnoticed due to the opacity of current human evaluation procedures? To provide insight into these issues we carefully inspect the outputs of top-performing systems in the most recent WMT-19 news translation shared task for all language pairs in which a system either tied or outperformed human translation. Our analysis provides a new method of identifying the remaining segments for which either machine or human perform poorly. For example, in our close inspection of WMT-19 English to German and German to English we discover the segments that disjointly proved a challenge for human and machine. For English to Russian, there were no segments included in our sample of translations that caused a significant challenge for the human translator, while we again identify the set of segments that caused issues for the top-performing system. + 2020.findings-emnlp.375 + 2020.findings-emnlp.375.OptionalSupplementaryMaterial.pdf + + + Multichannel Generative Language Model: Learning All Possible Factorizations Within and Across Channels + HarrisChan + JamieKiros + WilliamChan + 4208–4220 + A channel corresponds to a viewpoint or transformation of an underlying meaning. A pair of parallel sentences in English and French express the same underlying meaning, but through two separate channels corresponding to their languages. In this work, we present the Multichannel Generative Language Model (MGLM). MGLM is a generative joint distribution model over channels. MGLM marginalizes over all possible factorizations within and across all channels. MGLM endows flexible inference, including unconditional generation, conditional generation (where 1 channel is observed and other channels are generated), and partially observed generation (where incomplete observations are spread across all the channels). We experiment with the Multi30K dataset containing English, French, Czech, and German. We demonstrate experiments with unconditional, conditional, and partially conditional generation. We provide qualitative samples sampled unconditionally from the generative joint distribution. We also quantitatively analyze the quality-diversity trade-offs and find MGLM outperforms traditional bilingual discriminative models. + 2020.findings-emnlp.376 + + + Factorized Transformer for Multi-Domain Neural Machine Translation + YongchaoDeng + HongfeiYu + HengYu + XiangyuDuan + WeihuaLuo + 4221–4230 + Multi-Domain Neural Machine Translation (NMT) aims at building a single system that performs well on a range of target domains. However, along with the extreme diversity of cross-domain wording and phrasing style, the imperfections of training data distribution and the inherent defects of the current sequential learning process all contribute to making the task of multi-domain NMT very challenging. To mitigate these problems, we propose the Factorized Transformer, which consists of an in-depth factorization of the parameters of an NMT model, namely Transformer in this paper, into two categories: domain-shared ones that encode common cross-domain knowledge and domain-specific ones that are private for each constituent domain. We experiment with various designs of our model and conduct extensive validations on English to French open multi-domain dataset. Our approach achieves state-of-the-art performance and opens up new perspectives for multi-domain and open-domain applications. + 2020.findings-emnlp.377 + + + Improving Named Entity Recognition with Attentive Ensemble of Syntactic Information + YuyangNie + YuanheTian + YanSong + XiangAo + XiangWan + 4231–4245 + Named entity recognition (NER) is highly sensitive to sentential syntactic and semantic properties where entities may be extracted according to how they are used and placed in the running text. To model such properties, one could rely on existing resources to providing helpful knowledge to the NER task; some existing studies proved the effectiveness of doing so, and yet are limited in appropriately leveraging the knowledge such as distinguishing the important ones for particular context. In this paper, we improve NER by leveraging different types of syntactic information through attentive ensemble, which functionalizes by the proposed key-value memory networks, syntax attention, and the gate mechanism for encoding, weighting and aggregating such syntactic information, respectively. Experimental results on six English and Chinese benchmark datasets suggest the effectiveness of the proposed model and show that it outperforms previous studies on all experiment datasets. + 2020.findings-emnlp.378 + + + Query-Key Normalization for Transformers + AlexHenry + Prudhvi RajDachapally + Shubham ShantaramPawar + YuxuanChen + 4246–4253 + Low-resource language translation is a challenging but socially valuable NLP task. Building on recent work adapting the Transformer’s normalization to this setting, we propose QKNorm, a normalization technique that modifies the attention mechanism to make the softmax function less prone to arbitrary saturation without sacrificing expressivity. Specifically, we apply l2-normalization along the head dimension of each query and key matrix prior to multiplying them and then scale up by a learnable parameter instead of dividing by the square root of the embedding dimension. We show improvements averaging 0.928 BLEU over state-of-the-art bilingual benchmarks for 5 low-resource translation pairs from the TED Talks corpus and IWSLT’15. + 2020.findings-emnlp.379 + 2020.findings-emnlp.379.OptionalSupplementaryMaterial.zip + + + Contract Discovery: Dataset and a Few-shot Semantic Retrieval Challenge with Competitive Baselines + ŁukaszBorchmann + DawidWisniewski + AndrzejGretkowski + IzabelaKosmala + DawidJurkiewicz + ŁukaszSzałkiewicz + GabrielaPałka + KarolKaczmarek + AgnieszkaKaliska + FilipGraliński + 4254–4268 + We propose a new shared task of semantic retrieval from legal texts, in which a so-called contract discovery is to be performed – where legal clauses are extracted from documents, given a few examples of similar clauses from other legal acts. The task differs substantially from conventional NLI and shared tasks on legal information extraction (e.g., one has to identify text span instead of a single document, page, or paragraph). The specification of the proposed task is followed by an evaluation of multiple solutions within the unified framework proposed for this branch of methods. It is shown that state-of-the-art pretrained encoders fail to provide satisfactory results on the task proposed. In contrast, Language Model-based solutions perform better, especially when unsupervised fine-tuning is applied. Besides the ablation studies, we addressed questions regarding detection accuracy for relevant text fragments depending on the number of examples available. In addition to the dataset and reference results, LMs specialized in the legal domain were made publicly available. + 2020.findings-emnlp.380 + + + Vocabulary Adaptation for Domain Adaptation in Neural Machine Translation + ShoetsuSato + JinSakuma + NaokiYoshinaga + MasashiToyoda + MasaruKitsuregawa + 4269–4279 + Neural network methods exhibit strong performance only in a few resource-rich domains. Practitioners therefore employ domain adaptation from resource-rich domains that are, in most cases, distant from the target domain. Domain adaptation between distant domains (e.g., movie subtitles and research papers), however, cannot be performed effectively due to mismatches in vocabulary; it will encounter many domain-specific words (e.g., “angstrom”) and words whose meanings shift across domains (e.g., “conductor”). In this study, aiming to solve these vocabulary mismatches in domain adaptation for neural machine translation (NMT), we propose vocabulary adaptation, a simple method for effective fine-tuning that adapts embedding layers in a given pretrained NMT model to the target domain. Prior to fine-tuning, our method replaces the embedding layers of the NMT model by projecting general word embeddings induced from monolingual data in a target domain onto a source-domain embedding space. Experimental results indicate that our method improves the performance of conventional fine-tuning by 3.86 and 3.28 BLEU points in En-Ja and De-En translation, respectively. + 2020.findings-emnlp.381 + + + A Shared-Private Representation Model with Coarse-to-Fine Extraction for Target Sentiment Analysis + PeiqinLin + MengYang + 4280–4289 + Target sentiment analysis aims to detect opinion targets along with recognizing their sentiment polarities from a sentence. Some models with span-based labeling have achieved promising results in this task. However, the relation between the target extraction task and the target classification task has not been well exploited. Besides, the span-based target extraction algorithm has a poor performance on target phrases due to the maximum target length setting or length penalty factor. To address these problems, we propose a novel framework of Shared-Private Representation Model (SPRM) with a coarse-to-fine extraction algorithm. For jointly learning target extraction and classification, we design a Shared-Private Network, which encodes not only shared information for both tasks but also private information for each task. To avoid missing correct target phrases, we also propose a heuristic coarse-to-fine extraction algorithm that first gets the approximate interval of the targets by matching the nearest predicted start and end indexes and then extracts the targets by adopting an extending strategy. Experimental results show that our model achieves state-of-the-art performance. + 2020.findings-emnlp.382 + + + Detecting Media Bias in News Articles using <fixed-case>G</fixed-case>aussian Bias Distributions + Wei-FanChen + KhalidAl Khatib + BennoStein + HenningWachsmuth + 4290–4300 + Media plays an important role in shaping public opinion. Biased media can influence people in undesirable directions and hence should be unmasked as such. We observe that feature-based and neural text classification approaches which rely only on the distribution of low-level lexical information fail to detect media bias. This weakness becomes most noticeable for articles on new events, where words appear in new contexts and hence their “bias predictiveness” is unclear. In this paper, we therefore study how second-order information about biased statements in an article helps to improve detection effectiveness. In particular, we utilize the probability distributions of the frequency, positions, and sequential order of lexical and informational sentence-level bias in a Gaussian Mixture Model. On an existing media bias dataset, we find that the frequency and positions of biased statements strongly impact article-level bias, whereas their exact sequential order is secondary. Using a standard model for sentence-level bias detection, we provide empirical evidence that article-level bias detectors that use second-order information clearly outperform those without. + 2020.findings-emnlp.383 + + + How Can Self-Attention Networks Recognize <fixed-case>D</fixed-case>yck-n Languages? + JavidEbrahimi + DhruvGelda + WeiZhang + 4301–4306 + We focus on the recognition of Dyck-n (Dn) languages with self-attention (SA) networks, which has been deemed to be a difficult task for these networks. We compare the performance of two variants of SA, one with a starting symbol (SA+) and one without (SA-). Our results show that SA+ is able to generalize to longer sequences and deeper dependencies. For D2, we find that SA- completely breaks down on long sequences whereas the accuracy of SA+ is 58.82%. We find attention maps learned by SA+ to be amenable to interpretation and compatible with a stack-based language recognizer. Surprisingly, the performance of SA networks is at par with LSTMs, which provides evidence on the ability of SA to learn hierarchies without recursion. + 2020.findings-emnlp.384 + 2020.findings-emnlp.384.OptionalSupplementaryMaterial.zip + + + Training Flexible Depth Model by Multi-Task Learning for Neural Machine Translation + QiangWang + TongXiao + JingboZhu + 4307–4312 + The standard neural machine translation model can only decode with the same depth configuration as training. Restricted by this feature, we have to deploy models of various sizes to maintain the same translation latency, because the hardware conditions on different terminal devices (e.g., mobile phones) may vary greatly. Such individual training leads to increased model maintenance costs and slower model iterations, especially for the industry. In this work, we propose to use multi-task learning to train a flexible depth model that can adapt to different depth configurations during inference. Experimental results show that our approach can simultaneously support decoding in 24 depth configurations and is superior to the individual training and another flexible depth model training method——LayerDrop. + 2020.findings-emnlp.385 + 2020.findings-emnlp.385.OptionalSupplementaryMaterial.zip + + + Looking inside Noun Compounds: Unsupervised Prepositional and Free Paraphrasing using Language Models + GirishkumarPonkiya + RudraMurthy + PushpakBhattacharyya + GirishPalshikar + 4313–4323 + A noun compound is a sequence of contiguous nouns that acts as a single noun, although the predicate denoting the semantic relation between its components is dropped. Noun Compound Interpretation is the task of uncovering the relation, in the form of a preposition or a free paraphrase. Prepositional paraphrasing refers to the use of preposition to explain the semantic relation, whereas free paraphrasing refers to invoking an appropriate predicate denoting the semantic relation. In this paper, we propose an unsupervised methodology for these two types of paraphrasing. We use pre-trained contextualized language models to uncover the ‘missing’ words (preposition or predicate). These language models are usually trained to uncover the missing word/words in a given input sentence. Our approach uses templates to prepare the input sequence for the language model. The template uses a special token to indicate the missing predicate. As the model has already been pre-trained to uncover a missing word (or a sequence of words), we exploit it to predict missing words for the input sequence. Our experiments using four datasets show that our unsupervised approach (a) performs comparably to supervised approaches for prepositional paraphrasing, and (b) outperforms supervised approaches for free paraphrasing. Paraphrasing (prepositional or free) using our unsupervised approach is potentially helpful for NLP tasks like machine translation and information extraction. + 2020.findings-emnlp.386 + + + The birth of <fixed-case>R</fixed-case>omanian <fixed-case>BERT</fixed-case> + StefanDumitrescu + Andrei-MariusAvram + SampoPyysalo + 4324–4328 + Large-scale pretrained language models have become ubiquitous in Natural Language Processing. However, most of these models are available either in high-resource languages, in particular English, or as multilingual models that compromise performance on individual languages for coverage. This paper introduces Romanian BERT, the first purely Romanian transformer-based language model, pretrained on a large text corpus. We discuss corpus com-position and cleaning, the model training process, as well as an extensive evaluation of the model on various Romanian datasets. We opensource not only the model itself, but also a repository that contains information on how to obtain the corpus, fine-tune and use this model in production (with practical examples), and how to fully replicate the evaluation process. + 2020.findings-emnlp.387 + + + <fixed-case>BERT</fixed-case> for Monolingual and Cross-Lingual Reverse Dictionary + HangYan + XiaonanLi + XipengQiu + BocaoDeng + 4329–4338 + Reverse dictionary is the task to find the proper target word given the word description. In this paper, we tried to incorporate BERT into this task. However, since BERT is based on the byte-pair-encoding (BPE) subword encoding, it is nontrivial to make BERT generate a word given the description. We propose a simple but effective method to make BERT generate the target word for this specific task. Besides, the cross-lingual reverse dictionary is the task to find the proper target word described in another language. Previous models have to keep two different word embeddings and learn to align these embeddings. Nevertheless, by using the Multilingual BERT (mBERT), we can efficiently conduct the cross-lingual reverse dictionary with one subword embedding, and the alignment between languages is not necessary. More importantly, mBERT can achieve remarkable cross-lingual reverse dictionary performance even without the parallel corpus, which means it can conduct the cross-lingual reverse dictionary with only corresponding monolingual data. Code is publicly available at https://github.com/yhcc/BertForRD.git. + 2020.findings-emnlp.388 + + + What’s so special about <fixed-case>BERT</fixed-case>’s layers? A closer look at the <fixed-case>NLP</fixed-case> pipeline in monolingual and multilingual models + Wietsede Vries + Andreasvan Cranenburgh + MalvinaNissim + 4339–4350 + Peeking into the inner workings of BERT has shown that its layers resemble the classical NLP pipeline, with progressively more complex tasks being concentrated in later layers. To investigate to what extent these results also hold for a language other than English, we probe a Dutch BERT-based model and the multilingual BERT model for Dutch NLP tasks. In addition, through a deeper analysis of part-of-speech tagging, we show that also within a given task, information is spread over different parts of the network and the pipeline might not be as neat as it seems. Each layer has different specialisations, so that it may be more useful to combine information from different layers, instead of selecting a single one based on the best overall performance. + 2020.findings-emnlp.389 + + + Leakage-Adjusted Simulatability: Can Models Generate Non-Trivial Explanations of Their Behavior in Natural Language? + PeterHase + ShiyueZhang + HarryXie + MohitBansal + 4351–4367 + Data collection for natural language (NL) understanding tasks has increasingly included human explanations alongside data points, allowing past works to introduce models that both perform a task and generate NL explanations for their outputs. Yet to date, model-generated explanations have been evaluated on the basis of surface-level similarities to human explanations, both through automatic metrics like BLEU and human evaluations. We argue that these evaluations are insufficient, since they fail to indicate whether explanations support actual model behavior (faithfulness), rather than simply match what a human would say (plausibility). In this work, we address the problem of evaluating explanations from the the model simulatability perspective. Our contributions are as follows: (1) We introduce a leakage-adjusted simulatability (LAS) metric for evaluating NL explanations, which measures how well explanations help an observer predict a model’s output, while controlling for how explanations can directly leak the output. We use a model as a proxy for a human observer, and validate this choice with two human subject experiments. (2) Using the CoS-E and e-SNLI datasets, we evaluate two existing generative graphical models and two new approaches; one rationalizing method we introduce achieves roughly human-level LAS scores. (3) Lastly, we frame explanation generation as a multi-agent game and optimize explanations for simulatability while penalizing label leakage, which can improve LAS scores. + 2020.findings-emnlp.390 + + + A Pointer Network Architecture for Joint Morphological Segmentation and Tagging + AmitSeker + ReutTsarfaty + 4368–4378 + Morphologically Rich Languages (MRLs) such as Arabic, Hebrew and Turkish often require Morphological Disambiguation (MD), i.e., the prediction of morphological decomposition of tokens into morphemes, early in the pipeline. Neural MD may be addressed as a simple pipeline, where segmentation is followed by sequence tagging, or as an end-to-end model, predicting morphemes from raw tokens. Both approaches are sub-optimal; the former is heavily prone to error propagation, and the latter does not enjoy explicit access to the basic processing units called morphemes. This paper offers MD architecture that combines the symbolic knowledge of morphemes with the learning capacity of neural end-to-end modeling. We propose a new, general and easy-to-implement Pointer Network model where the input is a morphological lattice and the output is a sequence of indices pointing at a single disambiguated path of morphemes. We demonstrate the efficacy of the model on segmentation and tagging, for Hebrew and Turkish texts, based on their respective Universal Dependencies (UD) treebanks. Our experiments show that with complete lattices, our model outperforms all shared-task results on segmenting and tagging these languages. On the SPMRL treebank, our model outperforms all previously reported results for Hebrew MD in realistic scenarios. + 2020.findings-emnlp.391 + + + Beyond Language: Learning Commonsense from Images for Reasoning + WanqingCui + YanyanLan + LiangPang + JiafengGuo + XueqiCheng + 4379–4389 + This paper proposes a novel approach to learn commonsense from images, instead of limited raw texts or costly constructed knowledge bases, for the commonsense reasoning problem in NLP. Our motivation comes from the fact that an image is worth a thousand words, where richer scene information could be leveraged to help distill the commonsense knowledge, which is often hidden in languages. Our approach, namely Loire, consists of two stages. In the first stage, a bi-modal sequence-to-sequence approach is utilized to conduct the scene layout generation task, based on a text representation model ViBERT. In this way, the required visual scene knowledge, such as spatial relations, will be encoded in ViBERT by the supervised learning process with some bi-modal data like COCO. Then ViBERT is concatenated with a pre-trained language model to perform the downstream commonsense reasoning tasks. Experimental results on two commonsense reasoning problems, i.e.commonsense question answering and pronoun resolution, demonstrate that Loire outperforms traditional language-based methods. We also give some case studies to show what knowledge is learned from images and explain how the generated scene layout helps the commonsense reasoning process. + 2020.findings-emnlp.392 + + + A <fixed-case>BERT</fixed-case>-based Distractor Generation Scheme with Multi-tasking and Negative Answer Training Strategies. + Ho-LamChung + Ying-HongChan + Yao-ChungFan + 4390–4400 + In this paper, we investigate the following two limitations for the existing distractor generation (DG) methods. First, the quality of the existing DG methods are still far from practical use. There are still room for DG quality improvement. Second, the existing DG designs are mainly for single distractor generation. However, for practical MCQ preparation, multiple distractors are desired. Aiming at these goals, in this paper, we present a new distractor generation scheme with multi-tasking and negative answer training strategies for effectively generating multiple distractors. The experimental results show that (1) our model advances the state-of-the-art result from 28.65 to 39.81 (BLEU 1 score) and (2) the generated multiple distractors are diverse and shows strong distracting power for multiple choice question. + 2020.findings-emnlp.393 + + + How Effective is Task-Agnostic Data Augmentation for Pretrained Transformers? + ShayneLongpre + YuWang + ChrisDuBois + 4401–4411 + Task-agnostic forms of data augmentation have proven widely effective in computer vision, even on pretrained models. In NLP similar results are reported most commonly for low data regimes, non-pretrained models, or situationally for pretrained models. In this paper we ask how effective these techniques really are when applied to pretrained transformers. Using two popular varieties of task-agnostic data augmentation (not tailored to any particular task), Easy Data Augmentation (Wei andZou, 2019) and Back-Translation (Sennrichet al., 2015), we conduct a systematic examination of their effects across 5 classification tasks, 6 datasets, and 3 variants of modern pretrained transformers, including BERT, XLNet, and RoBERTa. We observe a negative result, finding that techniques which previously reported strong improvements for non-pretrained models fail to consistently improve performance for pretrained transformers, even when training data is limited. We hope this empirical analysis helps inform practitioners where data augmentation techniques may confer improvements. + 2020.findings-emnlp.394 + + + Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions + PeterJansen + 4412–4417 + The recently proposed ALFRED challenge task aims for a virtual robotic agent to complete complex multi-step everyday tasks in a virtual home environment from high-level natural language directives, such as “put a hot piece of bread on a plate”. Currently, the best-performing models are able to complete less than 1% of these tasks successfully. In this work we focus on modeling the translation problem of converting natural language directives into detailed multi-step sequences of actions that accomplish those goals in the virtual environment. We empirically demonstrate that it is possible to generate gold multi-step plans from language directives alone without any visual input in 26% of unseen cases. When a small amount of visual information, the starting location in the virtual environment, is incorporated, our best-performing GPT-2 model successfully generates gold command sequences in 58% of cases, suggesting contextualized language models may provide strong planning modules for grounded virtual agents. + 2020.findings-emnlp.395 + + + Consistent Response Generation with Controlled Specificity + JunyaTakayama + YukiArase + 4418–4427 + We propose a method to control the specificity of responses while maintaining the consistency with the utterances. We first design a metric based on pointwise mutual information, which measures the co-occurrence degree between an utterance and a response. To control the specificity of generated responses, we add the distant supervision based on the co-occurrence degree and a PMI-based word prediction mechanism to a sequence-to-sequence model. With these mechanisms, our model outputs the words with optimal specificity for a given specificity control variable. In experiments with open-domain dialogue corpora, automatic and human evaluation results confirm that our model controls the specificity of the response more sensitively than the conventional model and can generate highly consistent responses. + 2020.findings-emnlp.396 + + + Internal and External Pressures on Language Emergence: Least Effort, Object Constancy and Frequency + DianaRodríguez Luna + Edoardo MariaPonti + DieuwkeHupkes + EliaBruni + 4428–4437 + In previous work, artificial agents were shown to achieve almost perfect accuracy in referential games where they have to communicate to identify images. Nevertheless, the resulting communication protocols rarely display salient features of natural languages, such as compositionality. In this paper, we propose some realistic sources of pressure on communication that avert this outcome. More specifically, we formalise the principle of least effort through an auxiliary objective. Moreover, we explore several game variants, inspired by the principle of object constancy, in which we alter the frequency, position, and luminosity of the objects in the images. We perform an extensive analysis on their effect through compositionality metrics, diagnostic classifiers, and zero-shot evaluation. Our findings reveal that the proposed sources of pressure result in emerging languages with less redundancy, more focus on high-level conceptual information, and better abilities of generalisation. Overall, our contributions reduce the gap between emergent and natural languages. + 2020.findings-emnlp.397 + + + Parsing All: Syntax and Semantics, Dependencies and Spans + JunruZhou + ZuchaoLi + HaiZhao + 4438–4449 + Both syntactic and semantic structures are key linguistic contextual clues, in which parsing the latter has been well shown beneficial from parsing the former. However, few works ever made an attempt to let semantic parsing help syntactic parsing. As linguistic representation formalisms, both syntax and semantics may be represented in either span (constituent/phrase) or dependency, on both of which joint learning was also seldom explored. In this paper, we propose a novel joint model of syntactic and semantic parsing on both span and dependency representations, which incorporates syntactic information effectively in the encoder of neural network and benefits from two representation formalisms in a uniform way. The experiments show that semantics and syntax can benefit each other by optimizing joint objectives. Our single model achieves new state-of-the-art or competitive results on both span and dependency semantic parsing on Propbank benchmarks and both dependency and constituent syntactic parsing on Penn Treebank. + 2020.findings-emnlp.398 + 2020.findings-emnlp.398.OptionalSupplementaryMaterial.zip + + + <fixed-case>LIMIT</fixed-case>-<fixed-case>BERT</fixed-case> : Linguistics Informed Multi-Task <fixed-case>BERT</fixed-case> + JunruZhou + ZhuoshengZhang + HaiZhao + ShuailiangZhang + 4450–4461 + In this paper, we present Linguistics Informed Multi-Task BERT (LIMIT-BERT) for learning language representations across multiple linguistics tasks by Multi-Task Learning. LIMIT-BERT includes five key linguistics tasks: Part-Of-Speech (POS) tags, constituent and dependency syntactic parsing, span and dependency semantic role labeling (SRL). Different from recent Multi-Task Deep Neural Networks (MT-DNN), our LIMIT-BERT is fully linguistics motivated and thus is capable of adopting an improved masked training objective according to syntactic and semantic constituents. Besides, LIMIT-BERT takes a semi-supervised learning strategy to offer the same large amount of linguistics task data as that for the language model training. As a result, LIMIT-BERT not only improves linguistics tasks performance but also benefits from a regularization effect and linguistics information that leads to more general representations to help adapt to new tasks and domains. LIMIT-BERT outperforms the strong baseline Whole Word Masking BERT on both dependency and constituent syntactic/semantic parsing, GLUE benchmark, and SNLI task. Our practice on the proposed LIMIT-BERT also enables us to release a well pre-trained model for multi-purpose of natural language processing tasks once for all. + 2020.findings-emnlp.399 + 2020.findings-emnlp.399.OptionalSupplementaryMaterial.zip + + + Improving Limited Labeled Dialogue State Tracking with Self-Supervision + Chien-ShengWu + Steven C.H.Hoi + CaimingXiong + 4462–4472 + Existing dialogue state tracking (DST) models require plenty of labeled data. However, collecting high-quality labels is costly, especially when the number of domains increases. In this paper, we address a practical DST problem that is rarely discussed, i.e., learning efficiently with limited labeled data. We present and investigate two self-supervised objectives: preserving latent consistency and modeling conversational behavior. We encourage a DST model to have consistent latent distributions given a perturbed input, making it more robust to an unseen scenario. We also add an auxiliary utterance generation task, modeling a potential correlation between conversational behavior and dialogue states. The experimental results show that our proposed self-supervised signals can improve joint goal accuracy by 8.95% when only 1% labeled data is used on the MultiWOZ dataset. We can achieve an additional 1.76% improvement if some unlabeled data is jointly trained as semi-supervised learning. We analyze and visualize how our proposed self-supervised signals help the DST task and hope to stimulate future data-efficient DST research. + 2020.findings-emnlp.400 + 2020.findings-emnlp.400.OptionalSupplementaryMaterial.pdf + + + On the Branching Bias of Syntax Extracted from Pre-trained Language Models + HuayangLi + LemaoLiu + GuopingHuang + ShumingShi + 4473–4478 + Many efforts have been devoted to extracting constituency trees from pre-trained language models, often proceeding in two stages: feature definition and parsing. However, this kind of methods may suffer from the branching bias issue, which will inflate the performances on languages with the same branch it biases to. In this work, we propose quantitatively measuring the branching bias by comparing the performance gap on a language and its reversed language, which is agnostic to both language models and extracting methods. Furthermore, we analyze the impacts of three factors on the branching bias, namely feature definitions, parsing algorithms, and language models. Experiments show that several existing works exhibit branching biases, and some implementations of these three factors can introduce the branching bias. + 2020.findings-emnlp.401 + + + The Pragmatics behind Politics: Modelling Metaphor, Framing and Emotion in Political Discourse + Pere-LluísHuguet Cabot + VernaDankers + DavidAbadi + AgnetaFischer + EkaterinaShutova + 4479–4488 + There has been an increased interest in modelling political discourse within the natural language processing (NLP) community, in tasks such as political bias and misinformation detection, among others. Metaphor-rich and emotion-eliciting communication strategies are ubiquitous in political rhetoric, according to social science research. Yet, none of the existing computational models of political discourse has incorporated these phenomena. In this paper, we present the first joint models of metaphor, emotion and political rhetoric, and demonstrate that they advance performance in three tasks: predicting political perspective of news articles, party affiliation of politicians and framing of policy issues. + 2020.findings-emnlp.402 + + + <fixed-case>SMRT</fixed-case>er Chatbots: Improving Non-Task-Oriented Dialog with Simulated Multi-Reference Training + HudaKhayrallah + JoãoSedoc + 4489–4505 + Non-task-oriented dialog models suffer from poor quality and non-diverse responses. To overcome limited conversational data, we apply Simulated Multiple Reference Training (SMRT; Khayrallah et al., 2020), and use a paraphraser to simulate multiple responses per training prompt. We find SMRT improves over a strong Transformer baseline as measured by human and automatic quality scores and lexical diversity. We also find SMRT is comparable to pretraining in human evaluation quality, and outperforms pretraining on automatic quality and lexical diversity, without requiring related-domain dialog data. + 2020.findings-emnlp.403 + + + <fixed-case>P</fixed-case>riv<fixed-case>N</fixed-case>et: Safeguarding Private Attributes in Transfer Learning for Recommendation + GuangnengHu + QiangYang + 4506–4516 + Transfer learning is an effective technique to improve a target recommender system with the knowledge from a source domain. Existing research focuses on the recommendation performance of the target domain while ignores the privacy leakage of the source domain. The transferred knowledge, however, may unintendedly leak private information of the source domain. For example, an attacker can accurately infer user demographics from their historical purchase provided by a source domain data owner. This paper addresses the above privacy-preserving issue by learning a privacy-aware neural representation by improving target performance while protecting source privacy. The key idea is to simulate the attacks during the training for protecting unseen users’ privacy in the future, modeled by an adversarial game, so that the transfer learning model becomes robust to attacks. Experiments show that the proposed PrivNet model can successfully disentangle the knowledge benefitting the transfer from leaking the privacy. + 2020.findings-emnlp.404 + 2020.findings-emnlp.404.OptionalSupplementaryMaterial.zip + + + Learning to Learn to Disambiguate: Meta-Learning for Few-Shot Word Sense Disambiguation + NithinHolla + PushkarMishra + HelenYannakoudakis + EkaterinaShutova + 4517–4533 + The success of deep learning methods hinges on the availability of large training datasets annotated for the task of interest. In contrast to human intelligence, these methods lack versatility and struggle to learn and adapt quickly to new tasks, where labeled data is scarce. Meta-learning aims to solve this problem by training a model on a large number of few-shot tasks, with an objective to learn new tasks quickly from a small number of examples. In this paper, we propose a meta-learning framework for few-shot word sense disambiguation (WSD), where the goal is to learn to disambiguate unseen words from only a few labeled instances. Meta-learning approaches have so far been typically tested in an N-way, K-shot classification setting where each task has N classes with K examples per class. Owing to its nature, WSD deviates from this controlled setup and requires the models to handle a large number of highly unbalanced classes. We extend several popular meta-learning approaches to this scenario, and analyze their strengths and weaknesses in this new challenging setting. + 2020.findings-emnlp.405 + + + An Empirical Investigation of Beam-Aware Training in Supertagging + RenatoNegrinho + Matthew R.Gormley + GeoffGordon + 4534–4542 + Structured prediction is often approached by training a locally normalized model with maximum likelihood and decoding approximately with beam search. This approach leads to mismatches as, during training, the model is not exposed to its mistakes and does not use beam search. Beam-aware training aims to address these problems, but unfortunately, it is not yet widely used due to a lack of understanding about how it impacts performance, when it is most useful, and whether it is stable. Recently, Negrinho et al. (2018) proposed a meta-algorithm that captures beam-aware training algorithms and suggests new ones, but unfortunately did not provide empirical results. In this paper, we begin an empirical investigation: we train the supertagging model of Vaswani et al. (2018) and a simpler model with instantiations of the meta-algorithm. We explore the influence of various design choices and make recommendations for choosing them. We observe that beam-aware training improves performance for both models, with large improvements for the simpler model which must effectively manage uncertainty during decoding. Our results suggest that a model must be learned with search to maximize its effectiveness. + 2020.findings-emnlp.406 + 2020.findings-emnlp.406.OptionalSupplementaryMaterial.zip + + + Improving Aspect-based Sentiment Analysis with Gated Graph Convolutional Networks and Syntax-based Regulation + AmirPouran Ben Veyseh + NasimNouri + FranckDernoncourt + Quan HungTran + DejingDou + Thien HuuNguyen + 4543–4548 + Aspect-based Sentiment Analysis (ABSA) seeks to predict the sentiment polarity of a sentence toward a specific aspect. Recently, it has been shown that dependency trees can be integrated into deep learning models to produce the state-of-the-art performance for ABSA. However, these models tend to compute the hidden/representation vectors without considering the aspect terms and fail to benefit from the overall contextual importance scores of the words that can be obtained from the dependency tree for ABSA. In this work, we propose a novel graph-based deep learning model to overcome these two issues of the prior work on ABSA. In our model, gate vectors are generated from the representation vectors of the aspect terms to customize the hidden vectors of the graph-based models toward the aspect terms. In addition, we propose a mechanism to obtain the importance scores for each word in the sentences based on the dependency trees that are then injected into the model to improve the representation vectors for ABSA. The proposed model achieves the state-of-the-art performance on three benchmark datasets. + 2020.findings-emnlp.407 + + + Decoding language spatial relations to 2<fixed-case>D</fixed-case> spatial arrangements + GorjanRadevski + GuillemCollell + Marie-FrancineMoens + TinneTuytelaars + 4549–4560 + We address the problem of multimodal spatial understanding by decoding a set of language-expressed spatial relations to a set of 2D spatial arrangements in a multi-object and multi-relationship setting. We frame the task as arranging a scene of clip-arts given a textual description. We propose a simple and effective model architecture Spatial-Reasoning Bert (SR-Bert), trained to decode text to 2D spatial arrangements in a non-autoregressive manner. SR-Bert can decode both explicit and implicit language to 2D spatial arrangements, generalizes to out-of-sample data to a reasonable extent and can generate complete abstract scenes if paired with a clip-arts predictor. Finally, we qualitatively evaluate our method with a user study, validating that our generated spatial arrangements align with human expectation. + 2020.findings-emnlp.408 + 2020.findings-emnlp.408.OptionalSupplementaryMaterial.zip + + + The Dots Have Their Values: Exploiting the Node-Edge Connections in Graph-based Neural Models for Document-level Relation Extraction + HieuMinh Tran + Minh TrungNguyen + Thien HuuNguyen + 4561–4567 + The goal of Document-level Relation Extraction (DRE) is to recognize the relations between entity mentions that can span beyond sentence boundary. The current state-of-the-art method for this problem has involved the graph-based edge-oriented model where the entity mentions, entities, and sentences in the documents are used as the nodes of the document graphs for representation learning. However, this model does not capture the representations for the nodes in the graphs, thus preventing it from effectively encoding the specific and relevant information of the nodes for DRE. To address this issue, we propose to explicitly compute the representations for the nodes in the graph-based edge-oriented model for DRE. These node representations allow us to introduce two novel representation regularization mechanisms to improve the representation vectors for DRE. The experiments show that our model achieves state-of-the-art performance on two benchmark datasets. + 2020.findings-emnlp.409 + + + Why and when should you pool? Analyzing Pooling in Recurrent Architectures + PratyushMaini + KeshavKolluru + DanishPruthi + Mausam + 4568–4586 + Pooling-based recurrent neural architectures consistently outperform their counterparts without pooling on sequence classification tasks. However, the reasons for their enhanced performance are largely unexamined. In this work, we examine three commonly used pooling techniques (mean-pooling, max-pooling, and attention, and propose *max-attention*, a novel variant that captures interactions among predictive tokens in a sentence. Using novel experiments, we demonstrate that pooling architectures substantially differ from their non-pooling equivalents in their learning ability and positional biases: (i) pooling facilitates better gradient flow than BiLSTMs in initial training epochs, and (ii) BiLSTMs are biased towards tokens at the beginning and end of the input, whereas pooling alleviates this bias. Consequently, we find that pooling yields large gains in low resource scenarios, and instances when salient words lie towards the middle of the input. Across several text classification tasks, we find max-attention to frequently outperform other pooling techniques. + 2020.findings-emnlp.410 + + + Structural and Functional Decomposition for Personality Image Captioning in a Communication Game + Minh ThuNguyen + DuyPhung + MinhHoai + Thien HuuNguyen + 4587–4593 + Personality image captioning (PIC) aims to describe an image with a natural language caption given a personality trait. In this work, we introduce a novel formulation for PIC based on a communication game between a speaker and a listener. The speaker attempts to generate natural language captions while the listener encourages the generated captions to contain discriminative information about the input images and personality traits. In this way, we expect that the generated captions can be improved to naturally represent the images and express the traits. In addition, we propose to adapt the language model GPT2 to perform caption generation for PIC. This enables the speaker and listener to benefit from the language encoding capacity of GPT2. Our experiments show that the proposed model achieves the state-of-the-art performance for PIC. + 2020.findings-emnlp.411 + 2020.findings-emnlp.411.OptionalSupplementaryMaterial.zip + + + Long Document Ranking with Query-Directed Sparse Transformer + Jyun-YuJiang + ChenyanXiong + Chia-JungLee + WeiWang + 4594–4605 + The computing cost of transformer self-attention often necessitates breaking long documents to fit in pretrained models in document ranking tasks. In this paper, we design Query-Directed Sparse attention that induces IR-axiomatic structures in transformer self-attention. Our model, QDS-Transformer, enforces the principle properties desired in ranking: local contextualization, hierarchical representation, and query-oriented proximity matching, while it also enjoys efficiency from sparsity. Experiments on four fully supervised and few-shot TREC document ranking benchmarks demonstrate the consistent and robust advantage of QDS-Transformer over previous approaches, as they either retrofit long documents into BERT or use sparse attention without emphasizing IR principles. We further quantify the computing complexity and demonstrates that our sparse attention with TVM implementation is twice more efficient that the fully-connected self-attention. All source codes, trained model, and predictions of this work are available at https://github.com/hallogameboy/QDS-Transformer. + 2020.findings-emnlp.412 + + + Visuo-Lingustic Question Answering (<fixed-case>VLQA</fixed-case>) Challenge + Shailaja KeyurSampat + YezhouYang + ChittaBaral + 4606–4616 + Understanding images and text together is an important aspect of cognition and building advanced Artificial Intelligence (AI) systems. As a community, we have achieved good benchmarks over language and vision domains separately, however joint reasoning is still a challenge for state-of-the-art computer vision and natural language processing (NLP) systems. We propose a novel task to derive joint inference about a given image-text modality and compile the Visuo-Linguistic Question Answering (VLQA) challenge corpus in a question answering setting. Each dataset item consists of an image and a reading passage, where questions are designed to combine both visual and textual information i.e., ignoring either modality would make the question unanswerable. We first explore the best existing vision-language architectures to solve VLQA subsets and show that they are unable to reason well. We then develop a modular method with slightly better baseline performance, but it is still far behind human performance. We believe that VLQA will be a good benchmark for reasoning over a visuo-linguistic context. The dataset, code and leaderboard is available at https://shailaja183.github.io/vlqa/. + 2020.findings-emnlp.413 + 2020.findings-emnlp.413.OptionalSupplementaryMaterial.pdf + + + Byte Pair Encoding is Suboptimal for Language Model Pretraining + KajBostrom + GregDurrett + 4617–4624 + The success of pretrained transformer language models (LMs) in natural language processing has led to a wide range of pretraining setups. In particular, these models employ a variety of subword tokenization methods, most notably byte-pair encoding (BPE) (Sennrich et al., 2016; Gage, 1994), the WordPiece method (Schuster and Nakajima, 2012), and unigram language modeling (Kudo, 2018), to segment text. However, to the best of our knowledge, the literature does not contain a direct evaluation of the impact of tokenization on language model pretraining. We analyze differences between BPE and unigram LM tokenization, finding that the latter method recovers subword units that align more closely with morphology and avoids problems stemming from BPE’s greedy construction procedure. We then compare the fine-tuned task performance of identical transformer masked language models pretrained with these tokenizations. Across downstream tasks and two languages (English and Japanese), we find that the unigram LM tokenization method matches or outperforms BPE. We hope that developers of future pretrained LMs will consider adopting the unigram LM method over the more prevalent BPE. + 2020.findings-emnlp.414 + + + Exploring <fixed-case>BERT</fixed-case>’s sensitivity to lexical cues using tests from semantic priming + KanishkaMisra + AllysonEttinger + JuliaRayz + 4625–4635 + Models trained to estimate word probabilities in context have become ubiquitous in natural language processing. How do these models use lexical cues in context to inform their word probabilities? To answer this question, we present a case study analyzing the pre-trained BERT model with tests informed by semantic priming. Using English lexical stimuli that show priming in humans, we find that BERT too shows “priming”, predicting a word with greater probability when the context includes a related word versus an unrelated one. This effect decreases as the amount of information provided by the context increases. Follow-up analysis shows BERT to be increasingly distracted by related prime words as context becomes more informative, assigning lower probabilities to related words. Our findings highlight the importance of considering contextual constraint effects when studying word prediction in these models, and highlight possible parallels with human processing. + 2020.findings-emnlp.415 + + + Multi-hop Question Generation with Graph Convolutional Network + DanSu + YanXu + WenliangDai + ZiweiJi + TiezhengYu + PascaleFung + 4636–4647 + Multi-hop Question Generation (QG) aims to generate answer-related questions by aggregating and reasoning over multiple scattered evidence from different paragraphs. It is a more challenging yet under-explored task compared to conventional single-hop QG, where the questions are generated from the sentence containing the answer or nearby sentences in the same paragraph without complex reasoning. To address the additional challenges in multi-hop QG, we propose Multi-Hop Encoding Fusion Network for Question Generation (MulQG), which does context encoding in multiple hops with Graph Convolutional Network and encoding fusion via an Encoder Reasoning Gate. To the best of our knowledge, we are the first to tackle the challenge of multi-hop reasoning over paragraphs without any sentence-level information. Empirical results on HotpotQA dataset demonstrate the effectiveness of our method, in comparison with baselines on automatic evaluation metrics. Moreover, from the human evaluation, our proposed model is able to generate fluent questions with high completeness and outperforms the strongest baseline by 20.8% in the multi-hop evaluation. on. The code is publicly availableat https://github.com/HLTCHKU + 2020.findings-emnlp.416 + + + <fixed-case>MMFT</fixed-case>-<fixed-case>BERT</fixed-case>: Multimodal Fusion Transformer with <fixed-case>BERT</fixed-case> Encodings for Visual Question Answering + AishaUrooj + AmirMazaheri + NielsDa vitoria lobo + MubarakShah + 4648–4660 + We present MMFT-BERT(MultiModal FusionTransformer with BERT encodings), to solve Visual Question Answering (VQA) ensuring individual and combined processing of multiple input modalities. Our approach benefits from processing multimodal data (video and text) adopting the BERT encodings individually and using a novel transformer-based fusion method to fuse them together. Our method decomposes the different sources of modalities, into different BERT instances with similar architectures, but variable weights. This achieves SOTA results on the TVQA dataset. Additionally, we provide TVQA-Visual, an isolated diagnostic subset of TVQA, which strictly requires the knowledge of visual (V) modality based on a human annotator’s judgment. This set of questions helps us to study the model’s behavior and the challenges TVQA poses to prevent the achievement of super human performance. Extensive experiments show the effectiveness and superiority of our method. + 2020.findings-emnlp.417 + + + Thinking Like a Skeptic: Defeasible Inference in Natural Language + RachelRudinger + VeredShwartz + Jena D.Hwang + ChandraBhagavatula + MaxwellForbes + RonanLe Bras + Noah A.Smith + YejinChoi + 4661–4675 + Defeasible inference is a mode of reasoning in which an inference (X is a bird, therefore X flies) may be weakened or overturned in light of new evidence (X is a penguin). Though long recognized in classical AI and philosophy, defeasible inference has not been extensively studied in the context of contemporary data-driven research on natural language inference and commonsense reasoning. We introduce Defeasible NLI (abbreviated \delta-NLI), a dataset for defeasible inference in natural language. Defeasible NLI contains extensions to three existing inference datasets covering diverse modes of reasoning: common sense, natural language inference, and social norms. From Defeasible NLI, we develop both a classification and generation task for defeasible inference, and demonstrate that the generation task is much more challenging. Despite lagging human performance, however, generative models trained on this data are capable of writing sentences that weaken or strengthen a specified inference up to 68% of the time. + 2020.findings-emnlp.418 + + + Guiding Attention for Self-Supervised Learning with Transformers + AmeetDeshpande + KarthikNarasimhan + 4676–4686 + In this paper, we propose a simple and effective technique to allow for efficient self-supervised learning with bi-directional Transformers. Our approach is motivated by recent studies demonstrating that self-attention patterns in trained models contain a majority of non-linguistic regularities. We propose a computationally efficient auxiliary loss function to guide attention heads to conform to such patterns. Our method is agnostic to the actual pre-training objective and results in faster convergence of models as well as better performance on downstream tasks compared to the baselines, achieving state of the art results in low-resource settings. Surprisingly, we also find that linguistic properties of attention heads are not necessarily correlated with language modeling performance. + 2020.findings-emnlp.419 + 2020.findings-emnlp.419.OptionalSupplementaryMaterial.zip + + + Language-Conditioned Feature Pyramids for Visual Selection Tasks + TaichiIki + AkikoAizawa + 4687–4697 + Referring expression comprehension, which is the ability to locate language to an object in an image, plays an important role in creating common ground. Many models that fuse visual and linguistic features have been proposed. However, few models consider the fusion of linguistic features with multiple visual features with different sizes of receptive fields, though the proper size of the receptive field of visual features intuitively varies depending on expressions. In this paper, we introduce a neural network architecture that modulates visual features with varying sizes of receptive field by linguistic features. We evaluate our architecture on tasks related to referring expression comprehension in two visual dialogue games. The results show the advantages and broad applicability of our architecture. Source code is available at https://github.com/Alab-NII/lcfp . + 2020.findings-emnlp.420 + + + Learning to Classify Human Needs of Events from Category Definitions with Prototypical Instantiation + HaiboDing + ZheFeng + 4698–4704 + We study the problem of learning an event classifier from human needs category descriptions, which is challenging due to: (1) the use of highly abstract concepts in natural language descriptions, (2) the difficulty of choosing key concepts. To tackle these two challenges, we propose LeaPI, a zero-shot learning method that first automatically generate weak labels by instantiating high-level concepts with prototypical instances and then trains a human needs classifier with the weakly labeled data. To filter noisy concepts, we design a reinforced selection algorithm to choose high-quality concepts for instantiation. Experimental results on the human needs categorization task show that our method outperforms baseline methods, producing substantially better precision. + 2020.findings-emnlp.421 + 2020.findings-emnlp.421.OptionalSupplementaryMaterial.pdf + + + Automatic Term Name Generation for Gene Ontology: Task and Dataset + YanjianZhang + QinChen + YitengZhang + ZhongyuWei + YixuGao + JiajiePeng + ZengfengHuang + WeijianSun + XuanjingHuang + 4705–4710 + Terms contained in Gene Ontology (GO) have been widely used in biology and bio-medicine. Most previous research focuses on inferring new GO terms, while the term names that reflect the gene function are still named by the experts. To fill this gap, we propose a novel task, namely term name generation for GO, and build a large-scale benchmark dataset. Furthermore, we present a graph-based generative model that incorporates the relations between genes, words and terms for term name generation, which exhibits great advantages over the strong baselines. + 2020.findings-emnlp.422 + + + Compressing Transformer-Based Semantic Parsing Models using Compositional Code Embeddings + PrafullPrakash + Saurabh KumarShashidhar + WenlongZhao + SubendhuRongali + HaidarKhan + MichaelKayser + 4711–4717 + The current state-of-the-art task-oriented semantic parsing models use BERT or RoBERTa as pretrained encoders; these models have huge memory footprints. This poses a challenge to their deployment for voice assistants such as Amazon Alexa and Google Assistant on edge devices with limited memory budgets. We propose to learn compositional code embeddings to greatly reduce the sizes of BERT-base and RoBERTa-base. We also apply the technique to DistilBERT, ALBERT-base, and ALBERT-large, three already compressed BERT variants which attain similar state-of-the-art performances on semantic parsing with much smaller model sizes. We observe 95.15% 98.46% embedding compression rates and 20.47% 34.22% encoder compression rates, while preserving >97.5% semantic parsing performances. We provide the recipe for training and analyze the trade-off between code embedding sizes and downstream performances. + 2020.findings-emnlp.423 + 2020.findings-emnlp.423.OptionalSupplementaryMaterial.zip + + + <fixed-case>BERT</fixed-case>-<fixed-case>QE</fixed-case>: Contextualized Query Expansion for Document Re-ranking + ZhiZheng + KaiHui + BenHe + XianpeiHan + LeSun + AndrewYates + 4718–4728 + Query expansion aims to mitigate the mismatch between the language used in a query and in a document. However, query expansion methods can suffer from introducing non-relevant information when expanding the query. To bridge this gap, inspired by recent advances in applying contextualized models like BERT to the document retrieval task, this paper proposes a novel query expansion model that leverages the strength of the BERT model to select relevant document chunks for expansion. In evaluation on the standard TREC Robust04 and GOV2 test collections, the proposed BERT-QE model significantly outperforms BERT-Large models. + 2020.findings-emnlp.424 + + + <fixed-case>ZEN</fixed-case>: Pre-training <fixed-case>C</fixed-case>hinese Text Encoder Enhanced by N-gram Representations + ShizheDiao + JiaxinBai + YanSong + TongZhang + YonggangWang + 4729–4740 + The pre-training of text encoders normally processes text as a sequence of tokens corresponding to small text units, such as word pieces in English and characters in Chinese. It omits information carried by larger text granularity, and thus the encoders cannot easily adapt to certain combinations of characters. This leads to a loss of important semantic information, which is especially problematic for Chinese because the language does not have explicit word boundaries. In this paper, we propose ZEN, a BERT-based Chinese text encoder enhanced by n-gram representations, where different combinations of characters are considered during training, thus potential word or phrase boundaries are explicitly pre-trained and fine-tuned with the character encoder (BERT). Therefore ZEN incorporates the comprehensive information of both the character sequence and words or phrases it contains. Experimental results illustrated the effectiveness of ZEN on a series of Chinese NLP tasks, where state-of-the-art results is achieved on most tasks with requiring less resource than other published encoders. It is also shown that reasonable performance is obtained when ZEN is trained on a small corpus, which is important for applying pre-training techniques to scenarios with limited data. The code and pre-trained models of ZEN are available at https://github.com/sinovation/ZEN. + 2020.findings-emnlp.425 + + + Finding <fixed-case>F</fixed-case>riends and Flipping Frenemies: Automatic Paraphrase Dataset Augmentation Using Graph Theory + HannahChen + YangfengJi + DavidEvans + 4741–4751 + Most NLP datasets are manually labeled, so suffer from inconsistent labeling or limited size. We propose methods for automatically improving datasets by viewing them as graphs with expected semantic properties. We construct a paraphrase graph from the provided sentence pair labels, and create an augmented dataset by directly inferring labels from the original sentence pairs using a transitivity property. We use structural balance theory to identify likely mislabelings in the graph, and flip their labels. We evaluate our methods on paraphrase models trained using these datasets starting from a pretrained BERT model, and find that the automatically-enhanced training sets result in more accurate models. + 2020.findings-emnlp.426 + + + Probabilistic Case-based Reasoning in Knowledge Bases + RajarshiDas + AmeyaGodbole + NicholasMonath + ManzilZaheer + AndrewMcCallum + 4752–4765 + A case-based reasoning (CBR) system solves a new problem by retrieving ‘cases’ that are similar to the given problem. If such a system can achieve high accuracy, it is appealing owing to its simplicity, interpretability, and scalability. In this paper, we demonstrate that such a system is achievable for reasoning in knowledge-bases (KBs). Our approach predicts attributes for an entity by gathering reasoning paths from similar entities in the KB. Our probabilistic model estimates the likelihood that a path is effective at answering a query about the given entity. The parameters of our model can be efficiently computed using simple path statistics and require no iterative optimization. Our model is non-parametric, growing dynamically as new entities and relations are added to the KB. On several benchmark datasets our approach significantly outperforms other rule learning approaches and performs comparably to state-of-the-art embedding-based approaches. Furthermore, we demonstrate the effectiveness of our model in an “open-world” setting where new entities arrive in an online fashion, significantly outperforming state-of-the-art approaches and nearly matching the best offline method. + 2020.findings-emnlp.427 + + + <fixed-case>TLDR</fixed-case>: Extreme Summarization of Scientific Documents + IsabelCachola + KyleLo + ArmanCohan + DanielWeld + 4766–4777 + We introduce TLDR generation, a new form of extreme summarization, for scientific papers. TLDR generation involves high source compression and requires expert background knowledge and understanding of complex domain-specific language. To facilitate study on this task, we introduce SCITLDR, a new multi-target dataset of 5.4K TLDRs over 3.2K papers. SCITLDR contains both author-written and expert-derived TLDRs, where the latter are collected using a novel annotation protocol that produces high-quality summaries while minimizing annotation burden. We propose CATTS, a simple yet effective learning strategy for generating TLDRs that exploits titles as an auxiliary training signal. CATTS improves upon strong baselines under both automated metrics and human evaluations. Data and code are publicly available at https://github.com/allenai/scitldr. + 2020.findings-emnlp.428 + 2020.findings-emnlp.428.OptionalSupplementaryMaterial.zip + + + Tri-Train: Automatic Pre-Fine Tuning between Pre-Training and Fine-Tuning for <fixed-case>S</fixed-case>ci<fixed-case>NER</fixed-case> + QingkaiZeng + WenhaoYu + MengxiaYu + TianwenJiang + TimWeninger + MengJiang + 4778–4787 + The training process of scientific NER models is commonly performed in two steps: i) Pre-training a language model by self-supervised tasks on huge data and ii) fine-tune training with small labelled data. The success of the strategy depends on the relevance between the data domains and between the tasks. However, gaps are found in practice when the target domains are specific and small. We propose a novel framework to introduce a “pre-fine tuning” step between pre-training and fine-tuning. It constructs a corpus by selecting sentences from unlabeled documents that are the most relevant with the labelled training data. Instead of predicting tokens in random spans, the pre-fine tuning task is to predict tokens in entity candidates identified by text mining methods. Pre-fine tuning is automatic and light-weight because the corpus size can be much smaller than pre-training data to achieve a better performance. Experiments on seven benchmarks demonstrate the effectiveness. + 2020.findings-emnlp.429 + 2020.findings-emnlp.429.OptionalSupplementaryMaterial.zip + + + Hierarchical Region Learning for Nested Named Entity Recognition + XinweiLong + ShuziNiu + YuchengLi + 4788–4793 + Named Entity Recognition (NER) is deeply explored and widely used in various tasks. Usually, some entity mentions are nested in other entities, which leads to the nested NER problem. Leading region based models face both the efficiency and effectiveness challenge due to the high subsequence enumeration complexity. To tackle these challenges, we propose a hierarchical region learning framework to automatically generate a tree hierarchy of candidate regions with nearly linear complexity and incorporate structure information into the region representation for better classification. Experiments on benchmark datasets ACE-2005, GENIA and JNLPBA demonstrate competitive or better results than state-of-the-art baselines. + 2020.findings-emnlp.430 + + + Understanding User Resistance Strategies in Persuasive Conversations + YouzhiTian + WeiyanShi + ChenLi + ZhouYu + 4794–4798 + Persuasive dialog systems have various usages, such as donation persuasion and physical exercise persuasion. Previous persuasive dialog systems research mostly focused on analyzing the persuader’s strategies and paid little attention to the persuadee (user). However, understanding and addressing users’ resistance strategies is an essential job of a persuasive dialog system. So, we adopt a preliminary framework on persuasion resistance in psychology and design a fine-grained resistance strategy annotation scheme. We annotate the PersuasionForGood dataset with the scheme. With the enriched annotations, we build a classifier to predict the resistance strategies. Furthermore, we analyze the relationships between persuasion strategies and persuasion resistance strategies. Our work lays the ground for developing a persuasive dialogue system that can understand and address user resistance strategy appropriately. The code and data will be released. + 2020.findings-emnlp.431 + + + On the Sub-Layer Functionalities of Transformer Decoder + YilinYang + LongyueWang + ShumingShi + PrasadTadepalli + StefanLee + ZhaopengTu + 4799–4811 + There have been significant efforts to interpret the encoder of Transformer-based encoder-decoder architectures for neural machine translation (NMT); meanwhile, the decoder remains largely unexamined despite its critical role. During translation, the decoder must predict output tokens by considering both the source-language text from the encoder and the target-language prefix produced in previous steps. In this work, we study how Transformer-based decoders leverage information from the source and target languages – developing a universal probe task to assess how information is propagated through each module of each decoder layer. We perform extensive experiments on three major translation datasets (WMT En-De, En-Fr, and En-Zh). Our analysis provides insight on when and where decoders leverage different sources. Based on these insights, we demonstrate that the residual feed-forward module in each Transformer decoder layer can be dropped with minimal loss of performance – a significant reduction in computation and number of parameters, and consequently a significant boost to both training and inference speed. + 2020.findings-emnlp.432 + + + Extremely Low Bit Transformer Quantization for On-Device Neural Machine Translation + InsooChung + ByeongwookKim + YoonjungChoi + Se JungKwon + YongkweonJeon + BaeseongPark + SanghaKim + DongsooLee + 4812–4826 + The deployment of widely used Transformer architecture is challenging because of heavy computation load and memory overhead during inference, especially when the target device is limited in computational resources such as mobile or edge devices. Quantization is an effective technique to address such challenges. Our analysis shows that for a given number of quantization bits, each block of Transformer contributes to translation quality and inference computations in different manners. Moreover, even inside an embedding block, each word presents vastly different contributions. Correspondingly, we propose a mixed precision quantization strategy to represent Transformer weights by an extremely low number of bits (e.g., under 3 bits). For example, for each word in an embedding block, we assign different quantization bits based on statistical property. Our quantized Transformer model achieves 11.8× smaller model size than the baseline model, with less than -0.5 BLEU. We achieve 8.3× reduction in run-time memory footprints and 3.5× speed up (Galaxy N10+) such that our proposed compression strategy enables efficient implementation for on-device NMT. + 2020.findings-emnlp.433 + + + Robust Backed-off Estimation of Out-of-Vocabulary Embeddings + NobukazuFukuda + NaokiYoshinaga + MasaruKitsuregawa + 4827–4838 + Out-of-vocabulary (oov) words cause serious troubles in solving natural language tasks with a neural network. Existing approaches to this problem resort to using subwords, which are shorter and more ambiguous units than words, in order to represent oov words with a bag of subwords. In this study, inspired by the processes for creating words from known words, we propose a robust method of estimating oov word embeddings by referring to pre-trained word embeddings for known words with similar surfaces to target oov words. We collect known words by segmenting oov words and by approximate string matching, and we then aggregate their pre-trained embeddings. Experimental results show that the obtained oov word embeddings improve not only word similarity tasks but also downstream tasks in Twitter and biomedical domains where oov words often appear, even when the computed oov embeddings are integrated into a bert-based strong baseline. + 2020.findings-emnlp.434 + + + Exploiting Unsupervised Data for Emotion Recognition in Conversations + WenxiangJiao + MichaelLyu + IrwinKing + 4839–4846 + Emotion Recognition in Conversations (ERC) aims to predict the emotional state of speakers in conversations, which is essentially a text classification task. Unlike the sentence-level text classification problem, the available supervised data for the ERC task is limited, which potentially prevents the models from playing their maximum effect. In this paper, we propose a novel approach to leverage unsupervised conversation data, which is more accessible. Specifically, we propose the Conversation Completion (ConvCom) task, which attempts to select the correct answer from candidate answers to fill a masked utterance in a conversation. Then, we Pre-train a basic COntext-Dependent Encoder (Pre-CODE) on the ConvCom task. Finally, we fine-tune the Pre-CODE on the datasets of ERC. Experimental results demonstrate that pre-training on unsupervised data achieves significant improvement of performance on the ERC datasets, particularly on the minority emotion classes. + 2020.findings-emnlp.435 + + + Tensorized Embedding Layers + OleksiiHrinchuk + ValentinKhrulkov + LeylaMirvakhabova + ElenaOrlova + IvanOseledets + 4847–4860 + The embedding layers transforming input words into real vectors are the key components of deep neural networks used in natural language processing. However, when the vocabulary is large, the corresponding weight matrices can be enormous, which precludes their deployment in a limited resource setting. We introduce a novel way of parameterizing embedding layers based on the Tensor Train decomposition, which allows compressing the model significantly at the cost of a negligible drop or even a slight gain in performance. We evaluate our method on a wide range of benchmarks in natural language processing and analyze the trade-off between performance and compression ratios for a wide range of architectures, from MLPs to LSTMs and Transformers. + 2020.findings-emnlp.436 + + + Speaker or Listener? The Role of a Dialogue Agent + YafeiLiu + HongjinQian + HengpengXu + JinmaoWei + 4861–4869 + For decades, chitchat bots are designed as a listener to passively answer what people ask. This passive and relatively simple dialogue mechanism gains less attention from humans and consumes the interests of human beings rapidly. Therefore some recent researches attempt to endow the bots with proactivity through external knowledge to transform the role from a listener to a speaker with a hypothesis that the speaker expresses more just like a knowledge disseminator. However, along with the proactive manner introduced into a dialogue agent, an issue arises that, with too many knowledge facts to express, the agent starts to talks endlessly, and even completely ignores what the other expresses in dialogue sometimes, which greatly harms the interest of the other chatter to continue the conversation. To the end, we propose a novel model named Initiative-Imitate to interact with adaptive initiative throughout a dialogue. It forces the agent to express in parallel with the appropriate role during the whole conversation. The corresponding experiments show the proposed Initiative-Imitate obtains competitive results both on the automatic and manual metrics. And the fluency and engagement of the chatbot have also been improved significantly. Besides, the case study indicates the Initiative-Imitate can constantly transfer to appropriate role timely and response more properly during the whole continuous conversation. + 2020.findings-emnlp.437 + + + Bridging Textual and Tabular Data for Cross-Domain Text-to-<fixed-case>SQL</fixed-case> Semantic Parsing + Xi VictoriaLin + RichardSocher + CaimingXiong + 4870–4888 + We present BRIDGE, a powerful sequential architecture for modeling dependencies between natural language questions and relational databases in cross-DB semantic parsing. BRIDGE represents the question and DB schema in a tagged sequence where a subset of the fields are augmented with cell values mentioned in the question. The hybrid sequence is encoded by BERT with minimal subsequent layers and the text-DB contextualization is realized via the fine-tuned deep attention in BERT. Combined with a pointer-generator decoder with schema-consistency driven search space pruning, BRIDGE attained state-of-the-art performance on the well-studied Spider benchmark (65.5% dev, 59.2% test), despite being much simpler than most recently proposed models for this task. Our analysis shows that BRIDGE effectively captures the desired cross-modal dependencies and has the potential to generalize to more text-DB related tasks. Our model implementation is available at https://github.com/ salesforce/TabularSemanticParsing. + 2020.findings-emnlp.438 + + + Do Language Embeddings capture Scales? + XikunZhang + DeepakRamachandran + IanTenney + YanaiElazar + DanRoth + 4889–4896 + Pretrained Language Models (LMs) have been shown to possess significant linguistic, common sense and factual knowledge. One form of knowledge that has not been studied yet in this context is information about the scalar magnitudes of objects. We show that pretrained language models capture a significant amount of this information but are short of the capability required for general common-sense reasoning. We identify contextual information in pre-training and numeracy as two key factors affecting their performance, and show that a simple method of canonicalizing numbers can have a significant effect on the results. + 2020.findings-emnlp.439 + + + Paraphrasing vs Coreferring: Two Sides of the Same Coin + YehuditMeged + AviCaciularu + VeredShwartz + IdoDagan + 4897–4907 + We study the potential synergy between two different NLP tasks, both confronting predicate lexical variability: identifying predicate paraphrases, and event coreference resolution. First, we used annotations from an event coreference dataset as distant supervision to re-score heuristically-extracted predicate paraphrases. The new scoring gained more than 18 points in average precision upon their ranking by the original scoring method. Then, we used the same re-ranking features as additional inputs to a state-of-the-art event coreference resolution model, which yielded modest but consistent improvements to the model’s performance. The results suggest a promising direction to leverage data and models for each of the tasks to the benefit of the other. + 2020.findings-emnlp.440 + + + Active Sentence Learning by Adversarial Uncertainty Sampling in Discrete Space + DongyuRu + JiangtaoFeng + LinQiu + HaoZhou + MingxuanWang + WeinanZhang + YongYu + LeiLi + 4908–4917 + Active learning for sentence understanding aims at discovering informative unlabeled data for annotation and therefore reducing the demand for labeled data. We argue that the typical uncertainty sampling method for active learning is time-consuming and can hardly work in real-time, which may lead to ineffective sample selection. We propose adversarial uncertainty sampling in discrete space (AUSDS) to retrieve informative unlabeled samples more efficiently. AUSDS maps sentences into latent space generated by the popular pre-trained language models, and discover informative unlabeled text samples for annotation via adversarial attack. The proposed approach is extremely efficient compared with traditional uncertainty sampling with more than 10x speedup. Experimental results on five datasets show that AUSDS outperforms strong baselines on effectiveness. + 2020.findings-emnlp.441 + + + Coming to Terms: Automatic Formation of Neologisms in <fixed-case>H</fixed-case>ebrew + MoranMizrahi + StavYardeni Seelig + DafnaShahaf + 4918–4929 + Spoken languages are ever-changing, with new words entering them all the time. However, coming up with new words (neologisms) today relies exclusively on human creativity. In this paper we propose a system to automatically suggest neologisms. We focus on the Hebrew language as a test case due to the unusual regularity of its noun formation. User studies comparing our algorithm to experts and non-experts demonstrate that our algorithm is capable of generating high-quality outputs, as well as enhance human creativity. More broadly, we seek to inspire more computational work around the topic of linguistic creativity, which we believe offers numerous unexplored opportunities. + 2020.findings-emnlp.442 + + + Dual Inference for Improving Language Understanding and Generation + Shang-YuSu + Yung-SungChuang + Yun-NungChen + 4930–4936 + Natural language understanding (NLU) and Natural language generation (NLG) tasks hold a strong dual relationship, where NLU aims at predicting semantic labels based on natural language utterances and NLG does the opposite. The prior work mainly focused on exploiting the duality in model training in order to obtain the models with better performance. However, regarding the fast-growing scale of models in the current NLP area, sometimes we may have difficulty retraining whole NLU and NLG models. To better address the issue, this paper proposes to leverage the duality in the inference stage without the need of retraining. The experiments on three benchmark datasets demonstrate the effectiveness of the proposed method in both NLU and NLG, providing the great potential of practical usage. + 2020.findings-emnlp.443 + + + Joint Intent Detection and Entity Linking on Spatial Domain Queries + LeiZhang + RunzeWang + JingboZhou + JingsongYu + ZhenhuaLing + HuiXiong + 4937–4947 + Continuous efforts have been devoted to language understanding (LU) for conversational queries with the fast and wide-spread popularity of voice assistants. In this paper, we first study the LU problem in the spatial domain, which is a critical problem for providing location-based services by voice assistants but is without in-depth investigation in existing studies. Spatial domain queries have several unique properties making them be more challenging for language understanding than common conversational queries, including lexical-similar but diverse intents and highly ambiguous words. Thus, a special tailored LU framework for spatial domain queries is necessary. To the end, a dataset was extracted and annotated based on the real-life queries from a voice assistant service. We then proposed a new multi-task framework that jointly learns the intent detection and entity linking tasks on the with invented hierarchical intent detection method and triple-scoring mechanism for entity linking. A specially designed spatial GCN is also utilized to model spatial context information among entities. We have conducted extensive experimental evaluations with state-of-the-art entity linking and intent detection methods, which demonstrated that can outperform all baselines with a significant margin. + 2020.findings-emnlp.444 + 2020.findings-emnlp.444.OptionalSupplementaryMaterial.pdf + + + i<fixed-case>NLPS</fixed-case>uite: Monolingual Corpora, Evaluation Benchmarks and Pre-trained Multilingual Language Models for <fixed-case>I</fixed-case>ndian Languages + DivyanshuKakwani + AnoopKunchukuttan + SatishGolla + GokulN.C. + AvikBhattacharyya + Mitesh M.Khapra + PratyushKumar + 4948–4961 + In this paper, we introduce NLP resources for 11 major Indian languages from two major language families. These resources include: (a) large-scale sentence-level monolingual corpora, (b) pre-trained word embeddings, (c) pre-trained language models, and (d) multiple NLU evaluation datasets (IndicGLUE benchmark). The monolingual corpora contains a total of 8.8 billion tokens across all 11 languages and Indian English, primarily sourced from news crawls. The word embeddings are based on FastText, hence suitable for handling morphological complexity of Indian languages. The pre-trained language models are based on the compact ALBERT model. Lastly, we compile the (IndicGLUE benchmark for Indian language NLU. To this end, we create datasets for the following tasks: Article Genre Classification, Headline Prediction, Wikipedia Section-Title Prediction, Cloze-style Multiple choice QA, Winograd NLI and COPA. We also include publicly available datasets for some Indic languages for tasks like Named Entity Recognition, Cross-lingual Sentence Retrieval, Paraphrase detection, etc. Our embeddings are competitive or better than existing pre-trained embeddings on multiple tasks. We hope that the availability of the dataset will accelerate Indic NLP research which has the potential to impact more than a billion people. It can also help the community in evaluating advances in NLP over a more diverse pool of languages. The data and models are available at https://indicnlp.ai4bharat.org. + 2020.findings-emnlp.445 + + + Weakly-Supervised Modeling of Contextualized Event Embedding for Discourse Relations + I-TaLee + Maria LeonorPacheco + DanGoldwasser + 4962–4972 + Representing, and reasoning over, long narratives requires models that can deal with complex event structures connected through multiple relationship types. This paper suggests to represent this type of information as a narrative graph and learn contextualized event representations over it using a relational graph neural network model. We train our model to capture event relations, derived from the Penn Discourse Tree Bank, on a huge corpus, and show that our multi-relational contextualized event representation can improve performance when learning script knowledge without direct supervision and provide a better representation for the implicit discourse sense classification task. + 2020.findings-emnlp.446 + + + Enhancing Generalization in Natural Language Inference by Syntax + QiHe + HanWang + YueZhang + 4973–4978 + Pre-trained language models such as BERT have achieved the state-of-the-art performance on natural language inference (NLI). However, it has been shown that such models can be tricked by variations of surface patterns such as syntax. We investigate the use of dependency trees to enhance the generalization of BERT in the NLI task, leveraging on a graph convolutional network to represent a syntax-based matching graph with heterogeneous matching patterns. Experimental results show that, our syntax-based method largely enhance generalization of BERT on a test set where the sentence pair has high lexical overlap but diverse syntactic structures, and do not degrade performance on the standard test set. In other words, the proposed method makes BERT more robust on syntactic changes. + 2020.findings-emnlp.447 + +
+
diff --git a/data/xml/2020.insights.xml b/data/xml/2020.insights.xml new file mode 100644 index 0000000000..4cf67c7d3a --- /dev/null +++ b/data/xml/2020.insights.xml @@ -0,0 +1,193 @@ + + + + + Proceedings of the First Workshop on Insights from Negative Results in NLP + AnnaRogers + JoãoSedoc + AnnaRumshisky + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.insights-1.0 + + + Domain adaptation challenges of <fixed-case>BERT</fixed-case> in tokenization and sub-word representations of Out-of-Vocabulary words + AnmolNayak + HariprasadTimmapathini + KarthikeyanPonnalagu + VijendranGopalan Venkoparao + 1–5 + BERT model (Devlin et al., 2019) has achieved significant progress in several Natural Language Processing (NLP) tasks by leveraging the multi-head self-attention mechanism (Vaswani et al., 2017) in its architecture. However, it still has several research challenges which are not tackled well for domain specific corpus found in industries. In this paper, we have highlighted these problems through detailed experiments involving analysis of the attention scores and dynamic word embeddings with the BERT-Base-Uncased model. Our experiments have lead to interesting findings that showed: 1) Largest substring from the left that is found in the vocabulary (in-vocab) is always chosen at every sub-word unit that can lead to suboptimal tokenization choices, 2) Semantic meaning of a vocabulary word deteriorates when found as a substring in an Out-Of-Vocabulary (OOV) word, and 3) Minor misspellings in words are inadequately handled. We believe that if these challenges are tackled, it will significantly help the domain adaptation aspect of BERT. + 2020.insights-1.1 + + + <fixed-case>Q</fixed-case>. Can Knowledge Graphs be used to Answer <fixed-case>B</fixed-case>oolean Questions? <fixed-case>A</fixed-case>. It’s complicated! + DariaDzendzik + CarlVogel + JenniferFoster + 6–14 + In this paper we explore the problem of machine reading comprehension, focusing on the BoolQ dataset of Yes/No questions. We carry out an error analysis of a BERT-based machine reading comprehension model on this dataset, revealing issues such as unstable model behaviour and some noise within the dataset itself. We then experiment with two approaches for integrating information from knowledge graphs: (i) concatenating knowledge graph triples to text passages and (ii) encoding knowledge with a Graph Neural Network. Neither of these approaches show a clear improvement and we hypothesize that this may be due to a combination of inaccuracies in the knowledge graph, imprecision in entity linking, and the models’ inability to capture additional information from knowledge graphs. + 2020.insights-1.2 + + + How Far Can We Go with Data Selection? A Case Study on Semantic Sequence Tagging Tasks + SamuelLouvan + BernardoMagnini + 15–21 + Although several works have addressed the role of data selection to improve transfer learning for various NLP tasks, there is no consensus about its real benefits and, more generally, there is a lack of shared practices on how it can be best applied. We propose a systematic approach aimed at evaluating data selection in scenarios of increasing complexity. Specifically, we compare the case in which source and target tasks are the same while source and target domains are different, against the more challenging scenario where both tasks and domains are different. We run a number of experiments on semantic sequence tagging tasks, which are relatively less investigated in data selection, and conclude that data selection has more benefit on the scenario when the tasks are the same, while in case of different (although related) tasks from distant domains, a combination of data selection and multi-task learning is ineffective for most cases. + 2020.insights-1.3 + 2020.insights-1.3.OptionalSupplementaryMaterial.zip + + + Evaluating the Effectiveness of Efficient Neural Architecture Search for Sentence-Pair Tasks + AnselMacLaughlin + JwalaDhamala + AnoopKumar + SriramVenkatapathy + RagavVenkatesan + RahulGupta + 22–31 + Neural Architecture Search (NAS) methods, which automatically learn entire neural model or individual neural cell architectures, have recently achieved competitive or state-of-the-art (SOTA) performance on variety of natural language processing and computer vision tasks, including language modeling, natural language inference, and image classification. In this work, we explore the applicability of a SOTA NAS algorithm, Efficient Neural Architecture Search (ENAS) (Pham et al., 2018) to two sentence pair tasks, paraphrase detection and semantic textual similarity. We use ENAS to perform a micro-level search and learn a task-optimized RNN cell architecture as a drop-in replacement for an LSTM. We explore the effectiveness of ENAS through experiments on three datasets (MRPC, SICK, STS-B), with two different models (ESIM, BiLSTM-Max), and two sets of embeddings (Glove, BERT). In contrast to prior work applying ENAS to NLP tasks, our results are mixed – we find that ENAS architectures sometimes, but not always, outperform LSTMs and perform similarly to random architecture search. + 2020.insights-1.4 + + + Which Matters Most? Comparing the Impact of Concept and Document Relationships in Topic Models + SilviaTerragni + DeboraNozza + ElisabettaFersini + MessinaEnza + 32–40 + Topic models have been widely used to discover hidden topics in a collection of documents. In this paper, we propose to investigate the role of two different types of relational information, i.e. document relationships and concept relationships. While exploiting the document network significantly improves topic coherence, the introduction of concepts and their relationships does not influence the results both quantitatively and qualitatively. + 2020.insights-1.5 + 2020.insights-1.5.OptionalSupplementaryMaterial.zip + + + On Task-Level Dialogue Composition of Generative Transformer Model + PrasannaParthasarathi + SharanNarang + ArvindNeelakantan + 41–47 + Task-oriented dialogue systems help users accomplish tasks such as booking a movie ticket and ordering food via conversation. Generative models parameterized by a deep neural network are widely used for next turn response generation in such systems. It is natural for users of the system to want to accomplish multiple tasks within the same conversation, but the ability of generative models to compose multiple tasks is not well studied. In this work, we begin by studying the effect of training human-human task-oriented dialogues towards improving the ability to compose multiple tasks on Transformer generative models. To that end, we propose and explore two solutions: (1) creating synthetic multiple task dialogue data for training from human-human single task dialogue and (2) forcing the encoder representation to be invariant to single and multiple task dialogues using an auxiliary loss. The results from our experiments highlight the difficulty of even the sophisticated variant of transformer model in learning to compose multiple tasks from single task dialogues. + 2020.insights-1.6 + 2020.insights-1.6.OptionalSupplementaryMaterial.pdf + + + How Effectively Can Machines Defend Against Machine-Generated Fake News? An Empirical Study + Meghana MoorthyBhat + SrinivasanParthasarathy + 48–53 + We empirically study the effectiveness of machine-generated fake news detectors by understanding the model’s sensitivity to different synthetic perturbations during test time. The current machine-generated fake news detectors rely on provenance to determine the veracity of news. Our experiments find that the success of these detectors can be limited since they are rarely sensitive to semantic perturbations and are very sensitive to syntactic perturbations. Also, we would like to open-source our code and believe it could be a useful diagnostic tool for evaluating models aimed at fighting machine-generated fake news. + 2020.insights-1.7 + + + Label Propagation-Based Semi-Supervised Learning for Hate Speech Classification + Ashwin GeetD’Sa + IrinaIllina + DominiqueFohr + DietrichKlakow + DanaRuiter + 54–59 + Research on hate speech classification has received increased attention. In real-life scenarios, a small amount of labeled hate speech data is available to train a reliable classifier. Semi-supervised learning takes advantage of a small amount of labeled data and a large amount of unlabeled data. In this paper, label propagation-based semi-supervised learning is explored for the task of hate speech classification. The quality of labeling the unlabeled set depends on the input representations. In this work, we show that pre-trained representations are label agnostic, and when used with label propagation yield poor results. Neural network-based fine-tuning can be adopted to learn task-specific representations using a small amount of labeled data. We show that fully fine-tuned representations may not always be the best representations for the label propagation and intermediate representations may perform better in a semi-supervised setup. + 2020.insights-1.8 + + + Layout-Aware Text Representations Harm Clustering Documents by Type + CatherineFinegan-Dollak + AshishVerma + 60–65 + Clustering documents by type—grouping invoices with invoices and articles with articles—is a desirable first step for organizing large collections of document scans. Humans approaching this task use both the semantics of the text and the document layout to assist in grouping like documents. LayoutLM (Xu et al., 2019), a layout-aware transformer built on top of BERT with state-of-the-art performance on document-type classification, could reasonably be expected to outperform regular BERT (Devlin et al., 2018) for document-type clustering. However, we find experimentally that BERT significantly outperforms LayoutLM on this task (p <0.001). We analyze clusters to show where layout awareness is an asset and where it is a liability. + 2020.insights-1.9 + + + An Analysis of Capsule Networks for Part of Speech Tagging in High- and Low-resource Scenarios + AndrewZupon + FaizRafique + MihaiSurdeanu + 66–70 + Neural networks are a common tool in NLP, but it is not always clear which architecture to use for a given task. Different tasks, different languages, and different training conditions can all affect how a neural network will perform. Capsule Networks (CapsNets) are a relatively new architecture in NLP. Due to their novelty, CapsNets are being used more and more in NLP tasks. However, their usefulness is still mostly untested.In this paper, we compare three neural network architectures—LSTM, CNN, and CapsNet—on a part of speech tagging task. We compare these architectures in both high- and low-resource training conditions and find that no architecture consistently performs the best. Our analysis shows that our CapsNet performs nearly as well as a more complex LSTM under certain training conditions, but not others, and that our CapsNet almost always outperforms our CNN. We also find that our CapsNet implementation shows faster prediction times than the LSTM for Scottish Gaelic but not for Spanish, highlighting the effect that the choice of languages can have on the models. + 2020.insights-1.10 + + + Can Knowledge Graph Embeddings Tell Us What Fact-checked Claims Are About? + ValentinaBeretta + SébastienHarispe + KatarinaBoland + LukeLo Seen + KonstantinTodorov + AndonTchechmedjiev + 71–75 + The web offers a wealth of discourse data that help researchers from various fields analyze debates about current societal issues and gauge the effects on society of important phenomena such as misinformation spread. Such analyses often revolve around claims made by people about a given topic of interest. Fact-checking portals offer partially structured information that can assist such analysis. However, exploiting the network structure of such online discourse data is as of yet under-explored. We study the effectiveness of using neural-graph embedding features for claim topic prediction and their complementarity with text embeddings. We show that graph embeddings are modestly complementary with text embeddings, but the low performance of graph embedding features alone indicate that the model fails to capture topological features pertinent of the topic prediction task. + 2020.insights-1.11 + + + Do Transformers Dream of Inference, or Can Pretrained Generative Models Learn Implicit Inferential Rules? + ZhengzhongLiang + MihaiSurdeanu + 76–81 + Large pretrained language models (LM) have been used successfully for multi-hop question answering. However, most of these directions are not interpretable, as they do not make the inference hops necessary to explain a candidate answer explicitly. In this work, we investigate the capability of a state-of-the-art transformer LM to generate explicit inference hops, i.e., to infer a new statement necessary to answer a question given some premise input statements. Our analysis shows that such LMs can generate new statements for some simple inference types, but performance remains poor for complex, real-world inference types such as those that require monotonicity, composition, and commonsense knowledge. + 2020.insights-1.12 + 2020.insights-1.12.OptionalSupplementaryMaterial.zip + + + Counterfactually-Augmented <fixed-case>SNLI</fixed-case> Training Data Does Not Yield Better Generalization Than Unaugmented Data + WilliamHuang + HaokunLiu + Samuel R.Bowman + 82–87 + A growing body of work shows that models exploit annotation artifacts to achieve state-of-the-art performance on standard crowdsourced benchmarks—datasets collected from crowdworkers to create an evaluation task—while still failing on out-of-domain examples for the same task. Recent work has explored the use of counterfactually-augmented data—data built by minimally editing a set of seed examples to yield counterfactual labels—to augment training data associated with these benchmarks and build more robust classifiers that generalize better. However, Khashabi et al. (2020) find that this type of augmentation yields little benefit on reading comprehension tasks when controlling for dataset size and cost of collection. We build upon this work by using English natural language inference data to test model generalization and robustness and find that models trained on a counterfactually-augmented SNLI dataset do not generalize better than unaugmented datasets of similar size and that counterfactual augmentation can hurt performance, yielding models that are less robust to challenge examples. Counterfactual augmentation of natural language understanding data through standard crowdsourcing techniques does not appear to be an effective way of collecting training data and further innovation is required to make this general line of work viable. + 2020.insights-1.13 + + + <fixed-case>NMF</fixed-case> Ensembles? Not for Text Summarization! + AlkaKhurana + VasudhaBhatnagar + 88–93 + Non-negative Matrix Factorization (NMF) has been used for text analytics with promising results. Instability of results arising due to stochastic variations during initialization makes a case for use of ensemble technology. However, our extensive empirical investigation indicates otherwise. In this paper, we establish that ensemble summary for single document using NMF is no better than the best base model summary. + 2020.insights-1.14 + 2020.insights-1.14.OptionalSupplementaryMaterial.zip + + + If You Build Your Own <fixed-case>NER</fixed-case> Scorer, Non-replicable Results Will Come + ConstantineLignos + MarjanKamyab + 94–99 + We attempt to replicate a named entity recognition (NER) model implemented in a popular toolkit and discover that a critical barrier to doing so is the inconsistent evaluation of improper label sequences. We define these sequences and examine how two scorers differ in their handling of them, finding that one approach produces F1 scores approximately 0.5 points higher on the CoNLL 2003 English development and test sets. We propose best practices to increase the replicability of NER evaluations by increasing transparency regarding the handling of improper label sequences. + 2020.insights-1.15 + + + <fixed-case>HINT</fixed-case>3: Raising the bar for Intent Detection in the Wild + GauravArora + ChiragJain + ManasChaturvedi + KrupalModi + 100–105 + Intent Detection systems in the real world are exposed to complexities of imbalanced datasets containing varying perception of intent, unintended correlations and domain-specific aberrations. To facilitate benchmarking which can reflect near real-world scenarios, we introduce 3 new datasets created from live chatbots in diverse domains. Unlike most existing datasets that are crowdsourced, our datasets contain real user queries received by the chatbots and facilitates penalising unwanted correlations grasped during the training process. We evaluate 4 NLU platforms and a BERT based classifier and find that performance saturates at inadequate levels on test sets because all systems latch on to unintended patterns in training data. + 2020.insights-1.16 + + + The Extraordinary Failure of Complement Coercion Crowdsourcing + YanaiElazar + VictoriaBasmov + ShauliRavfogel + YoavGoldberg + ReutTsarfaty + 106–116 + Crowdsourcing has eased and scaled up the collection of linguistic annotation in recent years. In this work, we follow known methodologies of collecting labeled data for the complement coercion phenomenon. These are constructions with an implied action — e.g., “I started a new book I bought last week”, where the implied action is reading. We aim to collect annotated data for this phenomenon by reducing it to either of two known tasks: Explicit Completion and Natural Language Inference. However, in both cases, crowdsourcing resulted in low agreement scores, even though we followed the same methodologies as in previous work. Why does the same process fail to yield high agreement scores? We specify our modeling schemes, highlight the differences with previous work and provide some insights about the task and possible explanations for the failure. We conclude that specific phenomena require tailored solutions, not only in specialized algorithms, but also in data collection methods. + 2020.insights-1.17 + + + Embedding Structured Dictionary Entries + StevenWilson + WalidMagdy + BarbaraMcGillivray + GarethTyson + 117–125 + Previous work has shown how to effectively use external resources such as dictionaries to improve English-language word embeddings, either by manipulating the training process or by applying post-hoc adjustments to the embedding space. We experiment with a multi-task learning approach for explicitly incorporating the structured elements of dictionary entries, such as user-assigned tags and usage examples, when learning embeddings for dictionary headwords. Our work generalizes several existing models for learning word embeddings from dictionaries. However, we find that the most effective representations overall are learned by simply training with a skip-gram objective over the concatenated text of all entries in the dictionary, giving no particular focus to the structure of the entries. + 2020.insights-1.18 + +
+
diff --git a/data/xml/2020.intexsempar.xml b/data/xml/2020.intexsempar.xml new file mode 100644 index 0000000000..d6d988d873 --- /dev/null +++ b/data/xml/2020.intexsempar.xml @@ -0,0 +1,76 @@ + + + + + Proceedings of the First Workshop on Interactive and Executable Semantic Parsing + BenBogin + SrinivasanIyer + VictoriaLin + DragomirRadev + AlaneSuhr + Panupong + CaimingXiong + PengchengYin + TaoYu + RuiZhang + VictorZhong + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.intexsempar-1.0 + + + <fixed-case>QA</fixed-case>2<fixed-case>E</fixed-case>xplanation: Generating and Evaluating Explanations for Question Answering Systems over Knowledge Graph + SaeedehShekarpour + AbhishekNadgeri + KuldeepSingh + 1–11 + In the era of Big Knowledge Graphs, Question Answering (QA) systems have reached a milestone in their performance and feasibility. However, their applicability, particularly in specific domains such as the biomedical domain, has not gained wide acceptance due to their “black box” nature, which hinders transparency, fairness, and accountability of QA systems. Therefore, users are unable to understand how and why particular questions have been answered, whereas some others fail. To address this challenge, in this paper, we develop an automatic approach for generating explanations during various stages of a pipeline-based QA system. Our approach is a supervised and automatic approach which considers three classes (i.e., success, no answer, and wrong answer) for annotating the output of involved QA components. Upon our prediction, a template explanation is chosen and integrated into the output of the corresponding component. To measure the effectiveness of the approach, we conducted a user survey as to how non-expert users perceive our generated explanations. The results of our study show a significant increase in the four dimensions of the human factor from the Human-computer interaction community. + 2020.intexsempar-1.1 + + + Uncertainty and Traffic-Aware Active Learning for Semantic Parsing + PriyankaSen + EmineYilmaz + 12–17 + Collecting training data for semantic parsing is a time-consuming and expensive task. As a result, there is growing interest in industry to reduce the number of annotations required to train a semantic parser, both to cut down on costs and to limit customer data handled by annotators. In this paper, we propose uncertainty and traffic-aware active learning, a novel active learning method that uses model confidence and utterance frequencies from customer traffic to select utterances for annotation. We show that our method significantly outperforms baselines on an internal customer dataset and the Facebook Task Oriented Parsing (TOP) dataset. On our internal dataset, our method achieves the same accuracy as random sampling with 2,000 fewer annotations. + 2020.intexsempar-1.2 + + + Improving Sequence-to-Sequence Semantic Parser for Task Oriented Dialog + ChaotingXuan + 18–22 + Task Oriented Parsing (TOP) attempts to map utterances to compositional requests, including multiple intents and their slots. Previous work focus on a tree-based hierarchical meaning representation, and applying constituency parsing techniques to address TOP. In this paper, we propose a new format of meaning representation that is more compact and amenable to sequence-to-sequence (seq-to-seq) models. A simple copy-augmented seq-to-seq parser is built and evaluated over a public TOP dataset, resulting in 3.44% improvement over prior best seq-to-seq parser (exact match accuracy), which is also comparable to constituency parsers’ performance. + 2020.intexsempar-1.3 + + + Learning Adaptive Language Interfaces through Decomposition + SiddharthKaramcheti + DorsaSadigh + PercyLiang + 23–33 + Our goal is to create an interactive natural language interface that efficiently and reliably learns from users to complete tasks in simulated robotics settings. We introduce a neural semantic parsing system that learns new high-level abstractions through decomposition: users interactively teach the system by breaking down high-level utterances describing novel behavior into low-level steps that it can understand. Unfortunately, existing methods either rely on grammars which parse sentences with limited flexibility, or neural sequence-to-sequence models that do not learn efficiently or reliably from individual examples. Our approach bridges this gap, demonstrating the flexibility of modern neural systems, as well as the one-shot reliable generalization of grammar-based methods. Our crowdsourced interactive experiments suggest that over time, users complete complex tasks more efficiently while using our system by leveraging what they just taught. At the same time, getting users to trust the system enough to be incentivized to teach high-level utterances is still an ongoing challenge. We end with a discussion of some of the obstacles we need to overcome to fully realize the potential of the interactive paradigm. + 2020.intexsempar-1.4 + + + <fixed-case>C</fixed-case>ollo<fixed-case>QL</fixed-case>: Robust Text-to-<fixed-case>SQL</fixed-case> Over Search Queries + KarthikRadhakrishnan + ArvindSrikantan + Xi VictoriaLin + 34–45 + Translating natural language utterances to executable queries is a helpful technique in making the vast amount of data stored in relational databases accessible to a wider range of non-tech-savvy end users. Prior work in this area has largely focused on textual input that is linguistically correct and semantically unambiguous. However, real-world user queries are often succinct, colloquial, and noisy, resembling the input of a search engine. In this work, we introduce data augmentation techniques and a sampling-based content-aware BERT model (ColloQL) to achieve robust text-to-SQL modeling over natural language search (NLS) questions. Due to the lack of evaluation data, we curate a new dataset of NLS questions and demonstrate the efficacy of our approach. ColloQL’s superior performance extends to well-formed text, achieving an 84.9% (logical) and 90.7% (execution) accuracy on the WikiSQL dataset, making it, to the best of our knowledge, the highest performing model that does not use execution guided decoding. + 2020.intexsempar-1.5 + + + Natural Language Response Generation from <fixed-case>SQL</fixed-case> with Generalization and Back-translation + SaptarashmiBandyopadhyay + TianyangZhao + 46–49 + Generation of natural language responses to the queries of structured language like SQL is very challenging as it requires generalization to new domains and the ability to answer ambiguous queries among other issues. We have participated in the CoSQL shared task organized in the IntEx-SemPar workshop at EMNLP 2020. We have trained a number of Neural Machine Translation (NMT) models to efficiently generate the natural language responses from SQL. Our shuffled back-translation model has led to a BLEU score of 7.47 on the unknown test dataset. In this paper, we will discuss our methodologies to approach the problem and future directions to improve the quality of the generated natural language responses. + 2020.intexsempar-1.6 + +
+
diff --git a/data/xml/2020.iwslt.xml b/data/xml/2020.iwslt.xml index 2095f98dde..583c0925f1 100644 --- a/data/xml/2020.iwslt.xml +++ b/data/xml/2020.iwslt.xml @@ -275,11 +275,11 @@
<fixed-case>ISTIC</fixed-case>’s Neural Machine Translation System for <fixed-case>IWSLT</fixed-case>’2020 - jiazewei - wenbinliu - zhenfengwu - youpan - yanqinghe + JiazeWei + WenbinLiu + ZhenfengWu + YouPan + YanqingHe 158–165 This paper introduces technical details of machine translation system of Institute of Scientific and Technical Information of China (ISTIC) for the 17th International Conference on Spoken Language Translation (IWSLT 2020). ISTIC participated in both translation tasks of the Open Domain Translation track: Japanese-to-Chinese MT task and Chinese-to-Japanese MT task. The paper mainly elaborates on the model framework, data preprocessing methods and decoding strategies adopted in our system. In addition, the system performance on the development set are given under different settings. 2020.iwslt-1.19 diff --git a/data/xml/2020.louhi.xml b/data/xml/2020.louhi.xml new file mode 100644 index 0000000000..a8ba044a82 --- /dev/null +++ b/data/xml/2020.louhi.xml @@ -0,0 +1,195 @@ + + + + + Proceedings of the 11th International Workshop on Health Text Mining and Information Analysis + EbenHolderness + AntonioJimeno Yepes + AlbertoLavelli + Anne-LyseMinard + JamesPustejovsky + FabioRinaldi + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.louhi-1.0 + + + The Impact of De-identification on Downstream Named Entity Recognition in Clinical Text + HannaBerg + AronHenriksson + HerculesDalianis + 1–11 + The impact of de-identification on data quality and, in particular, utility for developing models for downstream tasks has been more thoroughly studied for structured data than for unstructured text. While previous studies indicate that text de-identification has a limited impact on models for downstream tasks, it remains unclear what the impact is with various levels and forms of de-identification, in particular concerning the trade-off between precision and recall. In this paper, the impact of de-identification is studied on downstream named entity recognition in Swedish clinical text. The results indicate that de-identification models with moderate to high precision lead to similar downstream performance, while low precision has a substantial negative impact. Furthermore, different strategies for concealing sensitive information affect performance to different degrees, ranging from pseudonymisation having a low impact to the removal of entire sentences with sensitive information having a high impact. This study indicates that it is possible to increase the recall of models for identifying sensitive information without negatively affecting the use of de-identified text data for training models for clinical named entity recognition; however, there is ultimately a trade-off between the level of de-identification and the subsequent utility of the data. + 2020.louhi-1.1 + 2020.louhi-1.1.OptionalSupplementaryMaterial.zip + + + Simple Hierarchical Multi-Task Neural End-To-End Entity Linking for Biomedical Text + MaciejWiatrak + JuhaIso-Sipila + 12–17 + Recognising and linking entities is a crucial first step to many tasks in biomedical text analysis, such as relation extraction and target identification. Traditionally, biomedical entity linking methods rely heavily on heuristic rules and predefined, often domain-specific features. The features try to capture the properties of entities and complex multi-step architectures to detect, and subsequently link entity mentions. We propose a significant simplification to the biomedical entity linking setup that does not rely on any heuristic methods. The system performs all the steps of the entity linking task jointly in either single or two stages. We explore the use of hierarchical multi-task learning, using mention recognition and entity typing tasks as auxiliary tasks. We show that hierarchical multi-task models consistently outperform single-task models when trained tasks are homogeneous. We evaluate the performance of our models on the biomedical entity linking benchmarks using MedMentions and BC5CDR datasets. We achieve state-of-theart results on the challenging MedMentions dataset, and comparable results on BC5CDR. + 2020.louhi-1.2 + + + Medical Concept Normalization in User-Generated Texts by Learning Target Concept Embeddings + Katikapalli SubramanyamKalyan + SivanesanSangeetha + 18–23 + Medical concept normalization helps in discovering standard concepts in free-form text i.e., maps health-related mentions to standard concepts in a clinical knowledge base. It is much beyond simple string matching and requires a deep semantic understanding of concept mentions. Recent research approach concept normalization as either text classification or text similarity. The main drawback in existing a) text classification approach is ignoring valuable target concepts information in learning input concept mention representation b) text similarity approach is the need to separately generate target concept embeddings which is time and resource consuming. Our proposed model overcomes these drawbacks by jointly learning the representations of input concept mention and target concepts. First, we learn input concept mention representation using RoBERTa. Second, we find cosine similarity between embeddings of input concept mention and all the target concepts. Here, embeddings of target concepts are randomly initialized and then updated during training. Finally, the target concept with maximum cosine similarity is assigned to the input concept mention. Our model surpasses all the existing methods across three standard datasets by improving accuracy up to 2.31%. + 2020.louhi-1.3 + + + Not a cute stroke: Analysis of Rule- and Neural Network-based Information Extraction Systems for Brain Radiology Reports + AndreasGrivas + BeatriceAlex + ClaireGrover + RichardTobin + WilliamWhiteley + 24–37 + We present an in-depth comparison of three clinical information extraction (IE) systems designed to perform entity recognition and negation detection on brain imaging reports: EdIE-R, a bespoke rule-based system, and two neural network models, EdIE-BiLSTM and EdIE-BERT, both multi-task learning models with a BiLSTM and BERT encoder respectively. We compare our models both on an in-sample and an out-of-sample dataset containing mentions of stroke findings and draw on our error analysis to suggest improvements for effective annotation when building clinical NLP models for a new domain. Our analysis finds that our rule-based system outperforms the neural models on both datasets and seems to generalise to the out-of-sample dataset. On the other hand, the neural models do not generalise negation to the out-of-sample dataset, despite metrics on the in-sample dataset suggesting otherwise. + 2020.louhi-1.4 + + + <fixed-case>GGPONC</fixed-case>: A Corpus of <fixed-case>G</fixed-case>erman Medical Text with Rich Metadata Based on Clinical Practice Guidelines + FlorianBorchert + ChristinaLohr + LuiseModersohn + ThomasLanger + MarkusFollmann + Jan PhilippSachs + UdoHahn + Matthieu-P.Schapranow + 38–48 + The lack of publicly accessible text corpora is a major obstacle for progress in natural language processing. For medical applications, unfortunately, all language communities other than English are low-resourced. In this work, we present GGPONC (German Guideline Program in Oncology NLP Corpus), a freely dis tributable German language corpus based on clinical practice guidelines for oncology. This corpus is one of the largest ever built from German medical documents. Unlike clinical documents, clinical guidelines do not contain any patient-related information and can therefore be used without data protection restrictions. Moreover, GGPONC is the first corpus for the German language covering diverse conditions in a large medical subfield and provides a variety of metadata, such as literature references and evidence levels. By applying and evaluating existing medical information extraction pipelines for German text, we are able to draw comparisons for the use of medical language to other corpora, medical and non-medical ones. + 2020.louhi-1.5 + + + Normalization of Long-tail Adverse Drug Reactions in Social Media + EmmanouilManousogiannis + SepidehMesbah + AlessandroBozzon + Robert-JanSips + ZoltanSzlanik + SeleneBaez + 49–58 + The automatic mapping of Adverse Drug Reaction (ADR) reports from user-generated content to concepts in a controlled medical vocabulary provides valuable insights for monitoring public health. While state-of-the-art deep learning-based sequence classification techniques achieve impressive performance for medical concepts with large amounts of training data, they show their limit with long-tail concepts that have a low number of training samples. The above hinders their adaptability to the changes of layman’s terminology and the constant emergence of new informal medical terms. Our objective in this paper is to tackle the problem of normalizing long-tail ADR mentions in user-generated content. In this paper, we exploit the implicit semantics of rare ADRs for which we have few training samples, in order to detect the most similar class for the given ADR. The evaluation results demonstrate that our proposed approach addresses the limitations of the existing techniques when the amount of training data is limited. + 2020.louhi-1.6 + + + Evaluation of Machine Translation Methods applied to Medical Terminologies + KonstantinosSkianis + YannBriand + FlorentDesgrippes + 59–69 + Medical terminologies resources and standards play vital roles in clinical data exchanges, enabling significantly the services’ interoperability within healthcare national information networks. Health and medical science are constantly evolving causing requirements to advance the terminologies editions. In this paper, we present our evaluation work of the latest machine translation techniques addressing medical terminologies. Experiments have been conducted leveraging selected statistical and neural machine translation methods. The devised procedure is tested on a validated sample of ICD-11 and ICF terminologies from English to French with promising results. + 2020.louhi-1.7 + + + Information retrieval for animal disease surveillance: a pattern-based approach. + SarahValentin + MathieuRoche + RenaudLancelot + 70–78 + Animal diseases-related news articles are richin information useful for risk assessment. In this paper, we explore a method to automatically retrieve sentence-level epidemiological information. Our method is an incremental approach to create and expand patterns at both lexical and syntactic levels. Expert knowledge input are used at different steps of the approach. Distributed vector representations (word embedding) were used to expand the patterns at the lexical level, thus alleviating manual curation. We showed that expert validation was crucial to improve the precision of automatically generated patterns. + 2020.louhi-1.8 + + + Multitask Learning of Negation and Speculation using Transformers + AdityaKhandelwal + Benita KathleenBritto + 79–87 + Detecting negation and speculation in language has been a task of considerable interest to the biomedical community, as it is a key component of Information Extraction systems from Biomedical documents. Prior work has individually addressed Negation Detection and Speculation Detection, and both have been addressed in the same way, using 2 stage pipelined approach: Cue Detection followed by Scope Resolution. In this paper, we propose Multitask learning approaches over 2 sets of tasks: Negation Cue Detection & Speculation Cue Detection, and Negation Scope Resolution & Speculation Scope Resolution. We utilise transformer-based architectures like BERT, XLNet and RoBERTa as our core model architecture, and finetune these using the Multitask learning approaches. We show that this Multitask Learning approach outperforms the single task learning approach, and report new state-of-the-art results on Negation and Speculation Scope Resolution on the BioScope Corpus and the SFU Review Corpus. + 2020.louhi-1.9 + + + Biomedical Event Extraction as Multi-turn Question Answering + Xing DavidWang + LeonWeber + UlfLeser + 88–96 + Biomedical event extraction from natural text is a challenging task as it searches for complex and often nested structures describing specific relationships between multiple molecular entities, such as genes, proteins, or cellular components. It usually is implemented by a complex pipeline of individual tools to solve the different relation extraction subtasks. We present an alternative approach where the detection of relationships between entities is described uniformly as questions, which are iteratively answered by a question answering (QA) system based on the domain-specific language model SciBERT. This model outperforms two strong baselines in two biomedical event extraction corpora in a Knowledge Base Population setting, and also achieves competitive performance in BioNLP challenge evaluation settings. + 2020.louhi-1.10 + + + An efficient representation of chronological events in medical texts + AndreyKormilitzin + NemanjaVaci + QiangLiu + HaoNi + GoranNenadic + AlejoNevado-Holgado + 97–103 + In this work we addressed the problem of capturing sequential information contained in longitudinal electronic health records (EHRs). Clinical notes, which is a particular type of EHR data, are a rich source of information and practitioners often develop clever solutions how to maximise the sequential information contained in free-texts. We proposed a systematic methodology for learning from chronological events available in clinical notes. The proposed methodological path signature framework creates a non-parametric hierarchical representation of sequential events of any type and can be used as features for downstream statistical learning tasks. The methodology was developed and externally validated using the largest in the UK secondary care mental health EHR data on a specific task of predicting survival risk of patients diagnosed with Alzheimer’s disease. The signature-based model was compared to a common survival random forest model. Our results showed a 15.4% increase of risk prediction AUC at the time point of 20 months after the first admission to a specialist memory clinic and the signature method outperformed the baseline mixed-effects model by 13.2 %. + 2020.louhi-1.11 + + + Defining and Learning Refined Temporal Relations in the Clinical Narrative + KristinWright-Bettner + ChenLin + TimothyMiller + StevenBethard + DmitriyDligach + MarthaPalmer + James H.Martin + GuerganaSavova + 104–114 + We present refinements over existing temporal relation annotations in the Electronic Medical Record clinical narrative. We refined the THYME corpus annotations to more faithfully represent nuanced temporality and nuanced temporal-coreferential relations. The main contributions are in re-defining CONTAINS and OVERLAP relations into CONTAINS, CONTAINS-SUBEVENT, OVERLAP and NOTED-ON. We demonstrate that these refinements lead to substantial gains in learnability for state-of-the-art transformer models as compared to previously reported results on the original THYME corpus. We thus establish a baseline for the automatic extraction of these refined temporal relations. Although our study is done on clinical narrative, we believe it addresses far-reaching challenges that are corpus- and domain- agnostic. + 2020.louhi-1.12 + + + Context-Aware Automatic Text Simplification of Health Materials in Low-Resource Domains + TarekSakakini + Jong YoonLee + AdityaDuri + Renato F.L.Azevedo + VictorSadauskas + KuangxiaoGu + SumaBhat + DanMorrow + JamesGraumlich + SaqibWalayat + MarkHasegawa-Johnson + ThomasHuang + AnnWillemsen-Dunlap + DonaldHalpin + 115–126 + Healthcare systems have increased patients’ exposure to their own health materials to enhance patients’ health levels, but this has been impeded by patients’ lack of understanding of their health material. We address potential barriers to their comprehension by developing a context-aware text simplification system for health material. Given the scarcity of annotated parallel corpora in healthcare domains, we design our system to be independent of a parallel corpus, complementing the availability of data-driven neural methods when such corpora are available. Our system compensates for the lack of direct supervision using a biomedical lexical database: Unified Medical Language System (UMLS). Compared to a competitive prior approach that uses a tool for identifying biomedical concepts and a consumer-directed vocabulary list, we empirically show the enhanced accuracy of our system due to improved handling of ambiguous terms. We also show the enhanced accuracy of our system over directly-supervised neural methods in this low-resource setting. Finally, we show the direct impact of our system on laypeople’s comprehension of health material via a human subjects’ study (n=160). + 2020.louhi-1.13 + + + Identifying Personal Experience Tweets of Medication Effects Using Pre-trained <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a Language Model and Its Updating + MinghaoZhu + YouzheSong + GeJin + KeyuanJiang + 127–137 + Post-market surveillance, the practice of monitoring the safe use of pharmaceutical drugs is an important part of pharmacovigilance. Being able to collect personal experience related to pharmaceutical product use could help us gain insight into how the human body reacts to different medications. Twitter, a popular social media service, is being considered as an important alternative data source for collecting personal experience information with medications. Identifying personal experience tweets is a challenging classification task in natural language processing. In this study, we utilized three methods based on Facebook’s Robustly Optimized BERT Pretraining Approach (RoBERTa) to predict personal experience tweets related to medication use: the first one combines the pre-trained RoBERTa model with a classifier, the second combines the updated pre-trained RoBERTa model using a corpus of unlabeled tweets with a classifier, and the third combines the RoBERTa model that was trained with our unlabeled tweets from scratch with the classifier too. Our results show that all of these approaches outperform the published methods (Word Embedding + LSTM) in classification performance (p < 0.05), and updating the pre-trained language model with tweets related to medications could even improve the performance further. + 2020.louhi-1.14 + + + Detecting Foodborne Illness Complaints in Multiple Languages Using <fixed-case>E</fixed-case>nglish Annotations Only + ZiyiLiu + GiannisKaramanolakis + DanielHsu + LuisGravano + 138–146 + Health departments have been deploying text classification systems for the early detection of foodborne illness complaints in social media documents such as Yelp restaurant reviews. Current systems have been successfully applied for documents in English and, as a result, a promising direction is to increase coverage and recall by considering documents in additional languages, such as Spanish or Chinese. Training previous systems for more languages, however, would be expensive, as it would require the manual annotation of many documents for each new target language. To address this challenge, we consider cross-lingual learning and train multilingual classifiers using only the annotations for English-language reviews. Recent zero-shot approaches based on pre-trained multi-lingual BERT (mBERT) have been shown to effectively align languages for aspects such as sentiment. Interestingly, we show that those approaches are less effective for capturing the nuances of foodborne illness, our public health application of interest. To improve performance without extra annotations, we create artificial training documents in the target language through machine translation and train mBERT jointly for the source (English) and target language. Furthermore, we show that translating labeled documents to multiple languages leads to additional performance improvements for some target languages. We demonstrate the benefits of our approach through extensive experiments with Yelp restaurant reviews in seven languages. Our classifiers identify foodborne illness complaints in multilingual reviews from the Yelp Challenge dataset, which highlights the potential of our general approach for deployment in health departments. + 2020.louhi-1.15 + + + Detection of Mental Health from <fixed-case>R</fixed-case>eddit via Deep Contextualized Representations + ZhengpingJiang + Sarah ItaLevitan + JonathanZomick + JuliaHirschberg + 147–156 + We address the problem of automatic detection of psychiatric disorders from the linguistic content of social media posts. We build a large scale dataset of Reddit posts from users with eight disorders and a control user group. We extract and analyze linguistic characteristics of posts and identify differences between diagnostic groups. We build strong classification models based on deep contextualized word representations and show that they outperform previously applied statistical models with simple linguistic features by large margins. We compare user-level and post-level classification performance, as well as an ensembled multiclass model. + 2020.louhi-1.16 + +
+
diff --git a/data/xml/2020.lrec.xml b/data/xml/2020.lrec.xml index c89397b1f9..d0e0e50a30 100644 --- a/data/xml/2020.lrec.xml +++ b/data/xml/2020.lrec.xml @@ -1287,7 +1287,7 @@ “Voices of the Great War”: A Richly Annotated Corpus of <fixed-case>I</fixed-case>talian Texts on the First World War FedericoBoschetti - irenede felice + IreneDe Felice StefanoDei Rossi FeliceDell’Orletta MicheleDi Giorgio @@ -2869,7 +2869,7 @@ Event Extraction from Unstructured <fixed-case>A</fixed-case>mharic Text - ephremtadesse + EphremTadesse RosaTsegaye KuulaaQaqqabaa 2103–2109 @@ -3840,7 +3840,7 @@ Seyed AradAshrafi Asli ZahraMajdabadi OmidMomenzadeh - rezafahmi + RezaFahmi 2839–2845 Irony is a linguistic device used to intend an idea while articulating an opposing expression. Many text analytic algorithms used for emotion extraction or sentiment analysis, produce invalid results due to the use of irony. Persian speakers use this device more often due to the language’s nature and some cultural reasons. This phenomenon also appears in social media platforms such as Twitter where users express their opinions using ironic or sarcastic posts. In the current research, which is the first attempt at irony detection in Persian language, emoji prediction is used to build a pretrained model. The model is finetuned utilizing a set of hand labeled tweets with irony tags. A bidirectional LSTM (BiLSTM) network is employed as the basis of our model which is improved by attention mechanism. Additionally, a Persian corpus for irony detection containing 4339 manually-labeled tweets is introduced. Experiments show the proposed approach outperforms the adapted state-of-the-art method tested on Persian dataset with an accuracy of 83.1%, and offers a strong baseline for further research in Persian language. 2020.lrec-1.346 @@ -3862,7 +3862,7 @@ BehnamSabeti ZahraMajdabadi PreniGolazizian - rezafahmi + RezaFahmi OmidMomenzadeh 2855–2861 Deep learning models are the current State-of-the-art methodologies towards many real-world problems. However, they need a substantial amount of labeled data to be trained appropriately. Acquiring labeled data can be challenging in some particular domains or less-resourced languages. There are some practical solutions regarding these issues, such as Active Learning and Transfer Learning. Active learning’s idea is simple: let the model choose the samples for annotation instead of labeling the whole dataset. This method leads to a more efficient annotation process. Active Learning models can achieve the baseline performance (the accuracy of the model trained on the whole dataset), with a considerably lower amount of labeled data. Several active learning approaches are tested in this work, and their compatibility with Persian is examined using a brand-new sentiment analysis dataset that is also introduced in this work. MirasOpinion, which to our knowledge is the largest Persian sentiment analysis dataset, is crawled from a Persian e-commerce website and annotated using a crowd-sourcing policy. LDA sampling, which is an efficient Active Learning strategy using Topic Modeling, is proposed in this research. Active Learning Strategies have shown promising results in the Persian language, and LDA sampling showed a competitive performance compared to other approaches. @@ -5974,7 +5974,7 @@ A Multimodal Educational Corpus of Oral Courses: Annotation, Analysis and Case Study - salimamdhaffar + SalimaMdhaffar YannickEstève AntoineLaurent NicolasHernandez @@ -7570,7 +7570,7 @@ DelphineCharlet GeraldineDamnati FredericBechet - gabrielmarzinotto + GabrielMarzinotto JohannesHeinecke 5491–5497 Machine Reading received recently a lot of attention thanks to both the availability of very large corpora such as SQuAD or MS MARCO containing triplets (document, question, answer), and the introduction of Transformer Language Models such as BERT which obtain excellent results, even matching human performance according to the SQuAD leaderboard. One of the key features of Transformer Models is their ability to be jointly trained across multiple languages, using a shared subword vocabulary, leading to the construction of cross-lingual lexical representations. This feature has been used recently to perform zero-shot cross-lingual experiments where a multilingual BERT model fine-tuned on a machine reading comprehension task exclusively for English was directly applied to Chinese and French documents with interesting performance. In this paper we study the cross-language and cross-domain capabilities of BERT on a Machine Reading Comprehension task on two corpora: SQuAD and a new French Machine Reading dataset, called CALOR-QUEST. The semantic annotation available on CALOR-QUEST allows us to give a detailed analysis on the kinds of questions that are properly handled through the cross-language process. We will try to answer this question: which factor between language mismatch and domain mismatch has the strongest influence on the performances of a Machine Reading Comprehension task? @@ -8524,7 +8524,7 @@ PreniGolazizian Seyed AradAshrafi Asli OmidMomenzadeh - rezafahmi + RezaFahmi 6213–6219 Twitter has become a major platform for users to express their opinions on any topic and engage in debates. User debates and interactions usually lead to massive content regarding a specific topic which is called a Trend. Twitter trend extraction aims at finding these relevant groups of content that are generated in a short period. The most straightforward approach for this problem is using Hashtags, however, tweets without hashtags are not considered this way. In order to overcome this issue and extract trends using all tweets, we propose a graph-based approach where graph nodes represent tweets as well as words and hashtags. More specifically, we propose a modified version of RankClus algorithm to extract trends from the constructed tweets graph. The proposed approach is also capable of ranking tweets, words and hashtags in each trend with respect to their importance and relevance to the topic. The proposed algorithm is used to extract trends from several twitter datasets, where it produced consistent and coherent results. 2020.lrec-1.762 @@ -8739,7 +8739,7 @@ Augmented Prompt Selection for Evaluation of Spontaneous Speech Synthesis EvaSzekely JensEdlund - joakimgustafson + JoakimGustafson 6368–6374 By definition, spontaneous speech is unscripted and created on the fly by the speaker. It is dramatically different from read speech, where the words are authored as text before they are spoken. Spontaneous speech is emergent and transient, whereas text read out loud is pre-planned. For this reason, it is unsuitable to evaluate the usability and appropriateness of spontaneous speech synthesis by having it read out written texts sampled from for example newspapers or books. Instead, we need to use transcriptions of speech as the target - something that is much less readily available. In this paper, we introduce Starmap, a tool allowing developers to select a varied, representative set of utterances from a spoken genre, to be used for evaluation of TTS for a given domain. The selection can be done from any speech recording, without the need for transcription. The tool uses interactive visualisation of prosodic features with t-SNE, along with a tree-based algorithm to guide the user through thousands of utterances and ensure coverage of a variety of prompts. A listening test has shown that with a selection of genre-specific utterances, it is possible to show significant differences across genres between two synthetic voices built from spontaneous speech. 2020.lrec-1.782 diff --git a/data/xml/2020.nlp4convai.xml b/data/xml/2020.nlp4convai.xml index e7ca110347..cc66327c19 100644 --- a/data/xml/2020.nlp4convai.xml +++ b/data/xml/2020.nlp4convai.xml @@ -60,7 +60,7 @@ AdamSummerville JordanHashemi JamesRyan - williamferguson + WilliamFerguson 32–37 Dialog State Tracking (DST) is a problem space in which the effective vocabulary is practically limitless. For example, the domain of possible movie titles or restaurant names is bound only by the limits of language. As such, DST systems often encounter out-of-vocabulary words at inference time that were never encountered during training. To combat this issue, we present a targeted data augmentation process, by which a practitioner observes the types of errors made on held-out evaluation data, and then modifies the training data with additional corpora to increase the vocabulary size at training time. Using this with a RoBERTa-based Transformer architecture, we achieve state-of-the-art results in comparison to systems that only mask trouble slots with special tokens. Additionally, we present a data-representation scheme for seamlessly retargeting DST architectures to new domains. 2020.nlp4convai-1.4 diff --git a/data/xml/2020.nlpbt.xml b/data/xml/2020.nlpbt.xml new file mode 100644 index 0000000000..e280f8d588 --- /dev/null +++ b/data/xml/2020.nlpbt.xml @@ -0,0 +1,110 @@ + + + + + Proceedings of the First International Workshop on Natural Language Processing Beyond Text + GiuseppeCastellucci + SimoneFilice + SoujanyaPoria + ErikCambria + LuciaSpecia + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.nlpbt-1.0 + + + Modulated Fusion using Transformer for Linguistic-Acoustic Emotion Recognition + Jean-BenoitDelbrouck + NoéTits + StéphaneDupont + 1–10 + This paper aims to bring a new lightweight yet powerful solution for the task of Emotion Recognition and Sentiment Analysis. Our motivation is to propose two architectures based on Transformers and modulation that combine the linguistic and acoustic inputs from a wide range of datasets to challenge, and sometimes surpass, the state-of-the-art in the field. To demonstrate the efficiency of our models, we carefully evaluate their performances on the IEMOCAP, MOSI, MOSEI and MELD dataset. The experiments can be directly replicated and the code is fully open for future researches. + 2020.nlpbt-1.1 + + + Multimodal Speech Recognition with Unstructured Audio Masking + TejasSrinivasan + RamonSanabria + FlorianMetze + DesmondElliott + 11–18 + Visual context has been shown to be useful for automatic speech recognition (ASR) systems when the speech signal is noisy or corrupted. Previous work, however, has only demonstrated the utility of visual context in an unrealistic setting, where a fixed set of words are systematically masked in the audio. In this paper, we simulate a more realistic masking scenario during model training, called RandWordMask, where the masking can occur for any word segment. Our experiments on the Flickr 8K Audio Captions Corpus show that multimodal ASR can generalize to recover different types of masked words in this unstructured masking setting. Moreover, our analysis shows that our models are capable of attending to the visual signal when the audio signal is corrupted. These results show that multimodal ASR systems can leverage the visual signal in more generalized noisy scenarios. + 2020.nlpbt-1.2 + + + Building a Bridge: A Method for Image-Text Sarcasm Detection Without Pretraining on Image-Text Data + XinyuWang + XiaowenSun + TanYang + HongboWang + 19–29 + Sarcasm detection in social media with text and image is becoming more challenging. Previous works of image-text sarcasm detection were mainly to fuse the summaries of text and image: different sub-models read the text and image respectively to get the summaries, and fuses the summaries. Recently, some multi-modal models based on the architecture of BERT are proposed such as ViLBERT. However, they can only be pretrained on the image-text data. In this paper, we propose an image-text model for sarcasm detection using the pretrained BERT and ResNet without any further pretraining. BERT and ResNet have been pretrained on much larger text or image data than image-text data. We connect the vector spaces of BERT and ResNet to utilize more data. We use the pretrained Multi-Head Attention of BERT to model the text and image. Besides, we propose a 2D-Intra-Attention to extract the relationships between words and images. In experiments, our model outperforms the state-of-the-art model. + 2020.nlpbt-1.3 + + + A Benchmark for Structured Procedural Knowledge Extraction from Cooking Videos + Frank F.Xu + LeiJi + BotianShi + JunyiDu + GrahamNeubig + YonatanBisk + NanDuan + 30–40 + Watching instructional videos are often used to learn about procedures. Video captioning is one way of automatically collecting such knowledge. However, it provides only an indirect, overall evaluation of multimodal models with no finer-grained quantitative measure of what they have learned. We propose instead, a benchmark of structured procedural knowledge extracted from cooking videos. This work is complementary to existing tasks, but requires models to produce interpretable structured knowledge in the form of verb-argument tuples. Our manually annotated open-vocabulary resource includes 356 instructional cooking videos and 15,523 video clip/sentence-level annotations. Our analysis shows that the proposed task is challenging and standard modeling approaches like unsupervised segmentation, semantic role labeling, and visual action detection perform poorly when forced to predict every action of a procedure in a structured form. + 2020.nlpbt-1.4 + 2020.nlpbt-1.4.OptionalSupplementaryMaterial.pdf + + + A Multi-Modal <fixed-case>E</fixed-case>nglish-<fixed-case>I</fixed-case>talian Parallel Corpus for End-to-End Speech-to-Text Machine Translation + GiuseppeDella Corte + SaraStymne + 41–50 + We discuss a set of methods for the creation of IESTAC: a English-Italian speech and text parallel corpus designed for the training of end-to-end speech-to-text machine translation models and publicly released as part of this work. We first mapped English LibriVox audiobooks and their corresponding English Gutenberg Project e-books to Italian e-books with a set of three complementary methods. Then we aligned the English and the Italian texts using both traditional Gale-Church based alignment methods and a recently proposed tool to perform bilingual sentences alignment computing the cosine similarity of multilingual sentence embeddings. Finally, we forced the alignment between the English audiobooks and the English side of our textual parallel corpus with a text-to-speech and dynamic time warping based forced alignment tool. For each step, we provide the reader with a critical discussion based on detailed evaluation and comparison of the results of the different methods. + 2020.nlpbt-1.5 + + + Unsupervised Keyword Extraction for Full-Sentence <fixed-case>VQA</fixed-case> + KoheiUehara + TatsuyaHarada + 51–59 + In the majority of the existing Visual Question Answering (VQA) research, the answers consist of short, often single words, as per instructions given to the annotators during dataset construction. This study envisions a VQA task for natural situations, where the answers are more likely to be sentences rather than single words. To bridge the gap between this natural VQA and existing VQA approaches, a novel unsupervised keyword extraction method is proposed. The method is based on the principle that the full-sentence answers can be decomposed into two parts: one that contains new information answering the question (i.e. keywords), and one that contains information already included in the question. Discriminative decoders were designed to achieve such decomposition, and the method was experimentally implemented on VQA datasets containing full-sentence answers. The results show that the proposed model can accurately extract the keywords without being given explicit annotations describing them. + 2020.nlpbt-1.6 + 2020.nlpbt-1.6.OptionalSupplementaryMaterial.pdf + + + <fixed-case>MAST</fixed-case>: Multimodal Abstractive Summarization with Trimodal Hierarchical Attention + AmanKhullar + UditArora + 60–69 + This paper presents MAST, a new model for Multimodal Abstractive Text Summarization that utilizes information from all three modalities – text, audio and video – in a multimodal video. Prior work on multimodal abstractive text summarization only utilized information from the text and video modalities. We examine the usefulness and challenges of deriving information from the audio modality and present a sequence-to-sequence trimodal hierarchical attention-based model that overcomes these challenges by letting the model pay more attention to the text modality. MAST outperforms the current state of the art model (video-text) by 2.51 points in terms of Content F1 score and 1.00 points in terms of Rouge-L score on the How2 dataset for multimodal language understanding. + 2020.nlpbt-1.7 + + + Towards End-to-End In-Image Neural Machine Translation + ElmanMansimov + MitchellStern + MiaChen + OrhanFirat + JakobUszkoreit + PuneetJain + 70–74 + In this paper, we offer a preliminary investigation into the task of in-image machine translation: transforming an image containing text in one language into an image containing the same text in another language. We propose an end-to-end neural model for this task inspired by recent approaches to neural machine translation, and demonstrate promising initial results based purely on pixel-level supervision. We then offer a quantitative and qualitative evaluation of our system outputs and discuss some common failure modes. Finally, we conclude with directions for future work. + 2020.nlpbt-1.8 + 2020.nlpbt-1.8.OptionalSupplementaryMaterial.pdf + + + Reasoning Over History: Context Aware Visual Dialog + MuhammadShah + ShikibMehri + TejasSrinivasan + 75–83 + While neural models have been shown to exhibit strong performance on single-turn visual question answering (VQA) tasks, extending VQA to a multi-turn, conversational setting remains a challenge. One way to address this challenge is to augment existing strong neural VQA models with the mechanisms that allow them to retain information from previous dialog turns. One strong VQA model is the MAC network, which decomposes a task into a series of attention-based reasoning steps. However, since the MAC network is designed for single-turn question answering, it is not capable of referring to past dialog turns. More specifically, it struggles with tasks that require reasoning over the dialog history, particularly coreference resolution. We extend the MAC network architecture with Context-aware Attention and Memory (CAM), which attends over control states in past dialog turns to determine the necessary reasoning operations for the current question. MAC nets with CAM achieve up to 98.25% accuracy on the CLEVR-Dialog dataset, beating the existing state-of-the-art by 30% (absolute). Our error analysis indicates that with CAM, the model’s performance particularly improved on questions that required coreference resolution. + 2020.nlpbt-1.9 + +
+
diff --git a/data/xml/2020.nlpcovid19.xml b/data/xml/2020.nlpcovid19.xml index 5fb65f5ebb..a7ad75bfa1 100644 --- a/data/xml/2020.nlpcovid19.xml +++ b/data/xml/2020.nlpcovid19.xml @@ -199,4 +199,427 @@ 2020.nlpcovid19-acl.17
+ + + Proceedings of the 1st Workshop on NLP for COVID-19 (Part 2) at EMNLP 2020 + KarinVerspoor + Kevin BretonnelCohen + MichaelConway + Berryde Bruijn + MarkDredze + RadaMihalcea + ByronWallace + Association for Computational Linguistics +
Online
+ December + 2020 + + + 2020.nlpcovid19-2.0 + + + Answering Questions on <fixed-case>COVID</fixed-case>-19 in Real-Time + JinhyukLee + Sean S.Yi + MinbyulJeong + MujeenSung + WonJinYoon + YonghwaChoi + MiyoungKo + JaewooKang + The recent outbreak of the novel coronavirus is wreaking havoc on the world and researchers are struggling to effectively combat it. One reason why the fight is difficult is due to the lack of information and knowledge. In this work, we outline our effort to contribute to shrinking this knowledge vacuum by creating covidAsk, a question answering (QA) system that combines biomedical text mining and QA techniques to provide answers to questions in real-time. Our system also leverages information retrieval (IR) approaches to provide entity-level answers that are complementary to QA models. Evaluation of covidAsk is carried out by using a manually created dataset called COVID-19 Questions which is based on information from various sources, including the CDC and the WHO. We hope our system will be able to aid researchers in their search for knowledge and information not only for COVID-19, but for future pandemics as well. + 2020.nlpcovid19-2.1 + + + <fixed-case>CORA</fixed-case>: A Deep Active Learning Covid-19 Relevancy Algorithm to Identify Core Scientific Articles + ZubairAfzal + VikrantYadav + OlgaFedorova + VaishnaviKandala + Jannekevan de Loo + Saber A.Akhondi + PascalCoupet + GeorgeTsatsaronis + Ever since the COVID-19 pandemic broke out, the academic and scientific research community, as well as industry and governments around the world have joined forces in an unprecedented manner to fight the threat. Clinicians, biologists, chemists, bioinformaticians, nurses, data scientists, and all of the affiliated relevant disciplines have been mobilized to help discover efficient treatments for the infected population, as well as a vaccine solution to prevent further the virus spread. In this combat against the virus responsible for the pandemic, key for any advancements is the timely, accurate, peer-reviewed, and efficient communication of any novel research findings. In this paper we present a novel framework to address the information need of filtering efficiently the scientific bibliography for relevant literature around COVID-19. The contributions of the paper are summarized in the following: we define and describe the information need that encompasses the major requirements for COVID-19 articles relevancy, we present and release an expert-curated benchmark set for the task, and we analyze the performance of several state-of-the-art machine learning classifiers that may distinguish the relevant from the non-relevant COVID-19 literature. + 2020.nlpcovid19-2.2 + + + Frugal neural reranking: evaluation on the Covid-19 literature + TiagoAlmeida + SérgioMatos + The Covid-19 pandemic urged the scientific community to join efforts at an unprecedented scale, leading to faster than ever dissemination of data and results, which in turn motivated more research works. This paper presents and discusses information retrieval models aimed at addressing the challenge of searching the large number of publications that stem from these studies. The model presented, based on classical baselines followed by an interaction based neural ranking model, was evaluated and evolved within the TREC Covid challenge setting. Results on this dataset show that, when starting with a strong baseline, our light neural ranking model can achieve results that are comparable to other model architectures that use very large number of parameters. + 2020.nlpcovid19-2.3 + + + <fixed-case>COVID</fixed-case>-19 Literature Topic-Based Search via Hierarchical <fixed-case>NMF</fixed-case> + RachelGrotheer + LongxiuHuang + YihuanHuang + AlonaKryshchenko + OleksandrKryshchenko + PengyuLi + XiaLi + ElizavetaRebrova + KyungHa + DeannaNeedell + A dataset of COVID-19-related scientific literature is compiled, combining the articles from several online libraries and selecting those with open access and full text available. Then, hierarchical nonnegative matrix factorization is used to organize literature related to the novel coronavirus into a tree structure that allows researchers to search for relevant literature based on detected topics. We discover eight major latent topics and 52 granular subtopics in the body of literature, related to vaccines, genetic structure and modeling of the disease and patient studies, as well as related diseases and virology. In order that our tool may help current researchers, an interactive website is created that organizes available literature using this hierarchical structure. + 2020.nlpcovid19-2.4 + + + <fixed-case>TICO</fixed-case>-19: the Translation Initiative for <fixed-case>CO</fixed-case>vid-19 + AntoniosAnastasopoulos + AlessandroCattelan + Zi-YiDou + MarcelloFederico + ChristianFedermann + DmitriyGenzel + FransciscoGuzmán + JunjieHu + MacduffHughes + PhilippKoehn + RosieLazar + WillLewis + GrahamNeubig + MengmengNiu + AlpÖktem + EricPaquin + GraceTang + SylwiaTur + The COVID-19 pandemic is the worst pandemic to strike the world in over a century. Crucial to stemming the tide of the SARS-CoV-2 virus is communicating to vulnerable populations the means by which they can protect themselves. To this end, the collaborators forming the Translation Initiative for COvid-19 (TICO-19) have made test and development data available to AI and MT researchers in 35 different languages in order to foster the development of tools and resources for improving access to information about COVID-19 in these languages. In addition to 9 high-resourced, ”pivot” languages, the team is targeting 26 lesser resourced languages, in particular languages of Africa, South Asia and South-East Asia, whose populations may be the most vulnerable to the spread of the virus. The same data is translated into all of the languages represented, meaning that testing or development can be done for any pairing of languages in the set. Further, the team is converting the test and development data into translation memories (TMXs) that can be used by localizers from and to any of the languages. + 2020.nlpcovid19-2.5 + + + Expressive Interviewing: A Conversational System for Coping with <fixed-case>COVID</fixed-case>-19 + CharlesWelch + AllisonLahnala + VeronicaPerez-Rosas + SiqiShen + SarahSeraj + LarryAn + KennethResnicow + JamesPennebaker + RadaMihalcea + The ongoing COVID-19 pandemic has raised concerns for many regarding personal and public health implications, financial security and economic stability. Alongside many other unprecedented challenges, there are increasing concerns over social isolation and mental health. We introduce Expressive Interviewing – an interview-style conversational system that draws on ideas from motivational interviewing and expressive writing. Expressive Interviewing seeks to encourage users to express their thoughts and feelings through writing by asking them questions about how COVID-19 has impacted their lives. We present relevant aspects of the system’s design and implementation as well as quantitative and qualitative analyses of user interactions with the system. In addition, we conduct a comparative evaluation with a general purpose dialogue system for mental health that shows our system potential in helping users to cope with COVID-19 issues. + 2020.nlpcovid19-2.6 + + + Temporal Mental Health Dynamics on Social Media + TomTabak + MatthewPurver + We describe a set of experiments for building a temporal mental health dynamics system. We utilise a pre-existing methodology for distant- supervision of mental health data mining from social media platforms and deploy the system during the global COVID-19 pandemic as a case study. Despite the challenging nature of the task, we produce encouraging results, both explicit to the global pandemic and implicit to a global phenomenon, Christmas Depres- sion, supported by the literature. We propose a methodology for providing insight into tem- poral mental health dynamics to be utilised for strategic decision-making. + 2020.nlpcovid19-2.7 + + + Quantifying the Effects of <fixed-case>COVID</fixed-case>-19 on Mental Health Support Forums + LauraBiester + KatieMatton + JanarthananRajendran + Emily MowerProvost + RadaMihalcea + The COVID-19 pandemic, like many of the disease outbreaks that have preceded it, is likely to have a profound effect on mental health. Understanding its impact can inform strategies for mitigating negative consequences. In this work, we seek to better understand the effects of COVID-19 on mental health by examining discussions within mental health support communities on Reddit. First, we quantify the rate at which COVID-19 is discussed in each community, or subreddit, in order to understand levels of pandemic-related discussion. Next, we examine the volume of activity in order to determine whether the number of people discussing mental health has risen. Finally, we analyze how COVID-19 has influenced language use and topics of discussion within each subreddit. + 2020.nlpcovid19-2.8 + + + <fixed-case>COVID</fixed-case>-19 Surveillance through <fixed-case>T</fixed-case>witter using Self-Supervised and Few Shot Learning + BrandonLwowski + PeymanNajafirad + Public health surveillance and tracking virus via social media can be a useful digital tool for contact tracing and preventing the spread of the virus. Nowadays, large volumes of COVID-19 tweets can quickly be processed in real-time to offer information to researchers. Nonetheless, due to the absence of labeled data for COVID-19, the preliminary supervised classifier or semi-supervised self-labeled methods will not handle non-spherical data with adequate accuracy. With the seasonal influenza and novel Coronavirus having many similar symptoms, we propose using few shot learning to fine-tune a semi-supervised model built on unlabeled COVID-19 and previously labeled influenza dataset that can provide in- sights into COVID-19 that have not been investigated. The experimental results show the efficacy of the proposed model with an accuracy of 86%, identification of Covid-19 related discussion using recently collected tweets. + 2020.nlpcovid19-2.9 + + + Explaining the Trump Gap in Social Distancing Using <fixed-case>COVID</fixed-case> Discourse + Austin VanLoon + SheridanStewart + BrandonWaldon + Shrinidhi KLakshmikanth + IshanShah + Sharath ChandraGuntuku + GarrickSherman + JamesZou + JohannesEichstaedt + Our ability to limit the future spread of COVID-19 will in part depend on our understanding of the psychological and sociological processes that lead people to follow or reject coronavirus health behaviors. We argue that the virus has taken on heterogeneous meanings in communities across the United States and that these disparate meanings shaped communities’ response to the virus during the early, vital stages of the outbreak in the U.S. Using word embeddings, we demonstrate that counties where residents socially distanced less on average (as measured by residential mobility) more semantically associated the virus in their COVID discourse with concepts of fraud, the political left, and more benign illnesses like the flu. We also show that the different meanings the virus took on in different communities explains a substantial fraction of what we call the “”Trump Gap”, or the empirical tendency for more Trump-supporting counties to socially distance less. This work demonstrates that community-level processes of meaning-making in part determined behavioral responses to the COVID-19 pandemic and that these processes can be measured unobtrusively using Twitter. + 2020.nlpcovid19-2.10 + + + <fixed-case>COVIDL</fixed-case>ies: Detecting <fixed-case>COVID</fixed-case>-19 Misinformation on Social Media + TamannaHossain + Robert L.Logan IV + ArjunaUgarte + YoshitomoMatsubara + SeanYoung + SameerSingh + The ongoing pandemic has heightened the need for developing tools to flag COVID-19-related misinformation on the internet, specifically on social media such as Twitter. However, due to novel language and the rapid change of information, existing misinformation detection datasets are not effective for evaluating systems designed to detect misinformation on this topic. Misinformation detection can be divided into two sub-tasks: (i) retrieval of misconceptions relevant to posts being checked for veracity, and (ii) stance detection to identify whether the posts Agree, Disagree, or express No Stance towards the retrieved misconceptions. To facilitate research on this task, we release COVIDLies (https://ucinlp.github.io/covid19 ), a dataset of 6761 expert-annotated tweets to evaluate the performance of misinformation detection systems on 86 different pieces of COVID-19 related misinformation. We evaluate existing NLP systems on this dataset, providing initial benchmarks and identifying key challenges for future models to improve upon. + 2020.nlpcovid19-2.11 + + + Improved Topic Representations of Medical Documents to Assist <fixed-case>COVID</fixed-case>-19 Literature Exploration + YuliaOtmakhova + KarinVerspoor + TimothyBaldwin + SimonŠuster + Efficient discovery and exploration of biomedical literature has grown in importance in the context of the COVID-19 pandemic, and topic-based methods such as latent Dirichlet allocation (LDA) are a useful tool for this purpose. In this study we compare traditional topic models based on word tokens with topic models based on medical concepts, and propose several ways to improve topic coherence and specificity. + 2020.nlpcovid19-2.12 + + + A System for Worldwide <fixed-case>COVID</fixed-case>-19 Information Aggregation + AkikoAizawa + FredericBergeron + JunjieChen + FeiCheng + KatsuhikoHayashi + KentaroInui + HiroyoshiIto + DaisukeKawahara + MasaruKitsuregawa + HirokazuKiyomaru + MasakiKobayashi + TakashiKodama + SadaoKurohashi + QianyingLiu + MasakiMatsubara + YusukeMiyao + AtsuyukiMorishima + YugoMurawaki + KazumasaOmura + HaiyueSong + EiichiroSumita + ShinjiSuzuki + RibekaTanaka + YuTanaka + MasashiToyoda + NobuhiroUeda + HonaiUeoka + MasaoUtiyama + YingZhong + The global pandemic of COVID-19 has made the public pay close attention to related news, covering various domains, such as sanitation, treatment, and effects on education. Meanwhile, the COVID-19 condition is very different among the countries (e.g., policies and development of the epidemic), and thus citizens would be interested in news in foreign countries. We build a system for worldwide COVID-19 information aggregation containing reliable articles from 10 regions in 7 languages sorted by topics. Our reliable COVID-19 related website dataset collected through crowdsourcing ensures the quality of the articles. A neural machine translation module translates articles in other languages into Japanese and English. A BERT-based topic-classifier trained on our article-topic pair dataset helps users find their interested information efficiently by putting articles into different categories. + 2020.nlpcovid19-2.13 + + + <fixed-case>CA</fixed-case>i<fixed-case>RE</fixed-case>-<fixed-case>COVID</fixed-case>: A Question Answering and Query-focused Multi-Document Summarization System for <fixed-case>COVID</fixed-case>-19 Scholarly Information Management + DanSu + YanXu + TiezhengYu + Farhad BinSiddique + ElhamBarezi + PascaleFung + We present CAiRE-COVID, a real-time question answering (QA) and multi-document summarization system, which won one of the 10 tasks in the Kaggle COVID-19 Open Research Dataset Challenge, judged by medical experts. Our system aims to tackle the recent challenge of mining the numerous scientific articles being published on COVID-19 by answering high priority questions from the community and summarizing salient question-related information. It combines information extraction with state-of-the-art QA and query-focused multi-document summarization techniques, selecting and highlighting evidence snippets from existing literature given a query. We also propose query-focused abstractive and extractive multi-document summarization methods, to provide more relevant information related to the question. We further conduct quantitative experiments that show consistent improvements on various metrics for each module. We have launched our website CAiRE-COVID for broader use by the medical community, and have open-sourced the code for our system, to bootstrap further study by other researches. + 2020.nlpcovid19-2.14 + + + Automatic Evaluation vs. User Preference in Neural Textual <fixed-case>Q</fixed-case>uestion<fixed-case>A</fixed-case>nswering over <fixed-case>COVID</fixed-case>-19 Scientific Literature + ArantxaOtegi + Jon AnderCampos + GorkaAzkune + AitorSoroa + EnekoAgirre + We present a Question Answering (QA) system that won one of the tasks of the Kaggle CORD-19 Challenge, according to the qualitative evaluation of experts. The system is a combination of an Information Retrieval module and a reading comprehension module that finds the answers in the retrieved passages. In this paper we present a quantitative and qualitative analysis of the system. The quantitative evaluation using manually annotated datasets contradicted some of our design choices, e.g. the fact that using QuAC for fine-tuning provided better answers over just using SQuAD. We analyzed this mismatch with an additional A/B test which showed that the system using QuAC was indeed preferred by users, confirming our intuition. Our analysis puts in question the suitability of automatic metrics and its correlation to user preferences. We also show that automatic metrics are highly dependent on the characteristics of the gold standard, such as the average length of the answers. + 2020.nlpcovid19-2.15 + + + A Multilingual Neural Machine Translation Model for Biomedical Data + AlexandreBérard + Zae MyungKim + VassilinaNikoulina + Eunjeong LucyPark + MatthiasGallé + We release a multilingual neural machine translation model, which can be used to translate text in the biomedical domain. The model can translate from 5 languages (French, German, Italian, Korean and Spanish) into English. It is trained with large amounts of generic and biomedical data, using domain tags. Our benchmarks show that it performs near state-of-the-art both on news (generic domain) and biomedical test sets, and that it outperforms the existing publicly released models. We believe that this release will help the large-scale multilingual analysis of the digital content of the COVID-19 crisis and of its effects on society, economy, and healthcare policies. We also release a test set of biomedical text for Korean-English. It consists of 758 sentences from official guidelines and recent papers, all about COVID-19. + 2020.nlpcovid19-2.16 + + + Public Sentiment on Governmental <fixed-case>COVID</fixed-case>-19 Measures in <fixed-case>D</fixed-case>utch Social Media + ShihanWang + MarijnSchraagen + ErikTjong Kim Sang + MehdiDastani + Public sentiment (the opinion, attitude or feeling that the public expresses) is a factor of interest for government, as it directly influences the implementation of policies. Given the unprecedented nature of the COVID-19 crisis, having an up-to-date representation of public sentiment on governmental measures and announcements is crucial. In this paper, we analyse Dutch public sentiment on governmental COVID-19 measures from text data collected across three online media sources (Twitter, Reddit and Nu.nl) from February to September 2020. We apply sentiment analysis methods to analyse polarity over time, as well as to identify stance towards two specific pandemic policies regarding social distancing and wearing face masks. The presented preliminary results provide valuable insights into the narratives shown in vast social media text data, which help understand the influence of COVID-19 measures on the general public. + 2020.nlpcovid19-2.17 + + + Exploratory Analysis of <fixed-case>COVID</fixed-case>-19 Related Tweets in <fixed-case>N</fixed-case>orth <fixed-case>A</fixed-case>merica to Inform Public Health Institutes + HyejuJang + EmilyRempel + GiuseppeCarenini + NaveedJanjua + Social media is a rich source where we can learn about people’s reactions to social issues. As COVID-19 has significantly impacted on people’s lives, it is essential to capture how people react to public health interventions and understand their concerns. In this paper, we aim to investigate people’s reactions and concerns about COVID-19 in North America, especially focusing on Canada. We analyze COVID-19 related tweets using topic modeling and aspect-based sentiment analysis, and interpret the results with public health experts. We compare timeline of topics discussed with timing of implementation of public health interventions for COVID-19. We also examine people’s sentiment about COVID-19 related issues. We discuss how the results can be helpful for public health agencies when designing a policy for new interventions. Our work shows how Natural Language Processing (NLP) techniques could be applied to public health questions with domain expert involvement. + 2020.nlpcovid19-2.18 + + + <fixed-case>T</fixed-case>witter Data Augmentation for Monitoring Public Opinion on <fixed-case>COVID</fixed-case>-19 Intervention Measures + LinMiao + MarkLast + MarinaLitvak + The COVID-19 outbreak is an ongoing worldwide pandemic that was announced as a global health crisis in March 2020. Due to the enormous challenges and high stakes of this pandemic, governments have implemented a wide range of policies aimed at containing the spread of the virus and its negative effect on multiple aspects of our life. Public responses to various intervention measures imposed over time can be explored by analyzing the social media. Due to the shortage of available labeled data for this new and evolving domain, we apply data distillation methodology to labeled datasets from related tasks and a very small manually labeled dataset. Our experimental results show that data distillation outperforms other data augmentation methods on our task. + 2020.nlpcovid19-2.19 + + + <fixed-case>COVID</fixed-case>-19: A Semantic-Based Pipeline for Recommending Biomedical Entities + Marcia AfonsoBarros + AndreLamurias + DianaSousa + PedroRuas + Francisco M.Couto + With the increasing number of publications about COVID-19, it is a challenge to extract personalized knowledge suitable for each researcher. This work aims to build a new semantic-based pipeline for recommending biomedical entities to scientific researchers. To this end, we developed a pipeline that creates an implicit feedback matrix based on Named Entity Recognition (NER) on a corpus of documents, using multidisciplinary ontologies for recognizing and linking the entities. Our hypothesis is that by using ontologies from different fields in the NER phase, we can improve the results for state-of-the-art collaborative-filtering recommender systems applied to the dataset created. The tests performed using the COVID-19 Open Research Dataset (CORD-19) dataset show that when using four ontologies, the results for precision@k, for example, reach the 80%, whereas when using only one ontology, the results for precision@k drops to 20%, for the same users. Furthermore, the use of multi-fields entities may help in the discovery of new items, even if the researchers do not have items from that field in their set of preferences. + 2020.nlpcovid19-2.20 + + + Vapur: A Search Engine to Find Related Protein - Compound Pairs in <fixed-case>COVID</fixed-case>-19 Literature + AbdullatifKöksal + HilalDönmez + RızaÖzçelik + ElifOzkirimli + ArzucanÖzgür + Coronavirus Disease of 2019 (COVID-19) created dire consequences globally and triggered an intense scientific effort from different domains. The resulting publications created a huge text collection in which finding the studies related to a biomolecule of interest is challenging for general purpose search engines because the publications are rich in domain specific terminology. Here, we present Vapur: an online COVID-19 search engine specifically designed to find related protein - chemical pairs. Vapur is empowered with a relation-oriented inverted index that is able to retrieve and group studies for a query biomolecule with respect to its related entities. The inverted index of Vapur is automatically created with a BioNLP pipeline and integrated with an online user interface. The online interface is designed for the smooth traversal of the current literature by domain researchers and is publicly available at https://tabilab.cmpe.boun.edu.tr/vapur/. + 2020.nlpcovid19-2.21 + + + Knowledge Discovery in <fixed-case>COVID</fixed-case>-19 Research Literature + AlejandroPiad-Morffis + SuilanEstevez-Velarde + Ernesto LuisEstevanell-Valladares + YoanGutiérrez + AndrésMontoyo + RafaelMuñoz + YudiviánAlmeida-Cruz + This paper presents the preliminary results of an ongoing project that analyzes the growing body of scientific research published around the COVID-19 pandemic. In this research, a general-purpose semantic model is used to double annotate a batch of $500$ sentences that were manually selected by the researchers from the CORD-19 corpus. Afterwards, a baseline text-mining pipeline is designed and evaluated via a large batch of $100,959$ sentences. We present a qualitative analysis of the most interesting facts automatically extracted and highlight possible future lines of development. The preliminary results show that general-purpose semantic models are a useful tool for discovering fine-grained knowledge in large corpora of scientific documents. + 2020.nlpcovid19-2.22 + + + Identifying pandemic-related stress factors from social-media posts – <fixed-case>E</fixed-case>ffects on students and young-adults + SachinThukral + SuyashSangwan + ArnabChatterjee + LipikaDey + The COVID-19 pandemic has thrown natural life out of gear across the globe. Strict measures are deployed to curb the spread of the virus that is causing it, and the most effective of them have been social isolation. This has led to wide-spread gloom and depression across society but more so among the young and the elderly. There are currently more than 200 million college students in 186 countries worldwide, affected due to the pandemic. The mode of education has changed suddenly, with the rapid adaptation of e-learning, whereby teaching is undertaken remotely and on digital platforms. This study presents insights gathered from social media posts that were posted by students and young adults during the COVID times. Using statistical and NLP techniques, we analyzed the behavioural issues reported by users themselves in their posts in depression related communities on Reddit. We present methodologies to systematically analyze content using linguistic techniques to find out the stress-inducing factors. Online education, losing jobs, isolation from friends and abusive families emerge as key stress factors + 2020.nlpcovid19-2.23 + + + Tracking And Understanding Public Reaction During <fixed-case>COVID</fixed-case>-19: <fixed-case>S</fixed-case>audi <fixed-case>A</fixed-case>rabia As A Use Case + AseelAddawood + AlhanoufAlsuwailem + AliAlohali + DalalAlajaji + MashailAlturki + JaidaAlsuhaibani + FawziahAljabli + The coronavirus disease of 2019 (COVID-19) has a huge impact on economies and societies around the world. While governments are taking extreme measures to reduce the spread of the virus, people are getting affected by these new measures. With restrictions like lockdown and social distancing, it became important to understand the emotional response of the public towards the pandemic. In this paper, we study the reaction of Saudi Arabia citizens towards the pandemic. We utilize a collection of Arabic tweets that were sent during 2020, primarily through hashtags that were originated from Saudi Arabia. Our results showed that people had kept a positive reaction towards the pandemic. This positive reaction was at its highest at the beginning of the COVID-19 crisis and started to decline as time passes. Overall, the results showed that people were so supportive of each other through this pandemic. This research can help researchers and policymakers in understanding the emotional effect of a pandemic on societies. + 2020.nlpcovid19-2.24 + + + Characterizing drug mentions in <fixed-case>COVID</fixed-case>-19 <fixed-case>T</fixed-case>witter Chatter + RamyaTekumalla + Juan MBanda + Since the classification of COVID-19 as a global pandemic, there have been many attempts to treat and contain the virus. Although there is no specific antiviral treatment recommended for COVID-19, there are several drugs that can potentially help with symptoms. In this work, we mined a large twitter dataset of 424 million tweets of COVID-19 chatter to identify discourse around drug mentions. While seemingly a straightforward task, due to the informal nature of language use in Twitter, we demonstrate the need of machine learning alongside traditional automated methods to aid in this task. By applying these complementary methods, we are able to recover almost 15% additional data, making misspelling handling a needed task as a pre-processing step when dealing with social media data. + 2020.nlpcovid19-2.25 + + + Content analysis of <fixed-case>P</fixed-case>ersian/<fixed-case>F</fixed-case>arsi Tweets during <fixed-case>COVID</fixed-case>-19 pandemic in <fixed-case>I</fixed-case>ran using <fixed-case>NLP</fixed-case> + PedramHosseini + PooryaHosseini + DavidBroniatowski + Iran, along with China, South Korea, and Italy was among the countries that were hit hard in the first wave of the COVID-19 spread. Twitter is one of the widely-used online platforms by Iranians inside and abroad for sharing their opinion, thoughts, and feelings about a wide range of issues. In this study, using more than 530,000 original tweets in Persian/Farsi on COVID-19, we analyzed the topics discussed among users, who are mainly Iranians, to gauge and track the response to the pandemic and how it evolved over time. We applied a combination of manual annotation of a random sample of tweets and topic modeling tools to classify the contents and frequency of each category of topics. We identified the top 25 topics among which living experience under home quarantine emerged as a major talking point. We additionally categorized the broader content of tweets that shows satire, followed by news, is the dominant tweet type among Iranian users. While this framework and methodology can be used to track public response to ongoing developments related to COVID-19, a generalization of this framework can become a useful framework to gauge Iranian public reaction to ongoing policy measures or events locally and internationally. + 2020.nlpcovid19-2.26 + + + Annotating the Pandemic: Named Entity Recognition and Normalisation in <fixed-case>COVID</fixed-case>-19 Literature + NicoColic + LenzFurrer + FabioRinaldi + The COVID-19 pandemic has been accompanied by such an explosive increase in media coverage and scientific publications that researchers find it difficult to keep up. We are presenting a publicly available pipeline to perform named entity recognition and normalisation in parallel to help find relevant publications and to aid in downstream NLP tasks such as text summarisation. In our approach, we are using a dictionary-based system for its high recall in conjunction with two models based on BioBERT for their accuracy. Their outputs are combined according to different strategies depending on the entity type. In addition, we are using a manually crafted dictionary to increase performance for new concepts related to COVID-19. We have previously evaluated our work on the CRAFT corpus, and make the output of our pipeline available on two visualisation platforms. + 2020.nlpcovid19-2.27 + + + <fixed-case>A</fixed-case>sk<fixed-case>M</fixed-case>e: A <fixed-case>LAPPS</fixed-case> <fixed-case>G</fixed-case>rid-based <fixed-case>NLP</fixed-case> Query and Retrieval System for Covid-19 Literature + KeithSuderman + NancyIde + VerhagenMarc + BrentCochran + JamesPustejovsky + In a recent project, the Language Application Grid was augmented to support the mining of scientific publications. The results of that ef- fort have now been repurposed to focus on Covid-19 literature, including modification of the LAPPS Grid “AskMe” query and retrieval engine. We describe the AskMe system and discuss its functionality as compared to other query engines available to search covid-related publications. + 2020.nlpcovid19-2.28 + + + Concept Wikification for <fixed-case>COVID</fixed-case>-19 + PanagiotisLymperopoulos + HaolingQiu + BonanMin + Understanding scientific articles related to COVID-19 requires broad knowledge about concepts such as symptoms, diseases and medicine. Given the very large and ever-growing scientific articles related to COVID-19, it is a daunting task even for experts to recognize the large set of concepts mentioned in these articles. In this paper, we address the problem of concept wikification for COVID-19, which is to automatically recognize mentions of concepts related to COVID-19 in text and resolve them into Wikipedia titles. We develop an approach to curate a COVID-19 concept wikification dataset by mining Wikipedia text and the associated intra-Wikipedia links. We also develop an end-to-end system for concept wikification for COVID-19. Preliminary experiments show very encouraging results. Our dataset, code and pre-trained model are available at github.com/panlybero/Covid19_wikification. + 2020.nlpcovid19-2.29 + + + Developing a Curated Topic Model for <fixed-case>COVID</fixed-case>-19 Medical Research Literature + PhilipResnik + Katherine E.Goodman + MikeMoran + Topic models can facilitate search, navigation, and knowledge discovery in large document collections. However, automatic generation of topic models can produce results that fail to meet the needs of users. We advocate for a set of user-focused desiderata in topic modeling for the COVID-19 literature, and describe an effort in progress to develop a curated topic model for COVID-19 articles informed by subject matter expertise and the way medical researchers engage with medical literature. + 2020.nlpcovid19-2.30 + + + Collecting Verified <fixed-case>COVID</fixed-case>-19 Question Answer Pairs + AdamPoliak + MaxFleming + CashCostello + Kenton WMurray + MahsaYarmohammadi + ShivaniPandya + DariusIrani + MilindAgarwal + UditSharma + ShuoSun + NicolaIvanov + LingxiShang + KaushikSrinivasan + SeolhwaLee + XuHan + SmishaAgarwal + JoãoSedoc + We release a dataset of over 2,100 COVID19 related Frequently asked Question-Answer pairs scraped from over 40 trusted websites. We include an additional 24, 000 questions pulled from online sources that have been aligned by experts with existing answered questions from our dataset. This paper describes our efforts in collecting the dataset and summarizes the resulting data. Our dataset is automatically updated daily and available at https://github.com/JHU-COVID-QA/ scraping-qas. So far, this data has been used to develop a chatbot providing users information about COVID-19. We encourage others to build analytics and tools upon this dataset as well. + 2020.nlpcovid19-2.31 + + + A Comprehensive Dictionary and Term Variation Analysis for <fixed-case>COVID</fixed-case>-19 and <fixed-case>SARS</fixed-case>-<fixed-case>C</fixed-case>o<fixed-case>V</fixed-case>-2 + RobertLeaman + ZhiyongLu + The number of unique terms in the scientific literature used to refer to either SARS-CoV-2 or COVID-19 is remarkably large and has continued to increase rapidly despite well-established standardized terms. This high degree of term variation makes high recall identification of these important entities difficult. In this manuscript we present an extensive dictionary of terms used in the literature to refer to SARS-CoV-2 and COVID-19. We use a rule-based approach to iteratively generate new term variants, then locate these variants in a large text corpus. We compare our dictionary to an extensive collection of terminological resources, demonstrating that our resource provides a substantial number of additional terms. We use our dictionary to analyze the usage of SARS-CoV-2 and COVID-19 terms over time and show that the number of unique terms continues to grow rapidly. Our dictionary is freely available at https://github.com/ncbi-nlp/CovidTermVar. + 2020.nlpcovid19-2.32 + + + Using the Poly-encoder for a <fixed-case>COVID</fixed-case>-19 Question Answering System + SeolhwaLee + JoãoSedoc + To combat misinformation regarding COVID- 19 during this unprecedented pandemic, we propose a conversational agent that answers questions related to COVID-19. We adapt the Poly-encoder (Humeau et al., 2020) model for informational retrieval from FAQs. We show that after fine-tuning, the Poly-encoder can achieve a higher F1 score. We make our code publicly available for other researchers to use. + 2020.nlpcovid19-2.33 + + + <fixed-case>W</fixed-case>eibo-<fixed-case>COV</fixed-case>: A Large-Scale <fixed-case>COVID</fixed-case>-19 Social Media Dataset from <fixed-case>W</fixed-case>eibo + YongHu + HeyanHuang + AnfanChen + Xian-LingMao + With the rapid development of COVID-19 around the world, people are requested to maintain “social distance” and “stay at home”. In this scenario, extensive social interactions transfer to cyberspace, especially on social media platforms like Twitter and Sina Weibo. People generate posts to share information, express opinions and seek help during the pandemic outbreak, and these kinds of data on social media are valuable for studies to prevent COVID-19 transmissions, such as early warning and outbreaks detection. Therefore, in this paper, we release a novel and fine-grained large-scale COVID-19 social media dataset collected from Sina Weibo, named Weibo-COV, contains more than 40 million posts ranging from December 1, 2019 to April 30, 2020. Moreover, this dataset includes comprehensive information nuggets like post-level information, interactive information, location information, and repost network. We hope this dataset can promote studies of COVID-19 from multiple perspectives and enable better and rapid researches to suppress the spread of this pandemic. + 2020.nlpcovid19-2.34 + + + Detecting Emerging Symptoms of <fixed-case>COVID</fixed-case>-19 using Context-based <fixed-case>T</fixed-case>witter Embeddings + RoshanSantosh + H.Schwartz + JohannesEichstaedt + LyleUngar + Sharath ChandraGuntuku + In this paper, we present an iterative graph-based approach for the detection of symptoms of COVID-19, the pathology of which seems to be evolving. More generally, the method can be applied to finding context-specific words and texts (e.g. symptom mentions) in large imbalanced corpora (e.g. all tweets mentioning }#COVID-19). Given the novelty of COVID-19, we also test if the proposed approach generalizes to the problem of detecting Adverse Drug Reaction (ADR). We find that the approach applied to Twitter data can detect symptom mentions substantially before to their being reported by the Centers for Disease Control (CDC). + 2020.nlpcovid19-2.35 + + + Hate and Toxic Speech Detection in the Context of Covid-19 Pandemic using <fixed-case>XAI</fixed-case>: Ongoing Applied Research + DavidHardage + PeymanNajafirad + As social distancing, self-quarantines, and travel restrictions have shifted a lot of pandemic conversations to social media so does the spread of hate speech. While recent machine learning solutions for automated hate and offensive speech identification are available on Twitter, there are issues with their interpretability. We propose a novel use of learned feature importance which improves upon the performance of prior state-of-the-art text classification techniques, while producing more easily interpretable decisions. We also discuss both technical and practical challenges that remain for this task. + 2020.nlpcovid19-2.36 + + + Real-time Classification, Geolocation and Interactive Visualization of <fixed-case>COVID</fixed-case>-19 Information Shared on Social Media to Better Understand Global Developments + AndreiMircea + As people communicate on social media during COVID-19, it can be an invaluable source of useful and up-to-date information. However, the large volume and noise-to-signal ratio of social media can make this impractical. We present a prototype dashboard for the real-time classification, geolocation and interactive visualization of COVID-19 tweets that addresses these issues. We also describe a novel L2 classification layer that outperforms linear layers on a dataset of respiratory virus tweets. + 2020.nlpcovid19-2.37 + +
diff --git a/data/xml/2020.nlpcss.xml b/data/xml/2020.nlpcss.xml new file mode 100644 index 0000000000..2587733342 --- /dev/null +++ b/data/xml/2020.nlpcss.xml @@ -0,0 +1,246 @@ + + + + + Proceedings of the Fourth Workshop on Natural Language Processing and Computational Social Science + DavidBamman + DirkHovy + DavidJurgens + BrendanO'Connor + SvitlanaVolkova + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.nlpcss-1.0 + + + Measuring Linguistic Diversity During <fixed-case>COVID</fixed-case>-19 + JonathanDunn + TomCoupe + BenjaminAdams + 1–10 + Computational measures of linguistic diversity help us understand the linguistic landscape using digital language data. The contribution of this paper is to calibrate measures of linguistic diversity using restrictions on international travel resulting from the COVID-19 pandemic. Previous work has mapped the distribution of languages using geo-referenced social media and web data. The goal, however, has been to describe these corpora themselves rather than to make inferences about underlying populations. This paper shows that a difference-in-differences method based on the Herfindahl-Hirschman Index can identify the bias in digital corpora that is introduced by non-local populations. These methods tell us where significant changes have taken place and whether this leads to increased or decreased diversity. This is an important step in aligning digital corpora like social media with the real-world populations that have produced them. + 2020.nlpcss-1.1 + 2020.nlpcss-1.1.OptionalSupplementaryMaterial.zip + + + Using <fixed-case>BERT</fixed-case> for Qualitative Content Analysis in Psychosocial Online Counseling + PhilippGrandeit + CarolynHaberkern + MaximilianeLang + JensAlbrecht + RobertLehmann + 11–23 + Qualitative content analysis is a systematic method commonly used in the social sciences to analyze textual data from interviews or online discussions. However, this method usually requires high expertise and manual effort because human coders need to read, interpret, and manually annotate text passages. This is especially true if the system of categories used for annotation is complex and semantically rich. Therefore, qualitative content analysis could benefit greatly from automated coding. In this work, we investigate the usage of machine learning-based text classification models for automatic coding in the area of psycho-social online counseling. We developed a system of over 50 categories to analyze counseling conversations, labeled over 10.000 text passages manually, and evaluated the performance of different machine learning-based classifiers against human coders. + 2020.nlpcss-1.2 + + + Swimming with the Tide? Positional Claim Detection across Political Text Types + NicoBlokker + ErenayDayanik + GabriellaLapesa + SebastianPadó + 24–34 + Manifestos are official documents of political parties, providing a comprehensive topical overview of the electoral programs. Voters, however, seldom read them and often prefer other channels, such as newspaper articles, to understand the party positions on various policy issues. The natural question to ask is how compatible these two formats (manifesto and newspaper reports) are in their representation of party positioning. We address this question with an approach that combines political science (manual annotation and analysis) and natural language processing (supervised claim identification) in a cross-text type setting: we train a classifier on annotated newspaper data and test its performance on manifestos. Our findings show a) strong performance for supervised classification even across text types and b) a substantive overlap between the two formats in terms of party positioning, with differences regarding the salience of specific issues. + 2020.nlpcss-1.3 + + + Does Social Support (Expressed in Post Titles) Elicit Comments in Online Substance Use Recovery Forums? + AnietieAndy + Sharath ChandraGuntuku + 35–40 + Individuals recovering from substance use often seek social support (emotional and informational) on online recovery forums, where they can both write and comment on posts, expressing their struggles and successes. A common challenge in these forums is that certain posts (some of which may be support seeking) receive no comments. In this work, we use data from two Reddit substance recovery forums: /r/Leaves and /r/OpiatesRecovery, to determine the relationship between the social supports expressed in the titles of posts and the number of comments they receive. We show that the types of social support expressed in post titles that elicit comments vary from one substance use recovery forum to the other. + 2020.nlpcss-1.4 + + + <fixed-case>I</fixed-case> miss you babe: Analyzing Emotion Dynamics During <fixed-case>COVID</fixed-case>-19 Pandemic + Hui Xian LynnetteNg + Roy Ka-WeiLee + Md RabiulAwal + 41–49 + With the world on a lockdown due to the COVID-19 pandemic, this paper studies emotions expressed on Twitter. Using a combined strategy of time series analysis of emotions augmented by tweet topics, this study provides an insight into emotion transitions during the pandemic. After tweets are annotated with dominant emotions and topics, a time-series emotion analysis is used to identify disgust and anger as the most commonly identified emotions. Through longitudinal analysis of each user, we construct emotion transition graphs, observing key transitions between disgust and anger, and self-transitions within anger and disgust emotional states. Observing user patterns through clustering of user longitudinal analyses reveals emotional transitions fall into four main clusters: (1) erratic motion over short period of time, (2) disgust -> anger, (3) optimism -> joy. (4) erratic motion over a prolonged period. Finally, we propose a method for predicting users subsequent topic, and by consequence their emotions, through constructing an Emotion Topic Hidden Markov Model, augmenting emotion transition states with topic information. Results suggests that the predictions fare better than baselines, spurring directions of predicting emotional states based on Twitter posts. + 2020.nlpcss-1.5 + + + Assessing population-level symptoms of anxiety, depression, and suicide risk in real time using <fixed-case>NLP</fixed-case> applied to social media data + AlexFine + PatrickCrutchley + JennyBlase + JoshuaCarroll + GlenCoppersmith + 50–54 + Prevailing methods for assessing population-level mental health require costly collection of large samples of data through instruments such as surveys, and are thus slow to reflect current, rapidly changing social conditions. This constrains how easily population-level mental health data can be integrated into health and policy decision-making. Here, we demonstrate that natural language processing applied to publicly-available social media data can provide real-time estimates of psychological distress in the population (specifically, English-speaking Twitter users in the US). We examine population-level changes in linguistic correlates of mental health symptoms in response to the COVID-19 pandemic and to the killing of George Floyd. As a case study, we focus on social media data from healthcare providers, compared to a control sample. Our results provide a concrete demonstration of how the tools of computational social science can be applied to provide real-time or near-real-time insight into the impact of public events on mental health. + 2020.nlpcss-1.6 + + + Viable Threat on News Reading: Generating Biased News Using Natural Language Models + SaurabhGupta + Hong HuyNguyen + JunichiYamagishi + IsaoEchizen + 55–65 + Recent advancements in natural language generation has raised serious concerns. High-performance language models are widely used for language generation tasks because they are able to produce fluent and meaningful sentences. These models are already being used to create fake news. They can also be exploited to generate biased news, which can then be used to attack news aggregators to change their reader’s behavior and influence their bias. In this paper, we use a threat model to demonstrate that the publicly available language models can reliably generate biased news content based on an input original news. We also show that a large number of high-quality biased news articles can be generated using controllable text generation. A subjective evaluation with 80 participants demonstrated that the generated biased news is generally fluent, and a bias evaluation with 24 participants demonstrated that the bias (left or right) is usually evident in the generated articles and can be easily identified. + 2020.nlpcss-1.7 + + + Unsupervised Anomaly Detection in Parole Hearings using Language Models + GrahamTodd + CatalinVoss + JennyHong + 66–71 + Each year, thousands of roughly 150-page parole hearing transcripts in California go unread because legal experts lack the time to review them. Yet, reviewing transcripts is the only means of public oversight in the parole process. To assist reviewers, we present a simple unsupervised technique for using language models (LMs) to identify procedural anomalies in long-form legal text. Our technique highlights unusual passages that suggest further review could be necessary. We utilize a contrastive perplexity score to identify passages, defined as the scaled difference between its perplexities from two LMs, one fine-tuned on the target (parole) domain, and another pre-trained on out-of-domain text to normalize for grammatical or syntactic anomalies. We present quantitative analysis of the results and note that our method has identified some important cases for review. We are also excited about potential applications in unsupervised anomaly detection, and present a brief analysis of results for detecting fake TripAdvisor reviews. + 2020.nlpcss-1.8 + 2020.nlpcss-1.8.OptionalSupplementaryMaterial.zip + + + Identifying Worry in <fixed-case>T</fixed-case>witter: Beyond Emotion Analysis + ReyhaVerma + Christianvon der Weth + JithinVachery + MohanKankanhalli + 72–82 + Identifying the worries of individuals and societies plays a crucial role in providing social support and enhancing policy decision-making. Due to the popularity of social media platforms such as Twitter, users share worries about personal issues (e.g., health, finances, relationships) and broader issues (e.g., changes in society, environmental concerns, terrorism) freely. In this paper, we explore and evaluate a wide range of machine learning models to predict worry on Twitter. While this task has been closely associated with emotion prediction, we argue and show that identifying worry needs to be addressed as a separate task given the unique challenges associated with it. We conduct a user study to provide evidence that social media posts express two basic kinds of worry – normative and pathological – as stated in psychology literature. In addition, we show that existing emotion detection techniques underperform, especially while capturing normative worry. Finally, we discuss the current limitations of our approach and propose future applications of the worry identification system. + 2020.nlpcss-1.9 + + + Text Zoning and Classification for Job Advertisements in <fixed-case>G</fixed-case>erman, <fixed-case>F</fixed-case>rench and <fixed-case>E</fixed-case>nglish + Ann-SophieGnehm + SimonClematide + 83–93 + We present experiments to structure job ads into text zones and classify them into pro- fessions, industries and management functions, thereby facilitating social science analyses on labor marked demand. Our main contribution are empirical findings on the benefits of contextualized embeddings and the potential of multi-task models for this purpose. With contextualized in-domain embeddings in BiLSTM-CRF models, we reach an accuracy of 91% for token-level text zoning and outperform previous approaches. A multi-tasking BERT model performs well for our classification tasks. We further compare transfer approaches for our multilingual data. + 2020.nlpcss-1.10 + 2020.nlpcss-1.10.OptionalSupplementaryMaterial.zip + + + Is <fixed-case>W</fixed-case>ikipedia succeeding in reducing gender bias? Assessing changes in gender bias in <fixed-case>W</fixed-case>ikipedia using word embeddings + Katja GeertruidaSchmahl + Tom JulianViering + StavrosMakrodimitris + ArmanNaseri Jahfari + DavidTax + MarcoLoog + 94–103 + Large text corpora used for creating word embeddings (vectors which represent word meanings) often contain stereotypical gender biases. As a result, such unwanted biases will typically also be present in word embeddings derived from such corpora and downstream applications in the field of natural language processing (NLP). To minimize the effect of gender bias in these settings, more insight is needed when it comes to where and how biases manifest themselves in the text corpora employed. This paper contributes by showing how gender bias in word embeddings from Wikipedia has developed over time. Quantifying the gender bias over time shows that art related words have become more female biased. Family and science words have stereotypical biases towards respectively female and male words. These biases seem to have decreased since 2006, but these changes are not more extreme than those seen in random sets of words. Career related words are more strongly associated with male than with female, this difference has only become smaller in recently written articles. These developments provide additional understanding of what can be done to make Wikipedia more gender neutral and how important time of writing can be when considering biases in word embeddings trained from Wikipedia or from other text corpora. + 2020.nlpcss-1.11 + + + Effects of Anonymity on Comment Persuasiveness in <fixed-case>W</fixed-case>ikipedia Articles for Deletion Discussions + YiminXiao + LuXiao + 104–115 + It has been shown that anonymity affects various aspects of online communications such as message credibility, the trust among communicators, and the participants’ accountability and reputation. Anonymity influences social interactions in online communities in these many ways, which can lead to influences on opinion change and the persuasiveness of a message. Prior studies also suggest that the effect of anonymity can vary in different online communication contexts and online communities. In this study, we focus on Wikipedia Articles for Deletion (AfD) discussions as an example of online collaborative communities to study the relationship between anonymity and persuasiveness in this context. We find that in Wikipedia AfD discussions, more identifiable users tend to be more persuasive. The higher persuasiveness can be related to multiple aspects, including linguistic features of the comments, the user’s motivation to participate, persuasive skills the user learns over time, and the user’s identity and credibility established in the community through participation. + 2020.nlpcss-1.12 + + + Uncertainty over Uncertainty: Investigating the Assumptions, Annotations, and Text Measurements of Economic Policy Uncertainty + KatherineKeith + ChristophTeichmann + BrendanO’Connor + EdgarMeij + 116–131 + Methods and applications are inextricably linked in science, and in particular in the domain of text-as-data. In this paper, we examine one such text-as-data application, an established economic index that measures economic policy uncertainty from keyword occurrences in news. This index, which is shown to correlate with firm investment, employment, and excess market returns, has had substantive impact in both the private sector and academia. Yet, as we revisit and extend the original authors’ annotations and text measurements we find interesting text-as-data methodological research questions: (1) Are annotator disagreements a reflection of ambiguity in language? (2) Do alternative text measurements correlate with one another and with measures of external predictive validity? We find for this application (1) some annotator disagreements of economic policy uncertainty can be attributed to ambiguity in language, and (2) switching measurements from keyword-matching to supervised machine learning classifiers results in low correlation, a concerning implication for the validity of the index. + 2020.nlpcss-1.13 + + + Recalibrating classifiers for interpretable abusive content detection + BertieVidgen + ScottHale + SamStaton + TomMelham + HelenMargetts + OhadKammar + MarcinSzymczak + 132–138 + We investigate the use of machine learning classifiers for detecting online abuse in empirical research. We show that uncalibrated classifiers (i.e. where the ‘raw’ scores are used) align poorly with human evaluations. This limits their use for understanding the dynamics, patterns and prevalence of online abuse. We examine two widely used classifiers (created by Perspective and Davidson et al.) on a dataset of tweets directed against candidates in the UK’s 2017 general election. A Bayesian approach is presented to recalibrate the raw scores from the classifiers, using probabilistic programming and newly annotated data. We argue that interpretability evaluation and recalibration is integral to the application of abusive content classifiers. + 2020.nlpcss-1.14 + 2020.nlpcss-1.14.OptionalSupplementaryMaterial.zip + + + Predicting independent living outcomes from written reports of social workers + AngelikaMaier + PhilippCimiano + 139–148 + In social care environments, the main goal of social workers is to foster independent living by their clients. An important task is thus to monitor progress towards reaching independence in different areas of their patients’ life. To support this task, we present an approach that extracts indications of independence on different life aspects from the day-to-day documentation that social workers create. We describe the process of collecting and annotating a corresponding corpus created from data records of two social work institutions with a focus on disability care. We show that the agreement on the task of annotating the observations of social workers with respect to discrete independent levels yields a high agreement of .74 as measured by Fleiss’ Kappa. We present a classification approach towards automatically classifying an observation into the discrete independence levels and present results for different types of classifiers. Against our original expectation, we show that we reach F-Measures (macro) of 95% averaged across topics, showing that this task can be automatically solved. + 2020.nlpcss-1.15 + + + Analyzing Political Bias and Unfairness in News Articles at Different Levels of Granularity + Wei-FanChen + KhalidAl Khatib + HenningWachsmuth + BennoStein + 149–154 + Media is an indispensable source of information and opinion, shaping the beliefs and attitudes of our society. Obviously, media portals can also provide overly biased content, e.g., by reporting on political events in a selective or incomplete manner. A relevant question hence is whether and how such a form of unfair news coverage can be exposed. This paper addresses the automatic detection of bias, but it goes one step further in that it explores how political bias and unfairness are manifested linguistically. We utilize a new corpus of 6964 news articles with labels derived from adfontesmedia.com to develop a neural model for bias assessment. Analyzing the model on article excerpts, we find insightful bias patterns at different levels of text granularity, from single words to the whole article discourse. + 2020.nlpcss-1.16 + + + Mapping Local News Coverage: Precise location extraction in textual news content using fine-tuned <fixed-case>BERT</fixed-case> based language model + SarangGupta + KumariNishu + 155–162 + Mapping local news coverage from textual content is a challenging problem that requires extracting precise location mentions from news articles. While traditional named entity taggers are able to extract geo-political entities and certain non geo-political entities, they cannot recognize precise location mentions such as addresses, streets and intersections that are required to accurately map the news article. We fine-tune a BERT-based language model for achieving high level of granularity in location extraction. We incorporate the model into an end-to-end tool that further geocodes the extracted locations for the broader objective of mapping news coverage. + 2020.nlpcss-1.17 + + + Foreigner-directed speech is simpler than native-directed: Evidence from social media + AleksandrsBerdicevskis + 163–172 + I test two hypotheses that play an important role in modern sociolinguistics and language evolution studies: first, that non-native production is simpler than native; second, that production addressed to non-native speakers is simpler than that addressed to natives. The second hypothesis is particularly important for theories about contact-induced simplification, since the accommodation to non-natives may explain how the simplification can spread from adult learners to the whole community. To test the hypotheses, I create a very large corpus of native and non-native written speech in four languages (English, French, Italian, Spanish), extracting data from an internet forum where native languages of the participants are known and the structure of the interactions can be inferred. The corpus data yield inconsistent evidence with respect to the first hypothesis, but largely support the second one, suggesting that foreigner-directed speech is indeed simpler than native-directed. Importantly, when testing the first hypothesis, I contrast production of different speakers, which can introduce confounds and is a likely reason for the inconsistencies. When testing the second hypothesis, the comparison is always within the production of the same speaker (but with different addressees), which makes it more reliable. + 2020.nlpcss-1.18 + + + Diachronic Embeddings for People in the News + FelixHennig + StevenWilson + 173–183 + Previous English-language diachronic change models based on word embeddings have typically used single tokens to represent entities, including names of people. This leads to issues with both ambiguity (resulting in one embedding representing several distinct and unrelated people) and unlinked references (leading to several distinct embeddings which represent the same person). In this paper, we show that using named entity recognition and heuristic name linking steps before training a diachronic embedding model leads to more accurate representations of references to people, as compared to the token-only baseline. In large news corpus of articles from The Guardian, we provide examples of several types of analysis that can be performed using these new embeddings. Further, we show that real world events and context changes can be detected using our proposed model. + 2020.nlpcss-1.19 + + + Social media data as a lens onto care-seeking behavior among women veterans of the <fixed-case>US</fixed-case> armed forces + KacieKelly + AlexFine + GlenCoppersmith + 184–192 + In this article, we examine social media data as a lens onto support-seeking among women veterans of the US armed forces. Social media data hold a great deal of promise as a source of information on needs and support-seeking among individuals who are excluded from or systematically prevented from accessing clinical or other institutions ostensibly designed to support them. We apply natural language processing (NLP) techniques to more than 3 million Tweets collected from 20,000 Twitter users. We find evidence that women veterans are more likely to use social media to seek social and community engagement and to discuss mental health and veterans’ issues significantly more frequently than their male counterparts. By contrast, male veterans tend to use social media to amplify political ideologies or to engage in partisan debate. Our results have implications for how organizations can provide outreach and services to this uniquely vulnerable population, and illustrate the utility of non-traditional observational data sources such as social media to understand the needs of marginalized groups. + 2020.nlpcss-1.20 + + + Understanding Weekly <fixed-case>COVID</fixed-case>-19 Concerns through Dynamic Content-Specific <fixed-case>LDA</fixed-case> Topic Modeling + MohammadzamanZamani + H. AndrewSchwartz + JohannesEichstaedt + Sharath ChandraGuntuku + AdithyaVirinchipuram Ganesan + SeanClouston + SalvatoreGiorgi + 193–198 + The novelty and global scale of the COVID-19 pandemic has lead to rapid societal changes in a short span of time. As government policy and health measures shift, public perceptions and concerns also change, an evolution documented within discourse on social media.We propose a dynamic content-specific LDA topic modeling technique that can help to identify different domains of COVID-specific discourse that can be used to track societal shifts in concerns or views. Our experiments show that these model-derived topics are more coherent than standard LDA topics, and also provide new features that are more helpful in prediction of COVID-19 related outcomes including social mobility and unemployment rate. + 2020.nlpcss-1.21 + + + Emoji and Self-Identity in <fixed-case>T</fixed-case>witter Bios + JinhangLi + GiorgosLonginos + StevenWilson + WalidMagdy + 199–211 + Emoji are widely used to express emotions and concepts on social media, and prior work has shown that users’ choice of emoji reflects the way that they wish to present themselves to the world. Emoji usage is typically studied in the context of posts made by users, and this view has provided important insights into phenomena such as emotional expression and self-representation. In addition to making posts, however, social media platforms like Twitter allow for users to provide a short bio, which is an opportunity to briefly describe their account as a whole. In this work, we focus on the use of emoji in these bio statements. We explore the ways in which users include emoji in these self-descriptions, finding different patterns than those observed around emoji usage in tweets. We examine the relationships between emoji used in bios and the content of users’ tweets, showing that the topics and even the average sentiment of tweets varies for users with different emoji in their bios. Lastly, we confirm that homophily effects exist with respect to the types of emoji that are included in bios of users and their followers. + 2020.nlpcss-1.22 + + + Analyzing Gender Bias within Narrative Tropes + DhruvilGala + Mohammad OmarKhursheed + HannahLerner + BrendanO’Connor + MohitIyyer + 212–217 + Popular media reflects and reinforces societal biases through the use of tropes, which are narrative elements, such as archetypal characters and plot arcs, that occur frequently across media. In this paper, we specifically investigate gender bias within a large collection of tropes. To enable our study, we crawl tvtropes.org, an online user-created repository that contains 30K tropes associated with 1.9M examples of their occurrences across film, television, and literature. We automatically score the “genderedness” of each trope in our TVTROPES dataset, which enables an analysis of (1) highly-gendered topics within tropes, (2) the relationship between gender bias and popular reception, and (3) how the gender of a work’s creator correlates with the types of tropes that they use. + 2020.nlpcss-1.23 + +
+
diff --git a/data/xml/2020.nlposs.xml b/data/xml/2020.nlposs.xml new file mode 100644 index 0000000000..a86032de48 --- /dev/null +++ b/data/xml/2020.nlposs.xml @@ -0,0 +1,203 @@ + + + + + Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS) + Eunjeong L.Park + MasatoHagiwara + DmitrijsMilajevs + NelsonLiu + GeetickaChauhan + LilingTan + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.nlposs-1.0 + + + A Framework to Assist Chat Operators of Mental Healthcare Services + ThiagoMadeira + HederBernardino + JairoFrancisco De Souza + HenriqueGomide + NatháliaMunck Machado + BrunoMarcos Pinheiro da Silva + AlexandreVieira Pereira Pacelli + 1–7 + Conversational agents can be used to make diagnoses, classify mental states, promote health education, and provide emotional support. The benefits of adopting conversational agents include widespread access, increased treatment engagement, and improved patient relationships with the intervention. We propose here a framework to assist chat operators of mental healthcare services, instead of a fully automated conversational agent. This design eases to avoid the adverse effects of applying chatbots in mental healthcare. The proposed framework is capable of improving the quality and reducing the time of interactions via chat between a user and a chat operator. We also present a case study in the context of health promotion on reducing tobacco use. The proposed framework uses artificial intelligence, specifically natural language processing (NLP) techniques, to classify messages from chat users. A list of suggestions is offered to the chat operator, with topics to be discussed in the session. These suggestions were created based on service protocols and the classification of previous chat sessions. The operator can also edit the suggested messages. Data collected can be used in the future to improve the quality of the suggestions offered. + 2020.nlposs-1.1 + + + <fixed-case>ARBML</fixed-case>: Democritizing <fixed-case>A</fixed-case>rabic Natural Language Processing Tools + ZaidAlyafeai + MagedAl-Shaibani + 8–13 + Automating natural language understanding is a lifelong quest addressed for decades. With the help of advances in machine learning and particularly, deep learning, we are able to produce state of the art models that can imitate human interactions with languages. Unfortunately, these advances are controlled by the availability of language resources. Arabic advances in this field , although it has a great potential, are still limited. This is apparent in both research and development. In this paper, we showcase some NLP models we trained for Arabic. We also present our methodology and pipeline to build such models from data collection, data preprocessing, tokenization and model deployment. These tools help in the advancement of the field and provide a systematic approach for extending NLP tools to many languages. + 2020.nlposs-1.2 + + + <fixed-case>CLEVR</fixed-case> Parser: A Graph Parser Library for Geometric Learning on Language Grounded Image Scenes + RaeidSaqur + AmeetDeshpande + 14–19 + The CLEVR dataset has been used extensively in language grounded visual reasoning in Machine Learning (ML) and Natural Language Processing (NLP). We present a graph parser library for CLEVR, that provides functionalities for object-centric attributes and relationships extraction, and construction of structural graph representations for dual modalities. Structural order-invariant representations enable geometric learning and can aid in downstream tasks like language grounding to vision, robotics, compositionality, interpretability, and computational grammar construction. We provide three extensible main components – parser, embedder, and visualizer that can be tailored to suit specific learning setups. We also provide out-of-the-box functionality for seamless integration with popular deep graph neural network (GNN) libraries. Additionally, we discuss downstream usage and applications of the library, and how it can accelerate research for the NLP community. + 2020.nlposs-1.3 + 2020.nlposs-1.3.OptionalSupplementaryMaterial.zip + + + End-to-end <fixed-case>NLP</fixed-case> Pipelines in Rust + GuillaumeBecquin + 20–25 + The recent progress in natural language processing research has been supported by the development of a rich open source ecosystem in Python. Libraries allowing NLP practitioners but also non-specialists to leverage state-of-the-art models have been instrumental in the democratization of this technology. The maturity of the open-source NLP ecosystem however varies between languages. This work proposes a new open-source library aimed at bringing state-of-the-art NLP to Rust. Rust is a systems programming language for which the foundations required to build machine learning applications are available but still lacks ready-to-use, end-to-end NLP libraries. The proposed library, rust-bert, implements modern language models and ready-to-use pipelines (for example translation or summarization). This allows further development by the Rust community from both NLP experts and non-specialists. It is hoped that this library will accelerate the development of the NLP ecosystem in Rust. The library is under active development and available at https://github.com/guillaume-be/rust-bert. + 2020.nlposs-1.4 + + + Fair Embedding Engine: A Library for Analyzing and Mitigating Gender Bias in Word Embeddings + VaibhavKumar + TenzinBhotia + VaibhavKumar + 26–31 + Non-contextual word embedding models have been shown to inherit human-like stereotypical biases of gender, race and religion from the training corpora. To counter this issue, a large body of research has emerged which aims to mitigate these biases while keeping the syntactic and semantic utility of embeddings intact. This paper describes Fair Embedding Engine (FEE), a library for analysing and mitigating gender bias in word embeddings. FEE combines various state of the art techniques for quantifying, visualising and mitigating gender bias in word embeddings under a standard abstraction. FEE will aid practitioners in fast track analysis of existing debiasing methods on their embedding models. Further, it will allow rapid prototyping of new methods by evaluating their performance on a suite of standard metrics. + 2020.nlposs-1.5 + + + Flexible retrieval with <fixed-case>NMSLIB</fixed-case> and <fixed-case>F</fixed-case>lex<fixed-case>N</fixed-case>eu<fixed-case>ART</fixed-case> + LeonidBoytsov + EricNyberg + 32–43 + Our objective is to introduce to the NLP community NMSLIB, describe a new retrieval toolkit FlexNeuART, as well as their integration capabilities. NMSLIB, while being one the fastest k-NN search libraries, is quite generic and supports a variety of distance/similarity functions. Because the library relies on the distance-based structure-agnostic algorithms, it can be further extended by adding new distances. FlexNeuART is a modular, extendible and flexible toolkit for candidate generation in IR and QA applications, which supports mixing of classic and neural ranking signals. FlexNeuART can efficiently retrieve mixed dense and sparse representations (with weights learned from training data), which is achieved by extending NMSLIB. In that, other retrieval systems work with purely sparse representations (e.g., Lucene), purely dense representations (e.g., FAISS and Annoy), or only perform mixing at the re-ranking stage. + 2020.nlposs-1.6 + + + fugashi, a Tool for Tokenizing <fixed-case>J</fixed-case>apanese in Python + PaulMcCann + 44–51 + Recent years have seen an increase in the number of large-scale multilingual NLP projects. However, even in such projects, languages with special processing requirements are often excluded. One such language is Japanese. Japanese is written without spaces, tokenization is non-trivial, and while high quality open source tokenizers exist they can be hard to use and lack English documentation. This paper introduces fugashi, a MeCab wrapper for Python, and gives an introduction to tokenizing Japanese. + 2020.nlposs-1.7 + + + Going Beyond <fixed-case>T</fixed-case>-<fixed-case>SNE</fixed-case>: Exposing whatlies in Text Embeddings + VincentWarmerdam + ThomasKober + RachaelTatman + 52–60 + We introduce whatlies, an open source toolkit for visually inspecting word and sentence embeddings. The project offers a unified and extensible API with current support for a range of popular embedding backends including spaCy, tfhub, huggingface transformers, gensim, fastText and BytePair embeddings. The package combines a domain specific language for vector arithmetic with visualisation tools that make exploring word embeddings more intuitive and concise. It offers support for many popular dimensionality reduction techniques as well as many interactive visualisations that can either be statically exported or shared via Jupyter notebooks. The project documentation is available from https://rasahq.github.io/whatlies/. + 2020.nlposs-1.8 + + + Howl: A Deployed, Open-Source Wake Word Detection System + RaphaelTang + JaejunLee + AfsanehRazi + JuliaCambre + IanBicking + JofishKaye + JimmyLin + 61–65 + We describe Howl, an open-source wake word detection toolkit with native support for open speech datasets such as Mozilla Common Voice (MCV) and Google Speech Commands (GSC). We report benchmark results of various models supported by our toolkit on GSC and our own freely available wake word detection dataset, built from MCV. One of our models is deployed in Firefox Voice, a plugin enabling speech interactivity for the Firefox web browser. Howl represents, to the best of our knowledge, the first fully productionized, open-source wake word detection toolkit with a web browser deployment target. Our codebase is at howl.ai. + 2020.nlposs-1.9 + + + i<fixed-case>NLTK</fixed-case>: Natural Language Toolkit for Indic Languages + GauravArora + 66–71 + We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95% of the previous best performance by using less than 10% of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub. The library is available at https://github.com/goru001/inltk. + 2020.nlposs-1.10 + + + <fixed-case>KLPT</fixed-case> – <fixed-case>K</fixed-case>urdish Language Processing Toolkit + SinaAhmadi + 72–84 + Despite the recent advances in applying language-independent approaches to various natural language processing tasks thanks to artificial intelligence, some language-specific tools are still essential to process a language in a viable manner. Kurdish language is a less-resourced language with a remarkable diversity in dialects and scripts and lacks basic language processing tools. To address this issue, we introduce a language processing toolkit to handle such a diversity in an efficient way. Our toolkit is composed of fundamental components such as text preprocessing, stemming, tokenization, lemmatization and transliteration and is able to get further extended by future developers. The project is publicly available. + 2020.nlposs-1.11 + + + Open <fixed-case>K</fixed-case>orean Corpora: A Practical Report + Won IkCho + SangwhanMoon + YoungsookSong + 85–93 + Korean is often referred to as a low-resource language in the research community. While this claim is partially true, it is also because the availability of resources is inadequately advertised and curated. This work curates and reviews a list of Korean corpora, first describing institution-level resource development, then further iterate through a list of current open datasets for different types of tasks. We then propose a direction on how open-source dataset construction and releases should be done for less-resourced languages to promote research. + 2020.nlposs-1.12 + + + Open-Source Morphology for Endangered Mordvinic Languages + JackRueter + MikaHämäläinen + NikoPartanen + 94–100 + This document describes shared development of finite-state description of two closely related but endangered minority languages, Erzya and Moksha. It touches upon morpholexical unity and diversity of the two languages and how this provides a motivation for shared open-source FST development. We describe how we have designed the transducers so that they can benefit from existing open-source infrastructures and are as reusable as possible. + 2020.nlposs-1.13 + + + Pimlico: A toolkit for corpus-processing pipelines and reproducible experiments + MarkGranroth-Wilding + 101–109 + We present Pimlico, an open source toolkit for building pipelines for processing large corpora. It is especially focused on processing linguistic corpora and provides wrappers around existing, widely used NLP tools. A particular goal is to ease distribution of reproducible and extensible experiments by making it easy to document and re-run all steps involved, including data loading, pre-processing, model training and evaluation. Once a pipeline is released, it is easy to adapt, for example, to run on a new dataset, or to re-run an experiment with different parameters. The toolkit takes care of many common challenges in writing and distributing corpus-processing code, such as managing data between the steps of a pipeline, installing required software and combining existing toolkits with new, task-specific code. + 2020.nlposs-1.14 + + + <fixed-case>P</fixed-case>y<fixed-case>SBD</fixed-case>: Pragmatic Sentence Boundary Disambiguation + NipunSadvilkar + MarkNeumann + 110–114 + We present a rule-based sentence boundary disambiguation Python package that works out-of-the-box for 22 languages. We aim to provide a realistic segmenter which can provide logical sentences even when the format and domain of the input text is unknown. In our work, we adapt the Golden Rules Set (a language specific set of sentence boundary exemplars) originally implemented as a ruby gem pragmatic segmenter which we ported to Python with additional improvements and functionality. PySBD passes 97.92% of the Golden Rule Set examplars for English, an improvement of 25% over the next best open source Python tool. + 2020.nlposs-1.15 + 2020.nlposs-1.15.OptionalSupplementaryMaterial.zip + + + iobes: Library for Span Level Processing + BrianLester + 115–119 + Many tasks in natural language processing, such as named entity recognition and slot-filling, involve identifying and labeling specific spans of text. In order to leverage common models, these tasks are often recast as sequence labeling tasks. Each token is given a label and these labels are prefixed with special tokens such as B- or I-. After a model assigns labels to each token, these prefixes are used to group the tokens into spans. Properly parsing these annotations is critical for producing fair and comparable metrics; however, despite its importance, there is not an easy-to-use, standardized, programmatically integratable library to help work with span labeling. To remedy this, we introduce our open-source library, iobes. iobes is used for parsing, converting, and processing spans represented as token level decisions. + 2020.nlposs-1.16 + + + <fixed-case>S</fixed-case>acre<fixed-case>ROUGE</fixed-case>: An Open-Source Library for Using and Developing Summarization Evaluation Metrics + DanielDeutsch + DanRoth + 120–125 + We present SacreROUGE, an open-source library for using and developing summarization evaluation metrics. SacreROUGE removes many obstacles that researchers face when using or developing metrics: (1) The library provides Python wrappers around the official implementations of existing evaluation metrics so they share a common, easy-to-use interface; (2) it provides functionality to evaluate how well any metric implemented in the library correlates to human-annotated judgments, so no additional code needs to be written for a new evaluation metric; and (3) it includes scripts for loading datasets that contain human judgments so they can easily be used for evaluation. This work describes the design of the library, including the core Metric interface, the command-line API for evaluating summarization models and metrics, and the scripts to load and reformat publicly available datasets. The development of SacreROUGE is ongoing and open to contributions from the community. + 2020.nlposs-1.17 + 2020.nlposs-1.17.OptionalSupplementaryMaterial.pdf + + + <fixed-case>T</fixed-case>ext<fixed-case>A</fixed-case>ttack: Lessons learned in designing Python frameworks for <fixed-case>NLP</fixed-case> + JohnMorris + Jin YongYoo + YanjunQi + 126–131 + TextAttack is an open-source Python toolkit for adversarial attacks, adversarial training, and data augmentation in NLP. TextAttack unites 15+ papers from the NLP adversarial attack literature into a single framework, with many components reused across attacks. This framework allows both researchers and developers to test and study the weaknesses of their NLP models. To build such an open-source NLP toolkit requires solving some common problems: How do we enable users to supply models from different deep learning frameworks? How can we build tools to support as many different datasets as possible? We share our insights into developing a well-written, well-documented NLP Python framework in hope that they can aid future development of similar packages. + 2020.nlposs-1.18 + + + <fixed-case>TOMODAPI</fixed-case>: A Topic Modeling <fixed-case>API</fixed-case> to Train, Use and Compare Topic Models + PasqualeLisena + IsmailHarrando + OussamaKandakji + RaphaelTroncy + 132–140 + From LDA to neural models, different topic modeling approaches have been proposed in the literature. However, their suitability and performance is not easy to compare, particularly when the algorithms are being used in the wild on heterogeneous datasets. In this paper, we introduce ToModAPI (TOpic MOdeling API), a wrapper library to easily train, evaluate and infer using different topic modeling algorithms through a unified interface. The library is extensible and can be used in Python environments or through a Web API. + 2020.nlposs-1.19 + + + User-centered & Robust <fixed-case>NLP</fixed-case> <fixed-case>OSS</fixed-case>: Lessons Learned from Developing & Maintaining <fixed-case>RSMT</fixed-case>ool + NitinMadnani + AnastassiaLoukina + 141–146 + For the last 5 years, we have developed and maintained RSMTool – an open-source tool for evaluating NLP systems that automatically score written and spoken responses. RSMTool is designed to be cross-disciplinary, borrowing heavily from NLP, machine learning, and educational measurement. Its cross-disciplinary nature has required us to learn a user-centered development approach in terms of both design and implementation. We share some of these lessons in this paper. + 2020.nlposs-1.20 + + + <fixed-case>WAFFLE</fixed-case>: A Graph for <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Applied to <fixed-case>F</fixed-case>ree<fixed-case>F</fixed-case>orm Linguistic Exploration + BerkEkmekci + BlakeHowald + 147–157 + The WordNet database of English (Fellbaum, 1998) is a key source of semantic information for research and development of natural language processing applications. As the sophistication of these applications increases with the use of large datasets, deep learning, and graph-based methods, so should the use of WordNet. To this end, we introduce WAFFLE: WordNet Applied to FreeForm Linguistic Exploration which makes WordNet available in an open source graph data structure. The WAFFLE graph relies on platform agnostic formats for robust interrogation and flexibility. Where existing implementations of WordNet offer dictionary-like lookup, single degree neighborhood operations, and path based similarity-scoring, the WAFFLE graph makes all nodes (semantic relation sets) and relationships queryable at scale, enabling local and global analysis of all relationships without the need for custom code. We demonstrate WAFFLE’s ease of use, visualization capabilities, and scalable efficiency with common queries, operations, and interactions. WAFFLE is available at github.com/TRSS-NLP/WAFFLE. + 2020.nlposs-1.21 + +
+
diff --git a/data/xml/2020.privatenlp.xml b/data/xml/2020.privatenlp.xml new file mode 100644 index 0000000000..77f95ab523 --- /dev/null +++ b/data/xml/2020.privatenlp.xml @@ -0,0 +1,68 @@ + + + + + Proceedings of the Second Workshop on Privacy in NLP + OluwaseyiFeyisetan + SepidehGhanavati + ShervinMalmasi + PatriciaThaine + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.privatenlp-1.0 + + + On Log-Loss Scores and (No) Privacy + AbhinavAggarwal + ZekunXu + OluwaseyiFeyisetan + NathanaelTeissier + 1–6 + A common metric for assessing the performance of binary classifiers is the Log-Loss score, which is a real number indicating the cross entropy distance between the predicted distribution over the labels and the true distribution (a point distribution defined by the ground truth labels). In this paper, we show that a malicious modeler, upon obtaining access to the Log-Loss scores on its predictions, can exploit this information to infer all the ground truth labels of arbitrary test datasets with full accuracy. We provide an efficient algorithm to perform this inference. A particularly interesting application where this attack can be exploited is to breach privacy in the setting of Membership Inference Attacks. These attacks exploit the vulnerabilities of exposing models trained on customer data to queries made by an adversary. Privacy auditing tools for measuring leakage from sensitive datasets assess the total privacy leakage based on the adversary’s predictions for datapoint membership. An instance of the proposed attack can hence, cause complete membership privacy breach, obviating any attack model training or access to side knowledge with the adversary. Moreover, our algorithm is agnostic to the model under attack and hence, enables perfect membership inference even for models that do not memorize or overfit. In particular, our observations provide insight into the extent of information leakage from statistical aggregates and how they can be exploited. + 2020.privatenlp-1.1 + + + A Differentially Private Text Perturbation Method Using Regularized Mahalanobis Metric + ZekunXu + AbhinavAggarwal + OluwaseyiFeyisetan + NathanaelTeissier + 7–17 + Balancing the privacy-utility tradeoff is a crucial requirement of many practical machine learning systems that deal with sensitive customer data. A popular approach for privacy- preserving text analysis is noise injection, in which text data is first mapped into a continuous embedding space, perturbed by sampling a spherical noise from an appropriate distribution, and then projected back to the discrete vocabulary space. While this allows the perturbation to admit the required metric differential privacy, often the utility of downstream tasks modeled on this perturbed data is low because the spherical noise does not account for the variability in the density around different words in the embedding space. In particular, words in a sparse region are likely unchanged even when the noise scale is large. In this paper, we propose a text perturbation mechanism based on a carefully designed regularized variant of the Mahalanobis metric to overcome this problem. For any given noise scale, this metric adds an elliptical noise to account for the covariance structure in the embedding space. This heterogeneity in the noise scale along different directions helps ensure that the words in the sparse region have sufficient likelihood of replacement without sacrificing the overall utility. We provide a text-perturbation algorithm based on this metric and formally prove its privacy guarantees. Additionally, we empirically show that our mechanism improves the privacy statistics to achieve the same level of utility as compared to the state-of-the-art Laplace mechanism. + 2020.privatenlp-1.2 + + + Identifying and Classifying Third-party Entities in Natural Language Privacy Policies + MitraBokaie Hosseini + PragyanK C + IrwinReyes + SergeEgelman + 18–27 + App developers often raise revenue by contracting with third party ad networks, which serve targeted ads to end-users. To this end, a free app may collect data about its users and share it with advertising companies for targeting purposes. Regulations such as General Data Protection Regulation (GDPR) require transparency with respect to the recipients (or categories of recipients) of user data. These regulations call for app developers to have privacy policies that disclose those third party recipients of user data. Privacy policies provide users transparency into what data an app will access, collect, shared, and retain. Given the size of app marketplaces, verifying compliance with such regulations is a tedious task. This paper aims to develop an automated approach to extract and categorize third party data recipients (i.e., entities) declared in privacy policies. We analyze 100 privacy policies associated with most downloaded apps in the Google Play Store. We crowdsource the collection and annotation of app privacy policies to establish the ground truth with respect to third party entities. From this, we train various models to extract third party entities automatically. Our best model achieves average F1 score of 66% when compared to crowdsourced annotations. + 2020.privatenlp-1.3 + + + Surfacing Privacy Settings Using Semantic Matching + RishabhKhandelwal + AsmitNayak + YaoYao + KassemFawaz + 28–38 + Online services utilize privacy settings to provide users with control over their data. However, these privacy settings are often hard to locate, causing the user to rely on provider-chosen default values. In this work, we train privacy-settings-centric encoders and leverage them to create an interface that allows users to search for privacy settings using free-form queries. In order to achieve this goal, we create a custom Semantic Similarity dataset, which consists of real user queries covering various privacy settings. We then use this dataset to fine-tune a state of the art encoder. Using this fine-tuned encoder, we perform semantic matching between the user queries and the privacy settings to retrieve the most relevant setting. Finally, we also use the encoder to generate embeddings of privacy settings from the top 100 websites and perform unsupervised clustering to learn about the online privacy settings types. We find that the most common type of privacy settings are ‘Personalization’ and ‘Notifications’, with coverage of 35.8% and 34.4%, respectively, in our dataset. + 2020.privatenlp-1.4 + + + Differentially Private Language Models Benefit from Public Pre-training + GavinKerrigan + DylanSlack + JensTuyls + 39–45 + Language modeling is a keystone task in natural language processing. When training a language model on sensitive information, differential privacy (DP) allows us to quantify the degree to which our private data is protected. However, training algorithms which enforce differential privacy often lead to degradation in model quality. We study the feasibility of learning a language model which is simultaneously high-quality and privacy preserving by tuning a public base model on a private corpus. We find that DP fine-tuning boosts the performance of language models in the private domain, making the training of such models possible. + 2020.privatenlp-1.5 + +
+
diff --git a/data/xml/2020.scai.xml b/data/xml/2020.scai.xml new file mode 100644 index 0000000000..8e8bbfe6a7 --- /dev/null +++ b/data/xml/2020.scai.xml @@ -0,0 +1,46 @@ + + + + + Proceedings of the 5th International Workshop on Search-Oriented Conversational AI (SCAI) + JeffDalton + AleksandrChuklin + JuliaKiseleva + MikhailBurtsev + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.scai-1.0 + + + Slice-Aware Neural Ranking + GustavoPenha + ClaudiaHauff + 1–6 + Understanding when and why neural ranking models fail for an IR task via error analysis is an important part of the research cycle. Here we focus on the challenges of (i) identifying categories of difficult instances (a pair of question and response candidates) for which a neural ranker is ineffective and (ii) improving neural ranking for such instances. To address both challenges we resort to slice-based learning for which the goal is to improve effectiveness of neural models for slices (subsets) of data. We address challenge (i) by proposing different slicing functions (SFs) that select slices of the dataset—based on prior work we heuristically capture different failures of neural rankers. Then, for challenge (ii) we adapt a neural ranking model to learn slice-aware representations, i.e. the adapted model learns to represent the question and responses differently based on the model’s prediction of which slices they belong to. Our experimental results (the source code and data are available at https://github.com/Guzpenha/slice_based_learning) across three different ranking tasks and four corpora show that slice-based learning improves the effectiveness by an average of 2% over a neural ranker that is not slice-aware. + 2020.scai-1.1 + + + A Wrong Answer or a Wrong Question? An Intricate Relationship between Question Reformulation and Answer Selection in Conversational Question Answering + SvitlanaVakulenko + ShayneLongpre + ZhuchengTu + RavitejaAnantha + 7–16 + The dependency between an adequate question formulation and correct answer selection is a very intriguing but still underexplored area. In this paper, we show that question rewriting (QR) of the conversational context allows to shed more light on this phenomenon and also use it to evaluate robustness of different answer selection approaches. We introduce a simple framework that enables an automated analysis of the conversational question answering (QA) performance using question rewrites, and present the results of this analysis on the TREC CAsT and QuAC (CANARD) datasets. Our experiments uncover sensitivity to question formulation of the popular state-of-the-art question answering approaches. Our results demonstrate that the reading comprehension model is insensitive to question formulation, while the passage ranking changes dramatically with a little variation in the input question. The benefit of QR is that it allows us to pinpoint and group such cases automatically. We show how to use this methodology to verify whether QA models are really learning the task or just finding shortcuts in the dataset, and better understand the frequent types of error they make. + 2020.scai-1.2 + + + Multi-Task Learning using Dynamic Task Weighting for Conversational Question Answering + SarawootKongyoung + CraigMacdonald + IadhOunis + 17–26 + Conversational Question Answering (ConvQA) is a Conversational Search task in a simplified setting, where an answer must be extracted from a given passage. Neural language models, such as BERT, fine-tuned on large-scale ConvQA datasets such as CoQA and QuAC have been used to address this task. Recently, Multi-Task Learning (MTL) has emerged as a particularly interesting approach for developing ConvQA models, where the objective is to enhance the performance of a primary task by sharing the learned structure across several related auxiliary tasks. However, existing ConvQA models that leverage MTL have not investigated the dynamic adjustment of the relative importance of the different tasks during learning, nor the resulting impact on the performance of the learned models. In this paper, we first study the effectiveness and efficiency of dynamic MTL methods including Evolving Weighting, Uncertainty Weighting, and Loss-Balanced Task Weighting, compared to static MTL methods such as the uniform weighting of tasks. Furthermore, we propose a novel hybrid dynamic method combining Abridged Linear for the main task with a Loss-Balanced Task Weighting (LBTW) for the auxiliary tasks, so as to automatically fine-tune task weighting during learning, ensuring that each of the task’s weights is adjusted by the relative importance of the different tasks. We conduct experiments using QuAC, a large-scale ConvQA dataset. Our results demonstrate the effectiveness of our proposed method, which significantly outperforms both the single-task learning and static task weighting methods with improvements ranging from +2.72% to +3.20% in F1 scores. Finally, our findings show that the performance of using MTL in developing ConvQA model is sensitive to the correct selection of the auxiliary tasks as well as to an adequate balancing of the loss rates of these tasks during training by using LBTW. + 2020.scai-1.3 + +
+
diff --git a/data/xml/2020.sdp.xml b/data/xml/2020.sdp.xml new file mode 100644 index 0000000000..99219b77a1 --- /dev/null +++ b/data/xml/2020.sdp.xml @@ -0,0 +1,458 @@ + + + + + Proceedings of the First Workshop on Scholarly Document Processing + Muthu KumarChandrasekaran + Anitade Waard + GuyFeigenblat + DayneFreitag + TirthankarGhosal + EduardHovy + PetrKnoth + DavidKonopnicki + PhilippMayr + Robert M.Patton + MichalShmueli-Scheuer + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.sdp-1.0 + + + Overview of the First Workshop on Scholarly Document Processing (<fixed-case>SDP</fixed-case>) + Muthu KumarChandrasekaran + GuyFeigenblat + DayneFreitag + TirthankarGhosal + EduardHovy + PhilippMayr + MichalShmueli-Scheuer + Anitade Waard + 1–6 + Next to keeping up with the growing literature in their own and related fields, scholars increasingly also need to rebut pseudo-science and disinformation. To address these challenges, computational work on enhancing search, summarization, and analysis of scholarly documents has flourished. However, the various strands of research on scholarly document processing remain fragmented. To reach to the broader NLP and AI/ML community, pool distributed efforts and enable shared access to published research, we held the 1st Workshop on Scholarly Document Processing at EMNLP 2020 as a virtual event. The SDP workshop consisted of a research track (including a poster session), two invited talks and three Shared Tasks (CL-SciSumm, Lay-Summ and LongSumm), geared towards easier access to scientific methods and results. Website: https://ornlcda.github.io/SDProc + + 2020.sdp-1.1 + + + The future of ar<fixed-case>X</fixed-case>iv and knowledge discovery in open science + SteinnSigurdsson + 7–9 + arXiv, the preprint server for the physical and mathematical sciences, is in its third decade of operation. As the flow of new, open access research increases inexorably, the challenges to keep up with and discover research content also become greater. I will discuss the status and future of arXiv, and possibilities and plans to make more effective use of the research database to enhance ongoing research efforts. + 2020.sdp-1.2 + + + Acknowledgement Entity Recognition in <fixed-case>CORD</fixed-case>-19 Papers + JianWu + PeiWang + XinWei + SarahRajtmajer + C. LeeGiles + ChristopherGriffin + 10–19 + Acknowledgements are ubiquitous in scholarly papers. Existing acknowledgement entity recognition methods assume all named entities are acknowledged. Here, we examine the nuances between acknowledged and named entities by analyzing sentence structure. We develop an acknowledgement extraction system, AckExtract based on open-source text mining software and evaluate our method using manually labeled data. AckExtract uses the PDF of a scholarly paper as input and outputs acknowledgement entities. Results show an overall performance of F_1=0.92. We built a supplementary database by linking CORD-19 papers with acknowledgement entities extracted by AckExtract including persons and organizations and find that only up to 50–60% of named entities are actually acknowledged. We further analyze chronological trends of acknowledgement entities in CORD-19 papers. All codes and labeled data are publicly available at https://github.com/lamps-lab/ackextract. + 2020.sdp-1.3 + + + A Smart System to Generate and Validate Question Answer Pairs for <fixed-case>COVID</fixed-case>-19 Literature + RohanBhambhoria + LunaFeng + DawnSepehr + JohnChen + ConnerCowling + SedefKocak + ElhamDolatabadi + 20–30 + Automatically generating question answer (QA) pairs from the rapidly growing coronavirus-related literature is of great value to the medical community. Creating high quality QA pairs would allow researchers to build models to address scientific queries for answers which are not readily available in support of the ongoing fight against the pandemic. QA pair generation is, however, a very tedious and time consuming task requiring domain expertise for annotation and evaluation. In this paper we present our contribution in addressing some of the challenges of building a QA system without gold data. We first present a method to create QA pairs from a large semi-structured dataset through the use of transformer and rule-based models. Next, we propose a means of engaging subject matter experts (SMEs) for annotating the QA pairs through the usage of a web application. Finally, we demonstrate some experiments showcasing the effectiveness of leveraging active learning in designing a high performing model with a substantially lower annotation effort from the domain experts. + 2020.sdp-1.4 + + + Covidex: Neural Ranking Models and Keyword Search Infrastructure for the <fixed-case>COVID</fixed-case>-19 Open Research Dataset + EdwinZhang + NikhilGupta + RaphaelTang + XiaoHan + RonakPradeep + KuangLu + YueZhang + RodrigoNogueira + KyunghyunCho + HuiFang + JimmyLin + 31–41 + We present Covidex, a search engine that exploits the latest neural ranking models to provide information access to the COVID-19 Open Research Dataset curated by the Allen Institute for AI. Our system has been online and serving users since late March 2020. The Covidex is the user application component of our three-pronged strategy to develop technologies for helping domain experts tackle the ongoing global pandemic. In addition, we provide robust and easy-to-use keyword search infrastructure that exploits mature fusion-based methods as well as standalone neural ranking models that can be incorporated into other applications. These techniques have been evaluated in the multi-round TREC-COVID challenge: Our infrastructure and baselines have been adopted by many participants, including some of the best systems. In round 3, we submitted the highest-scoring run that took advantage of previous training data and the second-highest fully automatic run. In rounds 4 and 5, we submitted the highest-scoring fully automatic runs. + 2020.sdp-1.5 + + + The impact of preprint servers in the formation of novel ideas + SwarupSatish + ZonghaiYao + AndrewDrozdov + BorisVeytsman + 42–55 + We study whether novel ideas in biomedical literature appear first in preprints or traditional journals. We develop a Bayesian method to estimate the time of appearance for a phrase in the literature, and apply it to a number of phrases, both automatically extracted and suggested by experts. We see that presently most phrases appear first in the traditional journals, but there is a number of phrases with the first appearance on preprint servers. A comparison of the general composition of texts from bioRxiv and traditional journals shows a growing trend of bioRxiv being predictive of traditional journals. We discuss the application of the method for related problems. + 2020.sdp-1.6 + + + Effective distributed representations for academic expert search + MarkBerger + JakubZavrel + PaulGroth + 56–71 + Expert search aims to find and rank experts based on a user’s query. In academia, retrieving experts is an efficient way to navigate through a large amount of academic knowledge. Here, we study how different distributed representations of academic papers (i.e. embeddings) impact academic expert retrieval. We use the Microsoft Academic Graph dataset and experiment with different configurations of a document-centric voting model for retrieval. In particular, we explore the impact of the use of contextualized embeddings on search performance. We also present results for paper embeddings that incorporate citation information through retrofitting. Additionally, experiments are conducted using different techniques for assigning author weights based on author order. We observe that using contextual embeddings produced by a transformer model trained for sentence similarity tasks produces the most effective paper representations for document-centric expert retrieval. However, retrofitting the paper embeddings and using elaborate author contribution weighting strategies did not improve retrieval performance. + 2020.sdp-1.7 + + + Learning <fixed-case>CNF</fixed-case> Blocking for Large-scale Author Name Disambiguation + KunhoKim + AtharSefid + C. LeeGiles + 72–80 + Author name disambiguation (AND) algorithms identify a unique author entity record from all similar or same publication records in scholarly or similar databases. Typically, a clustering method is used that requires calculation of similarities between each possible record pair. However, the total number of pairs grows quadratically with the size of the author database making such clustering difficult for millions of records. One remedy is a blocking function that reduces the number of pairwise similarity calculations. Here, we introduce a new way of learning blocking schemes by using a conjunctive normal form (CNF) in contrast to the disjunctive normal form (DNF). We demonstrate on PubMed author records that CNF blocking reduces more pairs while preserving high pairs completeness compared to the previous methods that use a DNF and that the computation time is significantly reduced. In addition, we also show how to ensure that the method produces disjoint blocks so that much of the AND algorithm can be efficiently paralleled. Our CNF blocking method is tested on the entire PubMed database of 80 million author mentions and efficiently removes 82.17% of all author record pairs in 10 minutes. + 2020.sdp-1.8 + + + Reconstructing Manual Information Extraction with <fixed-case>DB</fixed-case>-to-Document Backprojection: Experiments in the Life Science Domain + Mark-ChristophMüller + SuchetaGhosh + MajaRey + UlrikeWittig + WolfgangMüller + MichaelStrube + 81–90 + We introduce a novel scientific document processing task for making previously inaccessible information in printed paper documents available to automatic processing. We describe our data set of scanned documents and data records from the biological database SABIO-RK, provide a definition of the task, and report findings from preliminary experiments. Rigorous evaluation proved challenging due to lack of gold-standard data and a difficult notion of correctness. Qualitative inspection of results, however, showed the feasibility and usefulness of the task + 2020.sdp-1.9 + + + <fixed-case>D</fixed-case>eep<fixed-case>P</fixed-case>aper<fixed-case>C</fixed-case>omposer: A Simple Solution for Training Data Preparation for Parsing Research Papers + MengLing + JianChen + 91–96 + We present DeepPaperComposer, a simple solution for preparing highly accurate (100%) training data without manual labeling to extract content from scholarly articles using convolutional neural networks (CNNs). We used our approach to generate data and trained CNNs to extract eight categories of both textual (titles, abstracts, headers, figure and table captions, and other texts) and non-textural content (figures and tables) from 30 years of IEEE VIS conference papers, of which a third were scanned bitmap PDFs. We curated this dataset and named it VISpaper-3K. We then showed our initial benchmark performance using VISpaper-3K over itself and CS-150 using YOLOv3 and Faster-RCNN. We open-source DeepPaperComposer of our training data generation and released the resulting annotation data VISpaper-3K to promote re-producible research. + 2020.sdp-1.10 + + + Improved Local Citation Recommendation Based on Context Enhanced with Global Information + ZoranMedić + JanSnajder + 97–103 + Local citation recommendation aims at finding articles relevant for given citation context. While most previous approaches represent context using solely text surrounding the citation, we propose enhancing context representation with global information. Specifically, we include citing article’s title and abstract into context representation. We evaluate our model on datasets with different citation context sizes and demonstrate improvements with globally-enhanced context representations when citation contexts are smaller. + 2020.sdp-1.11 + + + On the effectiveness of small, discriminatively pre-trained language representation models for biomedical text mining + Ibrahim BurakOzyurt + 104–112 + Neural language representation models such as BERT have recently shown state of the art performance in downstream NLP tasks and bio-medical domain adaptation of BERT (Bio-BERT) has shown same behavior on biomedical text mining tasks. However, due to their large model size and resulting increased computational need, practical application of models such as BERT is challenging making smaller models with comparable performance desirable for real word applications. Recently, a new language transformers based language representation model named ELECTRA is introduced, that makes efficient usage of training data in a generative-discriminative neural model setting that shows performance gains over BERT. These gains are especially impressive for smaller models. Here, we introduce two small ELECTRA based model named Bio-ELECTRA and Bio-ELECTRA++ that are eight times smaller than BERT Base and Bio-BERT and achieves comparable or better performance on biomedical question answering, yes/no question answer classification, question answer candidate ranking and relation extraction tasks. Bio-ELECTRA is pre-trained from scratch on PubMed abstracts using a consumer grade GPU with only 8GB memory. Bio-ELECTRA++ is the further pre-trained version of Bio-ELECTRA trained on a corpus of open access full papers from PubMed Central. While, for biomedical named entity recognition, large BERT Base model outperforms Bio-ELECTRA++, Bio-ELECTRA and ELECTRA-Small++, with hyperparameter tuning Bio-ELECTRA++ achieves results comparable to BERT. + 2020.sdp-1.12 + + + <fixed-case>S</fixed-case>ci<fixed-case>WING</fixed-case>– A Software Toolkit for Scientific Document Processing + AbhinavRamesh Kashyap + Min-YenKan + 113–120 + We introduce SciWING, an open-source soft-ware toolkit which provides access to state-of-the-art pre-trained models for scientific document processing (SDP) tasks, such as citation string parsing, logical structure recovery and citation intent classification. Compared to other toolkits, SciWING follows a full neural pipeline and provides a Python inter-face for SDP. When needed, SciWING provides fine-grained control for rapid experimentation with different models by swapping and stacking different modules. Transfer learning from general and scientific documents specific pre-trained transformers (i.e., BERT, SciBERT, etc.) can be performed. SciWING incorporates ready-to-use web and terminal-based applications and demonstrations to aid adoption and development. The toolkit is available from http://sciwing.io and the demos are available at http://rebrand.ly/sciwing-demo. + 2020.sdp-1.13 + + + Multi-task Peer-Review Score Prediction + JiyiLi + AyakaSato + KazuyaShimura + FumiyoFukumoto + 121–126 + Automatic prediction on the peer-review aspect scores of academic papers can be a useful assistant tool for both reviewers and authors. To handle the small size of published datasets on the target aspect of scores, we propose a multi-task approach to leverage additional information from other aspects of scores for improving the performance of the target. Because one of the problems of building multi-task models is how to select the proper resources of auxiliary tasks and how to select the proper shared structures. We propose a multi-task shared structure encoding approach which automatically selects good shared network structures as well as good auxiliary resources. The experiments based on peer-review datasets show that our approach is effective and has better performance on the target scores than the single-task method and naive multi-task methods. + 2020.sdp-1.14 + + + <fixed-case>ERLKG</fixed-case>: Entity Representation Learning and Knowledge Graph based association analysis of <fixed-case>COVID</fixed-case>-19 through mining of unstructured biomedical corpora + SayantanBasu + SinchaniChakraborty + AtifHassan + SanaSiddique + AshishAnand + 127–137 + We introduce a generic, human-out-of-the-loop pipeline, ERLKG, to perform rapid association analysis of any biomedical entity with other existing entities from a corpora of the same domain. Our pipeline consists of a Knowledge Graph (KG) created from the Open Source CORD-19 dataset by fully automating the procedure of information extraction using SciBERT. The best latent entity representations are then found by benchnmarking different KG embedding techniques on the task of link prediction using a Graph Convolution Network Auto Encoder (GCN-AE). We demonstrate the utility of ERLKG with respect to COVID-19 through multiple qualitative evaluations. Due to the lack of a gold standard, we propose a relatively large intrinsic evaluation dataset for COVID-19 and use it for validating the top two performing KG embedding techniques. We find TransD to be the best performing KG embedding technique with Pearson and Spearman correlation scores of 0.4348 and 0.4570 respectively. We demonstrate that a considerable number of ERLKG’s top protein, chemical and disease predictions are currently in consideration for COVID-19 related research. + 2020.sdp-1.15 + + + Towards Grounding of Formulae + TakutoAsakura + AndréGreiner-Petter + AkikoAizawa + YusukeMiyao + 138–147 + A large amount of scientific knowledge is represented within mixed forms of natural language texts and mathematical formulae. Therefore, a collaboration of natural language processing and formula analyses, so-called mathematical language processing, is necessary to enable computers to understand and retrieve information from the documents. However, as we will show in this project, a mathematical notation can change its meaning even within the scope of a single paragraph. This flexibility makes it difficult to extract the exact meaning of a mathematical formula. In this project, we will propose a new task direction for grounding mathematical formulae. Particularly, we are addressing the widespread misconception of various research projects in mathematical information retrieval, which presume that mathematical notations have a fixed meaning within a single document. We manually annotated a long scientific paper to illustrate the task concept. Our high inter-annotator agreement shows that the task is well understood for humans. Our results indicate that it is worthwhile to grow the techniques for the proposed task to contribute to the further progress of mathematical language processing. + 2020.sdp-1.16 + + + <fixed-case>SC</fixed-case>hu<fixed-case>BERT</fixed-case>: Scholarly Document Chunks with <fixed-case>BERT</fixed-case>-encoding boost Citation Count Prediction. + Thomasvan Dongen + GideonMaillette de Buy Wenniger + LambertSchomaker + 148–157 + Predicting the number of citations of scholarly documents is an upcoming task in scholarly document processing. Besides the intrinsic merit of this information, it also has a wider use as an imperfect proxy for quality which has the advantage of being cheaply available for large volumes of scholarly documents. Previous work has dealt with number of citations prediction with relatively small training data sets, or larger datasets but with short, incomplete input text. In this work we leverage the open access ACL Anthology collection in combination with the Semantic Scholar bibliometric database to create a large corpus of scholarly documents with associated citation information and we propose a new citation prediction model called SChuBERT. In our experiments we compare SChuBERT with several state-of-the-art citation prediction models and show that it outperforms previous methods by a large margin. We also show the merit of using more training data and longer input for number of citations prediction. + 2020.sdp-1.17 + + + Structure-Tags Improve Text Classification for Scholarly Document Quality Prediction + GideonMaillette de Buy Wenniger + Thomasvan Dongen + EleriAedmaa + Herbert TeunKruitbosch + Edwin A.Valentijn + LambertSchomaker + 158–167 + Training recurrent neural networks on long texts, in particular scholarly documents, causes problems for learning. While hierarchical attention networks (HANs) are effective in solving these problems, they still lose important information about the structure of the text. To tackle these problems, we propose the use of HANs combined with structure-tags which mark the role of sentences in the document. Adding tags to sentences, marking them as corresponding to title, abstract or main body text, yields improvements over the state-of-the-art for scholarly document quality prediction. The proposed system is applied to the task of accept/reject prediction on the PeerRead dataset and compared against a recent BiLSTM-based model and joint textual+visual model as well as against plain HANs. Compared to plain HANs, accuracy increases on all three domains.On the computation and language domain our new model works best overall, and increases accuracy 4.7% over the best literature result. We also obtain improvements when introducing the tags for prediction of the number of citations for 88k scientific publications that we compiled from the Allen AI S2ORC dataset. For our HAN-system with structure-tags we reach 28.5% explained variance, an improvement of 1.8% over our reimplementation of the BiLSTM-based model as well as 1.0% improvement over plain HANs. + 2020.sdp-1.18 + 2020.sdp-1.18.OptionalSupplementaryMaterial.zip + + + Cydex: Neural Search Infrastructure for the Scholarly Literature + ShaneDing + EdwinZhang + JimmyLin + 168–173 + Cydex is a platform that provides neural search infrastructure for domain-specific scholarly literature. The platform represents an abstraction of Covidex, our recently developed full-stack open-source search engine for the COVID-19 Open Research Dataset (CORD-19) from AI2. While Covidex takes advantage of the latest best practices for keyword search using the popular Lucene search library as well as state-of-the-art neural ranking models using T5, parts of the system were hard coded to only work with CORD-19. This paper describes our efforts to generalize Covidex into Cydex, which can be applied to scholarly literature in different domains. By decoupling corpus-specific configurations from the frontend implementation, we are able to demonstrate the generality of Cydex on two very different corpora: the ACL Anthology and a collection of hydrology abstracts. Our platform is entirely open source and available at cydex.ai. + 2020.sdp-1.19 + + + On the Use of Web Search to Improve Scientific Collections + KrutarthPatel + CorneliaCaragea + Sujatha DasGollapalli + 174–183 + Despite the advancements in search engine features, ranking methods, technologies, and the availability of programmable APIs, current-day open-access digital libraries still rely on crawl-based approaches for acquiring their underlying document collections. In this paper, we propose a novel search-driven framework for acquiring documents for such scientific portals. Within our framework, publicly-available research paper titles and author names are used as queries to a Web search engine. We were able to obtain ~267,000 unique research papers through our fully-automated framework using ~76,000 queries, resulting in almost 200,000 more papers than the number of queries. Moreover, through a combination of title and author name search, we were able to recover 78% of the original searched titles. + 2020.sdp-1.20 + + + Scaling Systematic Literature Reviews with Machine Learning Pipelines + SeraphinaGoldfarb-Tarrant + AlexanderRobertson + JasminaLazic + TheodoraTsouloufi + LouiseDonnison + KarenSmyth + 184–195 + Systematic reviews, which entail the extraction of data from large numbers of scientific documents, are an ideal avenue for the application of machine learning. They are vital to many fields of science and philanthropy, but are very time-consuming and require experts. Yet the three main stages of a systematic review are easily done automatically: searching for documents can be done via APIs and scrapers, selection of relevant documents can be done via binary classification, and extraction of data can be done via sequence-labelling classification. Despite the promise of automation for this field, little research exists that examines the various ways to automate each of these tasks. We construct a pipeline that automates each of these aspects, and experiment with many human-time vs. system quality trade-offs. We test the ability of classifiers to work well on small amounts of data and to generalise to data from countries not represented in the training data. We test different types of data extraction with varying difficulty in annotation, and five different neural architectures to do the extraction. We find that we can get surprising accuracy and generalisability of the whole pipeline system with only 2 weeks of human-expert annotation, which is only 15% of the time it takes to do the whole review manually and can be repeated and extended to new data with no additional effort. + 2020.sdp-1.21 + + + Document-Level Definition Detection in Scholarly Documents: Existing Models, Error Analyses, and Future Directions + DongyeopKang + AndrewHead + RishamSidhu + KyleLo + DanielWeld + Marti A.Hearst + 196–206 + The task of definition detection is important for scholarly papers, because papers often make use of technical terminology that may be unfamiliar to readers. Despite prior work on definition detection, current approaches are far from being accurate enough to use in realworld applications. In this paper, we first perform in-depth error analysis of the current best performing definition detection system and discover major causes of errors. Based on this analysis, we develop a new definition detection system, HEDDEx, that utilizes syntactic features, transformer encoders, and heuristic filters, and evaluate it on a standard sentence-level benchmark. Because current benchmarks evaluate randomly sampled sentences, we propose an alternative evaluation that assesses every sentence within a document. This allows for evaluating recall in addition to precision. HEDDEx outperforms the leading system on both the sentence-level and the document-level tasks, by 12.7 F1 points and 14.4 F1 points, respectively. We note that performance on the high-recall document-level task is much lower than in the standard evaluation approach, due to the necessity of incorporation of document structure as features. We discuss remaining challenges in document-level definition detection, ideas for improvements, and potential issues for the development of reading aid applications. + 2020.sdp-1.22 + 2020.sdp-1.22.OptionalSupplementaryMaterial.zip + + + A New Neural Search and Insights Platform for Navigating and Organizing <fixed-case>AI</fixed-case> Research + MarziehFadaee + OlgaGureenkova + FernandoRejon Barrera + CarstenSchnober + WouterWeerkamp + JakubZavrel + 207–213 + To provide AI researchers with modern tools for dealing with the explosive growth of the research literature in their field, we introduce a new platform, AI Research Navigator, that combines classical keyword search with neural retrieval to discover and organize relevant literature. The system provides search at multiple levels of textual granularity, from sentences to aggregations across documents, both in natural language and through navigation in a domain specific Knowledge Graph. We give an overview of the overall architecture of the system and of the components for document analysis, question answering, search, analytics, expert search, and recommendations. + 2020.sdp-1.23 + + + Overview and Insights from the Shared Tasks at Scholarly Document Processing 2020: <fixed-case>CL</fixed-case>-<fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm, <fixed-case>L</fixed-case>ay<fixed-case>S</fixed-case>umm and <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm + Muthu KumarChandrasekaran + GuyFeigenblat + EduardHovy + AbhilashaRavichander + MichalShmueli-Scheuer + Anitade Waard + 214–224 + We present the results of three Shared Tasks held at the Scholarly Document Processing Workshop at EMNLP2020: CL-SciSumm, LaySumm and LongSumm. We report on each of the tasks, which received 18 submissions in total, with some submissions addressing two or three of the tasks. In summary, the quality and quantity of the submissions show that there is ample interest in scholarly document summarization, and the state of the art in this domain is at a midway point between being an impossible task and one that is fully resolved. + 2020.sdp-1.24 + + + <fixed-case>CIST</fixed-case>@<fixed-case>CL</fixed-case>-<fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm 2020, <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm 2020: Automatic Scientific Document Summarization + LeiLi + YangXie + WeiLiu + YinanLiu + YafeiJiang + SiyaQi + XingyuanLi + 225–234 + Our system participates in two shared tasks, CL-SciSumm 2020 and LongSumm 2020. In the CL-SciSumm shared task, based on our previous work, we apply more machine learning methods on position features and content features for facet classification in Task1B. And GCN is introduced in Task2 to perform extractive summarization. In the LongSumm shared task, we integrate both the extractive and abstractive summarization ways. Three methods were tested which are T5 Fine-tuning, DPPs Sampling, and GRU-GCN/GAT. + 2020.sdp-1.25 + + + <fixed-case>NLP</fixed-case>-<fixed-case>PINGAN</fixed-case>-<fixed-case>TECH</fixed-case> @ <fixed-case>CL</fixed-case>-<fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm 2020 + LingChai + GuizhenFu + YuanNi + 235–241 + We focus on systems for TASK1 (TASK 1A and TASK 1B) of CL-SciSumm Shared Task 2020 in this paper. Task 1A is regarded as a binary classification task of sentence pairs. The strategies of domain-specific embedding and special tokens based on language models are proposed. Fusion of contextualized embedding and extra information is further explored in this article. We leverage Sembert to capture the structured semantic information. The joint of BERT-based model and classifiers without neural networks is also exploited. For the Task 1B, a language model with different weights for classes is fine-tuned to accomplish a multi-label classification task. The results show that extra information can improve the identification of cited text spans. The end-to-end trained models outperform models trained with two stages, and the averaged prediction of multi-models is more accurate than an individual one. + 2020.sdp-1.26 + + + <fixed-case>IIITBH</fixed-case>-<fixed-case>IITP</fixed-case>@<fixed-case>CL</fixed-case>-<fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm20, <fixed-case>CL</fixed-case>-<fixed-case>L</fixed-case>ay<fixed-case>S</fixed-case>umm20, <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm20 + SaichethanReddy + NaveenSaini + SriparnaSaha + PushpakBhattacharyya + 242–250 + In this paper, we present the IIIT Bhagalpur and IIT Patna team’s effort to solve the three shared tasks namely, CL-SciSumm 2020, CL-LaySumm 2020, LongSumm 2020 at SDP 2020. The theme of these tasks is to generate medium-scale, lay and long summaries, respectively, for scientific articles. For the first two tasks, unsupervised systems are developed, while for the third one, we develop a supervised system.The performances of all the systems were evaluated on the associated datasets with the shared tasks in term of well-known ROUGE metric. + 2020.sdp-1.27 + + + <fixed-case>AUTH</fixed-case> @ <fixed-case>CLS</fixed-case>ci<fixed-case>S</fixed-case>umm 20, <fixed-case>L</fixed-case>ay<fixed-case>S</fixed-case>umm 20, <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm 20 + AlexiosGidiotis + StefanosStefanidis + GrigoriosTsoumakas + 251–260 + We present the systems we submitted for the shared tasks of the Workshop on Scholarly Document Processing at EMNLP 2020. Our approaches to the tasks are focused on exploiting large Transformer models pre-trained on huge corpora and adapting them to the different shared tasks. For tasks 1A and 1B of CL-SciSumm we are using different variants of the BERT model to tackle the tasks of “cited text span” and “facet” identification. For the summarization tasks 2 of CL-SciSumm, LaySumm and LongSumm we make use of different variants of the PEGASUS model, with and without fine-tuning, adapted to the nuances of each one of those particular tasks. + 2020.sdp-1.28 + + + <fixed-case>U</fixed-case>ni<fixed-case>HD</fixed-case>@<fixed-case>CL</fixed-case>-<fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm 2020: Citation Extraction as Search + DennisAumiller + SatyaAlmasian + PhilipHausner + MichaelGertz + 261–269 + This work presents the entry by the team from Heidelberg University in the CL-SciSumm 2020 shared task at the Scholarly Document Processing workshop at EMNLP 2020. As in its previous iterations, the task is to highlight relevant parts in a reference paper, depending on a citance text excerpt from a citing paper. We participated in tasks 1A (citation identification) and 1B (citation context classification). Contrary to most previous works, we frame Task 1A as a search relevance problem, and introduce a 2-step re-ranking approach, which consists of a preselection based on BM25 in addition to positional document features, and a top-k re-ranking with BERT. For Task 1B, we follow previous submissions in applying methods that deal well with low resources and imbalanced classes. + 2020.sdp-1.29 + + + <fixed-case>IITP</fixed-case>-<fixed-case>AI</fixed-case>-<fixed-case>NLP</fixed-case>-<fixed-case>ML</fixed-case>@ <fixed-case>CL</fixed-case>-<fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm 2020, <fixed-case>CL</fixed-case>-<fixed-case>L</fixed-case>ay<fixed-case>S</fixed-case>umm 2020, <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm 2020 + Santosh KumarMishra + HarshavardhanKundarapu + NaveenSaini + SriparnaSaha + PushpakBhattacharyya + 270–276 + The publication rate of scientific literature increases rapidly, which poses a challenge for researchers to keep themselves updated with new state-of-the-art. Scientific document summarization solves this problem by summarizing the essential fact and findings of the document. In the current paper, we present the participation of IITP-AI-NLP-ML team in three shared tasks, namely, CL-SciSumm 2020, LaySumm 2020, LongSumm 2020, which aims to generate medium, lay, and long summaries of the scientific articles, respectively. To solve CL-SciSumm 2020 and LongSumm 2020 tasks, three well-known clustering techniques are used, and then various sentence scoring functions, including textual entailment, are used to extract the sentences from each cluster for a summary generation. For LaySumm 2020, an encoder-decoder based deep learning model has been utilized. Performances of our developed systems are evaluated in terms of ROUGE measures on the associated datasets with the shared task. + 2020.sdp-1.30 + + + 1<fixed-case>A</fixed-case>-Team / <fixed-case>M</fixed-case>artin-Luther-Universität Halle-Wittenberg@<fixed-case>CLS</fixed-case>ci<fixed-case>S</fixed-case>umm 20 + ArturJurk + MaikBoltze + GeorgKeller + LornaUlbrich + AnjaFischer + 277–281 + This document demonstrates our groups approach to the CL-SciSumm shared task 2020. There are three tasks in CL-SciSumm 2020. In Task 1a, we apply a Siamese neural network to identify the spans of text in the reference paper best reflecting a citation. In Task 1b, we use a SVM to classify the facet of a citation. + 2020.sdp-1.31 + + + Team <fixed-case>MLU</fixed-case>@<fixed-case>CL</fixed-case>-<fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm20: Methods for Computational Linguistics Scientific Citation Linkage + RongHuang + KseniiaKrylova + 282–287 + This paper describes our approach to the CL-SciSumm 2020 shared task toward the problem of identifying reference span of the citing article in the referred article. In Task 1a, we apply and compare different methods in combination with similarity scores to identify spans of the reference text for the given citance. In Task 1b, we use a logistic regression to classifying the discourse facets. + 2020.sdp-1.32 + + + <fixed-case>IR</fixed-case>&<fixed-case>TM</fixed-case>-<fixed-case>NJUST</fixed-case>@<fixed-case>CLS</fixed-case>ci<fixed-case>S</fixed-case>umm 20 + HengZhang + LifanLiu + RupingWang + ShaohuHu + ShutianMa + ChengzhiZhang + 288–296 + This paper mainly introduces our methods for Task 1A and Task 1B of CL-SciSumm 2020. Task 1A is to identify reference text in reference paper. Traditional machine learning models and MLP model are used. We evaluate the performances of these models and submit the final results from the optimal model. Compared with previous work, we optimize the ratio of positive to negative examples after data sampling. In order to construct features for classification, we calculate similarities between reference text and candidate sentences based on sentence vectors. Accordingly, nine similarities are used, of which eight are chosen from what we used in CL-SciSumm 2019 and a new sentence similarity based on fastText is added. Task 1B is to classify the facets of reference text. Unlike the methods used in CL-SciSumm 2019, we construct inputs of models based on word vectors and add deep learning models for classification this year. + 2020.sdp-1.33 + + + <fixed-case>C</fixed-case>ite<fixed-case>QA</fixed-case>@<fixed-case>CLS</fixed-case>ci<fixed-case>S</fixed-case>umm 2020 + AnjanaUmapathy + KarthikRadhakrishnan + KinjalJain + RahulSingh + 297–302 + In academic publications, citations are used to build context for a concept by highlighting relevant aspects from reference papers. Automatically identifying referenced snippets can help researchers swiftly isolate principal contributions of scientific works. In this paper, we exploit the underlying structure of scientific articles to predict reference paper spans and facets corresponding to a citation. We propose two methods to detect citation spans - keyphrase overlap, BERT along with structural priors. We fine-tune FastText embeddings and leverage textual, positional features to predict citation facets. + 2020.sdp-1.34 + + + Dimsum @<fixed-case>L</fixed-case>ay<fixed-case>S</fixed-case>umm 20 + TiezhengYu + DanSu + WenliangDai + PascaleFung + 303–309 + Lay summarization aims to generate lay summaries of scientific papers automatically. It is an essential task that can increase the relevance of science for all of society. In this paper, we build a lay summary generation system based on BART model. We leverage sentence labels as extra supervision signals to improve the performance of lay summarization. In the CL-LaySumm 2020 shared task, our model achieves 46.00 Rouge1-F1 score. + 2020.sdp-1.35 + + + <fixed-case>ARTU</fixed-case> / <fixed-case>TU</fixed-case> <fixed-case>W</fixed-case>ien and Artificial Researcher@ <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm 20 + AlaaEl-Ebshihy + Annisa MaulidaNingtyas + LindaAndersson + FlorinaPiroi + AndreasRauber + 310–317 + In this paper, we present our approach to solve the LongSumm 2020 Shared Task, at the 1st Workshop on Scholarly Document Processing. The objective of the long summaries task is to generate long summaries that cover salient information in scientific articles. The task is to generate abstractive and extractive summaries of a given scientific article. In the proposed approach, we are inspired by the concept of Argumentative Zoning (AZ) that de- fines the main rhetorical structure in scientific articles. We define two aspects that should be covered in scientific paper summary, namely Claim/Method and Conclusion/Result aspects. We use Solr index to expand the sentences of the paper abstract. We formulate each abstract sentence in a given publication as query to retrieve similar sentences from the text body of the document itself. We utilize a sentence selection algorithm described in previous literature to select sentences for the final summary that covers the two aforementioned aspects. + 2020.sdp-1.36 + + + Monash-Summ@<fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm 20 <fixed-case>S</fixed-case>ci<fixed-case>S</fixed-case>umm<fixed-case>P</fixed-case>ip: An Unsupervised Scientific Paper Summarization Pipeline + JiaxinJu + MingLiu + LongxiangGao + ShiruiPan + 318–327 + The Scholarly Document Processing (SDP) workshop is to encourage more efforts on natural language understanding of scientific task. It contains three shared tasks and we participate in the LongSumm shared task. In this paper, we describe our text summarization system, SciSummPip, inspired by SummPip (Zhao et al., 2020) that is an unsupervised text summarization system for multi-document in News domain. Our SciSummPip includes a transformer-based language model SciBERT (Beltagy et al., 2019) for contextual sentence representation, content selection with PageRank (Page et al., 1999), sentence graph construction with both deep and linguistic information, sentence graph clustering and within-graph summary generation. Our work differs from previous method in that content selection and a summary length constraint is applied to adapt to the scientific domain. The experiment results on both training dataset and blind test dataset show the effectiveness of our method, and we empirically verify the robustness of modules used in SciSummPip with BERTScore (Zhang et al., 2019a). + 2020.sdp-1.37 + + + Using Pre-Trained Transformer for Better Lay Summarization + SeungwonKim + 328–335 + In this paper, we tack lay summarization tasks, which aim to automatically produce lay summaries for scientific papers, to participate in the first CL-LaySumm 2020 in SDP workshop at EMNLP 2020. We present our approach of using Pre-training with Extracted Gap-sentences for Abstractive Summarization (PEGASUS; Zhang et al., 2019b) to produce the lay summary and combining those with the extractive summarization model using Bidirectional Encoder Representations from Transformers (BERT; Devlin et al., 2018) and readability metrics that measure the readability of the sentence to further improve the quality of the summary. Our model achieves a remarkable performance on ROUGE metrics, demonstrating the produced summary is more readable while it summarizes the main points of the document. + 2020.sdp-1.38 + + + Summaformers @ <fixed-case>L</fixed-case>ay<fixed-case>S</fixed-case>umm 20, <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm 20 + SayarGhosh Roy + NikhilPinnaparaju + RisubhJain + ManishGupta + VasudevaVarma + 336–343 + Automatic text summarization has been widely studied as an important task in natural language processing. Traditionally, various feature engineering and machine learning based systems have been proposed for extractive as well as abstractive text summarization. Recently, deep learning based, specifically Transformer-based systems have been immensely popular. Summarization is a cognitively challenging task – extracting summary worthy sentences is laborious, and expressing semantics in brief when doing abstractive summarization is complicated. In this paper, we specifically look at the problem of summarizing scientific research papers from multiple domains. We differentiate between two types of summaries, namely, (a) LaySumm: A very short summary that captures the essence of the research paper in layman terms restricting overtly specific technical jargon and (b) LongSumm: A much longer detailed summary aimed at providing specific insights into various ideas touched upon in the paper. While leveraging latest Transformer-based models, our systems are simple, intuitive and based on how specific paper sections contribute to human summaries of the two types described above. Evaluations against gold standard summaries using ROUGE metrics prove the effectiveness of our approach. On blind test corpora, our system ranks first and third for the LongSumm and LaySumm tasks respectively. + 2020.sdp-1.39 + + + Divide and Conquer: From Complexity to Simplicity for Lay Summarization + RochanaChaturvedi + Saachi. + Jaspreet SinghDhani + AnuragJoshi + AnkushKhanna + NehaTomar + SwagataDuari + AlkaKhurana + VasudhaBhatnagar + 344–355 + We describe our approach for the 1st Computational Linguistics Lay Summary Shared Task CL-LaySumm20. The task is to produce non-technical summaries of scholarly documents. The summary should be within easy grasp of a layman who may not be well versed with the domain of the research article. We propose a two step divide-and-conquer approach. First, we judiciously select segments of the documents that are not overly pedantic and are likely to be of interest to the laity, and over-extract sentences from each segment using an unsupervised network based method. Next, we perform abstractive summarization on these extractions and systematically merge the abstractions. We run ablation studies to establish that each step in our pipeline is critical for improvement in the quality of lay summary. Our approach leverages state-of-the-art pre-trained deep neural network based models as zero-shot learners to achieve high scores on the task. + 2020.sdp-1.40 + 2020.sdp-1.40.OptionalSupplementaryMaterial.zip + + + <fixed-case>GUIR</fixed-case> @ <fixed-case>L</fixed-case>ong<fixed-case>S</fixed-case>umm 2020: Learning to Generate Long Summaries from Scientific Documents + SajadSotudeh Gharebagh + ArmanCohan + NazliGoharian + 356–361 + This paper presents our methods for the LongSumm 2020: Shared Task on Generating Long Summaries for Scientific Documents, where the task is to generatelong summaries given a set of scientific papers provided by the organizers. We explore 3 main approaches for this task: 1. An extractive approach using a BERT-based summarization model; 2. A two stage model that additionally includes an abstraction step using BART; and 3. A new multi-tasking approach on incorporating document structure into the summarizer. We found that our new multi-tasking approach outperforms the two other methods by large margins. Among 9 participants in the shared task, our best model ranks top according to Rouge-1 score (53.11%) while staying competitive in terms of Rouge-2. + 2020.sdp-1.41 + +
+
diff --git a/data/xml/2020.sigtyp.xml b/data/xml/2020.sigtyp.xml new file mode 100644 index 0000000000..153d83c30b --- /dev/null +++ b/data/xml/2020.sigtyp.xml @@ -0,0 +1,84 @@ + + + + + Proceedings of the Second Workshop on Computational Research in Linguistic Typology + EkaterinaVylomova + Edoardo M.Ponti + EitanGrossman + Arya D.McCarthy + YevgeniBerzak + HaimDubossarsky + IvanVulić + RoiReichart + AnnaKorhonen + RyanCotterell + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.sigtyp-1.0 + + + <fixed-case>SIGTYP</fixed-case> 2020 Shared Task: Prediction of Typological Features + JohannesBjerva + ElizabethSalesky + Sabrina J.Mielke + AditiChaudhary + CelanoGiuseppe + Edoardo MariaPonti + EkaterinaVylomova + RyanCotterell + IsabelleAugenstein + 1–11 + Typological knowledge bases (KBs) such as WALS (Dryer and Haspelmath, 2013) contain information about linguistic properties of the world’s languages. They have been shown to be useful for downstream applications, including cross-lingual transfer learning and linguistic probing. A major drawback hampering broader adoption of typological KBs is that they are sparsely populated, in the sense that most languages only have annotations for some features, and skewed, in that few features have wide coverage. As typological features often correlate with one another, it is possible to predict them and thus automatically populate typological KBs, which is also the focus of this shared task. Overall, the task attracted 8 submissions from 5 teams, out of which the most successful methods make use of such feature correlations. However, our error analysis reveals that even the strongest submitted systems struggle with predicting feature values for languages where few features are known. + 2020.sigtyp-1.1 + + + <fixed-case>KMI</fixed-case>-Panlingua-<fixed-case>IITKGP</fixed-case> @<fixed-case>SIGTYP</fixed-case>2020: Exploring rules and hybrid systems for automatic prediction of typological features + RiteshKumar + DeepakAlok + AkankshaBansal + BorniniLahiri + Atul Kr.Ojha + 12–16 + This paper enumerates SigTyP 2020 Shared Task on the prediction of typological features as performed by the KMI-Panlingua-IITKGP team. The task entailed the prediction of missing values in a particular language, provided, the name of the language family, its genus, location (in terms of latitude and longitude coordinates and name of the country where it is spoken) and a set of feature-value pair are available. As part of fulfillment of the aforementioned task, the team submitted 3 kinds of system - 2 rule-based and one hybrid system. Of these 3, one rule-based system generated the best performance on the test set. All the systems were ‘constrained’ in the sense that no additional dataset or information, other than those provided by the organisers, was used for developing the systems. + 2020.sigtyp-1.2 + + + <fixed-case>NEMO</fixed-case>: Frequentist Inference Approach to Constrained Linguistic Typology Feature Prediction in <fixed-case>SIGTYP</fixed-case> 2020 Shared Task + AlexanderGutkin + RichardSproat + 17–28 + This paper describes the NEMO submission to SIGTYP 2020 shared task (Bjerva et al., 2020) which deals with prediction of linguistic typological features for multiple languages using the data derived from World Atlas of Language Structures (WALS). We employ frequentist inference to represent correlations between typological features and use this representation to train simple multi-class estimators that predict individual features. We describe two submitted ridge regression-based configurations which ranked second and third overall in the constrained task. Our best configuration achieved the microaveraged accuracy score of 0.66 on 149 test languages. + 2020.sigtyp-1.3 + + + Predicting Typological Features in <fixed-case>WALS</fixed-case> using Language Embeddings and Conditional Probabilities: <fixed-case>ÚFAL</fixed-case> Submission to the <fixed-case>SIGTYP</fixed-case> 2020 Shared Task + MartinVastl + DanielZeman + RudolfRosa + 29–35 + We present our submission to the SIGTYP 2020 Shared Task on the prediction of typological features. We submit a constrained system, predicting typological features only based on the WALS database. We investigate two approaches. The simpler of the two is a system based on estimating correlation of feature values within languages by computing conditional probabilities and mutual information. The second approach is to train a neural predictor operating on precomputed language embeddings based on WALS features. Our submitted system combines the two approaches based on their self-estimated confidence scores. We reach the accuracy of 70.7% on the test data and rank first in the shared task. + 2020.sigtyp-1.4 + + + Imputing typological values via phylogenetic inference + GerhardJäger + 36–42 + This paper describes a workflow to impute missing values in a typological database, a sub- set of the World Atlas of Language Structures (WALS). Using a world-wide phylogeny de- rived from lexical data, the model assumes a phylogenetic continuous time Markov chain governing the evolution of typological val- ues. Data imputation is performed via a Max- imum Likelihood estimation on the basis of this model. As back-off model for languages whose phylogenetic position is unknown, a k- nearest neighbor classification based on geo- graphic distance is performed. + 2020.sigtyp-1.5 + + + <fixed-case>NUIG</fixed-case>: Multitasking Self-attention based approach to <fixed-case>S</fixed-case>ig<fixed-case>T</fixed-case>yp 2020 Shared Task + ChinmayChoudhary + 43–50 + The paper describes the Multitasking Self-attention based approach to constrained sub-task within Sigtyp 2020 Shared task. Our model is simple neural network based architecture inspired by Transformers (CITATION) model. The model uses Multitasking to compute values of all WALS features for a given input language simultaneously. + +Results show that our approach performs at par with the baseline approaches, even though our proposed approach requires only phylogenetic and geographical attributes namely Longitude, Latitude, Genus-index, Family-index and Country-index and do not use any of the known WALS features of the respective input language, to compute its missing WALS features. + 2020.sigtyp-1.6 + +
+
diff --git a/data/xml/2020.sltu.xml b/data/xml/2020.sltu.xml index c55f417708..5d86f14b0a 100644 --- a/data/xml/2020.sltu.xml +++ b/data/xml/2020.sltu.xml @@ -213,7 +213,7 @@
Phoneme Boundary Analysis using Multiway Geometric Properties of Waveform Trajectories - BHAGATHPARABATTINA + BhagathParabattina Pradip K.Das 144–152 Automatic phoneme segmentation is an important problem in speech processing. It helps in improving the recognition quality by providing a proper segmentation information for phonemes or phonetic units. Inappropriate segmentation may lead to recognition falloff. The problem is essential not only for recognition but also for annotation purpose also. In general, segmentation algorithms rely on training large data sets where data is observed to find the patterns among them. But this process is not straight forward for languages that are under resourced because of less availability of datasets. In this paper, we propose a method that uses geometrical properties of waveform trajectory where intra signal variations are studied and used for segmentation. The method does not rely on large datasets for training. The geometric properties are extracted as linear structural changes in a raw waveform. The methods and findings of the study are presented. @@ -242,7 +242,7 @@ Acoustic-Phonetic Approach for <fixed-case>ASR</fixed-case> of Less Resourced Languages Using Monolingual and Cross-Lingual Information - shwetabansal + ShwetaBansal 167–171 The exploration of speech processing for endangered languages has substantially increased in the past epoch of time. In this paper, we present the acoustic-phonetic approach for automatic speech recognition (ASR) using monolingual and cross-lingual information with application to under-resourced Indian languages, Punjabi, Nepali and Hindi. The challenging task while developing the ASR was the collection of the acoustic corpus for under-resourced languages. We have described here, in brief, the strategies used for designing the corpus and also highlighted the issues pertaining while collecting data for these languages. The bootstrap GMM-UBM based approach is used, which integrates pronunciation lexicon, language model and acoustic-phonetic model. Mel Frequency Cepstral Coefficients were used for extracting the acoustic signal features for training in monolingual and cross-lingual settings. The experimental result shows the overall performance of ASR for cross-lingual and monolingual. The phone substitution plays a key role in the cross-lingual as well as monolingual recognition. The result obtained by cross-lingual recognition compared with other baseline system and it has been found that the performance of the recognition system is based on phonemic units . The recognition rate of cross-lingual generally declines as compared with the monolingual. 2020.sltu-1.23 @@ -358,7 +358,7 @@ Automatic Extraction of Verb Paradigms in Regional Languages: the case of the Linguistic Crescent varieties - elenaknyazeva + ElenaKnyazeva GillesAdda PhilippeBoula de Mareüil MaximilienGuérin diff --git a/data/xml/2020.splu.xml b/data/xml/2020.splu.xml new file mode 100644 index 0000000000..d842733443 --- /dev/null +++ b/data/xml/2020.splu.xml @@ -0,0 +1,90 @@ + + + + + Proceedings of the Third International Workshop on Spatial Language Understanding + ParisaKordjamshidi + ArchnaBhatia + MaliheAlikhani + JasonBaldridge + MohitBansal + Marie-FrancineMoens + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.splu-1.0 + + + An Element-wise Visual-enhanced <fixed-case>B</fixed-case>i<fixed-case>LSTM</fixed-case>-<fixed-case>CRF</fixed-case> Model for Location Name Recognition + TakuyaKomada + TakashiInui + 1–9 + In recent years, previous studies have used visual information in named entity recognition (NER) for social media posts with attached images. However, these methods can only be applied to documents with attached images. In this paper, we propose a NER method that can use element-wise visual information for any documents by using image data corresponding to each word in the document. The proposed method obtains element-wise image data using an image retrieval engine, to be used as extra features in the neural NER model. Experimental results on the standard Japanese NER dataset show that the proposed method achieves a higher F1 value (89.67%) than a baseline method, demonstrating the effectiveness of using element-wise visual information. + 2020.splu-1.1 + + + <fixed-case>BERT</fixed-case>-based Spatial Information Extraction + Hyeong JinShin + Jeong YeonPark + Dae BumYuk + Jae SungLee + 10–17 + Spatial information extraction is essential to understand geographical information in text. This task is largely divided to two subtasks: spatial element extraction and spatial relation extraction. In this paper, we utilize BERT (Devlin et al., 2018), which is very effective for many natural language processing applications. We propose a BERT-based spatial information extraction model, which uses BERT for spatial element extraction and R-BERT (Wu and He, 2019) for spatial relation extraction. The model was evaluated with the SemEval 2015 dataset. The result showed a 15.4% point increase in spatial element extraction and an 8.2% point increase in spatial relation extraction in comparison to the baseline model (Nichols and Botros, 2015). + 2020.splu-1.2 + + + A Cognitively Motivated Approach to Spatial Information Extraction + ChaoXu + Emmanuelle-AnnaDietz Saldanha + DagmarGromann + BeihaiZhou + 18–28 + Automatic extraction of spatial information from natural language can boost human-centered applications that rely on spatial dynamics. The field of cognitive linguistics has provided theories and cognitive models to address this task. Yet, existing solutions tend to focus on specific word classes, subject areas, or machine learning techniques that cannot provide cognitively plausible explanations for their decisions. We propose an automated spatial semantic analysis (ASSA) framework building on grammar and cognitive linguistic theories to identify spatial entities and relations, bringing together methods of spatial information extraction and cognitive frameworks on spatial language. The proposed rule-based and explainable approach contributes constructions and preposition schemas and outperforms previous solutions on the CLEF-2017 standard dataset. + 2020.splu-1.3 + + + They are not all alike: answering different spatial questions requires different grounding strategies + AlbertoTestoni + ClaudioGreco + TobiasBianchi + MauricioMazuecos + AgataMarcante + LucianaBenotti + RaffaellaBernardi + 29–38 + In this paper, we study the grounding skills required to answer spatial questions asked by humans while playing the GuessWhat?! game. We propose a classification for spatial questions dividing them into absolute, relational, and group questions. We build a new answerer model based on the LXMERT multimodal transformer and we compare a baseline with and without visual features of the scene. We are interested in studying how the attention mechanisms of LXMERT are used to answer spatial questions since they require putting attention on more than one region simultaneously and spotting the relation holding among them. We show that our proposed model outperforms the baseline by a large extent (9.70% on spatial questions and 6.27% overall). By analyzing LXMERT errors and its attention mechanisms, we find that our classification helps to gain a better understanding of the skills required to answer different spatial questions. + 2020.splu-1.4 + + + Categorisation, Typicality & Object-Specific Features in Spatial Referring Expressions + AdamRichard-Bollans + AnthonyCohn + LucíaGómez Álvarez + 39–49 + Various accounts of cognition and semantic representations have highlighted that, for some concepts, different factors may influence category and typicality judgements. In particular, some features may be more salient in categorisation tasks while other features are more salient when assessing typicality. In this paper we explore the extent to which this is the case for English spatial prepositions and discuss the implications for pragmatic strategies and semantic models. We hypothesise that object-specific features — related to object properties and affordances — are more salient in categorisation, while geometric and physical relationships between objects are more salient in typicality judgements. In order to test this hypothesis we conducted a study using virtual environments to collect both category and typicality judgements in 3D scenes. Based on the collected data we cannot verify the hypothesis and conclude that object-specific features appear to be salient in both category and typicality judgements, further evidencing the need to include these types of features in semantic models. + 2020.splu-1.5 + + + A Hybrid Deep Learning Approach for Spatial Trigger Extraction from Radiology Reports + SurabhiDatta + KirkRoberts + 50–55 + Radiology reports contain important clinical information about patients which are often tied through spatial expressions. Spatial expressions (or triggers) are mainly used to describe the positioning of radiographic findings or medical devices with respect to some anatomical structures. As the expressions result from the mental visualization of the radiologist’s interpretations, they are varied and complex. The focus of this work is to automatically identify the spatial expression terms from three different radiology sub-domains. We propose a hybrid deep learning-based NLP method that includes – 1) generating a set of candidate spatial triggers by exact match with the known trigger terms from the training data, 2) applying domain-specific constraints to filter the candidate triggers, and 3) utilizing a BERT-based classifier to predict whether a candidate trigger is a true spatial trigger or not. The results are promising, with an improvement of 24 points in the average F1 measure compared to a standard BERT-based sequence labeler. + 2020.splu-1.6 + + + Retouchdown: Releasing Touchdown on <fixed-case>S</fixed-case>treet<fixed-case>L</fixed-case>earn as a Public Resource for Language Grounding Tasks in Street View + HarshMehta + YoavArtzi + JasonBaldridge + EugeneIe + PiotrMirowski + 56–62 + The Touchdown dataset (Chen et al., 2019) provides instructions by human annotators for navigation through New York City streets and for resolving spatial descriptions at a given location. To enable the wider research community to work effectively with the Touchdown tasks, we are publicly releasing the 29k raw Street View panoramas needed for Touchdown. We follow the process used for the StreetLearn data release (Mirowski et al., 2019) to check panoramas for personally identifiable information and blur them as necessary. These have been added to the StreetLearn dataset and can be obtained via the same process as used previously for StreetLearn. We also provide a reference implementation for both Touchdown tasks: vision and language navigation (VLN) and spatial description resolution (SDR). We compare our model results to those given in (Chen et al., 2019) and show that the panoramas we have added to StreetLearn support both Touchdown tasks and can be used effectively for further research and comparison. + 2020.splu-1.7 + +
+
diff --git a/data/xml/2020.spnlp.xml b/data/xml/2020.spnlp.xml new file mode 100644 index 0000000000..27bebc45c3 --- /dev/null +++ b/data/xml/2020.spnlp.xml @@ -0,0 +1,138 @@ + + + + + Proceedings of the Fourth Workshop on Structured Prediction for NLP + PriyankaAgrawal + ZornitsaKozareva + JuliaKreutzer + GerasimosLampouras + AndréMartins + SujithRavi + AndreasVlachos + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.spnlp-1.0 + + + Syntax-driven Iterative Expansion Language Models for Controllable Text Generation + NoeCasas + José A. R.Fonollosa + Marta R.Costa-jussà + 1–10 + The dominant language modeling paradigm handles text as a sequence of discrete tokens. While that approach can capture the latent structure of the text, it is inherently constrained to sequential dynamics for text generation. We propose a new paradigm for introducing a syntactic inductive bias into neural text generation, where the dependency parse tree is used to drive the Transformer model to generate sentences iteratively. Our experiments show that this paradigm is effective at text generation, with quality between LSTMs and Transformers, and comparable diversity, requiring less than half their decoding steps, and its generation process allows direct control over the syntactic constructions of the generated text, enabling the induction of stylistic variations. + 2020.spnlp-1.1 + 2020.spnlp-1.1.OptionalSupplementaryMaterial.pdf + + + <fixed-case>C</fixed-case>opy<fixed-case>N</fixed-case>ext: Explicit Span Copying and Alignment in Sequence to Sequence Models + AbhinavSingh + PatrickXia + GuanghuiQin + MahsaYarmohammadi + BenjaminVan Durme + 11–16 + Copy mechanisms are employed in sequence to sequence (seq2seq) models to generate reproductions of words from the input to the output. These frameworks, operating at the lexical type level, fail to provide an explicit alignment that records where each token was copied from. Further, they require contiguous token sequences from the input (spans) to be copied individually. We present a model with an explicit token-level copy operation and extend it to copying entire spans. Our model provides hard alignments between spans in the input and output, allowing for nontraditional applications of seq2seq, like information extraction. We demonstrate the approach on Nested Named Entity Recognition, achieving near state-of-the-art accuracy with an order of magnitude increase in decoding speed. + 2020.spnlp-1.2 + 2020.spnlp-1.2.OptionalSupplementaryMaterial.pdf + + + Generating Synthetic Data for Task-Oriented Semantic Parsing with Hierarchical Representations + KeTran + MingTan + 17–21 + Modern conversational AI systems support natural language understanding for a wide variety of capabilities. While a majority of these tasks can be accomplished using a simple and flat representation of intents and slots, more sophisticated capabilities require complex hierarchical representations supported by semantic parsing. State-of-the-art semantic parsers are trained using supervised learning with data labeled according to a hierarchical schema which might be costly to obtain or not readily available for a new domain. In this work, we explore the possibility of generating synthetic data for neural semantic parsing using a pretrained denoising sequence-to-sequence model (i.e., BART). Specifically, we first extract masked templates from the existing labeled utterances, and then fine-tune BART to generate synthetic utterances conditioning on the extracted templates. Finally, we use an auxiliary parser (AP) to filter the generated utterances. The AP guarantees the quality of the generated data. We show the potential of our approach when evaluating on the Facebook TOP dataset for navigation domain. + 2020.spnlp-1.3 + + + Structured Prediction for Joint Class Cardinality and Entity Property Inference in Model-Complete Text Comprehension + Hendrikter Horst + PhilippCimiano + 22–32 + Model-complete text comprehension aims at interpreting a natural language text with respect to a semantic domain model describing the classes and their properties relevant for the domain in question. Solving this task can be approached as a structured prediction problem, consisting in inferring the most probable instance of the semantic model given the text. In this work, we focus on the challenging sub-problem of cardinality prediction that consists in predicting the number of distinct individuals of each class in the semantic model. We show that cardinality prediction can successfully be approached by modeling the overall task as a joint inference problem, predicting the number of individuals of certain classes while at the same time extracting their properties. We approach this task with probabilistic graphical models computing the maximum-a-posteriori instance of the semantic model. Our main contribution lies on the empirical investigation and analysis of different approximative inference strategies based on Gibbs sampling. We present and evaluate our models on the task of extracting key parameters from scientific full text articles describing pre-clinical studies in the domain of spinal cord injury. + 2020.spnlp-1.4 + + + Energy-based Neural Modelling for Large-Scale Multiple Domain Dialogue State Tracking + Anh DuongTrinh + Robert J.Ross + John D.Kelleher + 33–42 + Scaling up dialogue state tracking to multiple domains is challenging due to the growth in the number of variables being tracked. Furthermore, dialog state tracking models do not yet explicitly make use of relationships between dialogue variables, such as slots across domains. We propose using energy-based structure prediction methods for large-scale dialogue state tracking task in two multiple domain dialogue datasets. Our results indicate that: (i) modelling variable dependencies yields better results; and (ii) the structured prediction output aligns with the dialogue slot-value constraint principles. This leads to promising directions to improve state-of-the-art models by incorporating variable dependencies into their prediction process. + 2020.spnlp-1.5 + + + End-to-End Extraction of Structured Information from Business Documents with Pointer-Generator Networks + ClémentSage + AlexAussem + VéroniqueEglin + HaythamElghazel + JérémyEspinas + 43–52 + The predominant approaches for extracting key information from documents resort to classifiers predicting the information type of each word. However, the word level ground truth used for learning is expensive to obtain since it is not naturally produced by the extraction task. In this paper, we discuss a new method for training extraction models directly from the textual value of information. The extracted information of a document is represented as a sequence of tokens in the XML language. We learn to output this representation with a pointer-generator network that alternately copies the document words carrying information and generates the XML tags delimiting the types of information. The ability of our end-to-end method to retrieve structured information is assessed on a large set of business documents. We show that it performs competitively with a standard word classifier without requiring costly word level supervision. + 2020.spnlp-1.6 + + + Layer-wise Guided Training for <fixed-case>BERT</fixed-case>: Learning Incrementally Refined Document Representations + NikolaosManginas + IliasChalkidis + ProdromosMalakasiotis + 53–61 + Although BERT is widely used by the NLP community, little is known about its inner workings. Several attempts have been made to shed light on certain aspects of BERT, often with contradicting conclusions. A much raised concern focuses on BERT’s over-parameterization and under-utilization issues. To this end, we propose o novel approach to fine-tune BERT in a structured manner. Specifically, we focus on Large Scale Multilabel Text Classification (LMTC) where documents are assigned with one or more labels from a large predefined set of hierarchically organized labels. Our approach guides specific BERT layers to predict labels from specific hierarchy levels. Experimenting with two LMTC datasets we show that this structured fine-tuning approach not only yields better classification results but also leads to better parameter utilization. + 2020.spnlp-1.7 + 2020.spnlp-1.7.OptionalSupplementaryMaterial.zip + + + Improving Joint Training of Inference Networks and Structured Prediction Energy Networks + LifuTu + Richard YuanzhePang + KevinGimpel + 62–73 + Deep energy-based models are powerful, but pose challenges for learning and inference (Belanger and McCallum, 2016). Tu and Gimpel (2018) developed an efficient framework for energy-based models by training “inference networks” to approximate structured inference instead of using gradient descent. However, their alternating optimization approach suffers from instabilities during training, requiring additional loss terms and careful hyperparameter tuning. In this paper, we contribute several strategies to stabilize and improve this joint training of energy functions and inference networks for structured prediction. We design a compound objective to jointly train both cost-augmented and test-time inference networks along with the energy function. We propose joint parameterizations for the inference networks that encourage them to capture complementary functionality during learning. We empirically validate our strategies on two sequence labeling tasks, showing easier paths to strong performance than prior work, as well as further improvements with global energy terms. + 2020.spnlp-1.8 + + + Reading the Manual: Event Extraction as Definition Comprehension + YunmoChen + TongfeiChen + SethEbner + Aaron StevenWhite + BenjaminVan Durme + 74–83 + We ask whether text understanding has progressed to where we may extract event information through incremental refinement of bleached statements derived from annotation manuals. Such a capability would allow for the trivial construction and extension of an extraction framework by intended end-users through declarations such as, “Some person was born in some location at some time.” We introduce an example of a model that employs such statements, with experiments illustrating we can extract events under closed ontologies and generalize to unseen event types simply by reading new definitions. + 2020.spnlp-1.9 + + + On the Discrepancy between Density Estimation and Sequence Generation + JasonLee + DustinTran + OrhanFirat + KyunghyunCho + 84–94 + Many sequence-to-sequence generation tasks, including machine translation and text-to-speech, can be posed as estimating the density of the output y given the input x: p(y|x). Given this interpretation, it is natural to evaluate sequence-to-sequence models using conditional log-likelihood on a test set. However, the goal of sequence-to-sequence generation (or structured prediction) is to find the best output yˆ given an input x, and each task has its own downstream metric R that scores a model output by comparing against a set of references y*: R(yˆ, y* | x). While we hope that a model that excels in density estimation also performs well on the downstream metric, the exact correlation has not been studied for sequence generation tasks. In this paper, by comparing several density estimators on five machine translation tasks, we find that the correlation between rankings of models based on log-likelihood and BLEU varies significantly depending on the range of the model families being compared. First, log-likelihood is highly correlated with BLEU when we consider models within the same family (e.g. autoregressive models, or latent variable models with the same parameterization of the prior). However, we observe no correlation between rankings of models across different families: (1) among non-autoregressive latent variable models, a flexible prior distribution is better at density estimation but gives worse generation quality than a simple prior, and (2) autoregressive models offer the best translation performance overall, while latent variable models with a normalizing flow prior give the highest held-out log-likelihood across all datasets. Therefore, we recommend using a simple prior for the latent variable non-autoregressive model when fast generation speed is desired. + 2020.spnlp-1.10 + 2020.spnlp-1.10.OptionalSupplementaryMaterial.tex + + + Log-Linear Reformulation of the Noisy Channel Model for Document-Level Neural Machine Translation + SébastienJean + KyunghyunCho + 95–101 + We seek to maximally use various data sources, such as parallel and monolingual data, to build an effective and efficient document-level translation system. In particular, we start by considering a noisy channel approach (CITATION) that combines a target-to-source translation model and a language model. By applying Bayes’ rule strategically, we reformulate this approach as a log-linear combination of translation, sentence-level and document-level language model probabilities. In addition to using static coefficients for each term, this formulation alternatively allows for the learning of dynamic per-token weights to more finely control the impact of the language models. Using both static or dynamic coefficients leads to improvements over a context-agnostic baseline and a context-aware concatenation model. + 2020.spnlp-1.11 + + + Deeply Embedded Knowledge Representation & Reasoning For Natural Language Question Answering: A Practitioner’s Perspective + ArindamMitra + SanjayNarayana + ChittaBaral + 102–111 + Successful application of Knowledge Representation and Reasoning (KR) in Natural Language Understanding (NLU) is largely limited by the availability of a robust and general purpose natural language parser. Even though several projects have been launched in the pursuit of developing a universal meaning representation language, the existence of an accurate universal parser is far from reality. This has severely limited the application of knowledge representation and reasoning (KR) in the field of NLP and also prevented a proper evaluation of KR based NLU systems. Our goal is to build KR based systems for Natural Language Understanding without relying on a parser. Towards this we propose a method named Deeply Embedded Knowledge Representation & Reasoning (DeepEKR) where we replace the parser by a neural network, soften the symbolic representation so that a deterministic mapping exists between the parser neural network and the interpretable logical form, and finally replace the symbolic solver by an equivalent neural network, so the model can be trained end-to-end. We evaluate our method with respect to the task of Qualitative Word Problem Solving on the two available datasets (QuaRTz and QuaRel). Our system achieves same accuracy as that of the state-of-the-art accuracy on QuaRTz, outperforms the state-of-the-art on QuaRel and severely outperforms a traditional KR based system. The results show that the bias introduced by a KR solution does not prevent it from doing a better job at the end task. Moreover, our method is interpretable due to the bias introduced by the KR approach. + 2020.spnlp-1.12 + +
+
diff --git a/data/xml/2020.sustainlp.xml b/data/xml/2020.sustainlp.xml new file mode 100644 index 0000000000..ba665793a7 --- /dev/null +++ b/data/xml/2020.sustainlp.xml @@ -0,0 +1,246 @@ + + + + + Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing + Nafise SadatMoosavi + AngelaFan + VeredSchwartz + GoranGlavaš + ShafiqJoty + AlexWang + ThomasWolf + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.sustainlp-1.0 + + + Knowing Right from Wrong: Should We Use More Complex Models for Automatic Short-Answer Scoring in <fixed-case>B</fixed-case>ahasa <fixed-case>I</fixed-case>ndonesia? + Ali AkbarSeptiandri + Yosef ArdhitoWinatmoko + Ilham FirdausiPutra + 1–7 + We compare three solutions to UKARA 1.0 challenge on automated short-answer scoring: single classical, ensemble classical, and deep learning. The task is to classify given answers to two questions, whether they are right or wrong. While recent development shows increasing model complexity to push the benchmark performances, they tend to be resource-demanding with mundane improvement. For the UKARA task, we found that bag-of-words and classical machine learning approaches can compete with ensemble models and Bi-LSTM model with pre-trained word2vec embedding from 200 million words. In this case, the single classical machine learning achieved less than 2% difference in F1 compared to the deep learning approach with 1/18 time for model training. + 2020.sustainlp-1.1 + + + Rank and run-time aware compression of <fixed-case>NLP</fixed-case> Applications + UrmishThakker + JesseBeu + DibakarGope + GaneshDasika + MatthewMattina + 8–18 + Sequence model based NLP applications canbe large. Yet, many applications that benefit from them run on small devices with very limited compute and storage capabilities, while still having run-time constraints.As a result, there is a need for a compression technique that can achieve significant compression without negatively impacting inference run-time and task accuracy. This paper proposes a new compression technique called Hybrid Matrix Factorization (HMF) that achieves this dual objective. HMF improves low-rank matrix factorization (LMF) techniques by doubling the rank of the matrix using an intelligent hybrid-structure leading to better accuracy than LMF. Further, by preserving dense matrices, it leads to faster inference run-timethan pruning or structure matrix based compression technique. We evaluate the impact of this technique on 5 NLP benchmarks across multiple tasks (Translation, Intent Detection,Language Modeling) and show that for similar accuracy values and compression factors, HMF can achieve more than 2.32x faster inference run-time than pruning and 16.77% better accuracy than LMF. + 2020.sustainlp-1.2 + + + Learning Informative Representations of Biomedical Relations with Latent Variable Models + HarshilShah + JulienFauqueur + 19–28 + Extracting biomedical relations from large corpora of scientific documents is a challenging natural language processing task. Existing approaches usually focus on identifying a relation either in a single sentence (mention-level) or across an entire corpus (pair-level). In both cases, recent methods have achieved strong results by learning a point estimate to represent the relation; this is then used as the input to a relation classifier. However, the relation expressed in text between a pair of biomedical entities is often more complex than can be captured by a point estimate. To address this issue, we propose a latent variable model with an arbitrarily flexible distribution to represent the relation between an entity pair. Additionally, our model provides a unified architecture for both mention-level and pair-level relation extraction. We demonstrate that our model achieves results competitive with strong baselines for both tasks while having fewer parameters and being significantly faster to train. We make our code publicly available. + 2020.sustainlp-1.3 + + + End to End Binarized Neural Networks for Text Classification + KumarShridhar + HarshilJain + AkshatAgarwal + DenisKleyko + 29–34 + Deep neural networks have demonstrated their superior performance in almost every Natural Language Processing task, however, their increasing complexity raises concerns. A particular concern is that these networks pose high requirements for computing hardware and training budgets. The state-of-the-art transformer models are a vivid example. Simplifying the computations performed by a network is one way of addressing the issue of the increasing complexity. In this paper, we propose an end to end binarized neural network for the task of intent and text classification. In order to fully utilize the potential of end to end binarization, both the input representations (vector embeddings of tokens statistics) and the classifier are binarized. We demonstrate the efficiency of such a network on the intent classification of short texts over three datasets and text classification with a larger dataset. On the considered datasets, the proposed network achieves comparable to the state-of-the-art results while utilizing 20-40% lesser memory and training time compared to the benchmarks. + 2020.sustainlp-1.4 + 2020.sustainlp-1.4.OptionalSupplementaryMaterial.zip + + + Exploring the Boundaries of Low-Resource <fixed-case>BERT</fixed-case> Distillation + MosheWasserblat + OrenPereg + PeterIzsak + 35–40 + In recent years, large pre-trained models have demonstrated state-of-the-art performance in many of NLP tasks. However, the deployment of these models on devices with limited resources is challenging due to the models’ large computational consumption and memory requirements. Moreover, the need for a considerable amount of labeled training data also hinders real-world deployment scenarios. Model distillation has shown promising results for reducing model size, computational load and data efficiency. In this paper we test the boundaries of BERT model distillation in terms of model compression, inference efficiency and data scarcity. We show that classification tasks that require the capturing of general lexical semantics can be successfully distilled by very simple and efficient models and require relatively small amount of labeled training data. We also show that the distillation of large pre-trained models is more effective in real-life scenarios where limited amounts of labeled training are available. + 2020.sustainlp-1.5 + + + Efficient Estimation of Influence of a Training Instance + SosukeKobayashi + ShoYokoi + JunSuzuki + KentaroInui + 41–47 + Understanding the influence of a training instance on a neural network model leads to improving interpretability. However, it is difficult and inefficient to evaluate the influence, which shows how a model’s prediction would be changed if a training instance were not used. In this paper, we propose an efficient method for estimating the influence. Our method is inspired by dropout, which zero-masks a sub-network and prevents the sub-network from learning each training instance. By switching between dropout masks, we can use sub-networks that learned or did not learn each training instance and estimate its influence. Through experiments with BERT and VGGNet on classification datasets, we demonstrate that the proposed method can capture training influences, enhance the interpretability of error predictions, and cleanse the training dataset for improving generalization. + 2020.sustainlp-1.6 + 2020.sustainlp-1.6.OptionalSupplementaryMaterial.pdf + + + Efficient Inference For Neural Machine Translation + Yi-TeHsu + SarthakGarg + Yi-HsiuLiao + IlyaChatsviorkin + 48–53 + Large Transformer models have achieved state-of-the-art results in neural machine translation and have become standard in the field. In this work, we look for the optimal combination of known techniques to optimize inference speed without sacrificing translation quality. We conduct an empirical study that stacks various approaches and demonstrates that combination of replacing decoder self-attention with simplified recurrent units, adopting a deep encoder and a shallow decoder architecture and multi-head attention pruning can achieve up to 109% and 84% speedup on CPU and GPU respectively and reduce the number of parameters by 25% while maintaining the same translation quality in terms of BLEU. + 2020.sustainlp-1.7 + + + Sparse Optimization for Unsupervised Extractive Summarization of Long Documents with the Frank-Wolfe Algorithm + AliciaTsai + LaurentEl Ghaoui + 54–62 + We address the problem of unsupervised extractive document summarization, especially for long documents. We model the unsupervised problem as a sparse auto-regression one and approximate the resulting combinatorial problem via a convex, norm-constrained problem. We solve it using a dedicated Frank-Wolfe algorithm. To generate a summary with k sentences, the algorithm only needs to execute approximately k iterations, making it very efficient for a long document. We evaluate our approach against two other unsupervised methods using both lexical (standard) ROUGE scores, as well as semantic (embedding-based) ones. Our method achieves better results with both datasets and works especially well when combined with embeddings for highly paraphrased summaries. + 2020.sustainlp-1.8 + 2020.sustainlp-1.8.OptionalSupplementaryMaterial.pdf + + + Don’t Read Too Much Into It: Adaptive Computation for Open-Domain Question Answering + YuxiangWu + PasqualeMinervini + PontusStenetorp + SebastianRiedel + 63–72 + Most approaches to Open-Domain Question Answering consist of a light-weight retriever that selects a set of candidate passages, and a computationally expensive reader that examines the passages to identify the correct answer. Previous works have shown that as the number of retrieved passages increases, so does the performance of the reader. However, they assume all retrieved passages are of equal importance and allocate the same amount of computation to them, leading to a substantial increase in computational cost. To reduce this cost, we propose the use of adaptive computation to control the computational budget allocated for the passages to be read. We first introduce a technique operating on individual passages in isolation which relies on anytime prediction and a per-layer estimation of an early exit probability. We then introduce SKYLINEBUILDER, an approach for dynamically deciding on which passage to allocate computation at each step, based on a resource allocation policy trained via reinforcement learning. Our results on SQuAD-Open show that adaptive computation with global prioritisation improves over several strong static and adaptive methods, leading to a 4.3x reduction in computation while retaining 95% performance of the full model. + 2020.sustainlp-1.9 + 2020.sustainlp-1.9.OptionalSupplementaryMaterial.pdf + + + A Two-stage Model for Slot Filling in Low-resource Settings: Domain-agnostic Non-slot Reduction and Pretrained Contextual Embeddings + CennetOguz + Ngoc ThangVu + 73–82 + Learning-based slot filling - a key component of spoken language understanding systems - typically requires a large amount of in-domain hand-labeled data for training. In this paper, we propose a novel two-stage model architecture that can be trained with only a few in-domain hand-labeled examples. The first step is designed to remove non-slot tokens (i.e., O labeled tokens), as they introduce noise in the input of slot filling models. This step is domain-agnostic and therefore, can be trained by exploiting out-of-domain data. The second step identifies slot names only for slot tokens by using state-of-the-art pretrained contextual embeddings such as ELMO and BERT. We show that our approach outperforms other state-of-art systems on the SNIPS benchmark dataset. + 2020.sustainlp-1.10 + + + Early Exiting <fixed-case>BERT</fixed-case> for Efficient Document Ranking + JiXin + RodrigoNogueira + YaoliangYu + JimmyLin + 83–88 + Pre-trained language models such as BERT have shown their effectiveness in various tasks. Despite their power, they are known to be computationally intensive, which hinders real-world applications. In this paper, we introduce early exiting BERT for document ranking. With a slight modification, BERT becomes a model with multiple output paths, and each inference sample can exit early from these paths. In this way, computation can be effectively allocated among samples, and overall system latency is significantly reduced while the original quality is maintained. Our experiments on two document ranking datasets demonstrate up to 2.5x inference speedup with minimal quality degradation. The source code of our implementation can be found at https://github.com/castorini/earlyexiting-monobert. + 2020.sustainlp-1.11 + + + Keyphrase Generation with <fixed-case>GAN</fixed-case>s in Low-Resources Scenarios + GiuseppeLancioni + SaidaS.Mohamed + BeatricePortelli + GiuseppeSerra + CarloTasso + 89–96 + Keyphrase Generation is the task of predicting Keyphrases (KPs), short phrases that summarize the semantic meaning of a given document. Several past studies provided diverse approaches to generate Keyphrases for an input document. However, all of these approaches still need to be trained on very large datasets. In this paper, we introduce BeGanKP, a new conditional GAN model to address the problem of Keyphrase Generation in a low-resource scenario. Our main contribution relies in the Discriminator’s architecture: a new BERT-based module which is able to distinguish between the generated and humancurated KPs reliably. Its characteristics allow us to use it in a low-resource scenario, where only a small amount of training data are available, obtaining an efficient Generator. The resulting architecture achieves, on five public datasets, competitive results with respect to the state-of-the-art approaches, using less than 1% of the training data. + 2020.sustainlp-1.12 + + + Quasi-Multitask Learning: an Efficient Surrogate for Obtaining Model Ensembles + NorbertKis-Szabó + GáborBerend + 97–106 + We propose the technique of quasi-multitask learning (Q-MTL), a simple and easy to implement modification of standard multitask learning, in which the tasks to be modeled are identical. With this easy modification of a standard neural classifier we can get benefits similar to an ensemble of classifiers with a fraction of the resources required.We illustrate it through a series of sequence labeling experiments over a diverse set of languages, that applying Q-MTL consistently increases the generalization ability of the applied models. The proposed architecture can be regarded as a new regularization technique that encourages the model to develop an internal representation of the problem at hand which is beneficial to multiple output units of the classifier at the same time. Our experiments corroborate that by relying on the proposed algorithm, we can approximate the quality of an ensemble of classifiers at a fraction of computational resources required. Additionally, our results suggest that Q-MTL handles the presence of noisy training labels better than ensembles. + 2020.sustainlp-1.13 + + + A Little Bit Is Worse Than None: Ranking with Limited Training Data + XinyuZhang + AndrewYates + JimmyLin + 107–112 + Researchers have proposed simple yet effective techniques for the retrieval problem based on using BERT as a relevance classifier to rerank initial candidates from keyword search. In this work, we tackle the challenge of fine-tuning these models for specific domains in a data and computationally efficient manner. Typically, researchers fine-tune models using corpus-specific labeled data from sources such as TREC. We first answer the question: How much data of this type do we need? Recognizing that the most computationally efficient training is no training, we explore zero-shot ranking using BERT models that have already been fine-tuned with the large MS MARCO passage retrieval dataset. We arrive at the surprising and novel finding that “some” labeled in-domain data can be worse than none at all. + 2020.sustainlp-1.14 + + + Predictive Model Selection for Transfer Learning in Sequence Labeling Tasks + ParulAwasthy + BishwaranjanBhattacharjee + JohnKender + RaduFlorian + 113–118 + Transfer learning is a popular technique to learn a task using less training data and fewer compute resources. However, selecting the correct source model for transfer learning is a challenging task. We demonstrate a novel predictive method that determines which existing source model would minimize error for transfer learning to a given target. This technique does not require learning for prediction, and avoids computational costs of trail-and-error. We have evaluated this technique on nine datasets across diverse domains, including newswire, user forums, air flight booking, cybersecurity news, etc. We show that it per-forms better than existing techniques such as fine-tuning over vanilla BERT, or curriculum learning over the largest dataset on top of BERT, resulting in average F1 score gains in excess of 3%. Moreover, our technique consistently selects the best model using fewer tries. + 2020.sustainlp-1.15 + + + Load What You Need: Smaller Versions of Mutlilingual <fixed-case>BERT</fixed-case> + AmineAbdaoui + CamillePradel + GrégoireSigel + 119–123 + Pre-trained Transformer-based models are achieving state-of-the-art results on a variety of Natural Language Processing data sets. However, the size of these models is often a drawback for their deployment in real production applications. In the case of multilingual models, most of the parameters are located in the embeddings layer. Therefore, reducing the vocabulary size should have an important impact on the total number of parameters. In this paper, we propose to extract smaller models that handle fewer number of languages according to the targeted corpora. We present an evaluation of smaller versions of multilingual BERT on the XNLI data set, but we believe that this method may be applied to other multilingual transformers. The obtained results confirm that we can generate smaller models that keep comparable results, while reducing up to 45% of the total number of parameters. We compared our models with DistilmBERT (a distilled version of multilingual BERT) and showed that unlike language reduction, distillation induced a 1.7% to 6% drop in the overall accuracy on the XNLI data set. The presented models and code are publicly available. + 2020.sustainlp-1.16 + + + <fixed-case>S</fixed-case>queeze<fixed-case>BERT</fixed-case>: What can computer vision teach <fixed-case>NLP</fixed-case> about efficient neural networks? + ForrestIandola + AlbertShaw + RaviKrishna + KurtKeutzer + 124–135 + Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets, large computing systems, and better neural network models, natural language processing (NLP) technology has made significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. Toward this end, we consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today’s highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. To begin to address this problem, we draw inspiration from the computer vision community, where work such as MobileNet has demonstrated that grouped convolutions (e.g. depthwise convolutions) can enable speedups without sacrificing accuracy. We demonstrate how to replace several operations in self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test set. A PyTorch-based implementation of SqueezeBERT is available as part of the Hugging Face Transformers library: https://huggingface.co/squeezebert + 2020.sustainlp-1.17 + 2020.sustainlp-1.17.OptionalSupplementaryMaterial.zip + + + Analysis of Resource-efficient Predictive Models for Natural Language Processing + RajPranesh + AmbeshShekhar + 136–140 + In this paper, we presented an analyses of the resource efficient predictive models, namely Bonsai, Binary Neighbor Compression(BNC), ProtoNN, Random Forest, Naive Bayes and Support vector machine(SVM), in the machine learning field for resource constraint devices. These models try to minimize resource requirements like RAM and storage without hurting the accuracy much. We utilized these models on multiple benchmark natural language processing tasks, which were sentimental analysis, spam message detection, emotion analysis and fake news classification. The experiment results shows that the tree-based algorithm, Bonsai, surpassed the rest of the machine learning algorithms by achieve higher accuracy scores while having significantly lower memory usage. + 2020.sustainlp-1.18 + + + Towards Accurate and Reliable Energy Measurement of <fixed-case>NLP</fixed-case> Models + QingqingCao + ArunaBalasubramanian + NiranjanBalasubramanian + 141–148 + Accurate and reliable measurement of energy consumption is critical for making well-informed design choices when choosing and training large scale NLP models. In this work, we show that existing software-based energy estimations are not accurate because they do not take into account hardware differences and how resource utilization affects energy consumption. We conduct energy measurement experiments with four different models for a question answering task. We quantify the error of existing software-based energy estimations by using a hardware power meter that provides highly accurate energy measurements. Our key takeaway is the need for a more accurate energy estimation model that takes into account hardware variabilities and the non-linear relationship between resource utilization and energy consumption. We release the code and data at https://github.com/csarron/sustainlp2020-energy. + 2020.sustainlp-1.19 + + + <fixed-case>F</fixed-case>ast<fixed-case>F</fixed-case>ormers: Highly Efficient Transformer Models for Natural Language Understanding + Young JinKim + HanyHassan + 149–158 + Transformer-based models are the state-of-the-art for Natural Language Understanding (NLU) applications. Models are getting bigger and better on various tasks. However, Transformer models remain computationally challenging since they are not efficient at inference-time compared to traditional approaches. In this paper, we present FastFormers, a set of recipes to achieve efficient inference-time performance for Transformer-based models on various NLU tasks. We show how carefully utilizing knowledge distillation, structured pruning and numerical optimization can lead to drastic improvements on inference efficiency. We provide effective recipes that can guide practitioners to choose the best settings for various NLU tasks and pretrained models. Applying the proposed recipes to the SuperGLUE benchmark, we achieve from 9.8x up to 233.9x speed-up compared to out-of-the-box models on CPU. On GPU, we also achieve up to 12.4x speed-up with the presented methods. We show that FastFormers can drastically reduce cost of serving 100 million requests from 4,223 USD to just 18 USD on an Azure F16s_v2 instance. This translates to a sustainable runtime by reducing energy consumption 6.9x - 125.8x according to the metrics used in the SustaiNLP 2020 shared task. + 2020.sustainlp-1.20 + + + A comparison between <fixed-case>CNN</fixed-case>s and <fixed-case>WFA</fixed-case>s for Sequence Classification + AriadnaQuattoni + XavierCarreras + 159–163 + We compare a classical CNN architecture for sequence classification involving several convolutional and max-pooling layers against a simple model based on weighted finite state automata (WFA). Each model has its advantages and disadvantages and it is possible that they could be combined. However, we believe that the first research goal should be to investigate and understand how do these two apparently dissimilar models compare in the context of specific natural language processing tasks. This paper is the first step towards that goal. Our experiments with five sequence classification datasets suggest that, despite the apparent simplicity of WFA models and training algorithms, the performance of WFAs is comparable to that of the CNNs. + 2020.sustainlp-1.21 + + + Counterfactual Augmentation for Training Next Response Selection + SeungtaekChoi + MyeonghoJeong + JinyoungYeo + Seung-wonHwang + 164–168 + This paper studies label augmentation for training dialogue response selection. The existing model is trained by “observational” annotation, where one observed response is annotated as gold. In this paper, we propose “counterfactual augmentation” of pseudo-positive labels. We validate that the effectiveness of augmented labels are comparable to positives, such that ours outperform state-of-the-arts without augmentation. + 2020.sustainlp-1.22 + + + Do We Need to Create Big Datasets to Learn a Task? + SwaroopMishra + Bhavdeep SinghSachdeva + 169–173 + Deep Learning research has been largely accelerated by the development of huge datasets such as Imagenet. The general trend has been to create big datasets to make a deep neural network learn. A huge amount of resources is being spent in creating these big datasets, developing models, training them, and iterating this process to dominate leaderboards. We argue that the trend of creating bigger datasets needs to be revised by better leveraging the power of pre-trained language models. Since the language models have already been pre-trained with huge amount of data and have basic linguistic knowledge, there is no need to create big datasets to learn a task. Instead, we need to create a dataset that is sufficient for the model to learn various task-specific terminologies, such as ‘Entailment’, ‘Neutral’, and ‘Contradiction’ for NLI. As evidence, we show that RoBERTA is able to achieve near-equal performance on 2% data of SNLI. We also observe competitive zero-shot generalization on several OOD datasets. In this paper, we propose a baseline algorithm to find the optimal dataset for learning a task. + 2020.sustainlp-1.23 + + + Overview of the <fixed-case>S</fixed-case>ustai<fixed-case>NLP</fixed-case> 2020 Shared Task + AlexWang + ThomasWolf + 174–178 + We describe the SustaiNLP 2020 shared task: efficient inference on the SuperGLUE benchmark (Wang et al., 2019). Participants are evaluated based on performance on the benchmark as well as energy consumed in making predictions on the test sets. We describe the task, its organization, and the submitted systems. Across the six submissions to the shared task, participants achieved efficiency gains of 20× over a standard BERT (Devlin et al., 2019) baseline, while losing less than an absolute point in performance. + 2020.sustainlp-1.24 + +
+
diff --git a/data/xml/2020.trac.xml b/data/xml/2020.trac.xml index 6dff14d001..25d161785f 100644 --- a/data/xml/2020.trac.xml +++ b/data/xml/2020.trac.xml @@ -282,7 +282,7 @@ AkankshaBansal AkashBhagat YogeshDawer - borninilahiri + BorniniLahiri Atul Kr.Ojha 158–168 In this paper, we discuss the development of a multilingual annotated corpus of misogyny and aggression in Indian English, Hindi, and Indian Bangla as part of a project on studying and automatically identifying misogyny and communalism on social media (the ComMA Project). The dataset is collected from comments on YouTube videos and currently contains a total of over 20,000 comments. The comments are annotated at two levels - aggression (overtly aggressive, covertly aggressive, and non-aggressive) and misogyny (gendered and non-gendered). We describe the process of data collection, the tagset used for annotation, and issues and challenges faced during the process of annotation. Finally, we discuss the results of the baseline experiments conducted to develop a classifier for misogyny in the three languages. diff --git a/data/xml/2020.winlp.xml b/data/xml/2020.winlp.xml index 49bd1e2bcd..f6788188e8 100644 --- a/data/xml/2020.winlp.xml +++ b/data/xml/2020.winlp.xml @@ -118,7 +118,7 @@ <fixed-case>T</fixed-case>igrinya Automatic Speech recognition with Morpheme based recognition units HafteAbera - sebsibehailemariam + SebsibeHailemariam 46–50 The Tigrinya language is agglutinative and has a large number of inflected and derived forms of words. Therefore a Tigrinya large vocabulary continuous speech recognition system often has a large number of different units and a high out-of-vocabulary (OOV) rate if a word is used as a recognition unit of a language model (LM) and lexicon. Therefore a morpheme-based approach has often been used and a morpheme is used as the recognition unit to reduce the high OOV rate. This paper presents an automatic speech recognition experiment conducted to see the effect of OOV words on the performance speech recognition system for Tigrinya. We tried to solve the OOV problem by using morphemes as lexicon and language model units. It has been found that the morpheme-based recognition system is better lexical and language modeling units than words. An absolute improvement (in word recognition accuracy) of 3.45 token and 8.36 types has been obtained as a result of using a morph-based vocabulary. 10.18653/v1/2020.winlp-1.12 diff --git a/data/xml/2020.wnut.xml b/data/xml/2020.wnut.xml new file mode 100644 index 0000000000..4f5739932c --- /dev/null +++ b/data/xml/2020.wnut.xml @@ -0,0 +1,747 @@ + + + + + Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020) + WeiXu + AlanRitter + TimBaldwin + AfshinRahimi + Association for Computational Linguistics +
Online
+ November + 2020 + + + 2020.wnut-1.0 + + + May <fixed-case>I</fixed-case> Ask Who’s Calling? Named Entity Recognition on Call Center Transcripts for Privacy Law Compliance + MicaelaKaplan + 1–6 + We investigate using Named Entity Recognition on a new type of user-generated text: a call center conversation. These conversations combine problems from spontaneous speech with problems novel to conversational Automated Speech Recognition, including incorrect recognition, alongside other common problems from noisy user-generated text. Using our own corpus with new annotations, training custom contextual string embeddings, and applying a BiLSTM-CRF, we match state-of- the-art results on our novel task. + 2020.wnut-1.1 + + + “Did you really mean what you said?” : Sarcasm Detection in <fixed-case>H</fixed-case>indi-<fixed-case>E</fixed-case>nglish Code-Mixed Data using Bilingual Word Embeddings + AkshitaAggarwal + AnshulWadhawan + AnshimaChaudhary + KavitaMaurya + 7–15 + With the increased use of social media platforms by people across the world, many new interesting NLP problems have come into existence. One such being the detection of sarcasm in the social media texts. We present a corpus of tweets for training custom word embeddings and a Hinglish dataset labelled for sarcasm detection. We propose a deep learning based approach to address the issue of sarcasm detection in Hindi-English code mixed tweets using bilingual word embeddings derived from FastText and Word2Vec approaches. We experimented with various deep learning models, including CNNs, LSTMs, Bi-directional LSTMs (with and without attention). We were able to outperform all state-of-the-art performances with our deep learning models, with attention based Bi-directional LSTMs giving the best performance exhibiting an accuracy of 78.49%. + 2020.wnut-1.2 + + + Noisy Text Data: Achilles’ Heel of <fixed-case>BERT</fixed-case> + AnkitKumar + PiyushMakhija + AnujGupta + 16–21 + Owing to the phenomenal success of BERT on various NLP tasks and benchmark datasets, industry practitioners are actively experimenting with fine-tuning BERT to build NLP applications for solving industry use cases. For most datasets that are used by practitioners to build industrial NLP applications, it is hard to guarantee absence of any noise in the data. While BERT has performed exceedingly well for transferring the learnings from one use case to another, it remains unclear how BERT performs when fine-tuned on noisy text. In this work, we explore the sensitivity of BERT to noise in the data. We work with most commonly occurring noise (spelling mistakes, typos) and show that this results in significant degradation in the performance of BERT. We present experimental results to show that BERT’s performance on fundamental NLP tasks like sentiment analysis and textual similarity drops significantly in the presence of (simulated) noise on benchmark datasets viz. IMDB Movie Review, STS-B, SST-2. Further, we identify shortcomings in the existing BERT pipeline that are responsible for this drop in performance. Our findings suggest that practitioners need to be vary of presence of noise in their datasets while fine-tuning BERT to solve industry use cases. + 2020.wnut-1.3 + 2020.wnut-1.3.OptionalSupplementaryMaterial.zip + + + Determining Question-Answer Plausibility in Crowdsourced Datasets Using Multi-Task Learning + RachelGardner + MayaVarma + ClareZhu + RanjayKrishna + 22–27 + Datasets extracted from social networks and online forums are often prone to the pitfalls of natural language, namely the presence of unstructured and noisy data. In this work, we seek to enable the collection of high-quality question-answer datasets from social media by proposing a novel task for automated quality analysis and data cleaning: question-answer (QA) plausibility. Given a machine or user-generated question and a crowd-sourced response from a social media user, we determine if the question and response are valid; if so, we identify the answer within the free-form response. We design BERT-based models to perform the QA plausibility task, and we evaluate the ability of our models to generate a clean, usable question-answer dataset. Our highest-performing approach consists of a single-task model which determines the plausibility of the question, followed by a multi-task model which evaluates the plausibility of the response as well as extracts answers (Question Plausibility AUROC=0.75, Response Plausibility AUROC=0.78, Answer Extraction F1=0.665). + 2020.wnut-1.4 + + + Combining <fixed-case>BERT</fixed-case> with Static Word Embeddings for Categorizing Social Media + IsraaAlghanmi + LuisEspinosa Anke + StevenSchockaert + 28–33 + Pre-trained neural language models (LMs) have achieved impressive results in various natural language processing tasks, across different languages. Surprisingly, this extends to the social media genre, despite the fact that social media often has very different characteristics from the language that LMs have seen during training. A particularly striking example is the performance of AraBERT, an LM for the Arabic language, which is successful in categorizing social media posts in Arabic dialects, despite only having been trained on Modern Standard Arabic. Our hypothesis in this paper is that the performance of LMs for social media can nonetheless be improved by incorporating static word vectors that have been specifically trained on social media. We show that a simple method for incorporating such word vectors is indeed successful in several Arabic and English benchmarks. Curiously, however, we also find that similar improvements are possible with word vectors that have been trained on traditional text sources (e.g. Wikipedia). + 2020.wnut-1.5 + + + Enhanced Sentence Alignment Network for Efficient Short Text Matching + ZheHu + ZuohuiFu + ChengPeng + WeiweiWang + 34–40 + Cross-sentence attention has been widely applied in text matching, in which model learns the aligned information between two intermediate sequence representations to capture their semantic relationship. However, commonly the intermediate representations are generated solely based on the preceding layers and the models may suffer from error propagation and unstable matching, especially when multiple attention layers are used. In this paper, we pro-pose an enhanced sentence alignment network with simple gated feature augmentation, where the model is able to flexibly integrate both original word and contextual features to improve the cross-sentence attention. Moreover, our model is less complex with fewer parameters compared to many state-of-the-art structures.Experiments on three benchmark datasets validate our model capacity for text matching. + 2020.wnut-1.6 + 2020.wnut-1.6.OptionalSupplementaryMaterial.zip + + + <fixed-case>PHINC</fixed-case>: A Parallel <fixed-case>H</fixed-case>inglish Social Media Code-Mixed Corpus for Machine Translation + VivekSrivastava + MayankSingh + 41–49 + Code-mixing is the phenomenon of using more than one language in a sentence. In the multilingual communities, it is a very frequently observed pattern of communication on social media platforms. Flexibility to use multiple languages in one text message might help to communicate efficiently with the target audience. But, the noisy user-generated code-mixed text adds to the challenge of processing and understanding natural language to a much larger extent. Machine translation from monolingual source to the target language is a well-studied research problem. Here, we demonstrate that widely popular and sophisticated translation systems such as Google Translate fail at times to translate code-mixed text effectively. To address this challenge, we present a parallel corpus of the 13,738 code-mixed Hindi-English sentences and their corresponding human translation in English. In addition, we also propose a translation pipeline build on top of Google Translate. The evaluation of the proposed pipeline on PHINC demonstrates an increase in the performance of the underlying system. With minimal effort, we can extend the dataset and the proposed approach to other code-mixing language pairs. + 2020.wnut-1.7 + + + Cross-lingual sentiment classification in low-resource <fixed-case>B</fixed-case>engali language + SalimSazzed + 50–60 + Sentiment analysis research in low-resource languages such as Bengali is still unexplored due to the scarcity of annotated data and the lack of text processing tools. Therefore, in this work, we focus on generating resources and showing the applicability of the cross-lingual sentiment analysis approach in Bengali. For benchmarking, we created and annotated a comprehensive corpus of around 12000 Bengali reviews. To address the lack of standard text-processing tools in Bengali, we leverage resources from English utilizing machine translation. We determine the performance of supervised machine learning (ML) classifiers in machine-translated English corpus and compare it with the original Bengali corpus. Besides, we examine sentiment preservation in the machine-translated corpus utilizing Cohen’s Kappa and Gwet’s AC1. To circumvent the laborious data labeling process, we explore lexicon-based methods and study the applicability of utilizing cross-domain labeled data from the resource-rich language. We find that supervised ML classifiers show comparable performances in Bengali and machine-translated English corpus. By utilizing labeled data, they achieve 15%-20% higher F1 scores compared to both lexicon-based and transfer learning-based methods. Besides, we observe that machine translation does not alter the sentiment polarity of the review for most of the cases. Our experimental results demonstrate that the machine translation based cross-lingual approach can be an effective way for sentiment classification in Bengali. + 2020.wnut-1.8 + + + The Non-native Speaker Aspect: <fixed-case>I</fixed-case>ndian <fixed-case>E</fixed-case>nglish in Social Media + RupakSarkar + SayantanMahinder + AshiqurKhudaBukhsh + 61–70 + As the largest institutionalized second language variety of English, Indian English has received a sustained focus from linguists for decades. However, to the best of our knowledge, no prior study has contrasted web-expressions of Indian English in noisy social media with English generated by a social media user base that are predominantly native speakers. In this paper, we address this gap in the literature through conducting a comprehensive analysis considering multiple structural and semantic aspects. In addition, we propose a novel application of language models to perform automatic linguistic quality assessment. + 2020.wnut-1.9 + + + Sentence Boundary Detection on Line Breaks in <fixed-case>J</fixed-case>apanese + YutaHayashibe + KensukeMitsuzawa + 71–75 + For NLP, sentence boundary detection (SBD) is an essential task to decompose a text into sentences. Most of the previous studies have used a simple rule that uses only typical characters as sentence boundaries. However, some characters may or may not be sentence boundaries depending on the context. We focused on line breaks in them. We newly constructed annotated corpora, implemented sentence boundary detectors, and analyzed performance of SBD in several settings. + 2020.wnut-1.10 + + + Non-ingredient Detection in User-generated Recipes using the Sequence Tagging Approach + YasuhiroYamaguchi + ShintaroInuzuka + MakotoHiramatsu + JunHarashima + 76–80 + Recently, the number of user-generated recipes on the Internet has increased. In such recipes, users are generally supposed to write a title, an ingredient list, and steps to create a dish. However, some items in an ingredient list in a user-generated recipe are not actually edible ingredients. For example, headings, comments, and kitchenware sometimes appear in an ingredient list because users can freely write the list in their recipes. Such noise makes it difficult for computers to use recipes for a variety of tasks, such as calorie estimation. To address this issue, we propose a non-ingredient detection method inspired by a neural sequence tagging model. In our experiment, we annotated 6,675 ingredients in 600 user-generated recipes and showed that our proposed method achieved a 93.3 F1 score. + 2020.wnut-1.11 + + + Generating Fact Checking Summaries for Web Claims + RahulMishra + DhruvGupta + MarkusLeippold + 81–90 + We present SUMO, a neural attention-based approach that learns to establish correctness of textual claims based on evidence in the form of text documents (e.g., news articles or web documents). SUMO further generates an extractive summary by presenting a diversified set of sentences from the documents that explain its decision on the correctness of the textual claim. Prior approaches to address the problem of fact checking and evidence extraction have relied on simple concatenation of claim and document word embeddings as an input to claim driven attention weight computation. This is done so as to extract salient words and sentences from the documents that help establish the correctness of the claim. However this design of claim-driven attention fails to capture the contextual information in documents properly. We improve on the prior art by using improved claim and title guided hierarchical attention to model effective contextual cues. We show the efficacy of our approach on political, healthcare, and environmental datasets. + 2020.wnut-1.12 + + + Intelligent Analyses on Storytelling for Impact Measurement + KoenKicken + TessaDe Maesschalck + BartVanrumste + TomDe Keyser + Hee ReenShim + 91–100 + This paper explores how Dutch diary fragments, written by family coaches in the social sector, can be analysed automatically using machine learning techniques to quantitatively measure the impact of social coaching. The focus lays on two tasks: determining which sentiment a fragment contains (sentiment analysis) and investigating which fundamental social rights (education, employment, legal aid, etc.) are addressed in the fragment. To train and test the new algorithms, a dataset consisting of 1715 Dutch diary fragments is used. These fragments are manually labelled on sentiment and on the applicable fundamental social rights. The sentiment analysis models were trained to classify the fragments into three classes: negative, neutral or positive. Fine-tuning the Dutch pre-trained Bidirectional Encoder Representations from Transformers (BERTje) (de Vries et al., 2019) language model surpassed the more classic algorithms by correctly classifying 79.6% of the fragments on the sentiment analysis, which is considered as a good result. This technique also achieved the best results in the identification of the fundamental rights, where for every fragment the three most likely fundamental rights were given as output. In this way, 93% of the present fundamental rights were correctly recognised. To our knowledge, we are the first to try to extract social rights from written text with the help of Natural Language Processing techniques. + 2020.wnut-1.13 + + + An Empirical Analysis of Human-Bot Interaction on <fixed-case>R</fixed-case>eddit + Ming-ChengMa + John P.Lalor + 101–106 + Automated agents (“bots”) have emerged as an ubiquitous and influential presence on social media. Bots engage on social media platforms by posting content and replying to other users on the platform. In this work we conduct an empirical analysis of the activity of a single bot on Reddit. Our goal is to determine whether bot activity (in the form of posted comments on the website) has an effect on how humans engage on Reddit. We find that (1) the sentiment of a bot comment has a significant, positive effect on the subsequent human reply, and (2) human Reddit users modify their comment behaviors to overlap with the text of the bot, similar to how humans modify their text to mimic other humans in conversation. Understanding human-bot interactions on social media with relatively simple bots is important for preparing for more advanced bots in the future. + 2020.wnut-1.14 + + + Detecting Trending Terms in Cybersecurity Forum Discussions + JackHughes + SethAycock + AndrewCaines + PaulaButtery + AliceHutchings + 107–115 + We present a lightweight method for identifying currently trending terms in relation to a known prior of terms, using a weighted log-odds ratio with an informative prior. We apply this method to a dataset of posts from an English-language underground hacking forum, spanning over ten years of activity, with posts containing misspellings, orthographic variation, acronyms, and slang. Our statistical approach supports analysis of linguistic change and discussion topics over time, without a requirement to train a topic model for each time interval for analysis. We evaluate the approach by comparing the results to TF-IDF using the discounted cumulative gain metric with human annotations, finding our method outperforms TF-IDF on information retrieval. + 2020.wnut-1.15 + + + Service registration chatbot: collecting and comparing dialogues from <fixed-case>AMT</fixed-case> workers and service’s users + LucaMolteni + MittulSingh + JuhoLeinonen + KatriLeino + MikkoKurimo + EmanueleDella Valle + 116–121 + Crowdsourcing is the go-to solution for data collection and annotation in the context of NLP tasks. Nevertheless, crowdsourced data is noisy by nature; the source is often unknown and additional validation work is performed to guarantee the dataset’s quality. In this article, we compare two crowdsourcing sources on a dialogue paraphrasing task revolving around a chatbot service. We observe that workers hired on crowdsourcing platforms produce lexically poorer and less diverse rewrites than service users engaged voluntarily. Notably enough, on dialogue clarity and optimality, the two paraphrase sources’ human-perceived quality does not differ significantly. Furthermore, for the chatbot service, the combined crowdsourced data is enough to train a transformer-based Natural Language Generation (NLG) system. To enable similar services, we also release tools for collecting data and training the dialogue-act-based transformer-based NLG module. + 2020.wnut-1.16 + + + Automated Assessment of Noisy Crowdsourced Free-text Answers for <fixed-case>H</fixed-case>indi in Low Resource Setting + DollyAgarwal + SomyaGupta + NishantBaghel + 122–131 + The requirement of performing assessments continually on a larger scale necessitates the implementation of automated systems for evaluation of the learners’ responses to free-text questions. We target children of age group 8-14 years and use an ASR integrated assessment app to crowdsource learners’ responses to free text questions in Hindi. The app helped collect 39641 user answers to 35 different questions of Science topics. Since the users are young children from rural India and may not be well-equipped with technology, it brings in various noise types in the answers. We describe these noise types and propose a preprocessing pipeline to denoise user’s answers. We showcase the performance of different similarity metrics on the noisy and denoised versions of user and model answers. Our findings have large-scale applications for automated answer assessment for school children in India in low resource settings. + 2020.wnut-1.17 + + + Punctuation Restoration using Transformer Models for Resource-Rich and -Poor Languages + TanvirulAlam + AkibKhan + FirojAlam + 132–142 + Punctuation restoration is a common post-processing problem for Automatic Speech Recognition (ASR) systems. It is important to improve the readability of the transcribed text for the human reader and facilitate NLP tasks. Current state-of-art address this problem using different deep learning models. Recently, transformer models have proven their success in downstream NLP tasks, and these models have been explored very little for the punctuation restoration problem. In this work, we explore different transformer based models and propose an augmentation strategy for this task, focusing on high-resource (English) and low-resource (Bangla) languages. For English, we obtain comparable state-of-the-art results, while for Bangla, it is the first reported work, which can serve as a strong baseline for future work. We have made our developed Bangla dataset publicly available for the research community. + 2020.wnut-1.18 + + + Truecasing <fixed-case>G</fixed-case>erman user-generated conversational text + YuliaGrishina + ThomasGueudre + RalfWinkler + 143–148 + True-casing, the task of restoring proper case to (generally) lower case input, is important in downstream tasks and for screen display. In this paper, we investigate truecasing as an in- trinsic task and present several experiments on noisy user queries to a voice-controlled dia- log system. In particular, we compare a rule- based, an n-gram language model (LM) and a recurrent neural network (RNN) approaches, evaluating the results on a German Q&A cor- pus and reporting accuracy for different case categories. We show that while RNNs reach higher accuracy especially on large datasets, character n-gram models with interpolation are still competitive, in particular on mixed- case words where their fall-back mechanisms come into play. + 2020.wnut-1.19 + + + Fine-Tuning <fixed-case>MT</fixed-case> systems for Robustness to Second-Language Speaker Variations + Md Mahfuz IbnAlam + AntoniosAnastasopoulos + 149–158 + The performance of neural machine translation (NMT) systems only trained on a single language variant degrades when confronted with even slightly different language variations. With this work, we build upon previous work to explore how to mitigate this issue. We show that fine-tuning using naturally occurring noise along with pseudo-references (i.e. “corrected” non-native inputs translated using the baseline NMT system) is a promising solution towards systems robust to such type of input variations. We focus on four translation pairs, from English to Spanish, Italian, French, and Portuguese, with our system achieving improvements of up to 3.1 BLEU points compared to the baselines, establishing a new state-of-the-art on the JFLEG-ES dataset. All datasets and code are publicly available here: https://github.com/mahfuzibnalam/finetuning_for_robustness . + 2020.wnut-1.20 + + + Impact of <fixed-case>ASR</fixed-case> on <fixed-case>A</fixed-case>lzheimer’s Disease Detection: All Errors are Equal, but Deletions are More Equal than Others + AparnaBalagopalan + KseniaShkaruta + JekaterinaNovikova + 159–164 + Automatic Speech Recognition (ASR) is a critical component of any fully-automated speech-based dementia detection model. However, despite years of speech recognition research, little is known about the impact of ASR accuracy on dementia detection. In this paper, we experiment with controlled amounts of artificially generated ASR errors and investigate their influence on dementia detection. We find that deletion errors affect detection performance the most, due to their impact on the features of syntactic complexity and discourse representation in speech. We show the trend to be generalisable across two different datasets for cognitive impairment detection. As a conclusion, we propose optimising the ASR to reflect a higher penalty for deletion errors in order to improve dementia detection performance. + 2020.wnut-1.21 + 2020.wnut-1.21.OptionalSupplementaryMaterial.zip + + + Detecting Entailment in Code-Mixed <fixed-case>H</fixed-case>indi-<fixed-case>E</fixed-case>nglish Conversations + SharanyaChakravarthy + AnjanaUmapathy + Alan WBlack + 165–170 + The presence of large-scale corpora for Natural Language Inference (NLI) has spurred deep learning research in this area, though much of this research has focused solely on monolingual data. Code-mixing is the intertwined usage of multiple languages, and is commonly seen in informal conversations among polyglots. Given the rising importance of dialogue agents, it is imperative that they understand code-mixing, but the scarcity of code-mixed Natural Language Understanding (NLU) datasets has precluded research in this area. The dataset by Khanuja et. al. for detecting conversational entailment in code-mixed Hindi-English text is the first of its kind. We investigate the effectiveness of language modeling, data augmentation, translation, and architectural approaches to address the code-mixed, conversational, and low-resource aspects of this dataset. We obtain an 8.09% increase in test set accuracy over the current state of the art. + 2020.wnut-1.22 + + + Detecting Objectifying Language in Online Professor Reviews + AngieWaller + KyleGorman + 171–180 + Student reviews often make reference to professors’ physical appearances. Until recently RateMyProfessors.com, the website of this study’s focus, used a design feature to encourage a “hot or not” rating of college professors. In the wake of recent #MeToo and #TimesUp movements, social awareness of the inappropriateness of these reviews has grown; however, objectifying comments remain and continue to be posted in this online context. We describe two supervised text classifiers for detecting objectifying commentary in professor reviews. We then ensemble these classifiers and use the resulting model to track objectifying commentary at scale. We measure correlations between objectifying commentary, changes to the review website interface, and teacher gender across a ten-year period. + 2020.wnut-1.23 + 2020.wnut-1.23.OptionalSupplementaryMaterial.pdf + + + Annotation Efficient Language Identification from Weak Labels + ShriphaniPalakodety + AshiqurKhudaBukhsh + 181–192 + India is home to several languages with more than 30m speakers. These languages exhibit significant presence on social media platforms. However, several of these widely-used languages are under-addressed by current Natural Language Processing (NLP) models and resources. User generated social media content in these languages is also typically authored in the Roman script as opposed to the traditional native script further contributing to resource scarcity. In this paper, we leverage a minimally supervised NLP technique to obtain weak language labels from a large-scale Indian social media corpus leading to a robust and annotation-efficient language-identification technique spanning nine Romanized Indian languages. In fast-spreading pandemic situations such as the current COVID-19 situation, information processing objectives might be heavily tilted towards under-served languages in densely populated regions. We release our models to facilitate downstream analyses in these low-resource languages. Experiments across multiple social media corpora demonstrate the model’s robustness and provide several interesting insights on Indian language usage patterns on social media. We release an annotated data set of 1,000 comments in ten Romanized languages as a social media evaluation benchmark. + 2020.wnut-1.24 + + + Fantastic Features and Where to Find Them: Detecting Cognitive Impairment with a Subsequence Classification Guided Approach + BenEyre + AparnaBalagopalan + JekaterinaNovikova + 193–199 + Despite the widely reported success of embedding-based machine learning methods on natural language processing tasks, the use of more easily interpreted engineered features remains common in fields such as cognitive impairment (CI) detection. Manually engineering features from noisy text is time and resource consuming, and can potentially result in features that do not enhance model performance. To combat this, we describe a new approach to feature engineering that leverages sequential machine learning models and domain knowledge to predict which features help enhance performance. We provide a concrete example of this method on a standard data set of CI speech and demonstrate that CI classification accuracy improves by 2.3% over a strong baseline when using features produced by this method. This demonstration provides an example of how this method can be used to assist classification in fields where interpretability is important, such as health care. + 2020.wnut-1.25 + 2020.wnut-1.25.OptionalSupplementaryMaterial.zip + + + Quantifying the Evaluation of Heuristic Methods for Textual Data Augmentation + OmidKashefi + RebeccaHwa + 200–208 + Data augmentation has been shown to be effective in providing more training data for machine learning and resulting in more robust classifiers. However, for some problems, there may be multiple augmentation heuristics, and the choices of which one to use may significantly impact the success of the training. In this work, we propose a metric for evaluating augmentation heuristics; specifically, we quantify the extent to which an example is “hard to distinguish” by considering the difference between the distribution of the augmented samples of different classes. Experimenting with multiple heuristics in two prediction tasks (positive/negative sentiment and verbosity/conciseness) validates our claims by revealing the connection between the distribution difference of different classes and the classification accuracy. + 2020.wnut-1.26 + + + An Empirical Survey of Unsupervised Text Representation Methods on <fixed-case>T</fixed-case>witter Data + LiliWang + ChongyangGao + JasonWei + WeichengMa + RuiboLiu + SoroushVosoughi + 209–214 + The field of NLP has seen unprecedented achievements in recent years. Most notably, with the advent of large-scale pre-trained Transformer-based language models, such as BERT, there has been a noticeable improvement in text representation. It is, however, unclear whether these improvements translate to noisy user-generated text, such as tweets. In this paper, we present an experimental survey of a wide range of well-known text representation techniques for the task of text clustering on noisy Twitter data. Our results indicate that the more advanced models do not necessarily work best on tweets and that more exploration in this area is needed. + 2020.wnut-1.27 + + + Civil Unrest on <fixed-case>T</fixed-case>witter (<fixed-case>CUT</fixed-case>): A Dataset of Tweets to Support Research on Civil Unrest + JustinSech + AlexandraDeLucia + Anna L.Buczak + MarkDredze + 215–221 + We present CUT, a dataset for studying Civil Unrest on Twitter. Our dataset includes 4,381 tweets related to civil unrest, hand-annotated with information related to the study of civil unrest discussion and events. Our dataset is drawn from 42 countries from 2014 to 2019. We present baseline systems trained on this data for the identification of tweets related to civil unrest. We include a discussion of ethical issues related to research on this topic. + 2020.wnut-1.28 + 2020.wnut-1.28.OptionalSupplementaryMaterial.pdf + + + Tweeki: Linking Named Entities on <fixed-case>T</fixed-case>witter to a Knowledge Graph + BaharehHarandizadeh + SameerSingh + 222–231 + To identify what entities are being talked about in tweets, we need to automatically link named entities that appear in tweets to structured KBs like WikiData. Existing approaches often struggle with such short, noisy texts, or their complex design and reliance on supervision make them brittle, difficult to use and maintain, and lose significance over time. Further, there is a lack of a large, linked corpus of tweets to aid researchers, along with lack of gold dataset to evaluate the accuracy of entity linking. In this paper, we introduce (1) Tweeki, an unsupervised, modular entity linking system for Twitter, (2) TweekiData, a large, automatically-annotated corpus of Tweets linked to entities in WikiData, and (3) TweekiGold, a gold dataset for entity linking evaluation. Through comprehensive analysis, we show that Tweeki is comparable to the performance of recent state-of-the-art entity linkers models, the dataset is of high quality, and a use case of how the dataset can be used to improve downstream tasks in social media analysis (geolocation prediction). + 2020.wnut-1.29 + + + Representation learning of writing style + JulienHay + Bich-LienDoan + FabricePopineau + OuassimAit Elhara + 232–243 + In this paper, we introduce a new method of representation learning that aims to embed documents in a stylometric space. Previous studies in the field of authorship analysis focused on feature engineering techniques in order to represent document styles and to enhance model performance in specific tasks. Instead, we directly embed documents in a stylometric space by relying on a reference set of authors and the intra-author consistency property which is one of two components in our definition of writing style. The main intuition of this paper is that we can define a general stylometric space from a set of reference authors such that, in this space, the coordinates of different documents will be close when the documents are by the same author, and spread away when they are by different authors, even for documents by authors who are not in the set of reference authors. The method we propose allows for the clustering of documents based on stylistic clues reflecting the authorship of documents. For the empirical validation of the method, we train a deep neural network model to predict authors of a large reference dataset consisting of news and blog articles. Albeit the learning process is supervised, it does not require a dedicated labeling of the data but it relies only on the metadata of the articles which are available in huge amounts. We evaluate the model on multiple datasets, on both the authorship clustering and the authorship attribution tasks. + 2020.wnut-1.30 + + + “A Little Birdie Told Me ... " - Social Media Rumor Detection + KarthikRadhakrishnan + TusharKanakagiri + SharanyaChakravarthy + VidhishaBalachandran + 244–248 + The rise in the usage of social media has placed it in a central position for news dissemination and consumption. This greatly increases the potential for proliferation of rumours and misinformation. In an effort to mitigate the spread of rumours, we tackle the related task of identifying the stance (Support, Deny, Query, Comment) of a social media post. Unlike previous works, we impose inductive biases that capture platform specific user behavior. These biases, coupled with social media fine-tuning of BERT allow for better language understanding, thus yielding an F1 score of 58.7 on the SemEval 2019 task on rumour stance detection. + 2020.wnut-1.31 + 2020.wnut-1.31.OptionalSupplementaryMaterial.pdf + + + Paraphrase Generation via Adversarial Penalizations + GersonVizcarra + JoseOchoa-Luna + 249–259 + Paraphrase generation is an important problem in Natural Language Processing that has been addressed with neural network-based approaches recently. This paper presents an adversarial framework to address the paraphrase generation problem in English. Unlike previous methods, we employ the discriminator output as penalization instead of using policy gradients, and we propose a global discriminator to avoid the Monte-Carlo search. In addition, this work use and compare different settings of input representation. We compare our methods to some baselines in the Quora question pairs dataset. The results show that our framework is competitive against the previous benchmarks. + 2020.wnut-1.32 + + + <fixed-case>WNUT</fixed-case>-2020 Task 1 Overview: Extracting Entities and Relations from Wet Lab Protocols + JeniyaTabassum + WeiXu + AlanRitter + 260–267 + This paper presents the results of the wet labinformation extraction task at WNUT 2020.This task consisted of two sub tasks- (1) anamed entity recognition task with 13 partic-ipants; and (2) a relation extraction task with2 participants. We outline the task, data an-notation process, corpus statistics, and providea high-level overview of the participating sys-tems for each sub task. + 2020.wnut-1.33 + + + <fixed-case>IITKGP</fixed-case> at <fixed-case>W</fixed-case>-<fixed-case>NUT</fixed-case> 2020 Shared Task-1: Domain specific <fixed-case>BERT</fixed-case> representation for Named Entity Recognition of lab protocol + TejasVaidhya + AyushKaushal + 268–272 + Supervised models trained to predict properties from representations have been achieving high accuracy on a variety of tasks.For in-stance, the BERT family seems to work exceptionally well on the downstream task from NER tagging to the range of other linguistictasks. But the vocabulary used in the medical field contains a lot of different tokens used only in the medical industry such as the name of different diseases, devices, organisms,medicines, etc. that makes it difficult for traditional BERT model to create contextualized embedding. In this paper, we are going to illustrate the System for Named Entity Tagging based on Bio-Bert. Experimental results show that our model gives substantial improvements over the baseline and stood the fourth runner up in terms of F1 score, and first runner up in terms of Recall with just 2.21 F1 score behind the best one. + 2020.wnut-1.34 + + + <fixed-case>P</fixed-case>ublish<fixed-case>I</fixed-case>n<fixed-case>C</fixed-case>ovid19 at <fixed-case>WNUT</fixed-case> 2020 Shared Task-1: Entity Recognition in Wet Lab Protocols using Structured Learning Ensemble and Contextualised Embeddings + JanvijaySingh + AnshulWadhawan + 273–280 + In this paper, we describe the approach that we employed to address the task of Entity Recognition over Wet Lab Protocols - a shared task in EMNLP WNUT-2020 Workshop. Our approach is composed of two phases. In the first phase, we experiment with various contextualised word embeddings (like Flair, BERT-based) and a BiLSTM-CRF model to arrive at the best-performing architecture. In the second phase, we create an ensemble composed of eleven BiLSTM-CRF models. The individual models are trained on random train-validation splits of the complete dataset. Here, we also experiment with different output merging schemes, including Majority Voting and Structured Learning Ensembling (SLE). Our final submission achieved a micro F1-score of 0.8175 and 0.7757 for the partial and exact match of the entity spans, respectively. We were ranked first and second, in terms of partial and exact match, respectively. + 2020.wnut-1.35 + + + Big Green at <fixed-case>WNUT</fixed-case> 2020 Shared Task-1: Relation Extraction as Contextualized Sequence Classification + ChrisMiller + SoroushVosoughi + 281–285 + Relation and event extraction is an important task in natural language processing. We introduce a system which uses contextualized knowledge graph completion to classify relations and events between known entities in a noisy text environment. We report results which show that our system is able to effectively extract relations and events from a dataset of wet lab protocols. + 2020.wnut-1.36 + + + <fixed-case>WNUT</fixed-case> 2020 Shared Task-1: Conditional Random Field(<fixed-case>CRF</fixed-case>) based Named Entity Recognition(<fixed-case>NER</fixed-case>) for Wet Lab Protocols + KaushikAcharya + 286–289 + The paper describes how classifier model built using Conditional Random Field detects named entities in wet lab protocols. + 2020.wnut-1.37 + 2020.wnut-1.37.OptionalSupplementaryMaterial.txt + + + mgsohrab at <fixed-case>WNUT</fixed-case> 2020 Shared Task-1: Neural Exhaustive Approach for Entity and Relation Recognition Over Wet Lab Protocols + Mohammad GolamSohrab + Anh-KhoaDuong Nguyen + MakotoMiwa + HiroyaTakamura + 290–298 + We present a neural exhaustive approach that addresses named entity recognition (NER) and relation recognition (RE), for the entity and re- lation recognition over the wet-lab protocols shared task. We introduce BERT-based neural exhaustive approach that enumerates all pos- sible spans as potential entity mentions and classifies them into entity types or no entity with deep neural networks to address NER. To solve relation extraction task, based on the NER predictions or given gold mentions we create all possible trigger-argument pairs and classify them into relation types or no relation. In NER task, we achieved 76.60% in terms of F-score as third rank system among the partic- ipated systems. In relation extraction task, we achieved 80.46% in terms of F-score as the top system in the relation extraction or recognition task. Besides we compare our model based on the wet lab protocols corpus (WLPC) with the WLPC baseline and dynamic graph-based in- formation extraction (DyGIE) systems. + 2020.wnut-1.38 + + + Fancy Man Launches Zippo at <fixed-case>WNUT</fixed-case> 2020 Shared Task-1: A Bert Case Model for Wet Lab Entity Extraction + QingchengZeng + XiaoyangFang + ZhexinLiang + HaodingMeng + 299–304 + Automatic or semi-automatic conversion of protocols specifying steps in performing a lab procedure into machine-readable format benefits biological research a lot. These noisy, dense, and domain-specific lab protocols processing draws more and more interests with the development of deep learning. This paper presents our teamwork on WNUT 2020 shared task-1: wet lab entity extract, that we conducted studies in several models, including a BiLSTM CRF model and a Bert case model which can be used to complete wet lab entity extraction. And we mainly discussed the performance differences of Bert case under different situations such as transformers versions, case sensitivity that may don’t get enough attention before. + 2020.wnut-1.39 + + + <fixed-case>B</fixed-case>i<fixed-case>T</fixed-case>e<fixed-case>M</fixed-case> at <fixed-case>WNUT</fixed-case> 2020 Shared Task-1: Named Entity Recognition over Wet Lab Protocols using an Ensemble of Contextual Language Models + JulienKnafou + NonaNaderi + JennyCopara + DouglasTeodoro + PatrickRuch + 305–313 + Recent improvements in machine-reading technologies attracted much attention to automation problems and their possibilities. In this context, WNUT 2020 introduces a Name Entity Recognition (NER) task based on wet laboratory procedures. In this paper, we present a 3-step method based on deep neural language models that reported the best overall exact match F1-score (77.99%) of the competition. By fine-tuning 10 times, 10 different pretrained language models, this work shows the advantage of having more models in an ensemble based on a majority of votes strategy. On top of that, having 100 different models allowed us to analyse the combinations of ensemble that demonstrated the impact of having multiple pretrained models versus fine-tuning a pretrained model multiple times. + 2020.wnut-1.40 + + + <fixed-case>WNUT</fixed-case>-2020 Task 2: Identification of Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + Dat QuocNguyen + ThanhVu + AfshinRahimi + Mai HoangDao + Linh TheNguyen + LongDoan + 314–318 + In this paper, we provide an overview of the WNUT-2020 shared task on the identification of informative COVID-19 English Tweets. We describe how we construct a corpus of 10K Tweets and organize the development and evaluation phases for this task. In addition, we also present a brief summary of results obtained from the final system evaluation submissions of 55 teams, finding that (i) many systems obtain very high performance, up to 0.91 F1 score, (ii) the majority of the submissions achieve substantially higher results than the baseline fastText (Joulin et al., 2017), and (iii) fine-tuning pre-trained language models on relevant language data followed by supervised training performs well in this task. + 2020.wnut-1.41 + + + <fixed-case>TATL</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: A Transformer-based Baseline System for Identification of Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + AnhTuan Nguyen + 319–323 + As the COVID-19 outbreak continues to spread throughout the world, more and more information about the pandemic has been shared publicly on social media. For example, there are a huge number of COVID-19 English Tweets daily on Twitter. However, the majority of those Tweets are uninformative, and hence it is important to be able to automatically select only the informative ones for downstream applications. In this short paper, we present our participation in the W-NUT 2020 Shared Task 2: Identification of Informative COVID-19 English Tweets. Inspired by the recent advances in pretrained Transformer language models, we propose a simple yet effective baseline for the task. Despite its simplicity, our proposed approach shows very competitive results in the leaderboard as we ranked 8 over 56 teams participated in total. + 2020.wnut-1.42 + + + <fixed-case>NHK</fixed-case>_<fixed-case>STRL</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: <fixed-case>GAT</fixed-case>s with Syntactic Dependencies as Edges and <fixed-case>CTC</fixed-case>-based Loss for Text Classification + YukiYasuda + TaichiIshiwatari + TaroMiyazaki + JunGoto + 324–330 + The outbreak of COVID-19 has greatly impacted our daily lives. In these circumstances, it is important to grasp the latest information to avoid causing too much fear and panic. To help grasp new information, extracting information from social networking sites is one of the effective ways. In this paper, we describe a method to identify whether a tweet related to COVID-19 is informative or not, which can help to grasp new information. The key features of our method are its use of graph attention networks to encode syntactic dependencies and word positions in the sentence, and a loss function based on connectionist temporal classification (CTC) that can learn a label for each token without reference data for each token. Experimental results show that the proposed method achieved an F1 score of 0.9175, out- performing baseline methods. + 2020.wnut-1.43 + 2020.wnut-1.43.OptionalSupplementaryMaterial.pdf + + + <fixed-case>NLP</fixed-case> North at <fixed-case>WNUT</fixed-case>-2020 Task 2: Pre-training versus Ensembling for Detection of Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + AndersGiovanni Møller + Robvan der Goot + BarbaraPlank + 331–336 + With the COVID-19 pandemic raging world-wide since the beginning of the 2020 decade, the need for monitoring systems to track relevant information on social media is vitally important. This paper describes our submission to the WNUT-2020 Task 2: Identification of informative COVID-19 English Tweets. We investigate the effectiveness for a variety of classification models, and found that domain-specific pre-trained BERT models lead to the best performance. On top of this, we attempt a variety of ensembling strategies, but these attempts did not lead to further improvements. Our final best model, the standalone CT-BERT model, proved to be highly competitive, leading to a shared first place in the shared task. Our results emphasize the importance of domain and task-related pre-training. + 2020.wnut-1.44 + + + Siva at <fixed-case>WNUT</fixed-case>-2020 Task 2: Fine-tuning Transformer Neural Networks for Identification of Informative Covid-19 Tweets + SivaSai + 337–341 + Social media witnessed vast amounts of misinformation being circulated every day during the Covid-19 pandemic so much so that the WHO Director-General termed the phenomenon as “infodemic.” The ill-effects of such misinformation are multifarious. Thus, identifying and eliminating the sources of misinformation becomes very crucial, especially when mass panic can be controlled only through the right information. However, manual identification is arduous, with such large amounts of data being generated every day. This shows the importance of automatic identification of misinformative posts on social media. WNUT-2020 Task 2 aims at building systems for automatic identification of informative tweets. In this paper, I discuss my approach to WNUT-2020 Task 2. I fine-tuned eleven variants of four transformer networks -BERT, RoBERTa, XLM-RoBERTa, ELECTRA, on top of two different preprocessing techniques to reap good results. My top submission achieved an F1-score of 85.3% in the final evaluation. + 2020.wnut-1.45 + + + <fixed-case>IIITBH</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Exploiting the best of both worlds + SaichethanReddy + PradeepBiswal + 342–346 + In this paper, we present IIITBH team’s effort to solve the second shared task of the 6th Workshop on Noisy User-generated Text (W-NUT)i.e Identification of informative COVID-19 English Tweets. The central theme of the task is to develop a system that automatically identify whether an English Tweet related to the novel coronavirus (COVID-19) is Informative or not. Our approach is based on exploiting semantic information from both max pooling and average pooling, to this end we propose two models. + 2020.wnut-1.46 + + + Phonemer at <fixed-case>WNUT</fixed-case>-2020 Task 2: Sequence Classification Using <fixed-case>COVID</fixed-case> <fixed-case>T</fixed-case>witter <fixed-case>BERT</fixed-case> and Bagging Ensemble Technique based on Plurality Voting + AnshulWadhawan + 347–351 + This paper presents the approach that we employed to tackle the EMNLP WNUT-2020 Shared Task 2 : Identification of informative COVID-19 English Tweets. The task is to develop a system that automatically identifies whether an English Tweet related to the novel coronavirus (COVID-19) is informative or not. We solve the task in three stages. The first stage involves pre-processing the dataset by filtering only relevant information. This is followed by experimenting with multiple deep learning models like CNNs, RNNs and Transformer based models. In the last stage, we propose an ensemble of the best model trained on different subsets of the provided dataset. Our final approach achieved an F1-score of 0.9037 and we were ranked sixth overall with F1-score as the evaluation criteria. + 2020.wnut-1.47 + + + <fixed-case>CXP</fixed-case>949 at <fixed-case>WNUT</fixed-case>-2020 Task 2: Extracting Informative <fixed-case>COVID</fixed-case>-19 Tweets - <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a Ensembles and The Continued Relevance of Handcrafted Features + CalumPerrio + HarishTayyar Madabushi + 352–358 + This paper presents our submission to Task 2 of the Workshop on Noisy User-generated Text. We explore improving the performance of a pre-trained transformer-based language model fine-tuned for text classification through an ensemble implementation that makes use of corpus level information and a handcrafted feature. We test the effectiveness of including the aforementioned features in accommodating the challenges of a noisy data set centred on a specific subject outside the remit of the pre-training data. We show that inclusion of additional features can improve classification results and achieve a score within 2 points of the top performing team. + 2020.wnut-1.48 + + + <fixed-case>I</fixed-case>nfo<fixed-case>M</fixed-case>iner at <fixed-case>WNUT</fixed-case>-2020 Task 2: Transformer-based Covid-19 Informative Tweet Extraction + HansiHettiarachchi + TharinduRanasinghe + 359–365 + Identifying informative tweets is an important step when building information extraction systems based on social media. WNUT-2020 Task 2 was organised to recognise informative tweets from noise tweets. In this paper, we present our approach to tackle the task objective using transformers. Overall, our approach achieves 10th place in the final rankings scoring 0.9004 F1 score for the test set. + 2020.wnut-1.49 + + + <fixed-case>BANANA</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Identifying <fixed-case>COVID</fixed-case>-19 Information on <fixed-case>T</fixed-case>witter by Combining Deep Learning and Transfer Learning Models + TinHuynh + LuanThanh Luan + Son T.Luu + 366–370 + The outbreak COVID-19 virus caused a significant impact on the health of people all over the world. Therefore, it is essential to have a piece of constant and accurate information about the disease with everyone. This paper describes our prediction system for WNUT-2020 Task 2: Identification of Informative COVID-19 English Tweets. The dataset for this task contains size 10,000 tweets in English labeled by humans. The ensemble model from our three transformer and deep learning models is used for the final prediction. The experimental result indicates that we have achieved F1 for the INFORMATIVE label on our systems at 88.81% on the test set. + 2020.wnut-1.50 + + + <fixed-case>DATAMAFIA</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 <fixed-case>T</fixed-case>ask 2: <fixed-case>A</fixed-case> <fixed-case>S</fixed-case>tudy of <fixed-case>P</fixed-case>re-trained <fixed-case>L</fixed-case>anguage <fixed-case>M</fixed-case>odels along with <fixed-case>R</fixed-case>egularization <fixed-case>T</fixed-case>echniques for <fixed-case>D</fixed-case>ownstream <fixed-case>T</fixed-case>asks + AyanSengupta + 371–377 + This document describes the system description developed by team datamafia at WNUT-2020 Task 2: Identification of informative COVID-19 English Tweets. This paper contains a thorough study of pre-trained language models on downstream binary classification task over noisy user generated Twitter data. The solution submitted to final test leaderboard is a fine tuned RoBERTa model which achieves F1 score of 90.8% and 89.4% on the dev and test data respectively. In the later part, we explore several techniques for injecting regularization explicitly into language models to generalize predictions over noisy data. Our experiments show that adding regularizations to RoBERTa pre-trained model can be very robust to data and annotation noises and can improve overall performance by more than 1.2%. + 2020.wnut-1.51 + 2020.wnut-1.51.OptionalSupplementaryMaterial.zip + + + <fixed-case>UP</fixed-case>enn<fixed-case>HLP</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2 : Transformer models for classification of <fixed-case>COVID</fixed-case>19 posts on <fixed-case>T</fixed-case>witter + ArjunMagge + VaradPimpalkhute + DivyaRallapalli + DavidSiguenza + GracielaGonzalez-Hernandez + 378–382 + Increasing usage of social media presents new non-traditional avenues for monitoring disease outbreaks, virus transmissions and disease progressions through user posts describing test results or disease symptoms. However, the discussions on the topic of infectious diseases that are informative in nature also span various topics such as news, politics and humor which makes the data mining challenging. We present a system to identify tweets about the COVID19 disease outbreak that are deemed to be informative on Twitter for use in downstream applications. The system scored a F1-score of 0.8941, Precision of 0.9028, Recall of 0.8856 and Accuracy of 0.9010. In the shared task organized as part of the 6th Workshop of Noisy User-generated Text (WNUT), the system was ranked 18th by F1-score and 13th by Accuracy. + 2020.wnut-1.52 + + + <fixed-case>UIT</fixed-case>-<fixed-case>HSE</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Exploiting <fixed-case>CT</fixed-case>-<fixed-case>BERT</fixed-case> for Identifying <fixed-case>COVID</fixed-case>-19 Information on the <fixed-case>T</fixed-case>witter Social Network + KhiemTran + HaoPhan + KietNguyen + Ngan LuuThuy Nguyen + 383–387 + Recently, COVID-19 has affected a variety of real-life aspects of the world and led to dreadful consequences. More and more tweets about COVID-19 has been shared publicly on Twitter. However, the plurality of those Tweets are uninformative, which is challenging to build automatic systems to detect the informative ones for useful AI applications. In this paper, we present our results at the W-NUT 2020 Shared Task 2: Identification of Informative COVID-19 English Tweets. In particular, we propose our simple but effective approach using the transformer-based models based on COVID-Twitter-BERT (CT-BERT) with different fine-tuning techniques. As a result, we achieve the F1-Score of 90.94% with the third place on the leaderboard of this task which attracted 56 submitted teams in total. + 2020.wnut-1.53 + + + Emory at <fixed-case>WNUT</fixed-case>-2020 Task 2: Combining Pretrained Deep Learning Models and Feature Enrichment for Informative Tweet Identification + YutingGuo + MohammedAli Al-Garadi + AbeedSarker + 388–393 + This paper describes the system developed by the Emory team for the WNUT-2020 Task 2: “Identifi- cation of Informative COVID-19 English Tweet”. Our system explores three recent Transformer- based deep learning models pretrained on large- scale data to encode documents. Moreover, we developed two feature enrichment methods to en- hance document embeddings by integrating emoji embeddings and syntactic features into deep learn- ing models. Our system achieved F1-score of 0.897 and accuracy of 90.1% on the test set, and ranked in the top-third of all 55 teams. + 2020.wnut-1.54 + + + <fixed-case>CSECU</fixed-case>-<fixed-case>DSG</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Exploiting Ensemble of Transfer Learning and Hand-crafted Features for Identification of Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + FareenTasneem + JannatunNaim + RadiathunTasnia + TashinHossain + Abu NowshedChy + 394–398 + COVID-19 pandemic has become the trending topic on twitter and people are interested in sharing diverse information ranging from new cases, healthcare guidelines, medicine, and vaccine news. Such information assists the people to be updated about the situation as well as beneficial for public safety personnel for decision making. However, the informal nature of twitter makes it challenging to refine the informative tweets from the huge tweet streams. To address these challenges WNUT-2020 introduced a shared task focusing on COVID-19 related informative tweet identification. In this paper, we describe our participation in this task. We propose a neural model that adopts the strength of transfer learning and hand-crafted features in a unified architecture. To extract the transfer learning features, we utilize the state-of-the-art pre-trained sentence embedding model BERT, RoBERTa, and InferSent, whereas various twitter characteristics are exploited to extract the hand-crafted features. Next, various feature combinations are utilized to train a set of multilayer perceptron (MLP) as the base-classifier. Finally, a majority voting based fusion approach is employed to determine the informative tweets. Our approach achieved competitive performance and outperformed the baseline by 7% (approx.). + 2020.wnut-1.55 + + + <fixed-case>IRL</fixed-case>ab@<fixed-case>IITBHU</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Identification of informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets using <fixed-case>BERT</fixed-case> + SupriyaChanda + EshitaNandy + SukomalPal + 399–403 + This paper reports our submission to the shared Task 2: Identification of informative COVID-19 English tweets at W-NUT 2020. We attempted a few techniques, and we briefly explain here two models that showed promising results in tweet classification tasks: DistilBERT and FastText. DistilBERT achieves a F1 score of 0.7508 on the test set, which is the best of our submissions. + 2020.wnut-1.56 + + + <fixed-case>N</fixed-case>ut<fixed-case>C</fixed-case>racker at <fixed-case>WNUT</fixed-case>-2020 Task 2: Robustly Identifying Informative <fixed-case>COVID</fixed-case>-19 Tweets using Ensembling and Adversarial Training + PriyanshuKumar + AadarshSingh + 404–408 + We experiment with COVID-Twitter-BERT and RoBERTa models to identify informative COVID-19 tweets. We further experiment with adversarial training to make our models robust. The ensemble of COVID-Twitter-BERT and RoBERTa obtains a F1-score of 0.9096 (on the positive class) on the test data of WNUT-2020 Task 2 and ranks 1st on the leaderboard. The ensemble of the models trained using adversarial training also produces similar result. + 2020.wnut-1.57 + + + <fixed-case>DSC</fixed-case>-<fixed-case>IIT</fixed-case> <fixed-case>ISM</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Detection of <fixed-case>COVID</fixed-case>-19 informative tweets using <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a + SirigireddyDhana Laxmi + RohitAgarwal + AmanSinha + 409–413 + Social media such as Twitter is a hotspot of user-generated information. In this ongoing Covid-19 pandemic, there has been an abundance of data on social media which can be classified as informative and uninformative content. In this paper, we present our work to detect informative Covid-19 English tweets using RoBERTa model as a part of the W-NUT workshop 2020. We show the efficacy of our model on a public dataset with an F1-score of 0.89 on the validation dataset and 0.87 on the leaderboard. + 2020.wnut-1.58 + + + Linguist Geeks on <fixed-case>WNUT</fixed-case>-2020 Task 2: <fixed-case>COVID</fixed-case>-19 Informative Tweet Identification using Progressive Trained Language Models and Data Augmentation + VasudevAwatramani + AnupamKumar + 414–418 + Since the outbreak of COVID-19, there has been a surge of digital content on social media. The content ranges from news articles, academic reports, tweets, videos, and even memes. Among such an overabundance of data, it is crucial to distinguish which information is actually informative or merely sensational, redundant or false. This work focuses on developing such a language system that can differentiate between Informative or Uninformative tweets associated with COVID-19 for WNUT-2020 Shared Task 2. For this purpose, we employ deep transfer learning models such as BERT along other techniques such as Noisy Data Augmentation and Progress Training. The approach achieves a competitive F1-score of 0.8715 on the final testing dataset. + 2020.wnut-1.59 + + + <fixed-case>NLPRL</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: <fixed-case>ELM</fixed-case>o-based System for Identification of <fixed-case>COVID</fixed-case>-19 Tweets + Rajesh KumarMundotiya + RupjyotiBaruah + BhavanaSrivastava + Anil KumarSingh + 419–422 + The Coronavirus pandemic has been a dominating news on social media for the last many months. Efforts are being made to reduce its spread and reduce the casualties as well as new infections. For this purpose, the information about the infected people and their related symptoms, as available on social media, such as Twitter, can help in prevention and taking precautions. This is an example of using noisy text processing for disaster management. This paper discusses the NLPRL results in Shared Task-2 of WNUT-2020 workshop. We have considered this problem as a binary classification problem and have used a pre-trained ELMo embedding with GRU units. This approach helps classify the tweets with accuracy as 80.85% and 78.54% as F1-score on the provided test dataset. The experimental code is available online. + 2020.wnut-1.60 + + + <fixed-case>SU</fixed-case>-<fixed-case>NLP</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: The Ensemble Models + KenanFayoumi + ReyyanYeniterzi + 423–427 + In this paper, we address the problem of identifying informative tweets related to COVID-19 in the form of a binary classification task as part of our submission for W-NUT 2020 Task 2. Specifically, we focus on ensembling methods to boost the classification performance of classification models such as BERT and CNN. We show that ensembling can reduce the variance in performance, specifically for BERT base models. + 2020.wnut-1.61 + + + <fixed-case>IDSOU</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Identification of Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + SoraOhashi + TomoyukiKajiwara + ChenhuiChu + NorikoTakemura + YutaNakashima + HajimeNagahara + 428–433 + We introduce the IDSOU submission for the WNUT-2020 task 2: identification of informative COVID-19 English Tweets. Our system is an ensemble of pre-trained language models such as BERT. We ranked 16th in the F1 score. + 2020.wnut-1.62 + + + <fixed-case>C</fixed-case>omplex<fixed-case>D</fixed-case>ata<fixed-case>L</fixed-case>ab at <fixed-case>W</fixed-case>-<fixed-case>NUT</fixed-case> 2020 Task 2: Detecting Informative <fixed-case>COVID</fixed-case>-19 Tweets by Attending over Linked Documents + KellinPelrine + JacobDanovitch + Albert OrozcoCamacho + ReihanehRabbany + 434–439 + Given the global scale of COVID-19 and the flood of social media content related to it, how can we find informative discussions? We present Gapformer, which effectively classifies content as informative or not. It reformulates the problem as graph classification, drawing on not only the tweet but connected webpages and entities. We leverage a pre-trained language model as well as the connections between nodes to learn a pooled representation for each document network. We show it outperforms several competitive baselines and present ablation studies supporting the benefit of the linked information. Code is available on Github. + 2020.wnut-1.63 + + + <fixed-case>NEU</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Data Augmentation To Tell <fixed-case>BERT</fixed-case> That Death Is Not Necessarily Informative + KumudChauhan + 440–443 + Millions of people around the world are sharing COVID-19 related information on social media platforms. Since not all the information shared on the social media is useful, a machine learning system to identify informative posts can help users in finding relevant information. In this paper, we present a BERT classifier system for W-NUT2020 Shared Task 2: Identification of Informative COVID-19 English Tweets. Further, we show that BERT exploits some easy signals to identify informative tweets, and adding simple patterns to uninformative tweets drastically degrades BERT performance. In particular, simply adding “10 deaths” to tweets in dev set, reduces BERT F1- score from 92.63 to 7.28. We also propose a simple data augmentation technique that helps in improving the robustness and generalization ability of the BERT classifier. + 2020.wnut-1.64 + + + <fixed-case>L</fixed-case>ynyrd<fixed-case>S</fixed-case>kynyrd at <fixed-case>WNUT</fixed-case>-2020 Task 2: Semi-Supervised Learning for Identification of Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + AbhilashaSancheti + KushalChawla + GauravVerma + 444–449 + In this work, we describe our system for WNUT-2020 shared task on the identification of informative COVID-19 English tweets. Our system is an ensemble of various machine learning methods, leveraging both traditional feature-based classifiers as well as recent advances in pre-trained language models that help in capturing the syntactic, semantic, and contextual features from the tweets. We further employ pseudo-labelling to incorporate the unlabelled Twitter data released on the pandemic. Our best performing model achieves an F1-score of 0.9179 on the provided validation set and 0.8805 on the blind test-set. + 2020.wnut-1.65 + + + <fixed-case>NIT</fixed-case>_<fixed-case>COVID</fixed-case>-19 at <fixed-case>WNUT</fixed-case>-2020 Task 2: Deep Learning Model <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a for Identify Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + JagadeeshM S + AlphonseP J A + 450–454 + This paper presents the model submitted by NIT COVID-19 team for identified informative COVID-19 English tweets at WNUT-2020 Task2. This shared task addresses the problem of automatically identifying whether an English tweet related to informative (novel coronavirus) or not. These informative tweets provide information about recovered, confirmed, suspected, and death cases as well as location or travel history of the cases. The proposed approach includes pre-processing techniques and pre-trained RoBERTa with suitable hyperparameters for English coronavirus tweet classification. The performance achieved by the proposed model for shared task WNUT 2020 Task2 is 89.14% in the F1-score metric. + 2020.wnut-1.66 + + + <fixed-case>E</fixed-case>dinburgh<fixed-case>NLP</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Leveraging Transformers with Generalized Augmentation for Identifying Informativeness in <fixed-case>COVID</fixed-case>-19 Tweets + NickilMaveli + 455–461 + Twitter has become an important communication channel in times of emergency. The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (disaster relief organizations and news agencies) and therefore recognizing the informativeness of a tweet can help filter noise from large volumes of data. In this paper, we present our submission for WNUT-2020 Task 2: Identification of informative COVID-19 English Tweets. Our most successful model is an ensemble of transformers including RoBERTa, XLNet, and BERTweet trained in a Semi-Supervised Learning (SSL) setting. The proposed system achieves a F1 score of 0.9011 on the test set (ranking 7th on the leaderboard), and shows significant gains in performance compared to a baseline system using fasttext embeddings. + 2020.wnut-1.67 + + + #<fixed-case>GCDH</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: <fixed-case>BERT</fixed-case>-Based Models for the Detection of Informativeness in <fixed-case>E</fixed-case>nglish <fixed-case>COVID</fixed-case>-19 Related Tweets + HannaVarachkina + StefanZiehe + TillmannDönicke + FranziskaPannach + 462–465 + In this system paper, we present a transformer-based approach to the detection of informativeness in English tweets on the topic of the current COVID-19 pandemic. Our models distinguish informative tweets, i.e. tweets containing statistics on recovery, suspected and confirmed cases and COVID-19 related deaths, from uninformative tweets. We present two transformer-based approaches as well as a Naive Bayes classifier and a support vector machine as baseline systems. The transformer models outperform the baselines by more than 0.1 in F1-score, with F1-scores of 0.9091 and 0.9036. Our models were submitted to the shared task Identification of informative COVID-19 English tweets WNUT-2020 Task 2. + 2020.wnut-1.68 + + + Not-<fixed-case>NUT</fixed-case>s at <fixed-case>WNUT</fixed-case>-2020 Task 2: A <fixed-case>BERT</fixed-case>-based System in Identifying Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets + ThaiHoang + PhuongVu + 466–470 + As of 2020 when the COVID-19 pandemic is full-blown on a global scale, people’s need to have access to legitimate information regarding COVID-19 is more urgent than ever, especially via online media where the abundance of irrelevant information overshadows the more informative ones. In response to such, we proposed a model that, given an English tweet, automatically identifies whether that tweet bears informative content regarding COVID-19 or not. By ensembling different BERTweet model configurations, we have achieved competitive results that are only shy of those by top performing teams by roughly 1% in terms of F1 score on the informative class. In the post-competition period, we have also experimented with various other approaches that potentially boost generalization to a new dataset. + 2020.wnut-1.69 + + + <fixed-case>CIA</fixed-case>_<fixed-case>NITT</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Classification of <fixed-case>COVID</fixed-case>-19 Tweets Using Pre-trained Language Models + YandrapatiPrakash Babu + RajagopalEswari + 471–474 + This paper presents our models for WNUT2020 shared task2. The shared task2 involves identification of COVID-19 related informative tweets. We treat this as binary text clas-sification problem and experiment with pre-trained language models. Our first model which is based on CT-BERT achieves F1-scoreof 88.7% and second model which is an ensemble of CT-BERT, RoBERTa and SVM achieves F1-score of 88.52%. + 2020.wnut-1.70 + + + <fixed-case>UET</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: A Study of Combining Transfer Learning Methods for Text Classification with <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a + HuyDao Quang + TamNguyen Minh + 475–479 + This paper reports our approach and the results of our experiments for W-NUT task 2: Identification of Informative COVID-19 English Tweets. In this paper, we test out the effectiveness of transfer learning method with state of the art language models as RoBERTa on this text classification task. Moreover, we examine the benefit of applying additional fine-tuning and training techniques including fine-tuning discrimination, gradual unfreezing as well as our custom head for the classifier. Our best model results in a high F1-score of 89.89 on the task’s private test dataset and that of 90.96 on public test set without ensembling multiple models and additional data. + 2020.wnut-1.71 + + + Dartmouth <fixed-case>CS</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Fine tuning <fixed-case>BERT</fixed-case> for Tweet classification + DylanWhang + SoroushVosoughi + 480–484 + We describe the systems developed for the WNUT-2020 shared task 2, identification of informative COVID-19 English Tweets. BERT is a highly performant model for Natural Language Processing tasks. We increased BERT’s performance in this classification task by fine-tuning BERT and concatenating its embeddings with Tweet-specific features and training a Support Vector Machine (SVM) for classification (henceforth called BERT+). We compared its performance to a suite of machine learning models. We used a Twitter specific data cleaning pipeline and word-level TF-IDF to extract features for the non-BERT models. BERT+ was the top performing model with an F1-score of 0.8713. + 2020.wnut-1.72 + + + <fixed-case>S</fixed-case>un<fixed-case>B</fixed-case>ear at <fixed-case>WNUT</fixed-case>-2020 Task 2: Improving <fixed-case>BERT</fixed-case>-Based Noisy Text Classification with Knowledge of the Data domain + LinhDoan Bao + Viet AnhNguyen + QuangPham Huu + 485–490 + This paper proposes an improved custom model for WNUT task 2: Identification of Informative COVID-19 English Tweet. We improve experiment with the effectiveness of fine-tuning methodologies for state-of-the-art language model RoBERTa. We make a preliminary instantiation of this formal model for the text classification approaches. With appropriate training techniques, our model is able to achieve 0.9218 F1-score on public validation set and the ensemble version settles at top 9 F1-score (0.9005) and top 2 Recall (0.9301) on private test set. + 2020.wnut-1.73 + + + <fixed-case>ISWARA</fixed-case> at <fixed-case>WNUT</fixed-case>-2020 Task 2: Identification of Informative <fixed-case>COVID</fixed-case>-19 <fixed-case>E</fixed-case>nglish Tweets using <fixed-case>BERT</fixed-case> and <fixed-case>F</fixed-case>ast<fixed-case>T</fixed-case>ext Embeddings + Wava CarissaPutri + Rani AuliaHidayat + Isnaini NurulKhasanah + RahmadMahendra + 491–494 + This paper presents Iswara’s participation in the WNUT-2020 Task 2 “Identification of Informative COVID-19 English Tweets using BERT and FastText Embeddings”,which tries to classify whether a certain tweet is considered informative or not. We proposed a method that utilizes word embeddings and using word occurrence related to the topic for this task. We compare several models to get the best performance. Results show that pairing BERT with word occurrences outperforms fastText with F1-Score, precision, recall, and accuracy on test data of 76%, 81%, 72%, and 79%, respectively + 2020.wnut-1.74 + + + <fixed-case>COVCOR</fixed-case>20 at <fixed-case>WNUT</fixed-case>-2020 Task 2: An Attempt to Combine Deep Learning and Expert rules + AliHürriyetoğlu + AliSafaya + OsmanMutlu + NellekeOostdijk + ErdemYörük + 495–498 + In the scope of WNUT-2020 Task 2, we developed various text classification systems, using deep learning models and one using linguistically informed rules. While both of the deep learning systems outperformed the system using the linguistically informed rules, we found that through the integration of (the output of) the three systems a better performance could be achieved than the standalone performance of each approach in a cross-validation setting. However, on the test data the performance of the integration was slightly lower than our best performing deep learning model. These results hardly indicate any progress in line of integrating machine learning and expert rules driven systems. We expect that the release of the annotation manuals and gold labels of the test data after this workshop will shed light on these perplexing results. + 2020.wnut-1.75 + + + <fixed-case>TEST</fixed-case>_<fixed-case>POSITIVE</fixed-case> at <fixed-case>W</fixed-case>-<fixed-case>NUT</fixed-case> 2020 Shared Task-3: Cross-task modeling + ChachaChen + Chieh-YangHuang + YaqiHou + YangShi + EnyanDai + JiaqiWang + 499–504 + The competition of extracting COVID-19 events from Twitter is to develop systems that can automatically extract related events from tweets. The built system should identify different pre-defined slots for each event, in order to answer important questions (e.g., Who is tested positive? What is the age of the person? Where is he/she?). To tackle these challenges, we propose the Joint Event Multi-task Learning (JOELIN) model. Through a unified global learning framework, we make use of all the training data across different events to learn and fine-tune the language model. Moreover, we implement a type-aware post-processing procedure using named entity recognition (NER) to further filter the predictions. JOELIN outperforms the BERT baseline by 17.2% in micro F1. + 2020.wnut-1.76 + + + imec-<fixed-case>ETRO</fixed-case>-<fixed-case>VUB</fixed-case> at <fixed-case>W</fixed-case>-<fixed-case>NUT</fixed-case> 2020 Shared Task-3: A multilabel <fixed-case>BERT</fixed-case>-based system for predicting <fixed-case>COVID</fixed-case>-19 events + XiangyuYang + GiannisBekoulis + NikosDeligiannis + 505–513 + In this paper, we present our system designed to address the W-NUT 2020 shared task for COVID-19 Event Extraction from Twitter. To mitigate the noisy nature of the Twitter stream, our system makes use of the COVID-Twitter-BERT (CT-BERT), which is a language model pre-trained on a large corpus of COVID-19 related Twitter messages. Our system is trained on the COVID-19 Twitter Event Corpus and is able to identify relevant text spans that answer pre-defined questions (i.e., slot types) for five COVID-19 related events (i.e., TESTED POSITIVE, TESTED NEGATIVE, CAN-NOT-TEST, DEATH and CURE & PREVENTION). We have experimented with different architectures; our best performing model relies on a multilabel classifier on top of the CT-BERT model that jointly trains all the slot types for a single event. Our experimental results indicate that our Multilabel-CT-BERT system outperforms the baseline methods by 7 percentage points in terms of micro average F1 score. Our model ranked as 4th in the shared task leaderboard. + 2020.wnut-1.77 + + + <fixed-case>UCD</fixed-case>-<fixed-case>CS</fixed-case> at <fixed-case>W</fixed-case>-<fixed-case>NUT</fixed-case> 2020 Shared Task-3: A Text to Text Approach for <fixed-case>COVID</fixed-case>-19 Event Extraction on Social Media + CongcongWang + DavidLillis + 514–521 + In this paper, we describe our approach in the shared task: COVID-19 event extraction from Twitter. The objective of this task is to extract answers from COVID-related tweets to a set of predefined slot-filling questions. Our approach treats the event extraction task as a question answering task by leveraging the transformer-based T5 text-to-text model. According to the official evaluation scores returned, namely F1, our submitted run achieves competitive performance compared to other participating runs (Top 3). However, we argue that this evaluation may underestimate the actual performance of runs based on text-generation. Although some such runs may answer the slot questions well, they may not be an exact string match for the gold standard answers. To measure the extent of this underestimation, we adopt a simple exact-answer transformation method aiming at converting the well-answered predictions to exactly-matched predictions. The results show that after this transformation our run overall reaches the same level of performance as the best participating run and state-of-the-art F1 scores in three of five COVID-related events. Our code is publicly available to aid reproducibility + 2020.wnut-1.78 + + + Winners at <fixed-case>W</fixed-case>-<fixed-case>NUT</fixed-case> 2020 Shared Task-3: Leveraging Event Specific and Chunk Span information for Extracting <fixed-case>COVID</fixed-case> Entities from Tweets + AyushKaushal + TejasVaidhya + 522–529 + Twitter has acted as an important source of information during disasters and pandemic, especially during the times of COVID-19. In this paper, we describe our system entry for WNUT 2020 Shared Task-3. The task was aimed at automating the extraction of a variety of COVID-19 related events from Twitter, such as individuals who recently contracted the virus, someone with symptoms who were denied testing and believed remedies against the infection. The system consists of separate multi-task models for slot-filling subtasks and sentence-classification subtasks, while leveraging the useful sentence-level information for the corresponding event. The system uses COVID-Twitter-BERT with attention-weighted pooling of candidate slot-chunk features to capture the useful information chunks. The system ranks 1st at the leaderboard with F1 of 0.6598, without using any ensembles or additional datasets. + 2020.wnut-1.79 + + + <fixed-case>HLTRI</fixed-case> at <fixed-case>W</fixed-case>-<fixed-case>NUT</fixed-case> 2020 Shared Task-3: <fixed-case>COVID</fixed-case>-19 Event Extraction from <fixed-case>T</fixed-case>witter Using Multi-Task Hopfield Pooling + MaxwellWeinzierl + SandaHarabagiu + 530–538 + Extracting structured knowledge involving self-reported events related to the COVID-19 pandemic from Twitter has the potential to inform surveillance systems that play a critical role in public health. The event extraction challenge presented by the W-NUT 2020 Shared Task 3 focused on the identification of five types of events relevant to the COVID-19 pandemic and their respective set of pre-defined slots encoding demographic, epidemiological, clinical as well as spatial, temporal or subjective knowledge. Our participation in the challenge led to the design of a neural architecture for jointly identifying all Event Slots expressed in a tweet relevant to an event of interest. This architecture uses COVID-Twitter-BERT as the pre-trained language model. In addition, to learn text span embeddings for each Event Slot, we relied on a special case of Hopfield Networks, namely Hopfield pooling. The results of the shared task evaluation indicate that our system performs best when it is trained on a larger dataset, while it remains competitive when training on smaller datasets. + 2020.wnut-1.80 + +
+
diff --git a/data/xml/C88.xml b/data/xml/C88.xml index 6c029e8f76..e4e0f4cf41 100644 --- a/data/xml/C88.xml +++ b/data/xml/C88.xml @@ -59,7 +59,7 @@ Some Problems of Machine Translation Between Closely Related Languages AlevtinaBemova - karelOli̊va + KarelOli̊va JarmilaPanevová C88-1010 @@ -160,7 +160,7 @@ A Binding Rule for <fixed-case>G</fixed-case>overnment-binding Parsing NelsonCorrea - NelsonCORREA + NelsonCorrea C88-1026 @@ -612,12 +612,12 @@ Syntactic Functions in <fixed-case>GPSG</fixed-case> - karelOli̊va + KarelOli̊va C88-2104 List Automata With Syntactically Structured Output - karelOli̊va + KarelOli̊va MartinPlatek C88-2105 diff --git a/data/xml/C94.xml b/data/xml/C94.xml index 4f45381e99..a3c6c4cd7d 100644 --- a/data/xml/C94.xml +++ b/data/xml/C94.xml @@ -825,7 +825,7 @@
<fixed-case>HPSG</fixed-case> Lexicon Without Lexical Rules - kareloliva + KarelOliva C94-2131 diff --git a/data/xml/D17.xml b/data/xml/D17.xml index 57b794f806..4cdc929685 100644 --- a/data/xml/D17.xml +++ b/data/xml/D17.xml @@ -1922,7 +1922,7 @@ and the code is available at https://github.com/qizhex/RACE_AR_baselines Neural Discontinuous Constituency Parsing Miloš Stanojević - Raquel G. Alhama + Raquel G. Alhama 1666–1676 D17-1174 10.18653/v1/D17-1174 diff --git a/data/xml/D19.xml b/data/xml/D19.xml index 3919b3cc37..62f76a3faf 100644 --- a/data/xml/D19.xml +++ b/data/xml/D19.xml @@ -8106,7 +8106,7 @@ A plethora of methods have been proposed to emphasize specific lexico-semantic r LinPan MichaelGlass VittorioCastelli - J WilliamMurdock + J. WilliamMurdock RaduFlorian SalimRoukos AviSil diff --git a/data/xml/K18.xml b/data/xml/K18.xml index 8172b07d7b..39140b1605 100644 --- a/data/xml/K18.xml +++ b/data/xml/K18.xml @@ -426,7 +426,7 @@ Unsupervised Sentence Compression using Denoising Auto-Encoders - ThibaultFevry + ThibaultFévry JasonPhang 413–422 K18-1040 diff --git a/data/xml/R19.xml b/data/xml/R19.xml index 7a59ccc853..b956fea572 100644 --- a/data/xml/R19.xml +++ b/data/xml/R19.xml @@ -456,10 +456,10 @@ Assessing socioeconomic status of <fixed-case>T</fixed-case>witter users: A survey - DhouhaGHAZOUANI - LuigiLANCIERI - HabibOUNELLI - ChakerJEBARI + DhouhaGhazouani + LuigiLancieri + HabibOunelli + ChakerJebari 388–398 Every day, the emotion and opinion of different people across the world are reflected in the form of short messages using microblogging platforms. Despite the existence of enormous potential introduced by this data source, the Twitter community is still ambiguous and is not fully explored yet. While there are a huge number of studies examining the possibilities of inferring gender and age, there exist hardly researches on socioeconomic status (SES) inference of Twitter users. As socioeconomic status is essential to treating diverse questions linked to human behavior in several fields (sociology, demography, public health, etc.), we conducted a comprehensive literature review of SES studies, inference methods, and metrics. With reference to the research on literature’s results, we came to outline the most critical challenges for researchers. To the best of our knowledge, this paper is the first review that introduces the different aspects of SES inference. Indeed, this article provides the benefits for practitioners who aim to process and explore Twitter SES inference. R19-1046 @@ -830,10 +830,10 @@ Semantic Language Model for <fixed-case>T</fixed-case>unisian Dialect - AbirMASMOUDI + AbirMasmoudi RimLaatar - Mariemellouze - lamiahadrich belguith + MariemEllouze + LamiaHadrich Belguith 720–729 In this paper, we describe the process of creating a statistical Language Model (LM) for the Tunisian Dialect. Indeed, this work is part of the realization of Automatic Speech Recognition (ASR) system for the Tunisian Railway Transport Network. Since our eld of work has been limited, there are several words with similar behaviors (semantic for example) but they do not have the same appearance probability; their class groupings will therefore be possible. For these reasons, we propose to build an n-class LM that is based mainly on the integration of purely semantic data. Indeed, each class represents an abstraction of similar labels. In order to improve the sequence labeling task, we proposed to use a discriminative algorithm based on the Conditional Random Field (CRF) model. To better judge our choice of creating an n-class word model, we compared the created model with the 3-gram type model on the same test corpus of evaluation. Additionally, to assess the impact of using the CRF model to perform the semantic labelling task in order to construct semantic classes, we compared the n-class created model with using the CRF in the semantic labelling task and the n- class model without using the CRF in the semantic labelling task. The drawn comparison of the predictive power of the n-class model obtained by applying the CRF model in the semantic labelling is that it is better than the other two models presenting the highest value of its perplexity. R19-1084 @@ -1733,7 +1733,7 @@ Multilingual Complex Word Identification: Convolutional Neural Networks with Morphological and Linguistic Features - Kim ChengSHEANG + Kim ChengSheang 83–89 The paper is about our experiments with Complex Word Identification system using deep learning approach with word embeddings and engineered features. R19-2013 diff --git a/data/xml/W16.xml b/data/xml/W16.xml index 3a63c84b19..f51b01e02b 100644 --- a/data/xml/W16.xml +++ b/data/xml/W16.xml @@ -2841,7 +2841,7 @@ Generalization in Artificial Language Learning: Modelling the Propensity to Generalize - Raquel G. Alhama + Raquel G. Alhama Willem Zuidema 64–72 W16-1909 diff --git a/data/xml/W17.xml b/data/xml/W17.xml index 1621c884b3..22e7c76508 100644 --- a/data/xml/W17.xml +++ b/data/xml/W17.xml @@ -13030,7 +13030,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me “A pessimist sees the difficulty in every opportunity; an optimist sees the opportunity in every difficulty” – Understanding the psycho-sociological influences to it UpdendraKumar Vishal KumarRana - SrinivasPykl + SrinivasPYKL AmitavaDas 255–264 W17-7532 diff --git a/data/xml/W19.xml b/data/xml/W19.xml index 2c9ecf6fe0..2e159cbcef 100644 --- a/data/xml/W19.xml +++ b/data/xml/W19.xml @@ -2384,8 +2384,10 @@ RogerLevy 94–99 Word embeddings trained on large-scale historical corpora can illuminate human biases and stereotypes that perpetuate social inequalities. These embeddings are often trained in separate vector space models defined according to different attributes of interest. In this paper, we introduce a single, unified dynamic embedding model that learns attribute-specific word embeddings and apply it to a novel dataset—talk radio shows from around the US—to analyze perceptions about refugees. We validate our model on a benchmark dataset and apply it to two corpora of talk radio shows averaging 117 million words produced over one month across 83 stations and 64 cities. Our findings suggest that dynamic word embeddings are capable of identifying nuanced differences in public discourse about contentious topics, suggesting their usefulness as a tool for better understanding how the public perceives and engages with different issues across time, geography, and other dimensions. - W19-2111 + W19-2111 10.18653/v1/W19-2111 + + Corrects errors found in the original version of the paper; a new corrigendum at the end summarizes the original errors and how we corrected them. Modeling Behavioral Aspects of Social Media Discourse for Moral Classification @@ -16691,7 +16693,7 @@ In this tutorial on MT and post-editing we would like to continue sharing the la Towards a Proactive <fixed-case>MWE</fixed-case> Terminological Platform for Cross-Lingual Mediation in the Age of Big Data Benjamin K.Tsou KapoChow - JUNRUNie + JunruNie YuanYuan 116–121 The emergence of China as a global economic power in the 21st Century has brought about surging needs for cross-lingual and cross-cultural mediation, typically performed by translators. Advances in Artificial Intelligence and Language Engineering have been bolstered by Machine learning and suitable Big Data cultivation. They have helped to meet some of the translator’s needs, though the technical specialists have not kept pace with the practical and expanding requirements in language mediation. One major technical and linguistic hurdle involves words outside the vocabulary of the translator or the lexical database he/she consults, especially Multi-Word Expressions (Compound Words) in technical subjects. A further problem is in the multiplicity of renditions of a term in the target language. This paper discusses a proactive approach following the successful extraction and application of sizable bilingual Multi-Word Expressions (Compound Words) for language mediation in technical subjects, which do not fall within the expertise of typical translators, who have inadequate appreciation of the range of new technical tools available to help him/her. Our approach draws on the personal reflections of translators and teachers of translation and is based on the prior R&D efforts relating to 300,000 comparable Chinese-English patents. The subsequent protocol we have developed aims to be proactive in meeting four identified practical challenges in technical translation (e.g. patents). It has broader economic implication in the Age of Big Data (Tsou et al, 2015) and Trade War, as the workload, if not, the challenges, increasingly cannot be met by currently available front-line translators. We shall demonstrate how new tools can be harnessed to spearhead the application of language technology not only in language mediation but also in the “teaching” and “learning” of translation. It shows how a better appreciation of their needs may enhance the contributions of the technical specialists, and thus enhance the resultant synergetic benefits. diff --git a/data/yaml/joint.yaml b/data/yaml/joint.yaml index ce93cb4210..0f00994260 100644 --- a/data/yaml/joint.yaml +++ b/data/yaml/joint.yaml @@ -143,6 +143,8 @@ bionlp: - D19-57 birndl: - W16-15 +blackboxnlp: +- W19-48 bsnlp: - W13-24 - W15-53 @@ -331,6 +333,7 @@ eamt: - W15-49 - W16-34 emnlp: + 2020: [2020.conll-1, 2020.conll-shared, 2020.alw-1, 2020.blackboxnlp-1, 2020.clinicalnlp-1, 2020.cmcl-1, 2020.codi-1, 2020.deelio-1, 2020.eval4nlp-1, 2020.insights-1, 2020.intexsempar-1, 2020.louhi-1, 2020.nlpbt-1, 2020.nlpcovid19-1, 2020.nlpcss-1, 2020.nlposs-1, 2020.privatenlp-1, 2020.scai-1, 2020.sdp-1, 2020.sigtyp-1, 2020.splu-1, 2020.spnlp-1, 2020.sustainlp-1, 2020.wnut-1] 1996: - W96-02 1997: diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml index d3e3352ee9..3f3b1f5dc0 100644 --- a/data/yaml/name_variants.yaml +++ b/data/yaml/name_variants.yaml @@ -1,3 +1,7 @@ +- canonical: {first: Pranav, last: A} + comment: UC Santa Cruz + id: pranav-a + similar: [pranav-anand] - canonical: {first: Balamurali, last: AR} variants: - {first: Balamurali, last: A.R.} @@ -194,14 +198,6 @@ - canonical: {first: Diego Raphael, last: Amancio} variants: - {first: Diego, last: Amancio} -- canonical: {first: Pranav, last: A} - id: pranav-a - similar: [pranav-anand] - comment: UC Santa Cruz -- canonical: {first: Pranav, last: Anand} - id: pranav-anand - similar: [pranav-a] - comment: Dayta AI - canonical: {first: Shin-ya, last: Amano} variants: - {first: Sin-ya, last: Amano} @@ -224,6 +220,10 @@ - canonical: {first: Marcelo Adriano, last: Amâncio} variants: - {first: Marcelo, last: Amancio} +- canonical: {first: Pranav, last: Anand} + comment: Dayta AI + id: pranav-anand + similar: [pranav-a] - canonical: {first: Animashree, last: Anandkumar} variants: - {first: Anima, last: Anandkumar} @@ -262,9 +262,6 @@ comment: IBM id: alexander-andreyewsky similar: [alexandre-andreewsky] -- canonical: {first: Antonella, last: De Angeli} - variants: - - {first: Antonella, last: DeAngeli} - canonical: {first: Peter, last: Anick} variants: - {first: Peter G., last: Anick} @@ -281,6 +278,9 @@ variants: - {first: Douglas E., last: Appelt} - {first: Doug, last: Appelt} +- canonical: {first: Noriko H., last: Arai} + variants: + - {first: Noriko, last: Arai} - canonical: {first: Kenji, last: Araki} id: kenji-araki - canonical: {first: Masahiro, last: Araki} @@ -884,6 +884,7 @@ - canonical: {first: Emanuela, last: Boroş} variants: - {first: Emanuela, last: Boroș} + - {first: Emanuela, last: Boros} - canonical: {first: Tiberiu, last: Boroş} variants: - {first: Tiberiu, last: Boroș} @@ -972,6 +973,9 @@ id: andrew-brasher - canonical: {first: Harry, last: Bratt} id: harry-bratt +- canonical: {first: Adrian, last: Braşoveanu} + variants: + - {first: Adrian, last: Brasoveanu} - canonical: {first: Eric, last: Breck} variants: - {first: Eric J., last: Breck} @@ -1150,8 +1154,6 @@ - canonical: {first: Charles B., last: Callaway} variants: - {first: Charles, last: Callaway} -- canonical: {first: Martine, last: de Calmès} - id: martine-de-calmes - canonical: {first: Diego, last: Calvanese} id: diego-calvanese - canonical: {first: Nicoletta, last: Calzolari} @@ -1744,6 +1746,10 @@ - canonical: {first: Shay B., last: Cohen} variants: - {first: Shay, last: Cohen} +- canonical: {first: Trevor, last: Cohen} + comment: University of Washington + id: trevor-cohen + similar: [trevor-cohn] - canonical: {first: William, last: Cohen} variants: - {first: William W., last: Cohen} @@ -1754,13 +1760,9 @@ variants: - {first: Luisa, last: Coheur} - canonical: {first: Trevor, last: Cohn} + comment: University of Melbourne id: trevor-cohn similar: [trevor-cohen] - comment: University of Melbourne -- canonical: {first: Trevor, last: Cohen} - id: trevor-cohen - similar: [trevor-cohn] - comment: University of Washington - canonical: {first: Andrew W., last: Cole} variants: - {first: Andrew, last: Cole} @@ -2056,6 +2058,9 @@ - canonical: {first: Danilo, last: Dayag.} variants: - {first: Danilo, last: Dayag} +- canonical: {first: Antonella, last: De Angeli} + variants: + - {first: Antonella, last: DeAngeli} - canonical: {first: Vitor, last: De Araujo} variants: - {first: Vítor, last: Araújo} @@ -2074,17 +2079,24 @@ - {first: Anne, last: deRoeck} - {first: A.N., last: De Roeck} - {first: Anne N., last: De Roeck} +- canonical: {first: Gianluca, last: De Rossi} + variants: + - {first: Gianluca, last: Rossi} +- canonical: {first: Koenraad, last: De Smedt} + variants: + - {first: Koenraad, last: de Smedt} + - {first: Koenraad, last: DeSmedt} - canonical: {first: Ángel, last: De la Torre} id: angel-de-la-torre - canonical: {first: Jonathan, last: DeCristofaro} variants: - {first: Jonathan D., last: DeCristofaro} -- canonical: {first: Aina, last: Garí Soler} - variants: - - {first: Aina Garí, last: Soler} - canonical: {first: Rosa, last: Del Gaudio} variants: - {first: Rosa, last: Gaudio} +- canonical: {first: Riccardo, last: Del Gratta} + variants: + - {first: Riccardo, last: del Gratta} - canonical: {first: Iria, last: Del Río Gayo} variants: - {first: Iria, last: del Río Gayo} @@ -2169,6 +2181,11 @@ - canonical: {first: Giuseppe, last: Di Fabbrizio} variants: - {first: Giuseppe, last: Fabbrizio} +- canonical: {first: Mauro, last: Di Manzo} + id: mauro-di-manzo +- canonical: {first: Giorgio Maria, last: Di Nunzio} + variants: + - {first: Giorgio, last: Di Nunzio} - canonical: {first: Vittorio, last: Di Tomaso} id: vittorio-di-tomaso - canonical: {first: Chrysanne, last: DiMarco} @@ -2501,10 +2518,6 @@ - canonical: {first: Miquel, last: Esplà-Gomis} variants: - {first: Miquel, last: Esplà} -- canonical: {first: Carol, last: Van Ess-Dykema} - variants: - - {first: Carol J., last: Van Ess-Dykema} - - {first: Carol, last: VanEss-Dykema} - canonical: {first: Dominique, last: Estival} id: dominique-estival - canonical: {first: David A., last: Evans} @@ -2990,6 +3003,9 @@ - canonical: {first: E. Gabriela, last: Garza} variants: - {first: Gabriela, last: Garza} +- canonical: {first: Aina, last: Garí Soler} + variants: + - {first: Aina Garí, last: Soler} - canonical: {first: Milica, last: Gasic} variants: - {first: Milica, last: Gašić} @@ -3279,9 +3295,6 @@ - canonical: {first: Robert, last: Granville} variants: - {first: Robert Alan, last: Granville} -- canonical: {first: Riccardo, last: Del Gratta} - variants: - - {first: Riccardo, last: del Gratta} - canonical: {first: Agustin, last: Gravano} variants: - {first: Agustín, last: Gravano} @@ -3430,6 +3443,9 @@ - canonical: {first: Asunción, last: Gómez-Pérez} variants: - {first: Asunción Gómez, last: Pérez} +- canonical: {first: José Manuel, last: Gómez-Pérez} + variants: + - {first: Jose Manuel, last: Gomez-Perez} - canonical: {first: Anne, last: Göhring} variants: - {first: Anne, last: Goehring} @@ -3439,9 +3455,6 @@ - canonical: {first: Jana, last: Götze} variants: - {first: Jana, last: Goetze} -- canonical: {first: Noriko H., last: Arai} - variants: - - {first: Noriko, last: Arai} - canonical: {first: Shachi, last: H. Kumar} variants: - {first: Shachi H, last: Kumar} @@ -4243,11 +4256,6 @@ - canonical: {first: Gareth J. F., last: Jones} variants: - {first: Gareth J.F., last: Jones} -- canonical: {first: Karen, last: Spärck Jones} - id: karen-sparck-jones - variants: - - {first: Karen, last: Sparck Jones} - - {first: Karen, last: Jones} - canonical: {first: Mark, last: Jones} variants: - {first: Mark A., last: Jones} @@ -5251,6 +5259,9 @@ - canonical: {first: Bo, last: Li} comment: May refer to several people id: bo-li +- canonical: {first: Bo, last: Li} + comment: Vanderbilt, UIUC + id: bo-li-vanderbilt - canonical: {first: Bo, last: Li} comment: NUS, Google id: bo-li-nus @@ -5426,6 +5437,9 @@ comment: May refer to several people id: yang-liu similar: [yang-liu-georgetown] +- canonical: {first: Yang, last: Liu} + comment: Univ. of Michigan, UC Santa Cruz + id: yang-liu-umich - canonical: {first: Yang (Janet), last: Liu} comment: 刘洋; Georgetown id: yang-liu-georgetown @@ -5582,6 +5596,9 @@ - {first: Soledad, last: López Gambino} - canonical: {first: Karmele, last: López de Ipiña} id: karmele-lopez-de-ipina +- canonical: {first: Maddalen, last: López de Lacalle} + variants: + - {first: Maddalen, last: Lopez de Lacalle} - canonical: {first: Ramón, last: López-Cózar} id: ramon-lopez-cozar - canonical: {first: Birte, last: Lönneker} @@ -5773,8 +5790,6 @@ - canonical: {first: Ramesh, last: Manuvinakurike} variants: - {first: Ramesh, last: Manuvirakurike} -- canonical: {first: Mauro, last: Di Manzo} - id: mauro-di-manzo - canonical: {first: Lingshuang Jack, last: Mao} variants: - {first: Lingshuang, last: Mao} @@ -5875,10 +5890,6 @@ variants: - {first: David Martins, last: de Matos} - {first: David M., last: de Matos} -- canonical: {first: Ely Edison da Silva, last: Matos} - variants: - - {first: Ely, last: Matos} - - {first: Ely E. S., last: Matos} - canonical: {first: M. Antònia, last: Martí} id: m-antonia-marti variants: @@ -5959,6 +5970,10 @@ variants: - {first: Yvette Yannick, last: Mathieu} - {first: Yvette, last: Mathieu} +- canonical: {first: Ely Edison da Silva, last: Matos} + variants: + - {first: Ely, last: Matos} + - {first: Ely E. S., last: Matos} - canonical: {first: Yuji, last: Matsumoto} variants: - {first: Yūji, last: Matsumoto} @@ -6556,8 +6571,6 @@ variants: - {first: Stefan, last: Muller} - {first: Stefan, last: Mueller} -- canonical: {first: Nicolas, last: Nedobejkine} - id: nicolas-nedobejkine - canonical: {first: Maria, last: Nadejde} variants: - {first: Maria, last: Nădejde} @@ -6647,6 +6660,8 @@ - canonical: {first: Silvia, last: Necşulescu} variants: - {first: Silvia, last: Necsulescu} +- canonical: {first: Nicolas, last: Nedobejkine} + id: nicolas-nedobejkine - canonical: {first: Mary S., last: Neff} variants: - {first: Mary, last: Neff} @@ -6760,11 +6775,6 @@ - canonical: {first: Nicolas, last: Nicolov} id: nicolas-nicolov similar: [nikola-i-nikolov] -- canonical: {first: Nikola I., last: Nikolov} - id: nikola-i-nikolov - variants: - - {first: Nikola, last: Nikolov} - similar: [nicolas-nicolov] - canonical: {first: Jian-Yun, last: Nie} variants: - {first: Jian-yun, last: Nie} @@ -6775,6 +6785,11 @@ id: sonja-niessen variants: - {first: Sonja, last: Niessen} +- canonical: {first: Nikola I., last: Nikolov} + id: nikola-i-nikolov + similar: [nicolas-nicolov] + variants: + - {first: Nikola, last: Nikolov} - canonical: {first: Kristina, last: Nilsson Björkenstam} variants: - {first: Kristina, last: Nilsson} @@ -6815,9 +6830,6 @@ - canonical: {first: Rita, last: Nuebel} variants: - {first: Rita, last: Nüebel} -- canonical: {first: Giorgio Maria, last: Di Nunzio} - variants: - - {first: Giorgio, last: Di Nunzio} - canonical: {first: Minghua, last: Nuo} variants: - {first: Ming Hua, last: Nuo} @@ -6898,8 +6910,7 @@ - {first: Duane, last: Olawsky} - canonical: {first: Karel, last: Oliva} variants: - - {first: karel, last: oliva} - - {first: karel, last: Oli̊va} + - {first: Karel, last: Oli̊va} - canonical: {first: José Luís, last: Oliveira} variants: - {first: Luís, last: Oliveira} @@ -7544,8 +7555,6 @@ - canonical: {first: Chantal, last: Pérez-Hernández} variants: - {first: Chantal, last: Pérez} -- canonical: {first: Maurice, last: Quezel-Ambrunaz} - id: maurice-quezel-ambrunaz - canonical: {first: Behrang, last: QasemiZadeh} variants: - {first: Behrang, last: Q. Zadeh} @@ -7564,6 +7573,8 @@ - canonical: {first: Yun-Qian, last: Qu} variants: - {first: Yunqian, last: Qu} +- canonical: {first: Maurice, last: Quezel-Ambrunaz} + id: maurice-quezel-ambrunaz - canonical: {first: Matthieu, last: Quignard} id: matthieu-quignard - canonical: {first: Kevin M., last: Quinn} @@ -7573,8 +7584,6 @@ variants: - {first: T. Pattabhi, last: R. K Rao} - {first: Pattabhi RK, last: Rao} -- canonical: {first: Paul, last: Roossin} - id: paul-roossin - canonical: {first: Hazem, last: Raafat} variants: - {first: Hazem, last: M. Raafat} @@ -7842,6 +7851,8 @@ id: laurent-romary - canonical: {first: Tiit, last: Roosmaa} id: tiit-roosmaa +- canonical: {first: Paul, last: Roossin} + id: paul-roossin - canonical: {first: Carolyn, last: Rose} id: carolyn-rose variants: @@ -7872,9 +7883,6 @@ - {first: Peter Rossen, last: Skadhauge} - canonical: {first: Sophie, last: Rosset} id: sophie-rosset -- canonical: {first: Gianluca, last: De Rossi} - variants: - - {first: Gianluca, last: Rossi} - canonical: {first: Piercarlo, last: Rossi} id: piercarlo-rossi - canonical: {first: Stefano Dei, last: Rossi} @@ -8498,10 +8506,6 @@ - canonical: {first: Sharon, last: Small} variants: - {first: Sharon, last: Gower Small} -- canonical: {first: Koenraad, last: De Smedt} - variants: - - {first: Koenraad, last: de Smedt} - - {first: Koenraad, last: DeSmedt} - canonical: {first: R. A., last: Smit} variants: - {first: R.A., last: Smit} @@ -8634,6 +8638,11 @@ - canonical: {first: Constantine D., last: Spyropoulos} variants: - {first: Constantine, last: Spyropoulos} +- canonical: {first: Karen, last: Spärck Jones} + id: karen-sparck-jones + variants: + - {first: Karen, last: Sparck Jones} + - {first: Karen, last: Jones} - canonical: {first: Rohini K., last: Srihari} variants: - {first: Rohini, last: Srihari} @@ -8880,9 +8889,6 @@ - canonical: {first: Anders, last: Søgaard} variants: - {first: Anders, last: Sogaard} -- canonical: {first: Loong-Cheong, last: Tong} - variants: - - {first: Loong Cheong, last: Tong} - canonical: {first: Maite, last: Taboada} id: maite-taboada - canonical: {first: Martha Yifiru, last: Tachbelie} @@ -9052,6 +9058,9 @@ - canonical: {first: Laura Mayfield, last: Tomokiyo} variants: - {first: Laura, last: Mayfield} +- canonical: {first: Loong-Cheong, last: Tong} + variants: + - {first: Loong Cheong, last: Tong} - canonical: {first: Fatemeh, last: Torabi Asr} variants: - {first: Fatemeh Torabi, last: Asr} @@ -9327,6 +9336,10 @@ - canonical: {first: M. Pilar, last: Valverde Ibáñez} variants: - {first: M. Pilar, last: Valverde Ibañez} +- canonical: {first: Carol, last: Van Ess-Dykema} + variants: + - {first: Carol J., last: Van Ess-Dykema} + - {first: Carol, last: VanEss-Dykema} - canonical: {first: Marjo, last: Van Koppen} variants: - {first: Marjo, last: van Koppen} @@ -9399,6 +9412,10 @@ - canonical: {first: Jean, last: Veronis} variants: - {first: Jean, last: Véronis} +- canonical: {first: Karin, last: Verspoor} + variants: + - {first: Karin M., last: Verspoor} + - {first: Cornelia Maria, last: Verspoor} - canonical: {first: Anita Lilla, last: Verő} variants: - {first: Anita Lilla, last: Vero} @@ -9536,9 +9553,6 @@ - canonical: {first: Jaakko, last: Väyrynen} variants: - {first: Jaakko J., last: Väyrynen} -- canonical: {first: Heinz J., last: Weber} - variants: - - {first: H-J., last: Weber} - canonical: {first: Luuk Van, last: Waes} variants: - {first: Luuk, last: Van Waes} @@ -9676,6 +9690,9 @@ variants: - {first: Bonnie L., last: Webber} - {first: Bonnie Lynn, last: Webber} +- canonical: {first: Heinz J., last: Weber} + variants: + - {first: H-J., last: Weber} - canonical: {first: Jonathan J., last: Webster} variants: - {first: Jonathan, last: Webster} @@ -10169,6 +10186,8 @@ - canonical: {first: William, last: de Beaumont} variants: - {first: Will, last: de Beaumont} +- canonical: {first: Martine, last: de Calmès} + id: martine-de-calmes - canonical: {first: Guadalupe Aguado, last: de Cea} variants: - {first: Guadalupe, last: Aguado de Cea} @@ -10183,9 +10202,6 @@ - canonical: {first: Clément, last: de Groc} variants: - {first: Clément, last: De Groc} -- canonical: {first: Maddalen, last: López de Lacalle} - variants: - - {first: Maddalen, last: Lopez de Lacalle} - canonical: {first: Vera Lucia Strube, last: de Lima} variants: - {first: Vera Lúcia Strube, last: de Lima} @@ -10361,7 +10377,3 @@ - canonical: {first: Lukáš, last: Žilka} variants: - {first: Lukas, last: Zilka} -- canonical: {first: Karin, last: Verspoor} - variants: - - {first: Karin M., last: Verspoor} - - {first: Cornelia Maria, last: Verspoor} diff --git a/data/yaml/venues.yaml b/data/yaml/venues.yaml index ad352124eb..6e7fa6c800 100644 --- a/data/yaml/venues.yaml +++ b/data/yaml/venues.yaml @@ -18,7 +18,8 @@ aha: ai4hi: acronym: AI4HI is_acl: false - name: International Workshop on Artificial Intelligence for Historical Image Enrichment and Access + name: International Workshop on Artificial Intelligence for Historical Image Enrichment + and Access url: https://chia.acdh.oeaw.ac.at/call-for-papers/ akbc: acronym: AKBC @@ -42,7 +43,7 @@ alvr: acronym: ALVR is_acl: true name: The Workshop on Advances in Language and Vision Research - url: "https://alvr-workshop.github.io/" + url: https://alvr-workshop.github.io/ alw: acronym: ALW is_acl: true @@ -75,7 +76,7 @@ autosimtrans: acronym: AutoSimTrans is_acl: true name: The Workshop on Automatic Simultaneous Translation - url: "https://autosimtrans.github.io/" + url: https://autosimtrans.github.io/ bea: acronym: BEA is_acl: true @@ -93,6 +94,9 @@ birndl: is_acl: false name: Joint Workshop on Bibliometric-enhanced Information Retrieval and Natural Language Processing for Digital Libraries +blackboxnlp: + acronym: BlackboxNLP + name: 'BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP' bsnlp: acronym: BSNLP is_acl: false @@ -111,8 +115,8 @@ catocl: name: Workshop on Computational Approaches to Causality in Language ccl: acronym: CCL - name: The Chinese National Conference on Computational Linguistics is_toplevel: true + name: The Chinese National Conference on Computational Linguistics url: http://cips-cl.org ccnlg: acronym: CCNLG @@ -125,7 +129,7 @@ challengehml: acronym: Challenge-HML is_acl: true name: The Grand Challenge and Workshop on Multimodal Language - url: "http://multicomp.cs.cmu.edu/acl2020multimodalworkshop/" + url: http://multicomp.cs.cmu.edu/acl2020multimodalworkshop/ cl: acronym: CL is_acl: true @@ -177,6 +181,9 @@ codeswitch: acronym: CodeSwitch is_acl: true name: Workshop on Computational Approaches to Code Switching +codi: + acronym: CODI + name: Workshop on Computational Approaches to Discourse cogacll: acronym: CogACLL is_acl: true @@ -229,6 +236,10 @@ cvsc: acronym: CVSC is_acl: true name: Workshop on Continuous Vector Space Models and their Compositionality +deelio: + acronym: DeeLIO + name: 'Deep Learning Inside Out (DeeLIO): The Workshop on Knowledge Extraction and + Integration for Deep Learning Architectures' depling: acronym: DepLing is_acl: false @@ -255,7 +266,7 @@ eamt: ecnlp: acronym: ECNLP name: The Workshop on e-Commerce and NLP - url: "https://sites.google.com/view/ecnlp/" + url: https://sites.google.com/view/ecnlp/ emnlp: acronym: EMNLP is_acl: true @@ -270,6 +281,9 @@ ethnlp: acronym: EthNLP is_acl: true name: Workshop on Ethics in Natural Language Processing +eval4nlp: + acronym: Eval4NLP + name: The Workshop on Evaluation and Comparison of NLP Systems (Eval4NLP) events: acronym: EVENTS is_acl: true @@ -285,12 +299,16 @@ exprom: fever: acronym: FEVER is_acl: true - name: The Workshop on Fact Extraction and Verification - url: "https://fever.ai/index.html" + name: Workshop on Fact Extraction and Verification + url: https://fever.ai/index.html figlang: acronym: Fig-Lang is_acl: true name: Workshop on Figurative Language Processing +findings: + acronym: Findings + is_acl: true + name: Findings of the Association for Computational Linguistics finnlp: acronym: FinNLP name: Financial Technology and Natural Language Processing @@ -360,6 +378,12 @@ inlg: acronym: INLG is_acl: true name: International Natural Language Generation Conference +insights: + acronym: insights + name: Workshop on Insights from Negative Results in NLP +intexsempar: + acronym: intexsempar + name: Workshop on Interactive and Executable Semantic Parsing isa: acronym: ISA is_acl: false @@ -367,8 +391,8 @@ isa: iwclul: acronym: IWCLUL is_acl: true - name: International Workshop on Computational Linguistics for Uralic Languages joint: WS + name: International Workshop on Computational Linguistics for Uralic Languages iwcs: acronym: IWCS is_acl: true @@ -383,12 +407,14 @@ iwpt: iwslt: acronym: IWSLT name: International Workshop on Spoken Language Translation - url: "http://www.iwslt.org/" + url: http://www.iwslt.org/ jeptalnrecital: acronym: JEP/TALN/RECITAL is_acl: false is_toplevel: true - name: Journées d'Etudes sur la Parole / Traitement Automatique de la Langue Naturelle / Rencontres des Etudiants Chercheurs en Informatique et Traitement Automatique des Langues + name: Journées d'Etudes sur la Parole / Traitement Automatique de la Langue Naturelle + / Rencontres des Etudiants Chercheurs en Informatique et Traitement Automatique + des Langues oldstyle_letter: F lasm: acronym: LASM @@ -422,8 +448,8 @@ lglp: name: Workshop on Lexical and Grammatical Resources for Language Processing lilt: acronym: LILT - name: Linguistic Issues in Language Technology is_toplevel: true + name: Linguistic Issues in Language Technology lincr: acronym: LiNCr name: Linguistic and Neurocognitive Resources @@ -437,7 +463,8 @@ lr4nlp: name: Workshop on Linguistic Resources for NLP lr4sshoc: acronym: LR4SSHOC - name: "Language Resources in the SSH Cloud: Bringing Language Technologies for Social Sciences and Humanities (in)to the European Open Science Cloud" + name: 'Language Resources in the SSH Cloud: Bringing Language Technologies for Social + Sciences and Humanities (in)to the European Open Science Cloud' url: https://www.sshopencloud.eu lrec: acronym: LREC @@ -532,7 +559,7 @@ nlp4convai: acronym: NLP4ConvAI is_acl: true name: The Workshop on Natural Language Processing for Conversational AI - url: "https://sites.google.com/view/2ndnlp4convai/home" + url: https://sites.google.com/view/2ndnlp4convai/home nlp4if: acronym: NLP4IF is_acl: false @@ -545,10 +572,12 @@ nlp4tm: acronym: NLP4TM is_acl: false name: Workshop on Natural Language Processing for Translation Memories +nlpbt: + acronym: nlpbt + name: International Workshop on Natural Language Processing Beyond Text nlpcovid19: acronym: NLP-COVID19 - is_acl: true - name: Workshop on NLP for COVID-19 (NLP-COVID19) + name: Workshop on NLP for COVID-19 nlpcss: acronym: NLP+CSS is_acl: true @@ -561,7 +590,10 @@ nlpmc: acronym: NLPMC is_acl: true name: The Workshop on NLP for Medical Conversations - url: "https://sites.google.com/view/nlp4medicalconversations/" + url: https://sites.google.com/view/nlp4medicalconversations/ +nlposs: + acronym: NLPOSS + name: Workshop for NLP Open Source Software nlptea: acronym: NLP-TEA is_acl: true @@ -577,7 +609,7 @@ nuse: acronym: NUSE is_acl: true name: The Workshop on Narrative Understanding, Storylines, and Events - url: "https://sites.google.com/view/nuse" + url: https://sites.google.com/view/nuse oiaf4hlt: acronym: OIAF4HLT is_acl: false @@ -615,6 +647,9 @@ prep: acronym: PREP is_acl: true name: Workshop on Prepositions +privatenlp: + acronym: PrivateNLP + name: Workshop on Privacy in NLP pylo: acronym: PYLO is_acl: false @@ -631,14 +666,14 @@ ranlp: readi: acronym: READI name: Tools and Resources to Empower People with REAding Difficulties -repl4nlp: - acronym: RepL4NLP - is_acl: true - name: Workshop on Representation Learning for NLP repeval: acronym: RepEval is_acl: true name: Workshop on Evaluating Vector-Space Representations for NLP +repl4nlp: + acronym: RepL4NLP + is_acl: true + name: Workshop on Representation Learning for NLP restup: acronym: ResTUP name: Resources and Techniques for User and Author Profiling in Abusive Language @@ -660,6 +695,9 @@ sadaatl: acronym: SADAATL is_acl: false name: Workshop on Synchronic and Diachronic Approaches to Analyzing Technical Language +scai: + acronym: scai + name: International Workshop on Search-Oriented Conversational AI (SCAI) scil: acronym: SCiL name: Society for Computation in Linguistics @@ -667,6 +705,9 @@ sclem: acronym: SCLeM is_acl: true name: Workshop on Subword and Character LEvel Models in NLP +sdp: + acronym: sdp + name: Workshop on Scholarly Document Processing (SDP 2020) sedmt: acronym: SedMT is_acl: true @@ -720,7 +761,7 @@ signlang: name: Workshop on the Representation and Processing of Sign Languages sigtyp: acronym: SIGTYP - is_acl: True + is_acl: true name: Special Interest Group on Typology url: https://sigtyp.github.io slpat: @@ -742,6 +783,9 @@ spmrl: acronym: SPMRL is_acl: true name: Workshop on Statistical Parsing of Morphologically Rich Languages +spnlp: + acronym: spnlp + name: Workshop on Structured Prediction for NLP ssst: acronym: SSST is_acl: true @@ -769,6 +813,9 @@ stylevar: acronym: Style-Var is_acl: true name: Workshop on Stylistic Variation +sustainlp: + acronym: sustainlp + name: Workshop on Simple and Efficient Natural Language Processing swaie: acronym: SWAIE is_acl: false