Merge branch 'ingestion-checklist' of https://github.com/yufei118liu/…

…acl-anthology into ingestion-checklist
yufei118liu · Jun 23, 2024 · 1a8f98a · 1a8f98a
2 parents 9d4bb23 + 7cf2da5
commit 1a8f98a
Show file tree

Hide file tree

Showing 144 changed files with 9,511 additions and 221 deletions.
diff --git a/.github/ISSUE_TEMPLATE/04-ingestion-request.yml b/.github/ISSUE_TEMPLATE/04-ingestion-request.yml
@@ -24,12 +24,19 @@ body:
       placeholder: ex. emnlp, repl4nlp
     validations:
       required: true
+  - type: input
+    id: venue_sig
+    attributes:
+      label: "ACL SIG(s) sponsoring or endorsing the whole venue"
+      description: |
+        Provide a comma-separated list of any SIGs that apply to the whole venue. If there are multiple subvenues/volumes with different SIGs, provide the mapping under Supporting Information.
+      placeholder: ex. SIGLEX, SIGSEM
   - type: input
     id: volume_title
     attributes:
       label: Volume Title
       description: |
-        What is the title of the volume that should be published?
+        What is the title of the (main) volume that should be published?
       placeholder: ex. Proceedings of the 2019 Meeting of the Conference on Empirical Methods in Natural Language Processing (EMNLP)
     validations:
       required: true
@@ -54,9 +61,16 @@ body:
       description: |
         When would you like the material to be published on the ACL Anthology? If you are submitting material that can be published immediately (e.g. for conferences that already happened in the past), you can leave this field blank.
       placeholder: ex. 2023-12-31
+  - type: input
+    id: volume_address
+    attributes:
+      label: Location
+      description: |
+        What address should be included in bibliography entries, if any? For conferences this is the location of the conference. For a fully-online event use "Online", optionally following the host team location. Ensure the address field is consistent across submitted volumes.
+      placeholder: ex. Barcelona, Spain (Online)
   - type: textarea
     id: ingestion_information
     attributes:
       label: Supporting Information
       description: |
-        If there is anything else we should know about this ingestion request, please provide the information here.  You can also use this field to **provide links or attach files** of the material, if you already have them.
+        If there is anything else we should know about this ingestion request, please provide the information here. E.g. for venues with multiple volumes, list them with the volume identifier, volume title, and any SIGs for the volume. You can also use this field to **provide links or attach files** of the material, if you already have them.
diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py
@@ -180,14 +180,8 @@ def from_xml(xml_element, *args):
             paper.attrib["retracted"] = " "
 
         # Adjust the title for retracted papers
-        if (
-            "retracted" in paper.attrib
-            and "xml_title" in paper.attrib
-            and paper.attrib["xml_title"].text is not None
-        ):
-            paper.attrib["xml_title"].text = (
-                "[RETRACTED] " + paper.attrib["xml_title"].text
-            )
+        if "retracted" in paper.attrib and "xml_title" in paper.attrib:
+            paper.add_prefix_to_title("[RETRACTED] ")
 
         if "removed" in paper.attrib and paper.attrib["removed"] is None:
             paper.attrib["removed"] = " "
@@ -307,6 +301,13 @@ def get(self, name, default=None):
         except KeyError:
             return default
 
+    def add_prefix_to_title(self, prefix):
+        """Add a prefix to the title of the paper.
+        The attrib is an lxml Element object."""
+        if self.attrib["xml_title"].text is None:
+            self.attrib["xml_title"].text = ""
+        self.attrib["xml_title"].text = prefix + self.attrib["xml_title"].text
+
     def get_title(self, form="xml"):
         """Returns the paper title, optionally formatting it.
 

diff --git a/bin/requirements.txt b/bin/requirements.txt
@@ -1,3 +1,4 @@
+filelock==3.15.1
 black~=23.9.0
 citeproc-py
 citeproc-py-styles

diff --git a/data/xml/2020.aacl.xml b/data/xml/2020.aacl.xml
@@ -462,7 +462,6 @@
       <bibkey>nadeem-etal-2020-systematic</bibkey>
       <pwccode url="https://github.com/moinnadeem/characterizing-sampling-algorithms" additional="false">moinnadeem/characterizing-sampling-algorithms</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="37">
       <title><fixed-case>C</fixed-case>hinese Content Scoring: Open-Access Datasets and Features on Different Segmentation Levels</title>

diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
@@ -4006,7 +4006,6 @@
       <bibkey>press-etal-2020-improving</bibkey>
       <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="271">
       <title>Single Model Ensemble using Pseudo-Tags and Distinct Vectors</title>
@@ -12697,7 +12696,6 @@
       <bibkey>bhatt-etal-2020-much</bibkey>
       <pwccode url="https://github.com/bhattg/Decay-RNN-ACL-SRW2020" additional="false">bhattg/Decay-RNN-ACL-SRW2020</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="34">
       <title>Unsupervised Multilingual Sentence Embeddings for Parallel Corpus Mining</title>

diff --git a/data/xml/2020.coling.xml b/data/xml/2020.coling.xml
@@ -4761,7 +4761,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="356">
       <title>How <fixed-case>LSTM</fixed-case> Encodes Syntax: Exploring Context Vectors and Semi-Quantization on Natural Text</title>
@@ -6271,7 +6270,7 @@
     </paper>
     <paper id="469">
       <title><fixed-case>S</fixed-case>a<fixed-case>SAKE</fixed-case>: Syntax and Semantics Aware Keyphrase Extraction from Research Papers</title>
-      <author><first>Santosh</first><last>Tokala</last></author>
+      <author><first>Santosh</first><last>T.y.s.s</last></author>
       <author><first>Debarshi</first><last>Kumar Sanyal</last></author>
       <author><first>Plaban Kumar</first><last>Bhowmick</last></author>
       <author><first>Partha Pratim</first><last>Das</last></author>

diff --git a/data/xml/2020.conll.xml b/data/xml/2020.conll.xml
@@ -643,7 +643,6 @@
       <doi>10.18653/v1/2020.conll-1.49</doi>
       <bibkey>eisape-etal-2020-cloze</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="50">
       <title>Disentangling dialects: a neural approach to <fixed-case>I</fixed-case>ndo-<fixed-case>A</fixed-case>ryan historical phonology and subgrouping</title>

diff --git a/data/xml/2020.emnlp.xml b/data/xml/2020.emnlp.xml
@@ -409,10 +409,12 @@
       <author><first>David</first><last>Schlangen</last></author>
       <pages>357–374</pages>
       <abstract>While humans process language incrementally, the best language encoders currently used in NLP do not. Both bidirectional LSTMs and Transformers assume that the sequence that is to be encoded is available in full, to be processed either forwards and backwards (BiLSTMs) or as a whole (Transformers). We investigate how they behave under incremental interfaces, when partial output must be provided based on partial input seen up to a certain time step, which may happen in interactive systems. We test five models on various NLU datasets and compare their performance using three incremental evaluation metrics. The results support the possibility of using bidirectional encoders in incremental mode while retaining most of their non-incremental quality. The “omni-directional” BERT model, which achieves better non-incremental performance, is impacted more by the incremental access. This can be alleviated by adapting the training regime (truncated training), or the testing procedure, by delaying the output until some right context is available or by incorporating hypothetical right contexts generated by a language model like GPT-2.</abstract>
-      <url hash="09d22bbc">2020.emnlp-main.26</url>
+      <url hash="3ba95a3f">2020.emnlp-main.26</url>
       <doi>10.18653/v1/2020.emnlp-main.26</doi>
       <video href="https://slideslive.com/38938866"/>
       <bibkey>madureira-schlangen-2020-incremental</bibkey>
+      <revision id="1" href="2020.emnlp-main.26v1" hash="09d22bbc"/>
+      <revision id="2" href="2020.emnlp-main.26v2" hash="3ba95a3f" date="2024-05-07">Added a few missing citations and fixed results of a previously wrong implementation of one secondary evaluation metric.</revision>
       <pwccode url="https://github.com/briemadu/inc-bidirectional" additional="false">briemadu/inc-bidirectional</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
@@ -5390,7 +5392,6 @@
       <video href="https://slideslive.com/38938905"/>
       <bibkey>wang-etal-2020-negative</bibkey>
       <pwccode url="https://github.com/iedwardwangi/MetaAdapter" additional="false">iedwardwangi/MetaAdapter</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDiQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tydiqa-goldp">TyDiQA-GoldP</pwcdataset>
     </paper>
     <paper id="360">
@@ -6347,7 +6348,6 @@
       <bibkey>shen-etal-2020-blank</bibkey>
       <pwccode url="https://github.com/Varal7/blank_language_model" additional="false">Varal7/blank_language_model</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="421">
       <title><fixed-case>COD3S</fixed-case>: Diverse Generation with Discrete Semantic Signatures</title>
@@ -7533,7 +7533,7 @@
       <doi>10.18653/v1/2020.emnlp-main.498</doi>
       <video href="https://slideslive.com/38938695"/>
       <bibkey>garg-ramakrishnan-2020-bae</bibkey>
-      <pwccode url="https://github.com/QData/TextAttack" additional="true">QData/TextAttack</pwccode>
+      <pwccode url="https://github.com/QData/TextAttack/blob/master/textattack/attack_recipes/bae_garg_2019.py" additional="true">QData/TextAttack</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-binary">IMDB-BINARY</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mpqa-opinion-corpus">MPQA Opinion Corpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mr">MR</pwcdataset>
@@ -9712,7 +9712,6 @@
       <video href="https://slideslive.com/38938778"/>
       <bibkey>khoury-etal-2020-vector</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="641">
       <title>The importance of fillers for text representations of speech transcripts</title>
@@ -10682,7 +10681,6 @@
       <doi>10.18653/v1/2020.emnlp-main.703</doi>
       <video href="https://slideslive.com/38938907"/>
       <bibkey>bisk-etal-2020-experience</bibkey>
-      <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="704">
@@ -11319,7 +11317,6 @@
       <doi>10.18653/v1/2020.emnlp-main.743</doi>
       <video href="https://slideslive.com/38938668"/>
       <bibkey>zeng-etal-2020-meddialog</bibkey>
-      <pwccode url="https://github.com/UCSD-AI4H/Medical-Dialogue-System" additional="false">UCSD-AI4H/Medical-Dialogue-System</pwccode>
     </paper>
     <paper id="744">
       <title>An information theoretic view on selecting linguistic probes</title>

diff --git a/data/xml/2020.eval4nlp.xml b/data/xml/2020.eval4nlp.xml
@@ -193,7 +193,6 @@
       <bibkey>dudy-bedrick-2020-words</bibkey>
       <pwccode url="https://github.com/shiranD/word_level_evaluation" additional="false">shiranD/word_level_evaluation</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="14">
       <title>On Aligning <fixed-case>O</fixed-case>pen<fixed-case>IE</fixed-case> Extractions with Knowledge Bases: A Case Study</title>

diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
@@ -115,7 +115,6 @@
       <bibkey>huang-etal-2020-reducing</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="8">
       <title>Improving Text Understanding via Deep Syntax-Semantics Communication</title>
@@ -3740,7 +3739,6 @@
       <doi>10.18653/v1/2020.findings-emnlp.250</doi>
       <bibkey>lioutas-etal-2020-improving</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="251">
       <title><fixed-case>P</fixed-case>harm<fixed-case>MT</fixed-case>: A Neural Machine Translation Approach to Simplify Prescription Directions</title>
@@ -6440,7 +6438,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-1">NCBI Disease</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="435">
       <title><fixed-case>E</fixed-case>xploiting <fixed-case>U</fixed-case>nsupervised <fixed-case>D</fixed-case>ata for <fixed-case>E</fixed-case>motion <fixed-case>R</fixed-case>ecognition in <fixed-case>C</fixed-case>onversations</title>
@@ -6471,7 +6468,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="437">
       <title>Speaker or Listener? The Role of a Dialog Agent</title>

diff --git a/data/xml/2020.lrec.xml b/data/xml/2020.lrec.xml
@@ -5590,7 +5590,7 @@
     <paper id="446">
       <title><fixed-case>NMT</fixed-case> and <fixed-case>PBSMT</fixed-case> Error Analyses in <fixed-case>E</fixed-case>nglish to <fixed-case>B</fixed-case>razilian <fixed-case>P</fixed-case>ortuguese Automatic Translations</title>
       <author><first>Helena</first><last>Caseli</last></author>
-      <author><first>Marcio</first><last>Inácio</last></author>
+      <author><first>Marcio</first><last>Lima Inácio</last></author>
       <pages>3623–3629</pages>
       <abstract>Machine Translation (MT) is one of the most important natural language processing applications. Independently of the applied MT approach, a MT system automatically generates an equivalent version (in some target language) of an input sentence (in some source language). Recently, a new MT approach has been proposed: neural machine translation (NMT). NMT systems have already outperformed traditional phrase-based statistical machine translation (PBSMT) systems for some pairs of languages. However, any MT approach outputs errors. In this work we present a comparative study of MT errors generated by a NMT system and a PBSMT system trained on the same English – Brazilian Portuguese parallel corpus. This is the first study of this kind involving NMT for Brazilian Portuguese. Furthermore, the analyses and conclusions presented here point out the specific problems of NMT outputs in relation to PBSMT ones and also give lots of insights into how to implement automatic post-editing for a NMT system. Finally, the corpora annotated with MT errors generated by both PBSMT and NMT systems are also available.</abstract>
       <url hash="02cdcab2">2020.lrec-1.446</url>

diff --git a/data/xml/2020.msr.xml b/data/xml/2020.msr.xml
@@ -33,7 +33,6 @@
       <bibkey>mille-etal-2020-third</bibkey>
       <pwccode url="https://gitlab.com/talnupf/ud2deep" additional="false">talnupf/ud2deep</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="2">
       <title><fixed-case>BME</fixed-case>-<fixed-case>TUW</fixed-case> at <fixed-case>SR</fixed-case>’20: Lexical grammar induction for surface realization</title>

diff --git a/data/xml/2020.scil.xml b/data/xml/2020.scil.xml
@@ -356,7 +356,6 @@
       <bibkey>hu-etal-2020-closer</bibkey>
       <pwccode url="https://github.com/jennhu/reflexive-anaphor-licensing" additional="false">jennhu/reflexive-anaphor-licensing</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="40">
       <title><fixed-case>M</fixed-case>ona<fixed-case>L</fixed-case>og: a Lightweight System for Natural Language Inference Based on Monotonicity</title>

diff --git a/data/xml/2020.tacl.xml b/data/xml/2020.tacl.xml
@@ -364,7 +364,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webtext">WebText</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="26">
       <title>Reproducible and Efficient Benchmarks for Hyperparameter Optimization of Neural Machine Translation Systems</title>