diff --git a/.github/ISSUE_TEMPLATE/04-ingestion-request.yml b/.github/ISSUE_TEMPLATE/04-ingestion-request.yml
index 09f9d10939..663ac659df 100644
--- a/.github/ISSUE_TEMPLATE/04-ingestion-request.yml
+++ b/.github/ISSUE_TEMPLATE/04-ingestion-request.yml
@@ -24,12 +24,19 @@ body:
       placeholder: ex. emnlp, repl4nlp
     validations:
       required: true
+  - type: input
+    id: venue_sig
+    attributes:
+      label: "ACL SIG(s) sponsoring or endorsing the whole venue"
+      description: |
+        Provide a comma-separated list of any SIGs that apply to the whole venue. If there are multiple subvenues/volumes with different SIGs, provide the mapping under Supporting Information.
+      placeholder: ex. SIGLEX, SIGSEM
   - type: input
     id: volume_title
     attributes:
       label: Volume Title
       description: |
-        What is the title of the volume that should be published?
+        What is the title of the (main) volume that should be published?
       placeholder: ex. Proceedings of the 2019 Meeting of the Conference on Empirical Methods in Natural Language Processing (EMNLP)
     validations:
       required: true
@@ -54,9 +61,16 @@ body:
       description: |
         When would you like the material to be published on the ACL Anthology? If you are submitting material that can be published immediately (e.g. for conferences that already happened in the past), you can leave this field blank.
       placeholder: ex. 2023-12-31
+  - type: input
+    id: volume_address
+    attributes:
+      label: Location
+      description: |
+        What address should be included in bibliography entries, if any? For conferences this is the location of the conference. For a fully-online event use "Online", optionally following the host team location. Ensure the address field is consistent across submitted volumes.
+      placeholder: ex. Barcelona, Spain (Online)
   - type: textarea
     id: ingestion_information
     attributes:
       label: Supporting Information
       description: |
-        If there is anything else we should know about this ingestion request, please provide the information here.  You can also use this field to **provide links or attach files** of the material, if you already have them.
+        If there is anything else we should know about this ingestion request, please provide the information here. E.g. for venues with multiple volumes, list them with the volume identifier, volume title, and any SIGs for the volume. You can also use this field to **provide links or attach files** of the material, if you already have them.
diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py
index 9e334e6f95..1bbf0cb4dd 100644
--- a/bin/anthology/papers.py
+++ b/bin/anthology/papers.py
@@ -180,14 +180,8 @@ def from_xml(xml_element, *args):
             paper.attrib["retracted"] = " "
 
         # Adjust the title for retracted papers
-        if (
-            "retracted" in paper.attrib
-            and "xml_title" in paper.attrib
-            and paper.attrib["xml_title"].text is not None
-        ):
-            paper.attrib["xml_title"].text = (
-                "[RETRACTED] " + paper.attrib["xml_title"].text
-            )
+        if "retracted" in paper.attrib and "xml_title" in paper.attrib:
+            paper.add_prefix_to_title("[RETRACTED] ")
 
         if "removed" in paper.attrib and paper.attrib["removed"] is None:
             paper.attrib["removed"] = " "
@@ -307,6 +301,13 @@ def get(self, name, default=None):
         except KeyError:
             return default
 
+    def add_prefix_to_title(self, prefix):
+        """Add a prefix to the title of the paper.
+        The attrib is an lxml Element object."""
+        if self.attrib["xml_title"].text is None:
+            self.attrib["xml_title"].text = ""
+        self.attrib["xml_title"].text = prefix + self.attrib["xml_title"].text
+
     def get_title(self, form="xml"):
         """Returns the paper title, optionally formatting it.
 
diff --git a/bin/requirements.txt b/bin/requirements.txt
index 05ba7edb6d..6760374aff 100644
--- a/bin/requirements.txt
+++ b/bin/requirements.txt
@@ -1,3 +1,4 @@
+filelock==3.15.1
 black~=23.9.0
 citeproc-py
 citeproc-py-styles
diff --git a/data/xml/2020.aacl.xml b/data/xml/2020.aacl.xml
index 5704c4b343..b74ca8f616 100644
--- a/data/xml/2020.aacl.xml
+++ b/data/xml/2020.aacl.xml
@@ -462,7 +462,6 @@
       <bibkey>nadeem-etal-2020-systematic</bibkey>
       <pwccode url="https://github.com/moinnadeem/characterizing-sampling-algorithms" additional="false">moinnadeem/characterizing-sampling-algorithms</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="37">
       <title><fixed-case>C</fixed-case>hinese Content Scoring: Open-Access Datasets and Features on Different Segmentation Levels</title>
diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
index 566a7524ab..d79c11ec9f 100644
--- a/data/xml/2020.acl.xml
+++ b/data/xml/2020.acl.xml
@@ -4006,7 +4006,6 @@
       <bibkey>press-etal-2020-improving</bibkey>
       <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="271">
       <title>Single Model Ensemble using Pseudo-Tags and Distinct Vectors</title>
@@ -12697,7 +12696,6 @@
       <bibkey>bhatt-etal-2020-much</bibkey>
       <pwccode url="https://github.com/bhattg/Decay-RNN-ACL-SRW2020" additional="false">bhattg/Decay-RNN-ACL-SRW2020</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="34">
       <title>Unsupervised Multilingual Sentence Embeddings for Parallel Corpus Mining</title>
diff --git a/data/xml/2020.coling.xml b/data/xml/2020.coling.xml
index ad11e8413d..8fe9e02327 100644
--- a/data/xml/2020.coling.xml
+++ b/data/xml/2020.coling.xml
@@ -4761,7 +4761,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="356">
       <title>How <fixed-case>LSTM</fixed-case> Encodes Syntax: Exploring Context Vectors and Semi-Quantization on Natural Text</title>
@@ -6271,7 +6270,7 @@
     </paper>
     <paper id="469">
       <title><fixed-case>S</fixed-case>a<fixed-case>SAKE</fixed-case>: Syntax and Semantics Aware Keyphrase Extraction from Research Papers</title>
-      <author><first>Santosh</first><last>Tokala</last></author>
+      <author><first>Santosh</first><last>T.y.s.s</last></author>
       <author><first>Debarshi</first><last>Kumar Sanyal</last></author>
       <author><first>Plaban Kumar</first><last>Bhowmick</last></author>
       <author><first>Partha Pratim</first><last>Das</last></author>
diff --git a/data/xml/2020.conll.xml b/data/xml/2020.conll.xml
index 4c8838e8f5..5c952a7be2 100644
--- a/data/xml/2020.conll.xml
+++ b/data/xml/2020.conll.xml
@@ -643,7 +643,6 @@
       <doi>10.18653/v1/2020.conll-1.49</doi>
       <bibkey>eisape-etal-2020-cloze</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="50">
       <title>Disentangling dialects: a neural approach to <fixed-case>I</fixed-case>ndo-<fixed-case>A</fixed-case>ryan historical phonology and subgrouping</title>
diff --git a/data/xml/2020.emnlp.xml b/data/xml/2020.emnlp.xml
index d109f8c3a3..770c9e4bbc 100644
--- a/data/xml/2020.emnlp.xml
+++ b/data/xml/2020.emnlp.xml
@@ -409,10 +409,12 @@
       <author><first>David</first><last>Schlangen</last></author>
       <pages>357–374</pages>
       <abstract>While humans process language incrementally, the best language encoders currently used in NLP do not. Both bidirectional LSTMs and Transformers assume that the sequence that is to be encoded is available in full, to be processed either forwards and backwards (BiLSTMs) or as a whole (Transformers). We investigate how they behave under incremental interfaces, when partial output must be provided based on partial input seen up to a certain time step, which may happen in interactive systems. We test five models on various NLU datasets and compare their performance using three incremental evaluation metrics. The results support the possibility of using bidirectional encoders in incremental mode while retaining most of their non-incremental quality. The “omni-directional” BERT model, which achieves better non-incremental performance, is impacted more by the incremental access. This can be alleviated by adapting the training regime (truncated training), or the testing procedure, by delaying the output until some right context is available or by incorporating hypothetical right contexts generated by a language model like GPT-2.</abstract>
-      <url hash="09d22bbc">2020.emnlp-main.26</url>
+      <url hash="3ba95a3f">2020.emnlp-main.26</url>
       <doi>10.18653/v1/2020.emnlp-main.26</doi>
       <video href="https://slideslive.com/38938866"/>
       <bibkey>madureira-schlangen-2020-incremental</bibkey>
+      <revision id="1" href="2020.emnlp-main.26v1" hash="09d22bbc"/>
+      <revision id="2" href="2020.emnlp-main.26v2" hash="3ba95a3f" date="2024-05-07">Added a few missing citations and fixed results of a previously wrong implementation of one secondary evaluation metric.</revision>
       <pwccode url="https://github.com/briemadu/inc-bidirectional" additional="false">briemadu/inc-bidirectional</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
@@ -5390,7 +5392,6 @@
       <video href="https://slideslive.com/38938905"/>
       <bibkey>wang-etal-2020-negative</bibkey>
       <pwccode url="https://github.com/iedwardwangi/MetaAdapter" additional="false">iedwardwangi/MetaAdapter</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDiQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tydiqa-goldp">TyDiQA-GoldP</pwcdataset>
     </paper>
     <paper id="360">
@@ -6347,7 +6348,6 @@
       <bibkey>shen-etal-2020-blank</bibkey>
       <pwccode url="https://github.com/Varal7/blank_language_model" additional="false">Varal7/blank_language_model</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="421">
       <title><fixed-case>COD3S</fixed-case>: Diverse Generation with Discrete Semantic Signatures</title>
@@ -7533,7 +7533,7 @@
       <doi>10.18653/v1/2020.emnlp-main.498</doi>
       <video href="https://slideslive.com/38938695"/>
       <bibkey>garg-ramakrishnan-2020-bae</bibkey>
-      <pwccode url="https://github.com/QData/TextAttack" additional="true">QData/TextAttack</pwccode>
+      <pwccode url="https://github.com/QData/TextAttack/blob/master/textattack/attack_recipes/bae_garg_2019.py" additional="true">QData/TextAttack</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-binary">IMDB-BINARY</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mpqa-opinion-corpus">MPQA Opinion Corpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mr">MR</pwcdataset>
@@ -9712,7 +9712,6 @@
       <video href="https://slideslive.com/38938778"/>
       <bibkey>khoury-etal-2020-vector</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="641">
       <title>The importance of fillers for text representations of speech transcripts</title>
@@ -10682,7 +10681,6 @@
       <doi>10.18653/v1/2020.emnlp-main.703</doi>
       <video href="https://slideslive.com/38938907"/>
       <bibkey>bisk-etal-2020-experience</bibkey>
-      <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="704">
@@ -11319,7 +11317,6 @@
       <doi>10.18653/v1/2020.emnlp-main.743</doi>
       <video href="https://slideslive.com/38938668"/>
       <bibkey>zeng-etal-2020-meddialog</bibkey>
-      <pwccode url="https://github.com/UCSD-AI4H/Medical-Dialogue-System" additional="false">UCSD-AI4H/Medical-Dialogue-System</pwccode>
     </paper>
     <paper id="744">
       <title>An information theoretic view on selecting linguistic probes</title>
diff --git a/data/xml/2020.eval4nlp.xml b/data/xml/2020.eval4nlp.xml
index a84ee3843d..cd7fd69e87 100644
--- a/data/xml/2020.eval4nlp.xml
+++ b/data/xml/2020.eval4nlp.xml
@@ -193,7 +193,6 @@
       <bibkey>dudy-bedrick-2020-words</bibkey>
       <pwccode url="https://github.com/shiranD/word_level_evaluation" additional="false">shiranD/word_level_evaluation</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="14">
       <title>On Aligning <fixed-case>O</fixed-case>pen<fixed-case>IE</fixed-case> Extractions with Knowledge Bases: A Case Study</title>
diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
index 94137820f9..13c9df719a 100644
--- a/data/xml/2020.findings.xml
+++ b/data/xml/2020.findings.xml
@@ -115,7 +115,6 @@
       <bibkey>huang-etal-2020-reducing</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="8">
       <title>Improving Text Understanding via Deep Syntax-Semantics Communication</title>
@@ -3740,7 +3739,6 @@
       <doi>10.18653/v1/2020.findings-emnlp.250</doi>
       <bibkey>lioutas-etal-2020-improving</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="251">
       <title><fixed-case>P</fixed-case>harm<fixed-case>MT</fixed-case>: A Neural Machine Translation Approach to Simplify Prescription Directions</title>
@@ -6440,7 +6438,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-1">NCBI Disease</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="435">
       <title><fixed-case>E</fixed-case>xploiting <fixed-case>U</fixed-case>nsupervised <fixed-case>D</fixed-case>ata for <fixed-case>E</fixed-case>motion <fixed-case>R</fixed-case>ecognition in <fixed-case>C</fixed-case>onversations</title>
@@ -6471,7 +6468,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="437">
       <title>Speaker or Listener? The Role of a Dialog Agent</title>
diff --git a/data/xml/2020.lrec.xml b/data/xml/2020.lrec.xml
index 10b9d60a5c..3faa42f108 100644
--- a/data/xml/2020.lrec.xml
+++ b/data/xml/2020.lrec.xml
@@ -5590,7 +5590,7 @@
     <paper id="446">
       <title><fixed-case>NMT</fixed-case> and <fixed-case>PBSMT</fixed-case> Error Analyses in <fixed-case>E</fixed-case>nglish to <fixed-case>B</fixed-case>razilian <fixed-case>P</fixed-case>ortuguese Automatic Translations</title>
       <author><first>Helena</first><last>Caseli</last></author>
-      <author><first>Marcio</first><last>Inácio</last></author>
+      <author><first>Marcio</first><last>Lima Inácio</last></author>
       <pages>3623–3629</pages>
       <abstract>Machine Translation (MT) is one of the most important natural language processing applications. Independently of the applied MT approach, a MT system automatically generates an equivalent version (in some target language) of an input sentence (in some source language). Recently, a new MT approach has been proposed: neural machine translation (NMT). NMT systems have already outperformed traditional phrase-based statistical machine translation (PBSMT) systems for some pairs of languages. However, any MT approach outputs errors. In this work we present a comparative study of MT errors generated by a NMT system and a PBSMT system trained on the same English – Brazilian Portuguese parallel corpus. This is the first study of this kind involving NMT for Brazilian Portuguese. Furthermore, the analyses and conclusions presented here point out the specific problems of NMT outputs in relation to PBSMT ones and also give lots of insights into how to implement automatic post-editing for a NMT system. Finally, the corpora annotated with MT errors generated by both PBSMT and NMT systems are also available.</abstract>
       <url hash="02cdcab2">2020.lrec-1.446</url>
diff --git a/data/xml/2020.msr.xml b/data/xml/2020.msr.xml
index 219ff20831..10f00762a8 100644
--- a/data/xml/2020.msr.xml
+++ b/data/xml/2020.msr.xml
@@ -33,7 +33,6 @@
       <bibkey>mille-etal-2020-third</bibkey>
       <pwccode url="https://gitlab.com/talnupf/ud2deep" additional="false">talnupf/ud2deep</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="2">
       <title><fixed-case>BME</fixed-case>-<fixed-case>TUW</fixed-case> at <fixed-case>SR</fixed-case>’20: Lexical grammar induction for surface realization</title>
diff --git a/data/xml/2020.scil.xml b/data/xml/2020.scil.xml
index 810e825d92..3816227342 100644
--- a/data/xml/2020.scil.xml
+++ b/data/xml/2020.scil.xml
@@ -356,7 +356,6 @@
       <bibkey>hu-etal-2020-closer</bibkey>
       <pwccode url="https://github.com/jennhu/reflexive-anaphor-licensing" additional="false">jennhu/reflexive-anaphor-licensing</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="40">
       <title><fixed-case>M</fixed-case>ona<fixed-case>L</fixed-case>og: a Lightweight System for Natural Language Inference Based on Monotonicity</title>
diff --git a/data/xml/2020.tacl.xml b/data/xml/2020.tacl.xml
index cdd99ee3da..17daba8d35 100644
--- a/data/xml/2020.tacl.xml
+++ b/data/xml/2020.tacl.xml
@@ -364,7 +364,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webtext">WebText</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="26">
       <title>Reproducible and Efficient Benchmarks for Hyperparameter Optimization of Neural Machine Translation Systems</title>
diff --git a/data/xml/2021.acl.xml b/data/xml/2021.acl.xml
index d90bc03c95..91b0d01c81 100644
--- a/data/xml/2021.acl.xml
+++ b/data/xml/2021.acl.xml
@@ -724,7 +724,6 @@
       <pwccode url="https://github.com/LZhengisme/CODA" additional="false">LZhengisme/CODA</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wmt-2014">WMT 2014</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="46">
       <title>Structural Knowledge Distillation: Tractably Distilling Information for Structured Predictor</title>
@@ -1123,7 +1122,6 @@
       <video href="2021.acl-long.70.mp4"/>
       <pwccode url="https://github.com/lingo-mit/context-ablations" additional="false">lingo-mit/context-ablations</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="71">
       <title>Integrated Directional Gradients: Feature Interaction Attribution for Neural <fixed-case>NLP</fixed-case> Models</title>
@@ -1404,7 +1402,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst-2">SST-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="88">
       <title>Explainable Prediction of Text Complexity: The Missing Preliminaries for Text Simplification</title>
@@ -3725,7 +3722,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dureader">DuReader</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="228">
       <title>Marginal Utility Diminishes: Exploring the Minimum Knowledge for <fixed-case>BERT</fixed-case> Knowledge Distillation</title>
@@ -4778,7 +4774,6 @@
       <video href="2021.acl-long.292.mp4"/>
       <pwccode url="https://github.com/princeton-nlp/dyck-transformer" additional="false">princeton-nlp/dyck-transformer</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="293">
       <title><fixed-case>T</fixed-case>ext<fixed-case>SETTR</fixed-case>: Few-Shot Text Style Extraction and Tunable Targeted Restyling</title>
@@ -6582,7 +6577,6 @@
       <video href="2021.acl-long.404.mp4"/>
       <pwcdataset url="https://paperswithcode.com/dataset/wiki-40b">Wiki-40B</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="405">
       <title>Lower Perplexity is Not Always Human-Like</title>
@@ -6953,7 +6947,6 @@
       <pwccode url="https://github.com/ofirpress/shortformer" additional="false">ofirpress/shortformer</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bookcorpus">BookCorpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="428">
       <title><fixed-case>B</fixed-case>andit<fixed-case>MTL</fixed-case>: Bandit-based Multi-task Learning for Text Classification</title>
@@ -10657,7 +10650,6 @@
       <doi>10.18653/v1/2021.acl-short.90</doi>
       <bibkey>he-etal-2021-towards</bibkey>
       <video href="2021.acl-short.90.mp4"/>
-      <pwccode url="https://github.com/UCSD-AI4H/PathVQA" additional="false">UCSD-AI4H/PathVQA</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/vqa-rad">VQA-RAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visual-question-answering">Visual Question Answering</pwcdataset>
     </paper>
@@ -11003,7 +10995,6 @@
       <doi>10.18653/v1/2021.acl-short.112</doi>
       <bibkey>zhou-etal-2021-generation</bibkey>
       <video href="2021.acl-short.112.mp4"/>
-      <pwccode url="https://github.com/UCSD-AI4H/COVID-Dialogue" additional="false">UCSD-AI4H/COVID-Dialogue</pwccode>
     </paper>
     <paper id="113">
       <title>Constructing Multi-Modal Dialogue Dataset by Replacing Text with Semantically Relevant Images</title>
@@ -12263,7 +12254,7 @@
     <paper id="22">
       <title><fixed-case>CRSL</fixed-case>ab: An Open-Source Toolkit for Building Conversational Recommender System</title>
       <author><first>Kun</first><last>Zhou</last></author>
-      <author id='xiaolei-wang-renmin'><first>Xiaolei</first><last>Wang</last></author>
+      <author id="xiaolei-wang-renmin"><first>Xiaolei</first><last>Wang</last></author>
       <author><first>Yuanhang</first><last>Zhou</last></author>
       <author><first>Chenzhan</first><last>Shang</last></author>
       <author><first>Yuan</first><last>Cheng</last></author>
diff --git a/data/xml/2021.adaptnlp.xml b/data/xml/2021.adaptnlp.xml
index 91c1580f9c..027c1d0957 100644
--- a/data/xml/2021.adaptnlp.xml
+++ b/data/xml/2021.adaptnlp.xml
@@ -203,7 +203,6 @@
       <url hash="86928266">2021.adaptnlp-1.15</url>
       <bibkey>buck-vlachos-2021-trajectory</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="16">
       <title>Dependency Parsing Evaluation for Low-resource Spontaneous Speech</title>
@@ -223,7 +222,7 @@
       <abstract>Compound probabilistic context-free grammars (C-PCFGs) have recently established a new state of the art for phrase-structure grammar induction. However, due to the high time-complexity of chart-based representation and inference, it is difficult to investigate them comprehensively. In this work, we rely on a fast implementation of C-PCFGs to conduct evaluation complementary to that of (CITATION). We highlight three key findings: (1) C-PCFGs are data-efficient, (2) C-PCFGs make the best use of global sentence-level information in preterminal rule probabilities, and (3) the best configurations of C-PCFGs on English do not always generalize to morphology-rich languages.</abstract>
       <url hash="b99b0c8f">2021.adaptnlp-1.17</url>
       <bibkey>zhao-titov-2021-empirical</bibkey>
-      <pwccode url="https://github.com/zhaoyanpeng/cpcfg" additional="true">zhaoyanpeng/cpcfg</pwccode>
+      <pwccode url="https://github.com/zhaoyanpeng/xcfg" additional="true">zhaoyanpeng/xcfg</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/english-web-treebank">English Web Treebank</pwcdataset>
     </paper>
     <paper id="18">
diff --git a/data/xml/2021.cmcl.xml b/data/xml/2021.cmcl.xml
index 29dc25c61d..4c637ba2a0 100644
--- a/data/xml/2021.cmcl.xml
+++ b/data/xml/2021.cmcl.xml
@@ -203,7 +203,6 @@
       <doi>10.18653/v1/2021.cmcl-1.16</doi>
       <bibkey>vickers-etal-2021-cognlp</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="17">
       <title>Team <fixed-case>R</fixed-case>ead<fixed-case>M</fixed-case>e at <fixed-case>CMCL</fixed-case> 2021 Shared Task: Predicting Human Reading Patterns by Traditional Oculomotor Control Models and Machine Learning</title>
diff --git a/data/xml/2021.emnlp.xml b/data/xml/2021.emnlp.xml
index a0c055baba..a1961628bf 100644
--- a/data/xml/2021.emnlp.xml
+++ b/data/xml/2021.emnlp.xml
@@ -1164,7 +1164,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/cola">CoLA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/natural-stories">Natural Stories</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="75">
       <title>Condenser: a Pre-training Architecture for Dense Retrieval</title>
@@ -2385,7 +2384,7 @@
       <bibkey>wang-etal-2021-gender</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.151</doi>
       <video href="2021.emnlp-main.151.mp4"/>
-      <pwccode url="https://github.com/kuanghuei/SCAN" additional="false">kuanghuei/SCAN</pwccode>
+      <pwccode url="https://github.com/kuanghuei/SCAN" additional="true">kuanghuei/SCAN</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/flickr30k">Flickr30k</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">MS COCO</pwcdataset>
     </paper>
@@ -3003,7 +3002,6 @@
       <bibkey>hu-etal-2021-ranknas</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.191</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="192">
       <title><fixed-case>FL</fixed-case>i<fixed-case>T</fixed-case>ext: A Faster and Lighter Semi-Supervised Text Classification with Convolution Networks</title>
@@ -3743,7 +3741,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="238">
       <title>Adversarial Mixing Policy for Relaxing Locally Linear Constraints in Mixup</title>
@@ -7029,7 +7026,6 @@
       <video href="2021.emnlp-main.446.mp4"/>
       <pwccode url="https://github.com/mega002/ff-layers" additional="false">mega002/ff-layers</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="447">
       <title>Connecting Attributions and <fixed-case>QA</fixed-case> Model Behavior on Realistic Counterfactuals</title>
@@ -7242,7 +7238,6 @@
       <video href="2021.emnlp-main.461.mp4"/>
       <pwccode url="https://github.com/jxhe/efficient-knnlm" additional="true">jxhe/efficient-knnlm</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="462">
       <title><fixed-case>ST</fixed-case>ra<fixed-case>TA</fixed-case>: Self-Training with Task Augmentation for Better Few-shot Learning</title>
@@ -9524,7 +9519,6 @@
       <pwccode url="https://github.com/asappresearch/sru" additional="false">asappresearch/sru</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/billion-word-benchmark">Billion Word Benchmark</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="603">
       <title>Universal-<fixed-case>KD</fixed-case>: Attention-based Output-Grounded Intermediate Layer Knowledge Distillation</title>
@@ -11217,7 +11211,7 @@
       <bibkey>hardalov-etal-2021-cross</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.710</doi>
       <video href="2021.emnlp-main.710.mp4"/>
-      <pwccode url="https://github.com/checkstep/mole-stance" additional="false">checkstep/mole-stance</pwccode>
+      <pwccode url="https://github.com/checkstep/mole-stance" additional="true">checkstep/mole-stance</pwccode>
     </paper>
     <paper id="711">
       <title>Text <fixed-case>A</fixed-case>uto<fixed-case>A</fixed-case>ugment: Learning Compositional Augmentation Policy for Text Classification</title>
@@ -12620,7 +12614,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/mewsli-9">Mewsli-9</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tatoeba">Tatoeba</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDiQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tydiqa-goldp">TyDiQA-GoldP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/xcopa">XCOPA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/xnli">XNLI</pwcdataset>
@@ -13031,7 +13024,6 @@
       <video href="2021.emnlp-main.828.mp4"/>
       <pwccode url="https://github.com/cpcp1998/permuteformer" additional="false">cpcp1998/permuteformer</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="829">
       <title>Block Pruning For Faster Transformers</title>
@@ -13072,7 +13064,6 @@
       <video href="2021.emnlp-main.830.mp4"/>
       <pwcdataset url="https://paperswithcode.com/dataset/wmt-2014">WMT 2014</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="831">
       <title>How to Train <fixed-case>BERT</fixed-case> with an Academic Budget</title>
diff --git a/data/xml/2021.findings.xml b/data/xml/2021.findings.xml
index 38768c60ac..a1238639e8 100644
--- a/data/xml/2021.findings.xml
+++ b/data/xml/2021.findings.xml
@@ -1168,7 +1168,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/conan">CONAN</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/newsroom">NEWSROOM</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="80">
       <title><fixed-case>SOLID</fixed-case>: A Large-Scale Semi-Supervised Dataset for Offensive Language Identification</title>
@@ -3144,7 +3143,7 @@
       <url hash="32e2036e">2021.findings-acl.217</url>
       <doi>10.18653/v1/2021.findings-acl.217</doi>
       <bibkey>kruengkrai-etal-2021-multi</bibkey>
-      <pwccode url="https://github.com/nii-yamagishilab/mla" additional="false">nii-yamagishilab/mla</pwccode>
+      <pwccode url="https://github.com/nii-yamagishilab/mla" additional="true">nii-yamagishilab/mla</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/fever">FEVER</pwcdataset>
     </paper>
     <paper id="218">
@@ -4916,7 +4915,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/quasar-t">QUASAR-T</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/searchqa">SearchQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="347">
       <title>Minimally-Supervised Morphological Segmentation using <fixed-case>A</fixed-case>daptor <fixed-case>G</fixed-case>rammars with Linguistic Priors</title>
@@ -5620,7 +5618,6 @@
       <bibkey>garimella-etal-2021-intelligent</bibkey>
       <video href="2021.findings-acl.397.mp4"/>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="398">
       <title>Task-adaptive Pre-training of Language Models with Word Embedding Regularization</title>
@@ -11686,7 +11683,6 @@
       <pwccode url="https://github.com/machelreid/subformer" additional="false">machelreid/subformer</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="345">
       <title>Leveraging Information Bottleneck for Scientific Document Summarization</title>
@@ -11718,7 +11714,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst-2">SST-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="347">
       <title>Attend, Memorize and Generate: Towards Faithful Table-to-Text Generation in Few Shots</title>
diff --git a/data/xml/2021.konvens.xml b/data/xml/2021.konvens.xml
index a5d88a6479..68876689cb 100644
--- a/data/xml/2021.konvens.xml
+++ b/data/xml/2021.konvens.xml
@@ -44,7 +44,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst-2">SST-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="3">
       <title><fixed-case>A</fixed-case>rgue<fixed-case>BERT</fixed-case>: How To Improve <fixed-case>BERT</fixed-case> Embeddings for Measuring the Similarity of Arguments</title>
diff --git a/data/xml/2021.mtsummit.xml b/data/xml/2021.mtsummit.xml
index e27d4dc876..43f8a714a7 100644
--- a/data/xml/2021.mtsummit.xml
+++ b/data/xml/2021.mtsummit.xml
@@ -474,7 +474,6 @@ Our models outperform massively multilingual models such as Google (<tex-math>+8
       <url hash="9b4ced03">2021.mtsummit-at4ssl.7</url>
       <abstract>A cascaded Sign Language Translation system first maps sign videos to gloss annotations and then translates glosses into a spoken languages. This work focuses on the second-stage gloss translation component, which is challenging due to the scarcity of publicly available parallel data. We approach gloss translation as a low-resource machine translation task and investigate two popular methods for improving translation quality: hyperparameter search and backtranslation. We discuss the potentials and pitfalls of these methods based on experiments on the RWTH-PHOENIX-Weather 2014T dataset.</abstract>
       <bibkey>zhang-duh-2021-approaching</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/phoenix14t">PHOENIX14T</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/rwth-phoenix-weather-2014-t">RWTH-PHOENIX-Weather 2014 T</pwcdataset>
     </paper>
     <paper id="8">
@@ -514,7 +513,6 @@ Our models outperform massively multilingual models such as Google (<tex-math>+8
       <url hash="b695c908">2021.mtsummit-at4ssl.10</url>
       <abstract>One of the major challenges in sign language translation from a sign language to a spoken language is the lack of parallel corpora. Recent works have achieved promising results on the RWTH-PHOENIX-Weather 2014T dataset, which consists of over eight thousand parallel sentences between German sign language and German. However, from the perspective of neural machine translation, this is still a tiny dataset. To improve the performance of models trained on small datasets, transfer learning can be used. While this has been previously applied in sign language translation for feature extraction, to the best of our knowledge, pretrained language models have not yet been investigated. We use pretrained BERT-base and mBART-50 models to initialize our sign language video to spoken language text translation model. To mitigate overfitting, we apply the frozen pretrained transformer technique: we freeze the majority of parameters during training. Using a pretrained BERT model, we outperform a baseline trained from scratch by 1 to 2 BLEU-4. Our results show that pretrained language models can be used to improve sign language translation performance and that the self-attention patterns in BERT transfer in zero-shot to the encoder and decoder of sign language translation models.</abstract>
       <bibkey>de-coster-etal-2021-frozen</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/phoenix14t">PHOENIX14T</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/rwth-phoenix-weather-2014-t">RWTH-PHOENIX-Weather 2014 T</pwcdataset>
     </paper>
     <paper id="11">
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index 711cc7fb6e..631062899b 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -1232,6 +1232,7 @@
       <bibkey>yang-etal-2021-mtag</bibkey>
       <video href="2021.naacl-main.79.mp4"/>
       <pwccode url="https://github.com/jedyang97/MTAG" additional="false">jedyang97/MTAG</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/cmu-mosi">CMU-MOSI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/iemocap">IEMOCAP</pwcdataset>
     </paper>
     <paper id="80">
@@ -2586,7 +2587,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/sst-2">SST-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst-5">SST-5</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="166">
       <title><fixed-case>DA</fixed-case>-Transformer: Distance-aware Transformer</title>
@@ -5517,6 +5517,7 @@
       <doi>10.18653/v1/2021.naacl-main.359</doi>
       <bibkey>gosangi-etal-2021-use</bibkey>
       <video href="2021.naacl-main.359.mp4"/>
+      <pwcdataset url="https://paperswithcode.com/dataset/pmoa-cite">PMOA-CITE</pwcdataset>
     </paper>
     <paper id="360">
       <title>Data and Model Distillation as a Solution for Domain-transferable Fact Verification</title>
@@ -6108,7 +6109,6 @@
       <bibkey>ding-koehn-2021-evaluating</bibkey>
       <video href="2021.naacl-main.399.mp4"/>
       <pwccode url="https://github.com/shuoyangd/tarsius" additional="false">shuoyangd/tarsius</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/winobias">WinoBias</pwcdataset>
     </paper>
     <paper id="400">
@@ -6236,7 +6236,6 @@
       <pwccode url="https://github.com/SimengSun/revisit-nplm" additional="false">SimengSun/revisit-nplm</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/lambada">LAMBADA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="408">
       <title><fixed-case>R</fixed-case>ead<fixed-case>T</fixed-case>wice: Reading Very Large Documents with Memories</title>
diff --git a/data/xml/2021.ranlp.xml b/data/xml/2021.ranlp.xml
index f8b59a27dd..a4bb656c24 100644
--- a/data/xml/2021.ranlp.xml
+++ b/data/xml/2021.ranlp.xml
@@ -798,7 +798,7 @@
     </paper>
     <paper id="70">
       <title>Semantic-Based Opinion Summarization</title>
-      <author><first>Marcio</first><last>Inácio</last></author>
+      <author><first>Marcio</first><last>Lima Inácio</last></author>
       <author><first>Thiago</first><last>Pardo</last></author>
       <pages>619–628</pages>
       <abstract>The amount of information available online can be overwhelming for users to digest, specially when dealing with other users’ comments when making a decision about buying a product or service. In this context, opinion summarization systems are of great value, extracting important information from the texts and presenting them to the user in a more understandable manner. It is also known that the usage of semantic representations can benefit the quality of the generated summaries. This paper aims at developing opinion summarization methods based on Abstract Meaning Representation of texts in the Brazilian Portuguese language. Four different methods have been investigated, alongside some literature approaches. The results show that a Machine Learning-based method produced summaries of higher quality, outperforming other literature techniques on manually constructed semantic graphs. We also show that using parsed graphs over manually annotated ones harmed the output. Finally, an analysis of how important different types of information are for the summarization process suggests that using Sentiment Analysis features did not improve summary quality.</abstract>
diff --git a/data/xml/2021.spnlp.xml b/data/xml/2021.spnlp.xml
index 85c798a91a..4605fac2bf 100644
--- a/data/xml/2021.spnlp.xml
+++ b/data/xml/2021.spnlp.xml
@@ -81,7 +81,6 @@
       <video href="2021.spnlp-1.5.mp4"/>
       <pwccode url="https://github.com/uralik/mode_recovery" additional="false">uralik/mode_recovery</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="6">
       <title>Using Hierarchical Class Structure to Improve Fine-Grained Claim Classification</title>
diff --git a/data/xml/2021.sustainlp.xml b/data/xml/2021.sustainlp.xml
index 67d8ea1ba1..a0c337486a 100644
--- a/data/xml/2021.sustainlp.xml
+++ b/data/xml/2021.sustainlp.xml
@@ -97,7 +97,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/ropes">ROPES</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="6">
       <title><fixed-case>B</fixed-case>io<fixed-case>C</fixed-case>opy: A Plug-And-Play Span Copy Mechanism in <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models</title>
@@ -250,7 +249,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="16">
       <title>Efficient Domain Adaptation of Language Models via Adaptive Tokenization</title>
diff --git a/data/xml/2021.textgraphs.xml b/data/xml/2021.textgraphs.xml
index 8aa46fd4de..d073ad2bcb 100644
--- a/data/xml/2021.textgraphs.xml
+++ b/data/xml/2021.textgraphs.xml
@@ -117,7 +117,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dbpedia">DBpedia</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genwiki">GenWiki</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="8">
       <title>Selective Attention Based Graph Convolutional Networks for Aspect-Level Sentiment Classification</title>
diff --git a/data/xml/2022.acl.xml b/data/xml/2022.acl.xml
index 8273af5a86..034ed48462 100644
--- a/data/xml/2022.acl.xml
+++ b/data/xml/2022.acl.xml
@@ -66,7 +66,6 @@
       <bibkey>yu-etal-2022-rare</bibkey>
       <doi>10.18653/v1/2022.acl-long.3</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="4">
       <title><fixed-case>A</fixed-case>leph<fixed-case>BERT</fixed-case>: Language Model Pre-training and Evaluation from Sub-Word to Sentence Level</title>
@@ -414,7 +413,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/lambada">LAMBADA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/superglue">SuperGLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="27">
       <title><fixed-case>Q</fixed-case>uote<fixed-case>R</fixed-case>: A Benchmark of Quote Recommendation for Writing</title>
@@ -1565,7 +1563,6 @@
       <doi>10.18653/v1/2022.acl-long.96</doi>
       <pwccode url="https://github.com/richardbaihe/robustlm" additional="false">richardbaihe/robustlm</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="97">
       <title>Tackling Fake News Detection by Continually Improving Social Context Representations using Graph Neural Networks</title>
@@ -3146,7 +3143,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/piqa">PIQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/riddle-sense">RiddleSense</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="197">
       <title>A Good Prompt Is Worth Millions of Parameters: Low-resource Prompt-based Learning for Vision-Language Models</title>
@@ -6070,7 +6066,6 @@ in the Case of Unambiguous Gender</title>
       <pwccode url="https://github.com/deep-spin/infinite-former" additional="false">deep-spin/infinite-former</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/pg-19">PG-19</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="376">
       <title>Systematic Inequalities in Language Technology Performance across the World’s Languages</title>
@@ -8352,7 +8347,6 @@ in the Case of Unambiguous Gender</title>
       <pwcdataset url="https://paperswithcode.com/dataset/realnews">RealNews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wmt-2014">WMT 2014</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="516">
       <title>The Dangers of Underclaiming: Reasons for Caution When Reporting How <fixed-case>NLP</fixed-case> Systems Fail</title>
@@ -8985,6 +8979,7 @@ in the Case of Unambiguous Gender</title>
       <bibkey>lu-etal-2022-fantastically</bibkey>
       <doi>10.18653/v1/2022.acl-long.556</doi>
       <video href="2022.acl-long.556.mp4"/>
+      <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/ag-news">AG News</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mpqa-opinion-corpus">MPQA Opinion Corpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
@@ -10150,7 +10145,6 @@ in the Case of Unambiguous Gender</title>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webtext">WebText</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="25">
       <title>Simple and Effective Knowledge-Driven Query Expansion for <fixed-case>QA</fixed-case>-Based Product Attribute Extraction</title>
@@ -11494,7 +11488,6 @@ in the Case of Unambiguous Gender</title>
       <bibkey>angelova-etal-2022-using</bibkey>
       <doi>10.18653/v1/2022.acl-srw.21</doi>
       <pwccode url="https://github.com/dfki-signlanguage/gloss-to-text-sign-language-translation" additional="false">dfki-signlanguage/gloss-to-text-sign-language-translation</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/phoenix14t">PHOENIX14T</pwcdataset>
     </paper>
     <paper id="22">
       <title>Flexible Visual Grounding</title>
diff --git a/data/xml/2022.coling.xml b/data/xml/2022.coling.xml
index c6ad791da2..42a975572c 100644
--- a/data/xml/2022.coling.xml
+++ b/data/xml/2022.coling.xml
@@ -3344,6 +3344,7 @@
       <abstract>Social media spreads both real news and fake news in various domains including politics, health, entertainment, etc. It is crucial to automatically detect fake news, especially for news of influential domains like politics and health because they may lead to serious social impact, e.g., panic in the COVID-19 pandemic. Some studies indicate the correlation between domains and perform multi-domain fake news detection. However, these multi-domain methods suffer from a seesaw problem that the performance of some domains is often improved by hurting the performance of other domains, which could lead to an unsatisfying performance in the specific target domains. To address this issue, we propose a Domain- and Instance-level Transfer Framework for Fake News Detection (DITFEND), which could improve the performance of specific target domains. To transfer coarse-grained domain-level knowledge, we train a general model with data of all domains from the meta-learning perspective. To transfer fine-grained instance-level knowledge and adapt the general model to a target domain, a language model is trained on the target domain to evaluate the transferability of each data instance in source domains and re-weight the instance’s contribution. Experiments on two real-world datasets demonstrate the effectiveness of DITFEND. According to both offline and online experiments, the DITFEND shows superior effectiveness for fake news detection.</abstract>
       <url hash="4a7dbef5">2022.coling-1.250</url>
       <bibkey>nan-etal-2022-improving</bibkey>
+      <pwccode url="https://github.com/ICTMCG/DITFEND" additional="false">ICTMCG/DITFEND</pwccode>
     </paper>
     <paper id="251">
       <title>Student Surpasses Teacher: Imitation Attack for Black-Box <fixed-case>NLP</fixed-case> <fixed-case>API</fixed-case>s</title>
@@ -4112,8 +4113,11 @@
       <author><first>Marco</first><last>Avagnano</last></author>
       <pages>3465–3479</pages>
       <abstract>Driven by deep learning breakthroughs, natural language generation (NLG) models have been at the center of steady progress in the last few years, with a ubiquitous task influence. However, since our ability to generate human-indistinguishable artificial text lags behind our capacity to assess it, it is paramount to develop and apply even better automatic evaluation metrics. To facilitate researchers to judge the effectiveness of their models broadly, we introduce NLG-Metricverse—an end-to-end open-source library for NLG evaluation based on Python. Our framework provides a living collection of NLG metrics in a unified and easy-to-use environment, supplying tools to efficiently apply, analyze, compare, and visualize them. This includes (i) the extensive support to heterogeneous automatic metrics with n-arity management, (ii) the meta-evaluation upon individual performance, metric-metric and metric-human correlations, (iii) graphical interpretations for helping humans better gain score intuitions, (iv) formal categorization and convenient documentation to accelerate metrics understanding. NLG-Metricverse aims to increase the comparability and replicability of NLG research, hopefully stimulating new contributions in the area.</abstract>
-      <url hash="a4f10700">2022.coling-1.306</url>
+      <url hash="7d4a0d2f">2022.coling-1.306</url>
       <bibkey>frisoni-etal-2022-nlg</bibkey>
+      <revision id="1" href="2022.coling-1.306v1" hash="a4f10700"/>
+      <revision id="2" href="2022.coling-1.306v2" hash="7d4a0d2f" date="2024-05-21">Retracted by the COLING 2022 PC chairs.</revision>
+      <retracted date="2024-05-21">Retracted by the COLING 2022 PC chairs.</retracted>
     </paper>
     <paper id="307">
       <title><fixed-case>T</fixed-case>est<fixed-case>A</fixed-case>ug: A Framework for Augmenting Capability-based <fixed-case>NLP</fixed-case> Tests</title>
@@ -4153,7 +4157,6 @@
       <url hash="f94530b1">2022.coling-1.309</url>
       <bibkey>abonizio-etal-2022-monobyte</bibkey>
       <pwccode url="https://github.com/lersouza/lang-agnostic" additional="false">lersouza/lang-agnostic</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDiQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/xnli">XNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mc4">mC4</pwcdataset>
     </paper>
@@ -5488,6 +5491,7 @@
       <abstract>Conditional computation algorithms, such as the early exiting (EE) algorithm, can be applied to accelerate the inference of pretrained language models (PLMs) while maintaining competitive performance on resource-constrained devices. However, this approach is only applied to the vertical architecture to decide which layers should be used for inference. Conversely, the operation of the horizontal perspective is ignored, and the determination of which tokens in each layer should participate in the computation fails, leading to a high redundancy for adaptive inference. To address this limitation, a unified horizontal and vertical multi-perspective early exiting (MPEE) framework is proposed in this study to accelerate the inference of transformer-based models. Specifically, the vertical architecture uses recycling EE classifier memory and weighted self-distillation to enhance the performance of the EE classifiers. Then, the horizontal perspective uses recycling class attention memory to emphasize the informative tokens. Conversely, the tokens with less information are truncated by weighted fusion and isolated from the following computation. Based on this, both horizontal and vertical EE are unified to obtain a better tradeoff between performance and efficiency. Extensive experimental results show that MPEE can achieve higher acceleration inference with competent performance than existing competitive methods.</abstract>
       <url hash="ab68f362">2022.coling-1.414</url>
       <bibkey>kong-etal-2022-accelerating</bibkey>
+      <pwccode url="https://github.com/junkong5/mpee" additional="false">junkong5/mpee</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mrpc">MRPC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
@@ -6724,7 +6728,7 @@
       <abstract>Research on Automatic Story Generation (ASG) relies heavily on human and automatic evaluation. However, there is no consensus on which human evaluation criteria to use, and no analysis of how well automatic criteria correlate with them. In this paper, we propose to re-evaluate ASG evaluation. We introduce a set of 6 orthogonal and comprehensive human criteria, carefully motivated by the social sciences literature. We also present HANNA, an annotated dataset of 1,056 stories produced by 10 different ASG systems. HANNA allows us to quantitatively evaluate the correlations of 72 automatic metrics with human criteria. Our analysis highlights the weaknesses of current metrics for ASG and allows us to formulate practical recommendations for ASG evaluation.</abstract>
       <url hash="b029c210">2022.coling-1.509</url>
       <bibkey>chhun-etal-2022-human</bibkey>
-      <pwccode url="https://github.com/dig-team/hanna-benchmark-asg" additional="true">dig-team/hanna-benchmark-asg</pwccode>
+      <pwccode url="https://github.com/lashoun/hanna-benchmark-asg" additional="false">lashoun/hanna-benchmark-asg</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/hanna">HANNA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/writingprompts">WritingPrompts</pwcdataset>
     </paper>
@@ -6871,6 +6875,7 @@
       <abstract>Recent research on code summarization relies on the structural information from the abstract syntax tree (AST) of source codes. It is, however, questionable whether it is the most effective to use AST for expressing the structural information. We find that a program dependency graph (PDG) can represent the structure of a code more effectively. We propose PDG Boosting Module (PBM) that encodes PDG into graph embedding and the framework to implement the proposed PBM with the existing models. PBM achieves improvements of 6.67% (BLEU) and 7.47% (ROUGE) on average. We then analyze the experimental results, and examine how PBM helps the training of baseline models and its performance robustness. For the validation of robustness, we measure the performance of an out-of-domain benchmark dataset, and confirm its robustness. In addition, we apply a new evaluation measure, SBERT score, to evaluate the semantic performance. The models implemented with PBM improve the performance of SBERT score. This implies that they generate summaries that are semantically more similar to the reference summary.</abstract>
       <url hash="e7d8dbbd">2022.coling-1.521</url>
       <bibkey>son-etal-2022-boosting</bibkey>
+      <pwccode url="https://github.com/sjk0825/coling2022" additional="false">sjk0825/coling2022</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/codesearchnet">CodeSearchNet</pwcdataset>
     </paper>
     <paper id="522">
@@ -7324,7 +7329,6 @@
       <bibkey>sun-etal-2022-summarize</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/bookcorpus">BookCorpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/writingprompts">WritingPrompts</pwcdataset>
     </paper>
     <paper id="557">
@@ -7500,7 +7504,6 @@
       <pwccode url="https://github.com/fadedcosine/pos-guided-neural-text-generation" additional="false">fadedcosine/pos-guided-neural-text-generation</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/paranmt-50m">PARANMT-50M</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="571">
       <title>Enhancing Pre-trained Models with Text Structure Knowledge for Question Generation</title>
diff --git a/data/xml/2022.creativesumm.xml b/data/xml/2022.creativesumm.xml
index 0dd2542e4a..9b44072853 100644
--- a/data/xml/2022.creativesumm.xml
+++ b/data/xml/2022.creativesumm.xml
@@ -24,6 +24,7 @@
       <abstract>Summarizing Interactive Digital Narratives (IDN) presents some unique challenges to existing text summarization models especially around capturing interactive elements in addition to important plot points. In this paper, we describe the first IDN dataset (IDN-Sum) designed specifically for training and testing IDN text summarization algorithms. Our dataset is generated using random playthroughs of 8 IDN episodes, taken from 2 different IDN games, and consists of 10,000 documents. Playthrough documents are annotated through automatic alignment with fan-sourced summaries using a commonly used alignment algorithm. We also report and discuss results from experiments applying common baseline extractive text summarization algorithms to this dataset. Qualitative analysis of the results reveals shortcomings in common annotation approaches and evaluation methods when applied to narrative and interactive narrative datasets. The dataset is released as open source for future researchers to train and test their own approaches for IDN text.</abstract>
       <url hash="7e6498a3">2022.creativesumm-1.1</url>
       <bibkey>revi-etal-2022-idn</bibkey>
+      <pwccode url="https://github.com/ashwathytr/idn-sum" additional="false">ashwathytr/idn-sum</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/crd3">CRD3</pwcdataset>
     </paper>
diff --git a/data/xml/2022.emnlp.xml b/data/xml/2022.emnlp.xml
index 611d516d91..2416893581 100644
--- a/data/xml/2022.emnlp.xml
+++ b/data/xml/2022.emnlp.xml
@@ -981,7 +981,7 @@
     </paper>
     <paper id="74">
       <title>Deconfounding Legal Judgment Prediction for <fixed-case>E</fixed-case>uropean Court of Human Rights Cases Towards Better Alignment with Experts</title>
-      <author><first>T.y.s.s</first><last>Santosh</last><affiliation>Technical University of Munich</affiliation></author>
+      <author><first>Santosh</first><last>T.y.s.s</last><affiliation>Technical University of Munich</affiliation></author>
       <author><first>Shanshan</first><last>Xu</last><affiliation>Technical University of Munich</affiliation></author>
       <author><first>Oana</first><last>Ichim</last><affiliation>Graduate Institute of International and Development Studies</affiliation></author>
       <author><first>Matthias</first><last>Grabmair</last><affiliation>Technical University of Munich</affiliation></author>
@@ -3307,6 +3307,7 @@
     <paper id="252">
       <title>How to disagree well: Investigating the dispute tactics used on <fixed-case>W</fixed-case>ikipedia</title>
       <author><first>Christine</first><last>De Kock</last><affiliation>University of Cambridge</affiliation></author>
+      <author><first>Tom</first><last>Stafford</last><affiliation>University of Cambridge</affiliation></author>
       <author><first>Andreas</first><last>Vlachos</last><affiliation>University of Cambridge</affiliation></author>
       <pages>3824-3837</pages>
       <abstract>Disagreements are frequently studied from the perspective of either detecting toxicity or analysing argument structure. We propose a framework of dispute tactics which unifies these two perspectives, as well as other dialogue acts which play a role in resolving disputes, such as asking questions and providing clarification. This framework includes a preferential ordering among rebuttal-type tactics, ranging from ad hominem attacks to refuting the central argument. Using this framework, we annotate 213 disagreements (3,865 utterances) from Wikipedia Talk pages. This allows us to investigate research questions around the tactics used in disagreements; for instance, we provide empirical validation of the approach to disagreement recommended by Wikipedia. We develop models for multilabel prediction of dispute tactics in an utterance, achieving the best performance with a transformer-based label powerset model. Adding an auxiliary task to incorporate the ordering of rebuttal tactics further yields a statistically significant increase. Finally, we show that these annotations can be used to provide useful additional signals to improve performance on the task of predicting escalation.</abstract>
diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml
index 4a8281c64e..f5e7dd16e0 100644
--- a/data/xml/2022.findings.xml
+++ b/data/xml/2022.findings.xml
@@ -911,7 +911,6 @@
       <video href="2022.findings-acl.58.mp4"/>
       <pwccode url="https://github.com/kushalarora/quantifying_exposure_bias" additional="false">kushalarora/quantifying_exposure_bias</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="59">
       <title>Question Answering Infused Pre-training of General-Purpose Contextualized Representations</title>
@@ -3584,7 +3583,6 @@
       <doi>10.18653/v1/2022.findings-acl.228</doi>
       <video href="2022.findings-acl.228.mp4"/>
       <pwccode url="https://github.com/merterm/modeling-intensification-for-slg" additional="false">merterm/modeling-intensification-for-slg</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/phoenix14t">PHOENIX14T</pwcdataset>
     </paper>
     <paper id="229">
       <title>Controllable Natural Language Generation with Contrastive Prefixes</title>
@@ -4361,10 +4359,12 @@
       <author><first>Ashutosh</first><last>Modi</last></author>
       <pages>3521-3536</pages>
       <abstract>Many populous countries including India are burdened with a considerable backlog of legal cases. Development of automated systems that could process legal documents and augment legal practitioners can mitigate this. However, there is a dearth of high-quality corpora that is needed to develop such data-driven systems. The problem gets even more pronounced in the case of low resource languages such as Hindi. In this resource paper, we introduce the Hindi Legal Documents Corpus (HLDC), a corpus of more than 900K legal documents in Hindi. Documents are cleaned and structured to enable the development of downstream applications. Further, as a use-case for the corpus, we introduce the task of bail prediction. We experiment with a battery of models and propose a Multi-Task Learning (MTL) based model for the same. MTL models use summarization as an auxiliary task along with bail prediction as the main task. Experiments with different models are indicative of the need for further research in this area.</abstract>
-      <url hash="0e114075">2022.findings-acl.278</url>
+      <url hash="65199027">2022.findings-acl.278</url>
       <attachment type="software" hash="c763b971">2022.findings-acl.278.software.zip</attachment>
       <bibkey>kapoor-etal-2022-hldc</bibkey>
       <doi>10.18653/v1/2022.findings-acl.278</doi>
+      <revision id="1" href="2022.findings-acl.278v1" hash="0e114075"/>
+      <revision id="2" href="2022.findings-acl.278v2" hash="65199027" date="2024-05-17">This revision updates funding information in the Acknowledgements section of the paper.</revision>
       <pwccode url="https://github.com/exploration-lab/hldc" additional="false">exploration-lab/hldc</pwccode>
     </paper>
     <paper id="279">
@@ -4661,7 +4661,6 @@
       <url hash="c4a410ce">2022.findings-acl.297</url>
       <bibkey>jin-etal-2022-prior</bibkey>
       <doi>10.18653/v1/2022.findings-acl.297</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/phoenix14t">PHOENIX14T</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/rwth-phoenix-weather-2014-t">RWTH-PHOENIX-Weather 2014 T</pwcdataset>
     </paper>
     <paper id="298">
@@ -6592,7 +6591,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst-2">SST-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="90">
       <title>Towards Computationally Feasible Deep Active Learning</title>
@@ -7635,7 +7633,6 @@
       <doi>10.18653/v1/2022.findings-naacl.151</doi>
       <video href="2022.findings-naacl.151.mp4"/>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="152">
       <title>What kinds of errors do reference resolution models make and what can we learn from them?</title>
@@ -11177,6 +11174,11 @@ Faster and Smaller Speech Translation without Quality Compromise</title>
       <attachment type="software" hash="d2ccd007">2022.findings-emnlp.149.software.zip</attachment>
       <bibkey>li-etal-2022-self</bibkey>
       <doi>10.18653/v1/2022.findings-emnlp.149</doi>
+      <pwccode url="https://github.com/pldlgb/MetaSD" additional="false">pldlgb/MetaSD</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/fb15k">FB15k</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/fb15k-237">FB15k-237</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/wn18">WN18</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/wn18rr">WN18RR</pwcdataset>
     </paper>
     <paper id="150">
       <title><fixed-case>CQR</fixed-case>-<fixed-case>SQL</fixed-case>: Conversational Question Reformulation Enhanced Context-Dependent Text-to-<fixed-case>SQL</fixed-case> Parsers</title>
diff --git a/data/xml/2022.fl4nlp.xml b/data/xml/2022.fl4nlp.xml
index 876b60eaff..e616ba926e 100644
--- a/data/xml/2022.fl4nlp.xml
+++ b/data/xml/2022.fl4nlp.xml
@@ -63,7 +63,6 @@
       <bibkey>wu-etal-2022-adaptive</bibkey>
       <doi>10.18653/v1/2022.fl4nlp-1.3</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="4">
       <title>Intrinsic Gradient Compression for Scalable and Efficient Federated Learning</title>
diff --git a/data/xml/2022.in2writing.xml b/data/xml/2022.in2writing.xml
index 38d64c797f..544ef1c559 100644
--- a/data/xml/2022.in2writing.xml
+++ b/data/xml/2022.in2writing.xml
@@ -88,7 +88,6 @@
       <pwccode url="https://github.com/webis-de/in2writing22-language-models-as-context-sensitive-word-search-engines" additional="false">webis-de/in2writing22-language-models-as-context-sensitive-word-search-engines</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cloth">CLOTH</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="6">
       <title>Plug-and-Play Controller for Story Completion: A Pilot Study toward Emotion-aware Story Writing Assistance</title>
diff --git a/data/xml/2022.iwslt.xml b/data/xml/2022.iwslt.xml
index 136e4f4886..f3661e5e48 100644
--- a/data/xml/2022.iwslt.xml
+++ b/data/xml/2022.iwslt.xml
@@ -478,7 +478,7 @@
       <author><first>Patrick</first><last>Fernandes</last></author>
       <author><first>Siddharth</first><last>Dalmia</last></author>
       <author><first>Jiatong</first><last>Shi</last></author>
-      <author><first>Yifan</first><last>Peng</last></author>
+      <author id="yifan-peng-cmu"><first>Yifan</first><last>Peng</last></author>
       <author><first>Dan</first><last>Berrebbi</last></author>
       <author><first>Xinyi</first><last>Wang</last></author>
       <author><first>Graham</first><last>Neubig</last></author>
diff --git a/data/xml/2022.lrec.xml b/data/xml/2022.lrec.xml
index 940e63b8de..2122dd1ee5 100644
--- a/data/xml/2022.lrec.xml
+++ b/data/xml/2022.lrec.xml
@@ -4887,6 +4887,7 @@
       <abstract>This paper presents ClinIDMap, a tool for mapping identifiers between clinical ontologies and lexical resources. ClinIDMap interlinks identifiers from UMLS, SMOMED-CT, ICD-10 and the corresponding Wikipedia articles for concepts from the UMLS Metathesaurus. Our main goal is to provide semantic interoperability across the clinical concepts from various knowledge bases. As a side effect, the mapping enriches already annotated corpora in multiple languages with new labels. For instance, spans manually annotated with IDs from UMLS can be annotated with Semantic Types and Groups, and its corresponding SNOMED CT and ICD-10 IDs. We also experiment with sequence labelling models for detecting Diagnosis and Procedures concepts and for detecting UMLS Semantic Groups trained on Spanish, English, and bilingual corpora obtained with the new mapping procedure. The ClinIDMap tool is publicly available.</abstract>
       <url hash="1e203c3c">2022.lrec-1.390</url>
       <bibkey>zotova-etal-2022-clinidmap</bibkey>
+      <pwccode url="https://github.com/vicomtech/clinidmap" additional="false">vicomtech/clinidmap</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/medmentions">MedMentions</pwcdataset>
     </paper>
     <paper id="391">
@@ -4968,6 +4969,7 @@
       <abstract>In the field of Japanese medical information extraction, few analyzing tools are available and relation extraction is still an under-explored topic. In this paper, we first propose a novel relation annotation schema for investigating the medical and temporal relations between medical entities in Japanese medical reports. We experiment with the practical annotation scenarios by separately annotating two different types of reports. We design a pipeline system with three components for recognizing medical entities, classifying entity modalities, and extracting relations. The empirical results show accurate analyzing performance and suggest the satisfactory annotation quality, the superiority of the latest contextual embedding models. and the feasible annotation strategy for high-accuracy demand.</abstract>
       <url hash="0ec02ae5">2022.lrec-1.397</url>
       <bibkey>cheng-etal-2022-jamie</bibkey>
+      <pwccode url="https://github.com/racerandom/jamie" additional="false">racerandom/jamie</pwccode>
     </paper>
     <paper id="398">
       <title>Enhanced Entity Annotations for Multilingual Corpora</title>
@@ -5345,6 +5347,7 @@
       <abstract>Olfactory references play a crucial role in our memory and, more generally, in our experiences, since researchers have shown that smell is the sense that is most directly connected with emotions. Nevertheless, only few works in NLP have tried to capture this sensory dimension from a computational perspective. One of the main challenges is the lack of a systematic and consistent taxonomy of olfactory information, where concepts are organised also in a multi-lingual perspective. WordNet represents a valuable starting point in this direction, which can be semi-automatically extended taking advantage of Google n-grams and of existing language models. In this work we describe the process that has led to the semi-automatic development of a taxonomy for olfactory information in four languages (English, French, German and Italian), detailing the different steps and the intermediate evaluations. Along with being multi-lingual, the taxonomy also encloses temporal marks for olfactory terms thus making it a valuable resource for historical content analysis. The resource has been released and is freely available.</abstract>
       <url hash="c73463ee">2022.lrec-1.429</url>
       <bibkey>menini-etal-2022-building</bibkey>
+      <pwccode url="https://github.com/odeuropa/multilingualtaxonomies" additional="false">odeuropa/multilingualtaxonomies</pwccode>
     </paper>
     <paper id="430">
       <title>Attention Understands Semantic Relations</title>
@@ -6138,7 +6141,6 @@
       <pwccode url="https://github.com/jmeadows17/physnlu" additional="false">jmeadows17/physnlu</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/physnlu">PhysNLU</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="493">
       <title><fixed-case>HECTOR</fixed-case>: A Hybrid <fixed-case>TE</fixed-case>xt <fixed-case>S</fixed-case>implifi<fixed-case>C</fixed-case>ation <fixed-case>TO</fixed-case>ol for Raw Texts in <fixed-case>F</fixed-case>rench</title>
diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index 9215cabda3..629dd27a77 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -1341,7 +1341,6 @@
       <video href="2022.naacl-main.84.mp4"/>
       <pwccode url="https://github.com/rycolab/probing-via-prompting" additional="false">rycolab/probing-via-prompting</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="85">
       <title>Database Search Results Disambiguation for Task-Oriented Dialog Systems</title>
@@ -1571,7 +1570,6 @@
       <bibkey>ahuja-etal-2022-economics</bibkey>
       <doi>10.18653/v1/2022.naacl-main.98</doi>
       <video href="2022.naacl-main.98.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDiQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tydiqa-goldp">TyDiQA-GoldP</pwcdataset>
     </paper>
     <paper id="99">
@@ -5250,7 +5248,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="319">
       <title><fixed-case>FN</fixed-case>et: Mixing Tokens with <fixed-case>F</fixed-case>ourier Transforms</title>
@@ -6872,7 +6869,6 @@
       <video href="2022.naacl-main.422.mp4"/>
       <pwccode url="https://github.com/thu-coai/lamemo" additional="false">thu-coai/lamemo</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="423">
       <title>Exploiting Inductive Bias in Transformers for Unsupervised Disentanglement of Syntax and Semantics with <fixed-case>VAE</fixed-case>s</title>
@@ -6965,7 +6961,6 @@
       <pwccode url="https://github.com/llyx97/TAMT" additional="false">llyx97/TAMT</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="429">
       <title>You Don’t Know My Favorite Color: Preventing Dialogue Representations from Revealing Speakers’ Private Personas</title>
diff --git a/data/xml/2022.osact.xml b/data/xml/2022.osact.xml
index bc75243449..7a0e3e75f9 100644
--- a/data/xml/2022.osact.xml
+++ b/data/xml/2022.osact.xml
@@ -143,7 +143,6 @@
       <bibkey>aftab-malik-2022-erock</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/arcd">ARCD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDiQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tydiqa-goldp">TyDiQA-GoldP</pwcdataset>
     </paper>
     <paper id="12">
diff --git a/data/xml/2022.repl4nlp.xml b/data/xml/2022.repl4nlp.xml
index 725e9095f1..9b27c0131c 100644
--- a/data/xml/2022.repl4nlp.xml
+++ b/data/xml/2022.repl4nlp.xml
@@ -147,7 +147,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="10">
       <title>A Vocabulary-Free Multilingual Neural Tokenizer for End-to-End Task Learning</title>
diff --git a/data/xml/2022.sltat.xml b/data/xml/2022.sltat.xml
index 897b6a41db..155e2c41de 100644
--- a/data/xml/2022.sltat.xml
+++ b/data/xml/2022.sltat.xml
@@ -180,7 +180,6 @@
       <abstract>Recent approaches to Sign Language Production (SLP) have adopted spoken language Neural Machine Translation (NMT) architectures, applied without sign-specific modifications. In addition, these works represent sign language as a sequence of skeleton pose vectors, projected to an abstract representation with no inherent skeletal structure. In this paper, we represent sign language sequences as a skeletal graph structure, with joints as nodes and both spatial and temporal connections as edges. To operate on this graphical structure, we propose Skeletal Graph Self-Attention (SGSA), a novel graphical attention layer that embeds a skeleton inductive bias into the SLP model. Retaining the skeletal feature representation throughout, we directly apply a spatio-temporal adjacency matrix into the self-attention formulation. This provides structure and context to each skeletal joint that is not possible when using a non-graphical abstract representation, enabling fluid and expressive sign language production. We evaluate our Skeletal Graph Self-Attention architecture on the challenging RWTH-PHOENIX-Weather-2014T (PHOENIX14T) dataset, achieving state-of-the-art back translation performance with an 8% and 7% improvement over competing methods for the dev and test sets.</abstract>
       <url hash="c99c065b">2022.sltat-1.15</url>
       <bibkey>saunders-etal-2022-skeletal</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/phoenix14t">PHOENIX14T</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/rwth-phoenix-weather-2014-t">RWTH-PHOENIX-Weather 2014 T</pwcdataset>
     </paper>
     <paper id="16">
diff --git a/data/xml/2022.spnlp.xml b/data/xml/2022.spnlp.xml
index 36b7040f67..d7ccd5e27b 100644
--- a/data/xml/2022.spnlp.xml
+++ b/data/xml/2022.spnlp.xml
@@ -112,7 +112,6 @@
       <bibkey>treviso-etal-2022-predicting</bibkey>
       <doi>10.18653/v1/2022.spnlp-1.7</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2023.acl.xml b/data/xml/2023.acl.xml
index a6f8a59ae4..333c094645 100644
--- a/data/xml/2023.acl.xml
+++ b/data/xml/2023.acl.xml
@@ -6061,7 +6061,7 @@
     </paper>
     <paper id="424">
       <title>Answering Ambiguous Questions via Iterative Prompting</title>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>Shandong University</affiliation></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last><affiliation>Shandong University</affiliation></author>
       <author><first>Hengyi</first><last>Cai</last><affiliation>JD.com</affiliation></author>
       <author><first>Hongshen</first><last>Chen</last><affiliation>JD.com</affiliation></author>
       <author><first>Pengjie</first><last>Ren</last><affiliation>Shandong University</affiliation></author>
@@ -10357,7 +10357,7 @@
     <paper id="719">
       <title><fixed-case>RADE</fixed-case>: Reference-Assisted Dialogue Evaluation for Open-Domain Dialogue</title>
       <author><first>Zhengliang</first><last>Shi</last><affiliation>Shandong University</affiliation></author>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>Shandong University</affiliation></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last><affiliation>Shandong University</affiliation></author>
       <author><first>Shuo</first><last>Zhang</last><affiliation>Bloomberg</affiliation></author>
       <author><first>Zhen</first><last>Zhang</last><affiliation>Shandong University</affiliation></author>
       <author><first>Pengjie</first><last>Ren</last><affiliation>School of Computer Science and Technology, Shandong University</affiliation></author>
@@ -12556,7 +12556,7 @@
     <paper id="873">
       <title>Estimating the Uncertainty in Emotion Attributes using Deep Evidential Regression</title>
       <author><first>Wen</first><last>Wu</last><affiliation>University of Cambridge</affiliation></author>
-      <author><first>Chao</first><last>Zhang</last><affiliation>Tsinghua University</affiliation></author>
+      <author id="chao-zhang-tu"><first>Chao</first><last>Zhang</last><affiliation>Tsinghua University</affiliation></author>
       <author><first>Philip</first><last>Woodland</last><affiliation>University of Cambridge</affiliation></author>
       <pages>15681-15695</pages>
       <abstract>In automatic emotion recognition (AER), labels assigned by different human annotators to the same utterance are often inconsistent due to the inherent complexity of emotion and the subjectivity of perception. Though deterministic labels generated by averaging or voting are often used as the ground truth, it ignores the intrinsic uncertainty revealed by the inconsistent labels. This paper proposes a Bayesian approach, deep evidential emotion regression (DEER), to estimate the uncertainty in emotion attributes. Treating the emotion attribute labels of an utterance as samples drawn from an unknown Gaussian distribution, DEER places an utterance-specific normal-inverse gamma prior over the Gaussian likelihood and predicts its hyper-parameters using a deep neural network model. It enables a joint estimation of emotion attributes along with the aleatoric and epistemic uncertainties. AER experiments on the widely used MSP-Podcast and IEMOCAP datasets showed DEER produced state-of-the-art results for both the mean values and the distribution of emotion attributes.</abstract>
@@ -14871,7 +14871,7 @@
     </paper>
     <paper id="132">
       <title><fixed-case>MOSPC</fixed-case>: <fixed-case>MOS</fixed-case> Prediction Based on Pairwise Comparison</title>
-      <author><first>Kexin</first><last>Wang</last><affiliation>Bytedance</affiliation></author>
+      <author id="kexin-wang-bd"><first>Kexin</first><last>Wang</last><affiliation>Bytedance</affiliation></author>
       <author><first>Yunlong</first><last>Zhao</last><affiliation>Institute of Automation, Chinese Academy of Sciences</affiliation></author>
       <author><first>Qianqian</first><last>Dong</last><affiliation>ByteDance AI Lab</affiliation></author>
       <author><first>Tom</first><last>Ko</last><affiliation>ByteDance AI Lab</affiliation></author>
@@ -15923,7 +15923,7 @@
       <author><first>Jiatong</first><last>Shi</last><affiliation>Carnegie Mellon University</affiliation></author>
       <author><first>Yun</first><last>Tang</last><affiliation>Facebook</affiliation></author>
       <author><first>Hirofumi</first><last>Inaguma</last><affiliation>Meta AI</affiliation></author>
-      <author><first>Yifan</first><last>Peng</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author id="yifan-peng-cmu"><first>Yifan</first><last>Peng</last><affiliation>Carnegie Mellon University</affiliation></author>
       <author><first>Siddharth</first><last>Dalmia</last><affiliation>Google</affiliation></author>
       <author><first>Peter</first><last>Polák</last><affiliation>Charles University, MFF UFAL</affiliation></author>
       <author><first>Patrick</first><last>Fernandes</last><affiliation>Carnegie Mellon University, Instituto de Telecomunicações</affiliation></author>
diff --git a/data/xml/2023.calcs.xml b/data/xml/2023.calcs.xml
index 9b8c3cbc69..e3ad0a6e47 100644
--- a/data/xml/2023.calcs.xml
+++ b/data/xml/2023.calcs.xml
@@ -15,12 +15,11 @@
       <address>Singapore</address>
       <month>December</month>
       <year>2023</year>
-      <url hash="c39afaad">2023.calcs-1</url>
+      <url hash="0de89ca4">2023.calcs-1</url>
       <venue>calcs</venue>
-      <venue>ws</venue>
     </meta>
     <frontmatter>
-      <url hash="14bfa1b0">2023.calcs-1.0</url>
+      <url hash="dc913d28">2023.calcs-1.0</url>
       <bibkey>calcs-2023-approaches</bibkey>
     </frontmatter>
     <paper id="1">
@@ -29,9 +28,8 @@
       <author><first>Simone</first><last>Teufel</last></author>
       <pages>1-13</pages>
       <abstract>This paper contributes to German-English code-switching research. We provide the largest corpus of naturally occurring German-English code-switching, where English is included in German text, and two methods for code-switching identification. The first method is rule-based, using wordlists and morphological processing. We use this method to compile a corpus of 25.6M tweets employing German-English code-switching. In our second method, we continue pretraining of a neural language model on this corpus and classify tokens based on embeddings from this language model. Our systems establish SoTA on our new corpus and an existing German-English code-switching benchmark. In particular, we systematically study code-switching for language-ambiguous words which can only be resolved in context, and morphologically mixed words consisting of both English and German morphemes. We distribute both corpora and systems to the research community.</abstract>
-      <url hash="b065e7fb">2023.calcs-1.1</url>
+      <url hash="6e8c4b19">2023.calcs-1.1</url>
       <bibkey>sterner-teufel-2023-tongueswitcher</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.1</doi>
     </paper>
     <paper id="2">
       <title>Towards Real-World Streaming Speech Translation for Code-Switched Speech</title>
@@ -43,9 +41,8 @@
       <author><first>Aashish</first><last>Agarwal</last><affiliation>Universität Duisburg-Essen</affiliation></author>
       <pages>14-22</pages>
       <abstract>Code-switching (CS), i.e. mixing different languages in a single sentence, is a common phenomenon in communication and can be challenging in many Natural Language Processing (NLP) settings. Previous studies on CS speech have shown promising results for end-to-end speech translation (ST), but have been limited to offline scenarios and to translation to one of the languages present in the source monolingual transcription). In this paper, we focus on two essential yet unexplored areas for real-world CS speech translation: streaming settings, and translation to a third language (i.e., a language not included in the source). To this end, we extend the Fisher and Miami test and validation datasets to include new targets in Spanish and German. Using this data, we train a model for both offline and streaming ST and we establish baseline results for the two settings mentioned earlier.</abstract>
-      <url hash="7240bb13">2023.calcs-1.2</url>
+      <url hash="990c1088">2023.calcs-1.2</url>
       <bibkey>alastruey-etal-2023-towards</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.2</doi>
     </paper>
     <paper id="3">
       <title>Language Preference for Expression of Sentiment for <fixed-case>N</fixed-case>epali-<fixed-case>E</fixed-case>nglish Bilingual Speakers on Social Media</title>
@@ -53,9 +50,8 @@
       <author><first>Kazutaka</first><last>Shimada</last></author>
       <pages>23-32</pages>
       <abstract>Nepali-English code-switching (CS) has been a growing phenomenon in Nepalese society, especially in social media. The code-switching text can be leveraged to understand the socio-linguistic behaviours of the multilingual speakers. Existing studies have attempted to identify the language preference of the multilingual speakers for expressing different emotions using text in different language pairs. In this work, we aim to study the language preference of multilingual Nepali-English CS speakers while expressing sentiment in social media. We create a novel dataset for sentiment analysis using the public Nepali-English code-switched comments in YouTube. After performing the statistical study on the dataset, we find that the proportion of use of Nepali language is higher in negative comments when compared with positive comments, hence concluding the preference for using native language while expressing negative sentiment. Machine learning and transformer-based models are used as the baseline models for the dataset for sentiment classification. The dataset is released publicly.</abstract>
-      <url hash="bb8693fd">2023.calcs-1.3</url>
+      <url hash="0664c11a">2023.calcs-1.3</url>
       <bibkey>pahari-shimada-2023-language</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.3</doi>
     </paper>
     <paper id="4">
       <title>Text-Derived Language Identity Incorporation for End-to-End Code-Switching Speech Recognition</title>
@@ -63,9 +59,8 @@
       <author><first>Haizhou</first><last>Li</last></author>
       <pages>33-42</pages>
       <abstract>Recognizing code-switching (CS) speech often presents challenges for an automatic speech recognition system (ASR) due to limited linguistic context in short monolingual segments, resulting in language confusion. To mitigate this issue, language identity (LID) is often integrated into the speech recognition system to provide additional linguistic context. However, previous works predominately focus on extracting language identity from speech signals. We introduce a novel approach to learn language identity from pure text data via a dedicated language identity-language model. Besides, we explore two strategies: LID state fusion and language posterior biasing, to integrate the text-derived language identities into the end-to-end ASR system. By incorporating hypothesized language identities, our ASR system gains crucial contextual cues, effectively capturing language transitions and patterns within code-switched utterances. We conduct speech recognition experiments on the SEAME corpus and demonstrate the effectiveness of our proposed methods. Our results reveal significantly improved transcriptions in code-switching scenarios, underscoring the potential of text-derived LID in enhancing code-switching speech recognition.</abstract>
-      <url hash="172947d8">2023.calcs-1.4</url>
+      <url hash="09452002">2023.calcs-1.4</url>
       <bibkey>wang-li-2023-text</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.4</doi>
     </paper>
     <paper id="5">
       <title>Prompting Multilingual Large Language Models to Generate Code-Mixed Texts: The Case of South <fixed-case>E</fixed-case>ast <fixed-case>A</fixed-case>sian Languages</title>
@@ -84,12 +79,11 @@
       <author><first>Long</first><last>Phan</last></author>
       <author><first>Rowena</first><last>Garcia</last></author>
       <author><first>Thamar</first><last>Solorio</last></author>
-      <author><first>Alham</first><last>Aji</last></author>
+      <author><first>Alham Fikri</first><last>Aji</last></author>
       <pages>43-63</pages>
-      <abstract>While code-mixing is a common linguistic practice in many parts of the world, collecting high-quality and low-cost code-mixed data remains a challenge for natural language processing (NLP) research. The recent proliferation of Large Language Models (LLMs) compels one to ask: how capable are these systems in generating code-mixed data? In this paper, we explore prompting multilingual LLMs in a zero-shot manner to generate code-mixed data for seven languages in South East Asia (SEA), namely Indonesian, Malay, Chinese, Tagalog, Vietnamese, Tamil, and Singlish. We find that publicly available multilingual instruction-tuned models such as BLOOMZ and Flan-T5-XXL are incapable of producing texts with phrases or clauses from different languages. ChatGPT exhibits inconsistent capabilities in generating code-mixed texts, wherein its performance varies depending on the prompt template and language pairing. For instance, ChatGPT generates fluent and natural Singlish texts (an English-based creole spoken in Singapore), but for English-Tamil language pair, the system mostly produces grammatically incorrect or semantically meaningless utterances. Furthermore, it may erroneously introduce languages not specified in the prompt. Based on our investigation, existing multilingual LLMs exhibit a wide range of proficiency in code-mixed data generation for SEA languages. As such, we advise against using LLMs in this context without extensive human checks.</abstract>
-      <url hash="a4b0e996">2023.calcs-1.5</url>
+      <abstract>The differences in decision making between behavioural models of voice interfaces are hard to capture using existing measures for the absolute performance of such models. For instance, two models may have a similar task success rate, but very different ways of getting there. In this paper, we propose a general methodology to compute the similarity of two dialogue behaviour models and investigate different ways of computing scores on both the semantic and the textual level. Complementing absolute measures of performance, we test our scores on three different tasks and show the practical usability of the measures.</abstract>
+      <url hash="85582681">2023.calcs-1.5</url>
       <bibkey>yong-etal-2023-prompting</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.5</doi>
     </paper>
     <paper id="6">
       <title><fixed-case>CONFLATOR</fixed-case>: Incorporating Switching Point based Rotatory Positional Encodings for Code-Mixed Language Modeling</title>
@@ -102,12 +96,9 @@
       <author><first>Aman</first><last>Chadha</last></author>
       <author><first>Amitava</first><last>Das</last></author>
       <pages>64-73</pages>
-      <abstract>The mixing of two or more languages is called Code-Mixing (CM). CM is a social norm in multilingual societies. Neural Language Models (NLMs) like transformers have been effective on many NLP tasks. However, NLM for CM is an under-explored area. Though transformers are capable and powerful, they cannot always encode positional information since they are non-recurrent. Therefore, to enrich word information and incorporate positional information, positional encoding is defined. We hypothesize that Switching Points (SPs), i.e., junctions in the text where the language switches (L1 -&gt; L2 or L2 -&gt; L1), pose a challenge for CM Language Models (LMs), and hence give special emphasis to SPs in the modeling process. We experiment with several positional encoding mechanisms and show that rotatory positional encodings along with switching point information yield the best results.
-
-We introduce CONFLATOR: a neural language modeling approach for code-mixed languages. CONFLATOR tries to learn to emphasize switching points using smarter positional encoding, both at unigram and bigram levels. CONFLATOR outperforms the state-of-the-art on two tasks based on code-mixed Hindi and English (Hinglish): (i) sentiment analysis and (ii) machine translation.</abstract>
-      <url hash="e616bc03">2023.calcs-1.6</url>
+      <abstract>The mixing of two or more languages is called Code-Mixing (CM). CM is a social norm in multilingual societies. Neural Language Models (NLMs) like transformers have been effective on many NLP tasks. However, NLM for CM is an under-explored area. Though transformers are capable and powerful, they cannot always encode positional information since they are non-recurrent. Therefore, to enrich word information and incorporate positional information, positional encoding is defined. We hypothesize that Switching Points (SPs), i.e., junctions in the text where the language switches (L1 -&gt; L2 or L2 -&gt; L1), pose a challenge for CM Language Models (LMs), and hence give special emphasis to SPs in the modeling process. We experiment with several positional encoding mechanisms and show that rotatory positional encodings along with switching point information yield the best results.We introduce CONFLATOR: a neural language modeling approach for code-mixed languages. CONFLATOR tries to learn to emphasize switching points using smarter positional encoding, both at unigram and bigram levels. CONFLATOR outperforms the state-of-the-art on two tasks based on code-mixed Hindi and English (Hinglish): (i) sentiment analysis and (ii) machine translation.</abstract>
+      <url hash="f4062ee1">2023.calcs-1.6</url>
       <bibkey>mohammed-etal-2023-conflator</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.6</doi>
     </paper>
     <paper id="7">
       <title>Unified Model for Code-Switching Speech Recognition and Language Identification Based on Concatenated Tokenizer</title>
@@ -116,9 +107,8 @@ We introduce CONFLATOR: a neural language modeling approach for code-mixed langu
       <author><first>Boris</first><last>Ginsburg</last></author>
       <pages>74-82</pages>
       <abstract>Code-Switching (CS) multilingual Automatic Speech Recognition (ASR) models can transcribe speech containing two or more alternating languages during a conversation. This paper proposes (1) a new method for creating code-switching ASR datasets from purely monolingual data sources, and (2) a novel Concatenated Tokenizer that enables ASR models to generate language ID for each emitted text token while reusing existing monolingual tokenizers. The efficacy of these approaches for building CS ASR models is demonstrated for two language pairs, English-Hindi and English-Spanish, where we achieve new state-of-the-art results on the Miami Bangor CS evaluation corpus. In addition to competitive ASR performance, the proposed Concatenated Tokenizer models are highly effective for spoken language identification, achieving 98%+ accuracy on the out-of-distribution FLEURS dataset.</abstract>
-      <url hash="f1ff2fca">2023.calcs-1.7</url>
+      <url hash="9f524592">2023.calcs-1.7</url>
       <bibkey>dhawan-etal-2023-unified</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.7</doi>
     </paper>
     <paper id="8">
       <title>Multilingual self-supervised speech representations improve the speech recognition of low-resource <fixed-case>A</fixed-case>frican languages with codeswitching</title>
@@ -127,9 +117,8 @@ We introduce CONFLATOR: a neural language modeling approach for code-mixed langu
       <author><first>Dan</first><last>Jurafsky</last></author>
       <pages>83-88</pages>
       <abstract>While many speakers of low-resource languages regularly code-switch between their languages and other regional languages or English, datasets of codeswitched speech are too small to train bespoke acoustic models from scratch or do language model rescoring. Here we propose finetuning self-supervised speech representations such as wav2vec 2.0 XLSR to recognize code-switched data. We find that finetuning self-supervised multilingual representations and augmenting them with n-gram language models trained from transcripts reduces absolute word error rates by up to 20% compared to baselines of hybrid models trained from scratch on code-switched data. Our findings suggest that in circumstances with limited training data finetuning self-supervised representations is a better performing and viable solution.</abstract>
-      <url hash="8f240fd9">2023.calcs-1.8</url>
+      <url hash="6860faac">2023.calcs-1.8</url>
       <bibkey>ogunremi-etal-2023-multilingual</bibkey>
-      <doi>10.18653/v1/2023.calcs-1.8</doi>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2023.ccl.xml b/data/xml/2023.ccl.xml
index 2b6b5a16f0..7ae56b94d6 100644
--- a/data/xml/2023.ccl.xml
+++ b/data/xml/2023.ccl.xml
@@ -1626,7 +1626,7 @@
     </paper>
     <paper id="3">
       <title>Studying Language Processing in the Human Brain with Speech and Language Models</title>
-      <author><first>Zhang</first><last>Chao</last></author>
+      <author id="chao-zhang-tu"><first>Zhang</first><last>Chao</last></author>
       <author><first>Thwaites</first><last>Andrew</last></author>
       <author><first>Wingfield</first><last>Cai</last></author>
       <pages>17–23</pages>
diff --git a/data/xml/2023.cl.xml b/data/xml/2023.cl.xml
index 198b403465..820954d873 100644
--- a/data/xml/2023.cl.xml
+++ b/data/xml/2023.cl.xml
@@ -303,4 +303,85 @@
       <bibkey>tait-etal-2023-obituary</bibkey>
     </paper>
   </volume>
+  <volume id="4" type="journal">
+    <meta>
+      <booktitle>Computational Linguistics, Volume 49, Issue 4 - December 2023</booktitle>
+      <publisher>MIT Press</publisher>
+      <address>Cambridge, MA</address>
+      <month>December</month>
+      <year>2023</year>
+      <venue>cl</venue>
+      <journal-volume>49</journal-volume>
+      <journal-issue>4</journal-issue>
+    </meta>
+    <paper id="1">
+      <title>My Tenure as the Editor-in-Chief of Computational Linguistics</title>
+      <author><first>Hwee Tou</first><last>Ng</last></author>
+      <doi>10.1162/coli_e_00505</doi>
+      <abstract>Times flies and it has been close to five and a half years since I became the editor-in-chief of Computational Linguistics on 15 July 2018. In this editorial, I will describe the changes that I have introduced at the journal, and highlight the achievements and challenges of the journal.</abstract>
+      <pages>773–775</pages>
+      <url hash="440cb165">2023.cl-4.1</url>
+      <bibkey>ng-2023-tenure</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Measuring Attribution in Natural Language Generation Models</title>
+      <author><first>Hannah</first><last>Rashkin</last></author>
+      <author><first>Vitaly</first><last>Nikolaev</last></author>
+      <author><first>Matthew</first><last>Lamm</last></author>
+      <author><first>Lora</first><last>Aroyo</last></author>
+      <author><first>Michael</first><last>Collins</last></author>
+      <author><first>Dipanjan</first><last>Das</last></author>
+      <author><first>Slav</first><last>Petrov</last></author>
+      <author><first>Gaurav Singh</first><last>Tomar</last></author>
+      <author><first>Iulia</first><last>Turc</last></author>
+      <author><first>David</first><last>Reitter</last></author>
+      <doi>10.1162/coli_a_00486</doi>
+      <abstract>Large neural models have brought a new challenge to natural language generation (NLG): It has become imperative to ensure the safety and reliability of the output of models that generate freely. To this end, we present an evaluation framework, Attributable to Identified Sources (AIS), stipulating that NLG output pertaining to the external world is to be verified against an independent, provided source. We define AIS and a two-stage annotation pipeline for allowing annotators to evaluate model output according to annotation guidelines. We successfully validate this approach on generation datasets spanning three tasks (two conversational QA datasets, a summarization dataset, and a table-to-text dataset). We provide full annotation guidelines in the appendices and publicly release the annotated data at https://github.com/google-research-datasets/AIS.</abstract>
+      <pages>777–840</pages>
+      <url hash="1e5b02fe">2023.cl-4.2</url>
+      <bibkey>rashkin-etal-2023-measuring</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Generation and Polynomial Parsing of Graph Languages with Non-Structural Reentrancies</title>
+      <author><first>Johanna</first><last>Björklund</last></author>
+      <author><first>Frank</first><last>Drewes</last></author>
+      <author><first>Anna</first><last>Jonsson</last></author>
+      <doi>10.1162/coli_a_00488</doi>
+      <abstract>Graph-based semantic representations are popular in natural language processing, where it is often convenient to model linguistic concepts as nodes and relations as edges between them. Several attempts have been made to find a generative device that is sufficiently powerful to describe languages of semantic graphs, while at the same allowing efficient parsing. We contribute to this line of work by introducing graph extension grammar, a variant of the contextual hyperedge replacement grammars proposed by Hoffmann et al. Contextual hyperedge replacement can generate graphs with non-structural reentrancies, a type of node-sharing that is very common in formalisms such as abstract meaning representation, but that context-free types of graph grammars cannot model. To provide our formalism with a way to place reentrancies in a linguistically meaningful way, we endow rules with logical formulas in counting monadic second-order logic. We then present a parsing algorithm and show as our main result that this algorithm runs in polynomial time on graph languages generated by a subclass of our grammars, the so-called local graph extension grammars.</abstract>
+      <pages>841–882</pages>
+      <url hash="338b8904">2023.cl-4.3</url>
+      <bibkey>bjorklund-etal-2023-generation</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Capturing Fine-Grained Regional Differences in Language Use through Voting Precinct Embeddings</title>
+      <author><first>Alex</first><last>Rosenfeld</last></author>
+      <author><first>Lars</first><last>Hinrichs</last></author>
+      <doi>10.1162/coli_a_00487</doi>
+      <abstract>Linguistic variation across a region of interest can be captured by partitioning the region into areas and using social media data to train embeddings that represent language use in those areas. Recent work has focused on larger areas, such as cities or counties, to ensure that enough social media data is available in each area, but larger areas have a limited ability to find fine-grained distinctions, such as intracity differences in language use. We demonstrate that it is possible to embed smaller areas, which can provide higher resolution analyses of language variation. We embed voting precincts, which are tiny, evenly sized political divisions for the administration of elections. The issue with modeling language use in small areas is that the data becomes incredibly sparse, with many areas having scant social media data. We propose a novel embedding approach that alternates training with smoothing, which mitigates these sparsity issues. We focus on linguistic variation across Texas as it is relatively understudied. We develop two novel quantitative evaluations that measure how well the embeddings can be used to capture linguistic variation. The first evaluation measures how well a model can map a dialect given terms specific to that dialect. The second evaluation measures how well a model can map preference of lexical variants. These evaluations show how embedding models could be used directly by sociolinguists and measure how much sociolinguistic information is contained within the embeddings. We complement this second evaluation with a methodology for using embeddings as a kind of genetic code where we identify “genes” that correspond to a sociological variable and connect those “genes” to a linguistic phenomenon thereby connecting sociological phenomena to linguistic ones. Finally, we explore approaches for inferring isoglosses using embeddings.</abstract>
+      <pages>883–942</pages>
+      <url hash="c3788553">2023.cl-4.4</url>
+      <bibkey>rosenfeld-hinrichs-2023-capturing</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Languages Through the Looking Glass of <fixed-case>BPE</fixed-case> Compression</title>
+      <author><first>Ximena</first><last>Gutierrez-Vasques</last></author>
+      <author><first>Christian</first><last>Bentz</last></author>
+      <author><first>Tanja</first><last>Samardžić</last></author>
+      <doi>10.1162/coli_a_00489</doi>
+      <abstract>Byte-pair encoding (BPE) is widely used in NLP for performing subword tokenization. It uncovers redundant patterns for compressing the data, and hence alleviates the sparsity problem in downstream applications. Subwords discovered during the first merge operations tend to have the most substantial impact on the compression of texts. However, the structural underpinnings of this effect have not been analyzed cross-linguistically. We conduct in-depth analyses across 47 typologically diverse languages and three parallel corpora, and thereby show that the types of recurrent patterns that have the strongest impact on compression are an indicator of morphological typology. For languages with richer inflectional morphology there is a preference for highly productive subwords on the early merges, while for languages with less inflectional morphology, idiosyncratic subwords are more prominent. Both types of patterns contribute to efficient compression. Counter to the common perception that BPE subwords are not linguistically relevant, we find patterns across languages that resemble those described in traditional typology. We thus propose a novel way to characterize languages according to their BPE subword properties, inspired by the notion of morphological productivity in linguistics. This allows us to have language vectors that encode typological knowledge induced from raw text. Our approach is easily applicable to a wider range of languages and texts, as it does not require annotated data or any external linguistic knowledge. We discuss its potential contributions to quantitative typology and multilingual NLP.</abstract>
+      <pages>943–1001</pages>
+      <url hash="df47a4b1">2023.cl-4.5</url>
+      <bibkey>gutierrez-vasques-etal-2023-languages</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Language Embeddings Sometimes Contain Typological Generalizations</title>
+      <author><first>Robert</first><last>Östling</last></author>
+      <author><first>Murathan</first><last>Kurfalı</last></author>
+      <doi>10.1162/coli_a_00491</doi>
+      <abstract>To what extent can neural network models learn generalizations about language structure, and how do we find out what they have learned? We explore these questions by training neural models for a range of natural language processing tasks on a massively multilingual dataset of Bible translations in 1,295 languages. The learned language representations are then compared to existing typological databases as well as to a novel set of quantitative syntactic and morphological features obtained through annotation projection. We conclude that some generalizations are surprisingly close to traditional features from linguistic typology, but that most of our models, as well as those of previous work, do not appear to have made linguistically meaningful generalizations. Careful attention to details in the evaluation turns out to be essential to avoid false positives. Furthermore, to encourage continued work in this field, we release several resources covering most or all of the languages in our data: (1) multiple sets of language representations, (2) multilingual word embeddings, (3) projected and predicted syntactic and morphological features, (4) software to provide linguistically sound evaluations of language representations.</abstract>
+      <pages>1003–1051</pages>
+      <url hash="5a7ee79d">2023.cl-4.6</url>
+      <bibkey>ostling-kurfali-2023-language</bibkey>
+    </paper>
+  </volume>
 </collection>
diff --git a/data/xml/2023.emnlp.xml b/data/xml/2023.emnlp.xml
index 0a8f9eaa89..76bc5bf299 100644
--- a/data/xml/2023.emnlp.xml
+++ b/data/xml/2023.emnlp.xml
@@ -603,7 +603,7 @@
       <title><fixed-case>LLM</fixed-case>-powered Data Augmentation for Enhanced Cross-lingual Performance</title>
       <author><first>Chenxi</first><last>Whitehouse</last></author>
       <author><first>Monojit</first><last>Choudhury</last></author>
-      <author><first>Alham</first><last>Aji</last></author>
+      <author><first>Alham Fikri</first><last>Aji</last></author>
       <pages>671-686</pages>
       <abstract>This paper explores the potential of leveraging Large Language Models (LLMs) for data augmentation in multilingual commonsense reasoning datasets where the available training data is extremely limited. To achieve this, we utilise several LLMs, namely Dolly-v2, StableVicuna, ChatGPT, and GPT-4, to augment three datasets: XCOPA, XWinograd, and XStoryCloze. Subsequently, we evaluate the effectiveness of fine-tuning smaller multilingual models, mBERT and XLMR, using the synthesised data. We compare the performance of training with data generated in English and target languages, as well as translated English-generated data, revealing the overall advantages of incorporating data generated by LLMs, e.g. a notable 13.4 accuracy score improvement for the best case. Furthermore, we conduct a human evaluation by asking native speakers to assess the naturalness and logical coherence of the generated examples across different languages. The results of the evaluation indicate that LLMs such as ChatGPT and GPT-4 excel at producing natural and coherent text in most languages, however, they struggle to generate meaningful text in certain languages like Tamil. We also observe that ChatGPT falls short in generating plausible alternatives compared to the original dataset, whereas examples from GPT-4 exhibit competitive logical consistency.</abstract>
       <url hash="71f6b198">2023.emnlp-main.44</url>
@@ -3940,7 +3940,7 @@
     <paper id="281">
       <title>Towards Building More Robust <fixed-case>NER</fixed-case> datasets: An Empirical Study on <fixed-case>NER</fixed-case> Dataset Bias from a Dataset Difficulty View</title>
       <author><first>Ruotian</first><last>Ma</last></author>
-      <author id='xiaolei-wang-fudan'><first>Xiaolei</first><last>Wang</last></author>
+      <author id="xiaolei-wang-fudan"><first>Xiaolei</first><last>Wang</last></author>
       <author><first>Xin</first><last>Zhou</last></author>
       <author><first>Qi</first><last>Zhang</last></author>
       <author><first>Xuanjing</first><last>Huang</last></author>
@@ -7032,10 +7032,12 @@
       <author><first>Ryan</first><last>Cotterell</last></author>
       <pages>8069-8086</pages>
       <abstract>Studying language models (LMs) in terms of well-understood formalisms allows us to precisely characterize their abilities and limitations. Previous work has investigated the expressive power of recurrent neural network (RNN) LMs in terms of their capacity to recognize unweighted formal languages. However, LMs do not describe unweighted formal languages—rather, they define probability distributions over strings. In this work, we study what classes of such probability distributions RNN LMs can represent, which allows us to make more direct statements about their capabilities. We show that simple RNNs are equivalent to a subclass of probabilistic finite-state automata, and can thus model a strict subset of probability distributions expressible by finite-state models. Furthermore, we study the space complexity of representing finite-state LMs with RNNs. We show that, to represent an arbitrary deterministic finite-state LM with <tex-math>N</tex-math> states over an alphabet <tex-math>\Sigma</tex-math>, an RNN requires <tex-math>\Omega\left(N |\Sigma|\right)</tex-math> neurons. These results present a first step towards characterizing the classes of distributions RNN LMs can represent and thus help us understand their capabilities and limitations.</abstract>
-      <url hash="8bf6a718">2023.emnlp-main.502</url>
+      <url hash="395d611e">2023.emnlp-main.502</url>
       <bibkey>svete-cotterell-2023-recurrent</bibkey>
       <doi>10.18653/v1/2023.emnlp-main.502</doi>
       <video href="2023.emnlp-main.502.mp4"/>
+      <revision id="1" href="2023.emnlp-main.502v1" hash="8bf6a718"/>
+      <revision id="2" href="2023.emnlp-main.502v2" hash="395d611e" date="2024-05-06">Fixes Equation 3 and Figure 4.</revision>
     </paper>
     <paper id="503">
       <title>Revisiting Source Context in Nearest Neighbor Machine Translation</title>
@@ -8681,7 +8683,7 @@
     </paper>
     <paper id="621">
       <title>Rethinking the Evaluation for Conversational Recommendation in the Era of Large Language Models</title>
-      <author id='xiaolei-wang-renmin'><first>Xiaolei</first><last>Wang</last></author>
+      <author id="xiaolei-wang-renmin"><first>Xiaolei</first><last>Wang</last></author>
       <author><first>Xinyu</first><last>Tang</last></author>
       <author><first>Xin</first><last>Zhao</last></author>
       <author><first>Jingyuan</first><last>Wang</last></author>
@@ -10792,7 +10794,7 @@
       <author><first>Samuel</first><last>Cahyawijaya</last></author>
       <author><first>Jan Christian Blaise</first><last>Cruz</last></author>
       <author><first>Genta</first><last>Winata</last></author>
-      <author><first>Alham</first><last>Aji</last></author>
+      <author><first>Alham Fikri</first><last>Aji</last></author>
       <pages>12567-12582</pages>
       <abstract>Multilingual Large Language Models (LLMs) have recently shown great capabilities in a wide range of tasks, exhibiting state-of-the-art performance through zero-shot or few-shot prompting methods. While there have been extensive studies on their abilities in monolingual tasks, the investigation of their potential in the context of code-switching (CSW), the practice of alternating languages within an utterance, remains relatively uncharted. In this paper, we provide a comprehensive empirical analysis of various multilingual LLMs, benchmarking their performance across four tasks: sentiment analysis, machine translation, summarization and word-level language identification. Our results indicate that despite multilingual LLMs exhibiting promising outcomes in certain tasks using zero or few-shot prompting, they still underperform in comparison to fine-tuned models of much smaller scales. We argue that current “multilingualism’ in LLMs does not inherently imply proficiency with code-switching texts, calling for future research to bridge this discrepancy.</abstract>
       <url hash="a6c294da">2023.emnlp-main.774</url>
@@ -12199,7 +12201,7 @@
       <author><first>Fahim</first><last>Faisal</last></author>
       <author><first>Alissa</first><last>Ostapenko</last></author>
       <author><first>Genta</first><last>Winata</last></author>
-      <author><first>Alham</first><last>Aji</last></author>
+      <author><first>Alham Fikri</first><last>Aji</last></author>
       <author><first>Samuel</first><last>Cahyawijaya</last></author>
       <author><first>Yulia</first><last>Tsvetkov</last></author>
       <author><first>Antonios</first><last>Anastasopoulos</last></author>
@@ -12893,7 +12895,7 @@
     </paper>
     <paper id="923">
       <title>Is <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> Good at Search? Investigating Large Language Models as Re-Ranking Agents</title>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Lingyong</first><last>Yan</last></author>
       <author><first>Xinyu</first><last>Ma</last></author>
       <author><first>Shuaiqiang</first><last>Wang</last></author>
@@ -14100,7 +14102,7 @@
     </paper>
     <paper id="1010">
       <title>Not all Fake News is Written: A Dataset and Analysis of Misleading Video Headlines</title>
-      <author><first>Yoo</first><last>Sung</last></author>
+      <author><first>Yoo Yeon</first><last>Sung</last></author>
       <author><first>Jordan</first><last>Boyd-Graber</last></author>
       <author><first>Naeemul</first><last>Hassan</last></author>
       <pages>16241-16258</pages>
diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index e7974149b6..d753431a58 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -2081,7 +2081,7 @@
     </paper>
     <paper id="155">
       <title>Generative Knowledge Selection for Knowledge-Grounded Dialogues</title>
-      <author><first>Weiwei</first><last>Sun</last><affiliation>Shandong University</affiliation></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last><affiliation>Shandong University</affiliation></author>
       <author><first>Pengjie</first><last>Ren</last><affiliation>Shandong University</affiliation></author>
       <author><first>Zhaochun</first><last>Ren</last><affiliation>Shandong University</affiliation></author>
       <pages>2077-2088</pages>
@@ -4468,7 +4468,7 @@
     </paper>
     <paper id="134">
       <title>Speaking Multiple Languages Affects the Moral Bias of Language Models</title>
-      <author><first>Katharina</first><last>Haemmerl</last><affiliation>Center for Information and Language Processing, LMU</affiliation></author>
+      <author><first>Katharina</first><last>Hämmerl</last><affiliation>Center for Information and Language Processing, LMU</affiliation></author>
       <author><first>Bjoern</first><last>Deiseroth</last><affiliation>TU Darmstadt, Aleph Alpha</affiliation></author>
       <author><first>Patrick</first><last>Schramowski</last><affiliation>TU Darmstadt</affiliation></author>
       <author><first>Jindřich</first><last>Libovický</last><affiliation>Charles Univeristy</affiliation></author>
@@ -4999,6 +4999,7 @@
       <doi>10.18653/v1/2023.findings-acl.173</doi>
       <revision id="1" href="2023.findings-acl.173v1" hash="d8a17e96"/>
       <revision id="2" href="2023.findings-acl.173v2" hash="9551832e" date="2024-01-20">Author name correction.</revision>
+      <video href="2023.findings-acl.173.mp4"/>
     </paper>
     <paper id="174">
       <title><fixed-case>X</fixed-case>-<fixed-case>R</fixed-case>i<fixed-case>SAWOZ</fixed-case>: High-Quality End-to-End Multilingual Dialogue Datasets and Few-shot Agents</title>
@@ -8513,7 +8514,7 @@
     </paper>
     <paper id="439">
       <title>Exploring Anisotropy and Outliers in Multilingual Language Models for Cross-Lingual Semantic Sentence Similarity</title>
-      <author><first>Katharina</first><last>Haemmerl</last><affiliation>Center for Information and Language Processing, LMU</affiliation></author>
+      <author><first>Katharina</first><last>Hämmerl</last><affiliation>Center for Information and Language Processing, LMU</affiliation></author>
       <author><first>Alina</first><last>Fastowski</last><affiliation>Center for Information and Language Processing, LMU Munich</affiliation></author>
       <author><first>Jindřich</first><last>Libovický</last><affiliation>Charles Univeristy</affiliation></author>
       <author><first>Alexander</first><last>Fraser</last><affiliation>Ludwig-Maximilians-Universität München</affiliation></author>
@@ -27763,7 +27764,7 @@
       <title><fixed-case>D</fixed-case>i<fixed-case>QAD</fixed-case>: A Benchmark Dataset for Open-domain Dialogue Quality Assessment</title>
       <author><first>Yukun</first><last>Zhao</last></author>
       <author><first>Lingyong</first><last>Yan</last></author>
-      <author><first>Weiwei</first><last>Sun</last></author>
+      <author id="weiwei-sun-sd"><first>Weiwei</first><last>Sun</last></author>
       <author><first>Chong</first><last>Meng</last></author>
       <author><first>Shuaiqiang</first><last>Wang</last></author>
       <author><first>Zhicong</first><last>Cheng</last></author>
diff --git a/data/xml/2023.ijcnlp.xml b/data/xml/2023.ijcnlp.xml
index caae68176a..a8a2da7114 100644
--- a/data/xml/2023.ijcnlp.xml
+++ b/data/xml/2023.ijcnlp.xml
@@ -1384,7 +1384,7 @@
       <doi>10.18653/v1/2023.ijcnlp-tutorials.6</doi>
     </paper>
   </volume>
-  <volume id="demo" ingest-date="2024-01-18" type="proceedings">
+  <volume id="demo" ingest-date="2024-05-21" type="proceedings">
     <meta>
       <booktitle>Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics: System Demonstrations</booktitle>
       <editor><first>Sriparna</first><last>Saha</last></editor>
@@ -1394,10 +1394,9 @@
       <month>November</month>
       <year>2023</year>
       <venue>ijcnlp</venue>
-      <venue>aacl</venue>
     </meta>
     <frontmatter>
-      <url hash="325d82bf">2023.ijcnlp-demo.0</url>
+      <url hash="bd6dd706">2023.ijcnlp-demo.0</url>
       <bibkey>ijcnlp-2023-international-joint-natural-language-processing</bibkey>
     </frontmatter>
     <paper id="1">
@@ -1492,6 +1491,20 @@
       <bibkey>moon-etal-2023-wamp</bibkey>
       <doi>10.18653/v1/2023.ijcnlp-demo.8</doi>
     </paper>
+    <paper id="9">
+      <title><fixed-case>ERNIE</fixed-case>-Music: Text-to-Waveform Music Generation with Diffusion Models</title>
+      <author><first>Pengfei</first><last>Zhu</last></author>
+      <author><first>Chao</first><last>Pang</last></author>
+      <author><first>Yekun</first><last>Chai</last></author>
+      <author><first>Lei</first><last>Li</last></author>
+      <author><first>Shuohuan</first><last>Wang</last></author>
+      <author><first>Yu</first><last>Sun</last></author>
+      <author><first>Hao</first><last>Tian</last></author>
+      <author><first>Hua</first><last>Wu</last></author>
+      <pages>86–95</pages>
+      <url hash="645c502a">2023.ijcnlp-demo.9</url>
+      <bibkey>zhu-etal-2023-ernie</bibkey>
+    </paper>
   </volume>
   <event id="ijcnlp-2023">
     <meta>
diff --git a/data/xml/2023.inlg.xml b/data/xml/2023.inlg.xml
index d5bdf65766..79da3ed210 100644
--- a/data/xml/2023.inlg.xml
+++ b/data/xml/2023.inlg.xml
@@ -428,10 +428,12 @@
       <author><first>Dimitra</first><last>Gkatzia</last></author>
       <pages>443–448</pages>
       <abstract>Gàidhlig (Scottish Gaelic; gd) is spoken by about 57k people in Scotland, but remains an under-resourced language with respect to natural language processing in general and natural language generation (NLG) in particular. To address this gap, we developed the first datasets for Scottish Gaelic NLG, collecting both conversational and summarisation data in a single setting. Our task setup involves dialogues between a pair of speakers discussing museum exhibits, grounding the conversation in images and texts. Then, both interlocutors summarise the dialogue resulting in a secondary dialogue summarisation dataset. This paper presents the dialogue and summarisation corpora, as well as the software used for data collection. The corpus consists of 43 conversations (13.7k words) and 61 summaries (2.0k words), and will be released along with the data collection interface.</abstract>
-      <url hash="6d0d7087">2023.inlg-main.34</url>
+      <url hash="a560a5d7">2023.inlg-main.34</url>
       <attachment type="Supplementary_Attachment" hash="8a897a3d">2023.inlg-main.34.Supplementary_Attachment.pdf</attachment>
       <bibkey>howcroft-etal-2023-building</bibkey>
       <doi>10.18653/v1/2023.inlg-main.34</doi>
+      <revision id="1" href="2023.inlg-main.34v1" hash="6d0d7087"/>
+      <revision id="2" href="2023.inlg-main.34v2" hash="a560a5d7" date="2024-05-21">This version corrects descriptive corpus statistics, because some conversations, summaries, and participants were erroneously excluded.</revision>
     </paper>
     <paper id="35">
       <title>Generating Multiple Questions from Presentation Transcripts: A Pilot Study on Earnings Conference Calls</title>
diff --git a/data/xml/2023.iwslt.xml b/data/xml/2023.iwslt.xml
index e693e656b6..de22ca75ec 100644
--- a/data/xml/2023.iwslt.xml
+++ b/data/xml/2023.iwslt.xml
@@ -367,7 +367,7 @@
       <author><first>Soumi</first><last>Maiti</last><affiliation>ML researcher</affiliation></author>
       <author><first>William</first><last>Chen</last><affiliation>Carnegie Mellon University</affiliation></author>
       <author><first>Xinjian</first><last>Li</last><affiliation>Carnegie Mellon University</affiliation></author>
-      <author><first>Yifan</first><last>Peng</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author id="yifan-peng-cmu"><first>Yifan</first><last>Peng</last><affiliation>Carnegie Mellon University</affiliation></author>
       <author><first>Siddhant</first><last>Arora</last><affiliation>Student at Carnegie Mellon Univeristy</affiliation></author>
       <author><first>Shinji</first><last>Watanabe</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>235-240</pages>
diff --git a/data/xml/2023.latechclfl.xml b/data/xml/2023.latechclfl.xml
index c3a75d732c..6798865ba6 100644
--- a/data/xml/2023.latechclfl.xml
+++ b/data/xml/2023.latechclfl.xml
@@ -138,7 +138,7 @@
     </paper>
     <paper id="10">
       <title>What do Humor Classifiers Learn? An Attempt to Explain Humor Recognition Models</title>
-      <author><first>Marcio</first><last>Inácio</last><affiliation>University of Coimbra</affiliation></author>
+      <author><first>Marcio</first><last>Lima Inácio</last><affiliation>University of Coimbra</affiliation></author>
       <author><first>Gabriela</first><last>Wick-pedro</last><affiliation>Universidade Federal de São Carlos</affiliation></author>
       <author><first>Hugo</first><last>Goncalo Oliveira</last><affiliation>CISUC, DEI, University of Coimbra</affiliation></author>
       <pages>88-98</pages>
diff --git a/data/xml/2023.matching.xml b/data/xml/2023.matching.xml
index b4293301f1..cb8322962a 100644
--- a/data/xml/2023.matching.xml
+++ b/data/xml/2023.matching.xml
@@ -93,7 +93,7 @@
     <paper id="7">
       <title>Knowledge-Augmented Language Model Prompting for Zero-Shot Knowledge Graph Question Answering</title>
       <author><first>Jinheon</first><last>Baek</last></author>
-      <author><first>Alham</first><last>Aji</last></author>
+      <author><first>Alham Fikri</first><last>Aji</last></author>
       <author><first>Amir</first><last>Saffari</last></author>
       <pages>70-98</pages>
       <abstract>Large Language Models (LLMs) are capable of performing zero-shot closed-book question answering tasks, based on their internal knowledge stored in parameters during pre-training. However, such internalized knowledge might be insufficient and incorrect, which could lead LLMs to generate factually wrong answers. Furthermore, fine-tuning LLMs to update their knowledge is expensive. To this end, we propose to augment the knowledge directly in the input of LLMs. Specifically, we first retrieve the relevant facts to the input question from the knowledge graph based on semantic similarities between the question and its associated facts. After that, we prepend the retrieved facts to the input question in the form of the prompt, which is then forwarded to LLMs to generate the answer. Our framework, Knowledge-Augmented language model PromptING (KAPING), requires no model training, thus completely zero-shot. We validate the performance of our KAPING framework on the knowledge graph question answering task, that aims to answer the user’s question based on facts over a knowledge graph, on which ours outperforms relevant zero-shot baselines by up to 48% in average, across multiple LLMs of various sizes.</abstract>
diff --git a/data/xml/2023.nllp.xml b/data/xml/2023.nllp.xml
index 5eaf9b1650..17dd782c65 100644
--- a/data/xml/2023.nllp.xml
+++ b/data/xml/2023.nllp.xml
@@ -7,7 +7,7 @@
       <editor><first>Catalina</first><last>Goanta</last></editor>
       <editor><first>Ilias</first><last>Chalkidis</last></editor>
       <editor><first>Leslie</first><last>Barrett</last></editor>
-      <editor><first>Gerasimos (Jerry)</first><last>Spanakis</last></editor>
+      <editor><first>Gerasimos</first><last>Spanakis</last></editor>
       <editor><first>Nikolaos</first><last>Aletras</last></editor>
       <publisher>Association for Computational Linguistics</publisher>
       <address>Singapore</address>
diff --git a/data/xml/2023.sicon.xml b/data/xml/2023.sicon.xml
index 826c31c62e..f3e9bf5f3a 100644
--- a/data/xml/2023.sicon.xml
+++ b/data/xml/2023.sicon.xml
@@ -71,7 +71,7 @@
     <paper id="5">
       <title><fixed-case>BC</fixed-case>ause: Reducing group bias and promoting cohesive discussion in online deliberation processes through a simple and engaging online deliberation tool</title>
       <author><first>Lucas</first><last>Anastasiou</last></author>
-      <author><first>Anna</first><last>De Libbo</last><affiliation>NA</affiliation></author>
+      <author><first>Anna</first><last>De Liddo</last></author>
       <pages>39-49</pages>
       <abstract>Facilitating healthy online deliberation in terms of sensemaking and collaboration of discussion participants proves extremely challenging due to a number of known negative effects of online communication on social media platforms. We start from concerns and aspirations about the use of existing online discussion systems as distilled in previous literature, we then combine them with lessons learned on design and engineering practices from our research team, to inform the design of an easy-to-use tool (BCause.app) that enables higher quality discussions than traditional social media. We describe the design of this tool, highlighting the main interaction features that distinguish it from common social media, namely: i. the low-cost argumentation structuring of the conversations with direct replies; ii. and the distinctive use of reflective feedback rather than appreciative-only feedback. We then present the results of a controlled A/B experiment in which we show that the presence of argumentative and cognitive reflective discussion elements produces better social interaction with less polarization and promotes a more cohesive discussion than common social media-like interactions.</abstract>
       <url hash="920401d9">2023.sicon-1.5</url>
diff --git a/data/xml/2023.swisstext.xml b/data/xml/2023.swisstext.xml
index 3baa071256..84e50e44d7 100644
--- a/data/xml/2023.swisstext.xml
+++ b/data/xml/2023.swisstext.xml
@@ -17,7 +17,7 @@
       <venue>swisstext</venue>
     </meta>
     <frontmatter>
-      <url hash="36616492">2023.swisstext-1.0</url>
+      <url hash="399f3f31">2023.swisstext-1.0</url>
       <bibkey>swisstext-2023-edition</bibkey>
     </frontmatter>
     <paper id="1">
diff --git a/data/xml/2023.tacl.xml b/data/xml/2023.tacl.xml
index ca41be3c16..5154e6037f 100644
--- a/data/xml/2023.tacl.xml
+++ b/data/xml/2023.tacl.xml
@@ -1091,5 +1091,251 @@
       <bibkey>sherborne-etal-2023-optimal</bibkey>
       <video href="2023.tacl-1.81.mp4"/>
     </paper>
+    <paper id="82">
+      <title>Testing the Predictions of Surprisal Theory in 11 Languages</title>
+      <author><first>Ethan G.</first><last>Wilcox</last></author>
+      <author><first>Tiago</first><last>Pimentel</last></author>
+      <author><first>Clara</first><last>Meister</last></author>
+      <author><first>Ryan</first><last>Cotterell</last></author>
+      <author><first>Roger P.</first><last>Levy</last></author>
+      <doi>10.1162/tacl_a_00612</doi>
+      <abstract>Surprisal theory posits that less-predictable words should take more time to process, with word predictability quantified as surprisal, i.e., negative log probability in context. While evidence supporting the predictions of surprisal theory has been replicated widely, much of it has focused on a very narrow slice of data: native English speakers reading English texts. Indeed, no comprehensive multilingual analysis exists. We address this gap in the current literature by investigating the relationship between surprisal and reading times in eleven different languages, distributed across five language families. Deriving estimates from language models trained on monolingual and multilingual corpora, we test three predictions associated with surprisal theory: (i) whether surprisal is predictive of reading times, (ii) whether expected surprisal, i.e., contextual entropy, is predictive of reading times, and (iii) whether the linking function between surprisal and reading times is linear. We find that all three predictions are borne out crosslinguistically. By focusing on a more diverse set of languages, we argue that these results offer the most robust link to date between information theory and incremental language processing across languages.</abstract>
+      <pages>1451–1470</pages>
+      <url hash="eb3e52b6">2023.tacl-1.82</url>
+      <bibkey>wilcox-etal-2023-testing</bibkey>
+    </paper>
+    <paper id="83">
+      <title>Shared Lexical Items as Triggers of Code Switching</title>
+      <author><first>Shuly</first><last>Wintner</last></author>
+      <author><first>Safaa</first><last>Shehadi</last></author>
+      <author><first>Yuli</first><last>Zeira</last></author>
+      <author><first>Doreen</first><last>Osmelak</last></author>
+      <author><first>Yuval</first><last>Nov</last></author>
+      <doi>10.1162/tacl_a_00613</doi>
+      <abstract>Why do bilingual speakers code-switch (mix their two languages)? Among the several theories that attempt to explain this natural and ubiquitous phenomenon, the triggering hypothesis relates code-switching to the presence of lexical triggers, specifically cognates and proper names, adjacent to the switch point. We provide a fuller, more nuanced and refined exploration of the triggering hypothesis, based on five large datasets in three language pairs, reflecting both spoken and written bilingual interactions. Our results show that words that are assumed to reside in a mental lexicon shared by both languages indeed trigger code-switching, that the tendency to switch depends on the distance of the trigger from the switch point and on whether the trigger precedes or succeeds the switch, but not on the etymology of the trigger words. We thus provide strong, robust, evidence-based confirmation to several hypotheses on the relationships between lexical triggers and code-switching.</abstract>
+      <pages>1471–1484</pages>
+      <url hash="ec815dd6">2023.tacl-1.83</url>
+      <bibkey>wintner-etal-2023-shared</bibkey>
+    </paper>
+    <paper id="84">
+      <title>Learning More from Mixed Emotions: A Label Refinement Method for Emotion Recognition in Conversations</title>
+      <author><first>Jintao</first><last>Wen</last></author>
+      <author><first>Geng</first><last>Tu</last></author>
+      <author><first>Rui</first><last>Li</last></author>
+      <author><first>Dazhi</first><last>Jiang</last></author>
+      <author><first>Wenhua</first><last>Zhu</last></author>
+      <doi>10.1162/tacl_a_00614</doi>
+      <abstract>One-hot labels are commonly employed as ground truth in Emotion Recognition in Conversations (ERC). However, this approach may not fully encompass all the emotions conveyed in a single utterance, leading to suboptimal performance. Regrettably, current ERC datasets lack comprehensive emotionally distributed labels. To address this issue, we propose the Emotion Label Refinement (EmoLR) method, which utilizes context- and speaker-sensitive information to infer mixed emotional labels. EmoLR comprises an Emotion Predictor (EP) module and a Label Refinement (LR) module. The EP module recognizes emotions and provides context/speaker states for the LR module. Subsequently, the LR module calculates the similarity between these states and ground-truth labels, generating a refined label distribution (RLD). The RLD captures a more comprehensive range of emotions than the original one-hot labels. These refined labels are then used for model training in place of the one-hot labels. Experimental results on three public conversational datasets demonstrate that our EmoLR achieves state-of-the-art performance.</abstract>
+      <pages>1485–1499</pages>
+      <url hash="b2cb9785">2023.tacl-1.84</url>
+      <bibkey>wen-etal-2023-learning</bibkey>
+    </paper>
+    <paper id="85">
+      <title>Hallucinations in Large Multilingual Translation Models</title>
+      <author><first>Nuno M.</first><last>Guerreiro</last></author>
+      <author><first>Duarte M.</first><last>Alves</last></author>
+      <author><first>Jonas</first><last>Waldendorf</last></author>
+      <author><first>Barry</first><last>Haddow</last></author>
+      <author><first>Alexandra</first><last>Birch</last></author>
+      <author><first>Pierre</first><last>Colombo</last></author>
+      <author><first>André F. T.</first><last>Martins</last></author>
+      <doi>10.1162/tacl_a_00615</doi>
+      <abstract>Hallucinated translations can severely undermine and raise safety issues when machine translation systems are deployed in the wild. Previous research on the topic focused on small bilingual models trained on high-resource languages, leaving a gap in our understanding of hallucinations in multilingual models across diverse translation scenarios. In this work, we fill this gap by conducting a comprehensive analysis—over 100 language pairs across various resource levels and going beyond English-centric directions—on both the M2M neural machine translation (NMT) models and GPT large language models (LLMs). Among several insights, we highlight that models struggle with hallucinations primarily in low-resource directions and when translating out of English, where, critically, they may reveal toxic patterns that can be traced back to the training data. We also find that LLMs produce qualitatively different hallucinations to those of NMT models. Finally, we show that hallucinations are hard to reverse by merely scaling models trained with the same data. However, employing more diverse models, trained on different data or with different procedures, as fallback systems can improve translation quality and virtually eliminate certain pathologies.</abstract>
+      <pages>1500–1517</pages>
+      <url hash="78675183">2023.tacl-1.85</url>
+      <bibkey>guerreiro-etal-2023-hallucinations</bibkey>
+    </paper>
+    <paper id="86">
+      <title><fixed-case>P</fixed-case>anini<fixed-case>QA</fixed-case>: Enhancing Patient Education Through Interactive Question Answering</title>
+      <author><first>Pengshan</first><last>Cai</last></author>
+      <author><first>Zonghai</first><last>Yao</last></author>
+      <author id="fei-liu-utdallas"><first>Fei</first><last>Liu</last></author>
+      <author><first>Dakuo</first><last>Wang</last></author>
+      <author><first>Meghan</first><last>Reilly</last></author>
+      <author><first>Huixue</first><last>Zhou</last></author>
+      <author><first>Lingxi</first><last>Li</last></author>
+      <author><first>Yi</first><last>Cao</last></author>
+      <author><first>Alok</first><last>Kapoor</last></author>
+      <author><first>Adarsha</first><last>Bajracharya</last></author>
+      <author><first>Dan</first><last>Berlowitz</last></author>
+      <author><first>Hong</first><last>Yu</last></author>
+      <doi>10.1162/tacl_a_00616</doi>
+      <abstract>A patient portal allows discharged patients to access their personalized discharge instructions in electronic health records (EHRs). However, many patients have difficulty understanding or memorizing their discharge instructions (Zhao et al., 2017). In this paper, we present PaniniQA, a patient-centric interactive question answering system designed to help patients understand their discharge instructions. PaniniQA first identifies important clinical content from patients’ discharge instructions and then formulates patient-specific educational questions. In addition, PaniniQA is also equipped with answer verification functionality to provide timely feedback to correct patients’ misunderstandings. Our comprehensive automatic &amp; human evaluation results demonstrate our PaniniQA is capable of improving patients’ mastery of their medical instructions through effective interactions.1</abstract>
+      <pages>1518–1536</pages>
+      <url hash="432568f1">2023.tacl-1.86</url>
+      <bibkey>cai-etal-2023-paniniqa</bibkey>
+    </paper>
+    <paper id="87">
+      <title>Discover, Explain, Improve: An Automatic Slice Detection Benchmark for Natural Language Processing</title>
+      <author><first>Wenyue</first><last>Hua</last></author>
+      <author><first>Lifeng</first><last>Jin</last></author>
+      <author><first>Linfeng</first><last>Song</last></author>
+      <author><first>Haitao</first><last>Mi</last></author>
+      <author><first>Yongfeng</first><last>Zhang</last></author>
+      <author><first>Dong</first><last>Yu</last></author>
+      <doi>10.1162/tacl_a_00617</doi>
+      <abstract>Pretrained natural language processing (NLP) models have achieved high overall performance, but they still make systematic errors. Instead of manual error analysis, research on slice detection models (SDMs), which automatically identify underperforming groups of datapoints, has caught escalated attention in Computer Vision for both understanding model behaviors and providing insights for future model training and designing. However, little research on SDMs and quantitative evaluation of their effectiveness have been conducted on NLP tasks. Our paper fills the gap by proposing a benchmark named “Discover, Explain, Improve (DEIm)” for classification NLP tasks along with a new SDM Edisa. Edisa discovers coherent and underperforming groups of datapoints; DEIm then unites them under human-understandable concepts and provides comprehensive evaluation tasks and corresponding quantitative metrics. The evaluation in DEIm shows that Edisa can accurately select error-prone datapoints with informative semantic features that summarize error patterns. Detecting difficult datapoints directly boosts model performance without tuning any original model parameters, showing that discovered slices are actionable for users.1</abstract>
+      <pages>1537–1552</pages>
+      <url hash="fd62c952">2023.tacl-1.87</url>
+      <bibkey>hua-etal-2023-discover</bibkey>
+    </paper>
+    <paper id="88">
+      <title>Pre-train, Prompt, and Recommendation: A Comprehensive Survey of Language Modeling Paradigm Adaptations in Recommender Systems</title>
+      <author><first>Peng</first><last>Liu</last></author>
+      <author><first>Lemei</first><last>Zhang</last></author>
+      <author><first>Jon Atle</first><last>Gulla</last></author>
+      <doi>10.1162/tacl_a_00619</doi>
+      <abstract>The emergence of Pre-trained Language Models (PLMs) has achieved tremendous success in the field of Natural Language Processing (NLP) by learning universal representations on large corpora in a self-supervised manner. The pre-trained models and the learned representations can be beneficial to a series of downstream NLP tasks. This training paradigm has recently been adapted to the recommendation domain and is considered a promising approach by both academia and industry. In this paper, we systematically investigate how to extract and transfer knowledge from pre-trained models learned by different PLM-related training paradigms to improve recommendation performance from various perspectives, such as generality, sparsity, efficiency and effectiveness. Specifically, we propose a comprehensive taxonomy to divide existing PLM-based recommender systems w.r.t. their training strategies and objectives. Then, we analyze and summarize the connection between PLM-based training paradigms and different input data types for recommender systems. Finally, we elaborate on open issues and future research directions in this vibrant field.</abstract>
+      <pages>1553–1571</pages>
+      <url hash="3d7b6f96">2023.tacl-1.88</url>
+      <bibkey>liu-etal-2023-pre</bibkey>
+    </paper>
+    <paper id="89">
+      <title>An Efficient Self-Supervised Cross-View Training For Sentence Embedding</title>
+      <author><first>Peerat</first><last>Limkonchotiwat</last></author>
+      <author><first>Wuttikorn</first><last>Ponwitayarat</last></author>
+      <author><first>Lalita</first><last>Lowphansirikul</last></author>
+      <author><first>Can</first><last>Udomcharoenchaikit</last></author>
+      <author><first>Ekapol</first><last>Chuangsuwanich</last></author>
+      <author><first>Sarana</first><last>Nutanong</last></author>
+      <doi>10.1162/tacl_a_00620</doi>
+      <abstract>Self-supervised sentence representation learning is the task of constructing an embedding space for sentences without relying on human annotation efforts. One straightforward approach is to finetune a pretrained language model (PLM) with a representation learning method such as contrastive learning. While this approach achieves impressive performance on larger PLMs, the performance rapidly degrades as the number of parameters decreases. In this paper, we propose a framework called Self-supervised Cross-View Training (SCT) to narrow the performance gap between large and small PLMs. To evaluate the effectiveness of SCT, we compare it to 5 baseline and state-of-the-art competitors on seven Semantic Textual Similarity (STS) benchmarks using 5 PLMs with the number of parameters ranging from 4M to 340M. The experimental results show that STC outperforms the competitors for PLMs with less than 100M parameters in 18 of 21 cases.1</abstract>
+      <pages>1572–1587</pages>
+      <url hash="9769eef9">2023.tacl-1.89</url>
+      <bibkey>limkonchotiwat-etal-2023-efficient</bibkey>
+    </paper>
+    <paper id="90">
+      <title>General then Personal: Decoupling and Pre-training for Personalized Headline Generation</title>
+      <author><first>Yun-Zhu</first><last>Song</last></author>
+      <author><first>Yi-Syuan</first><last>Chen</last></author>
+      <author><first>Lu</first><last>Wang</last></author>
+      <author><first>Hong-Han</first><last>Shuai</last></author>
+      <doi>10.1162/tacl_a_00621</doi>
+      <abstract>Personalized Headline Generation aims to generate unique headlines tailored to users’ browsing history. In this task, understanding user preferences from click history and incorporating them into headline generation pose challenges. Existing approaches typically rely on predefined styles as control codes, but personal style lacks explicit definition or enumeration, making it difficult to leverage traditional techniques. To tackle these challenges, we propose General Then Personal (GTP), a novel framework comprising user modeling, headline generation, and customization. We train the framework using tailored designs that emphasize two central ideas: (a) task decoupling and (b) model pre-training. With the decoupling mechanism separating the task into generation and customization, two mechanisms, i.e., information self-boosting and mask user modeling, are further introduced to facilitate the training and text control. Additionally, we introduce a new evaluation metric to address existing limitations. Extensive experiments conducted on the PENS dataset, considering both zero-shot and few-shot scenarios, demonstrate that GTP outperforms state-of-the-art methods. Furthermore, ablation studies and analysis emphasize the significance of decoupling and pre-training. Finally, the human evaluation validates the effectiveness of our approaches.1</abstract>
+      <pages>1588–1607</pages>
+      <url hash="cd6acd30">2023.tacl-1.90</url>
+      <bibkey>song-etal-2023-general</bibkey>
+    </paper>
+    <paper id="91">
+      <title>Removing Backdoors in Pre-trained Models by Regularized Continual Pre-training</title>
+      <author><first>Biru</first><last>Zhu</last></author>
+      <author><first>Ganqu</first><last>Cui</last></author>
+      <author><first>Yangyi</first><last>Chen</last></author>
+      <author><first>Yujia</first><last>Qin</last></author>
+      <author><first>Lifan</first><last>Yuan</last></author>
+      <author><first>Chong</first><last>Fu</last></author>
+      <author><first>Yangdong</first><last>Deng</last></author>
+      <author><first>Zhiyuan</first><last>Liu</last></author>
+      <author><first>Maosong</first><last>Sun</last></author>
+      <author><first>Ming</first><last>Gu</last></author>
+      <doi>10.1162/tacl_a_00622</doi>
+      <abstract>Recent research has revealed that pre-trained models (PTMs) are vulnerable to backdoor attacks before the fine-tuning stage. The attackers can implant transferable task-agnostic backdoors in PTMs, and control model outputs on any downstream task, which poses severe security threats to all downstream applications. Existing backdoor-removal defenses focus on task-specific classification models and they are not suitable for defending PTMs against task-agnostic backdoor attacks. To this end, we propose the first task-agnostic backdoor removal method for PTMs. Based on the selective activation phenomenon in backdoored PTMs, we design a simple and effective backdoor eraser, which continually pre-trains the backdoored PTMs with a regularization term in an end-to-end approach. The regularization term removes backdoor functionalities from PTMs while the continual pre-training maintains the normal functionalities of PTMs. We conduct extensive experiments on pre-trained models across different modalities and architectures. The experimental results show that our method can effectively remove backdoors inside PTMs and preserve benign functionalities of PTMs with a few downstream-task-irrelevant auxiliary data, e.g., unlabeled plain texts. The average attack success rate on three downstream datasets is reduced from 99.88% to 8.10% after our defense on the backdoored BERT. The codes are publicly available at https://github.com/thunlp/RECIPE.</abstract>
+      <pages>1608–1623</pages>
+      <url hash="9e46f120">2023.tacl-1.91</url>
+      <bibkey>zhu-etal-2023-removing</bibkey>
+    </paper>
+    <paper id="92">
+      <title>Bridging the Gap: A Survey on Integrating (Human) Feedback for Natural Language Generation</title>
+      <author><first>Patrick</first><last>Fernandes</last></author>
+      <author><first>Aman</first><last>Madaan</last></author>
+      <author><first>Emmy</first><last>Liu</last></author>
+      <author><first>António</first><last>Farinhas</last></author>
+      <author><first>Pedro Henrique</first><last>Martins</last></author>
+      <author><first>Amanda</first><last>Bertsch</last></author>
+      <author><first>José G. C.</first><last>de Souza</last></author>
+      <author><first>Shuyan</first><last>Zhou</last></author>
+      <author><first>Tongshuang</first><last>Wu</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
+      <author><first>André F. T.</first><last>Martins</last></author>
+      <doi>10.1162/tacl_a_00626</doi>
+      <abstract>Natural language generation has witnessed significant advancements due to the training of large language models on vast internet-scale datasets. Despite these advancements, there exists a critical challenge: These models can inadvertently generate content that is toxic, inaccurate, and unhelpful, and existing automatic evaluation metrics often fall short of identifying these shortcomings. As models become more capable, human feedback is an invaluable signal for evaluating and improving models. This survey aims to provide an overview of recent research that has leveraged human feedback to improve natural language generation. First, we introduce a taxonomy distilled from existing research to categorize and organize the varied forms of feedback. Next, we discuss how feedback can be described by its format and objective, and cover the two approaches proposed to use feedback (either for training or decoding): directly using feedback or training feedback models. We also discuss existing datasets for human-feedback data collection, and concerns surrounding feedback collection. Finally, we provide an overview of the nascent field of AI feedback, which uses large language models to make judgments based on a set of principles and minimize the need for human intervention. We also release a website of this survey at feedback-gap-survey.info.</abstract>
+      <pages>1643–1668</pages>
+      <url hash="39a0e411">2023.tacl-1.92</url>
+      <bibkey>fernandes-etal-2023-bridging</bibkey>
+    </paper>
+    <paper id="93">
+      <title><fixed-case>A</fixed-case>fri<fixed-case>S</fixed-case>peech-200: Pan-<fixed-case>A</fixed-case>frican Accented Speech Dataset for Clinical and General Domain <fixed-case>ASR</fixed-case></title>
+      <author><first>Tobi</first><last>Olatunji</last></author>
+      <author><first>Tejumade</first><last>Afonja</last></author>
+      <author><first>Aditya</first><last>Yadavalli</last></author>
+      <author><first>Chris Chinenye</first><last>Emezue</last></author>
+      <author><first>Sahib</first><last>Singh</last></author>
+      <author><first>Bonaventure F. P.</first><last>Dossou</last></author>
+      <author><first>Joanne</first><last>Osuchukwu</last></author>
+      <author><first>Salomey</first><last>Osei</last></author>
+      <author><first>Atnafu Lambebo</first><last>Tonja</last></author>
+      <author><first>Naome</first><last>Etori</last></author>
+      <author><first>Clinton</first><last>Mbataku</last></author>
+      <doi>10.1162/tacl_a_00627</doi>
+      <abstract>Africa has a very poor doctor-to-patient ratio. At very busy clinics, doctors could see 30+ patients per day—a heavy patient burden compared with developed countries—but productivity tools such as clinical automatic speech recognition (ASR) are lacking for these overworked clinicians. However, clinical ASR is mature, even ubiquitous, in developed nations, and clinician-reported performance of commercial clinical ASR systems is generally satisfactory. Furthermore, the recent performance of general domain ASR is approaching human accuracy. However, several gaps exist. Several publications have highlighted racial bias with speech-to-text algorithms and performance on minority accents lags significantly. To our knowledge, there is no publicly available research or benchmark on accented African clinical ASR, and speech data is non-existent for the majority of African accents. We release AfriSpeech, 200hrs of Pan-African English speech, 67,577 clips from 2,463 unique speakers across 120 indigenous accents from 13 countries for clinical and general domain ASR, a benchmark test set, with publicly available pre-trained models with SOTA performance on the AfriSpeech benchmark.</abstract>
+      <pages>1669–1685</pages>
+      <url hash="0cd420af">2023.tacl-1.93</url>
+      <bibkey>olatunji-etal-2023-afrispeech</bibkey>
+    </paper>
+    <paper id="94">
+      <title><fixed-case>M</fixed-case>iss<fixed-case>M</fixed-case>odal: Increasing Robustness to Missing Modality in Multimodal Sentiment Analysis</title>
+      <author><first>Ronghao</first><last>Lin</last></author>
+      <author><first>Haifeng</first><last>Hu</last></author>
+      <doi>10.1162/tacl_a_00628</doi>
+      <abstract>When applying multimodal machine learning in downstream inference, both joint and coordinated multimodal representations rely on the complete presence of modalities as in training. However, modal-incomplete data, where certain modalities are missing, greatly reduces performance in Multimodal Sentiment Analysis (MSA) due to varying input forms and semantic information deficiencies. This limits the applicability of the predominant MSA methods in the real world, where the completeness of multimodal data is uncertain and variable. The generation-based methods attempt to generate the missing modality, yet they require complex hierarchical architecture with huge computational costs and struggle with the representation gaps across different modalities. Diversely, we propose a novel representation learning approach named MissModal, devoting to increasing robustness to missing modality in a classification approach. Specifically, we adopt constraints with geometric contrastive loss, distribution distance loss, and sentiment semantic loss to align the representations of modal-missing and modal-complete data, without impacting the sentiment inference for the complete modalities. Furthermore, we do not demand any changes in the multimodal fusion stage, highlighting the generality of our method in other multimodal learning systems. Extensive experiments demonstrate that the proposed method achieves superior performance with minimal computational costs in various missing modalities scenarios (flexibility), including severely missing modality (efficiency) on two public MSA datasets.</abstract>
+      <pages>1686–1702</pages>
+      <url hash="75d65e3b">2023.tacl-1.94</url>
+      <bibkey>lin-hu-2023-missmodal</bibkey>
+    </paper>
+    <paper id="95">
+      <title>Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision</title>
+      <author><first>Eugene</first><last>Kharitonov</last></author>
+      <author><first>Damien</first><last>Vincent</last></author>
+      <author><first>Zalán</first><last>Borsos</last></author>
+      <author><first>Raphaël</first><last>Marinier</last></author>
+      <author><first>Sertan</first><last>Girgin</last></author>
+      <author><first>Olivier</first><last>Pietquin</last></author>
+      <author><first>Matt</first><last>Sharifi</last></author>
+      <author><first>Marco</first><last>Tagliasacchi</last></author>
+      <author><first>Neil</first><last>Zeghidour</last></author>
+      <doi>10.1162/tacl_a_00618</doi>
+      <abstract>We introduce SPEAR-TTS, a multi-speaker text-to-speech (TTS) system that can be trained with minimal supervision. By combining two types of discrete speech representations, we cast TTS as a composition of two sequence-to-sequence tasks: from text to high-level semantic tokens (akin to “reading”) and from semantic tokens to low-level acoustic tokens (“speaking”). Decoupling these two tasks enables training of the “speaking” module using abundant audio-only data, and unlocks the highly efficient combination of pretraining and backtranslation to reduce the need for parallel data when training the “reading” component. To control the speaker identity, we adopt example prompting, which allows SPEAR-TTS to generalize to unseen speakers using only a short sample of 3 seconds, without any explicit speaker representation or speaker labels. Our experiments demonstrate that SPEAR-TTS achieves a character error rate that is competitive with state-of-the-art methods using only 15 minutes of parallel data, while matching ground-truth speech in naturalness and acoustic quality.</abstract>
+      <pages>1703–1718</pages>
+      <url hash="dc8c98b8">2023.tacl-1.95</url>
+      <bibkey>kharitonov-etal-2023-speak</bibkey>
+    </paper>
+    <paper id="96">
+      <title><fixed-case>R</fixed-case>e<fixed-case>COGS</fixed-case>: How Incidental Details of a Logical Form Overshadow an Evaluation of Semantic Interpretation</title>
+      <author><first>Zhengxuan</first><last>Wu</last></author>
+      <author><first>Christopher D.</first><last>Manning</last></author>
+      <author><first>Christopher</first><last>Potts</last></author>
+      <doi>10.1162/tacl_a_00623</doi>
+      <abstract>Compositional generalization benchmarks for semantic parsing seek to assess whether models can accurately compute meanings for novel sentences, but operationalize this in terms of logical form (LF) prediction. This raises the concern that semantically irrelevant details of the chosen LFs could shape model performance. We argue that this concern is realized for the COGS benchmark (Kim and Linzen, 2020). COGS poses generalization splits that appear impossible for present-day models, which could be taken as an indictment of those models. However, we show that the negative results trace to incidental features of COGS LFs. Converting these LFs to semantically equivalent ones and factoring out capabilities unrelated to semantic interpretation, we find that even baseline models get traction. A recent variable-free translation of COGS LFs suggests similar conclusions, but we observe this format is not semantically equivalent; it is incapable of accurately representing some COGS meanings. These findings inform our proposal for ReCOGS, a modified version of COGS that comes closer to assessing the target semantic capabilities while remaining very challenging. Overall, our results reaffirm the importance of compositional generalization and careful benchmark task design.</abstract>
+      <pages>1719–1733</pages>
+      <url hash="098e9057">2023.tacl-1.96</url>
+      <bibkey>wu-etal-2023-recogs</bibkey>
+    </paper>
+    <paper id="97">
+      <title>Data-driven Parsing Evaluation for Child-Parent Interactions</title>
+      <author><first>Zoey</first><last>Liu</last></author>
+      <author><first>Emily</first><last>Prud’hommeaux</last></author>
+      <doi>10.1162/tacl_a_00624</doi>
+      <abstract>We present a syntactic dependency treebank for naturalistic child and child-directed spoken English. Our annotations largely follow the guidelines of the Universal Dependencies project (UD [Zeman et al., 2022]), with detailed extensions to lexical and syntactic structures unique to spontaneous spoken language, as opposed to written texts or prepared speech. Compared to existing UD-style spoken treebanks and other dependency corpora of child-parent interactions specifically, our dataset is much larger (44,744 utterances; 233,907 words) and contains data from 10 children covering a wide age range (18–66 months). We conduct thorough dependency parser evaluations using both graph-based and transition-based parsers, trained on three different types of out-of-domain written texts: news, tweets, and learner data. Out-of-domain parsers demonstrate reasonable performance for both child and parent data. In addition, parser performance for child data increases along children’s developmental paths, especially between 18 and 48 months, and gradually approaches the performance for parent data. These results are further validated with in-domain training.</abstract>
+      <pages>1734–1753</pages>
+      <url hash="58992345">2023.tacl-1.97</url>
+      <bibkey>liu-prudhommeaux-2023-data</bibkey>
+    </paper>
+    <paper id="98">
+      <title><fixed-case>QA</fixed-case>meleon: Multilingual <fixed-case>QA</fixed-case> with Only 5 Examples</title>
+      <author><first>Priyanka</first><last>Agrawal</last></author>
+      <author><first>Chris</first><last>Alberti</last></author>
+      <author><first>Fantine</first><last>Huot</last></author>
+      <author><first>Joshua</first><last>Maynez</last></author>
+      <author><first>Ji</first><last>Ma</last></author>
+      <author><first>Sebastian</first><last>Ruder</last></author>
+      <author><first>Kuzman</first><last>Ganchev</last></author>
+      <author><first>Dipanjan</first><last>Das</last></author>
+      <author><first>Mirella</first><last>Lapata</last></author>
+      <doi>10.1162/tacl_a_00625</doi>
+      <abstract>The availability of large, high-quality datasets has been a major driver of recent progress in question answering (QA). Such annotated datasets, however, are difficult and costly to collect, and rarely exist in languages other than English, rendering QA technology inaccessible to underrepresented languages. An alternative to building large monolingual training datasets is to leverage pre-trained language models (PLMs) under a few-shot learning setting. Our approach, QAmeleon, uses a PLM to automatically generate multilingual data upon which QA models are fine-tuned, thus avoiding costly annotation. Prompt tuning the PLM with only five examples per language delivers accuracy superior to translation-based baselines; it bridges nearly 60% of the gap between an English-only baseline and a fully-supervised upper bound fine-tuned on almost 50,000 hand-labeled examples; and consistently leads to improvements compared to directly fine-tuning a QA model on labeled examples in low resource settings. Experiments on the TyDiqa-GoldP and MLQA benchmarks show that few-shot prompt tuning for data synthesis scales across languages and is a viable alternative to large-scale annotation.1</abstract>
+      <pages>1754–1771</pages>
+      <url hash="cd6e4624">2023.tacl-1.98</url>
+      <bibkey>agrawal-etal-2023-qameleon</bibkey>
+    </paper>
   </volume>
 </collection>
diff --git a/data/xml/2023.wmt.xml b/data/xml/2023.wmt.xml
index 8f952c0205..31b7929943 100644
--- a/data/xml/2023.wmt.xml
+++ b/data/xml/2023.wmt.xml
@@ -1193,7 +1193,7 @@
       <title><fixed-case>NICT</fixed-case>-<fixed-case>AI</fixed-case>4<fixed-case>B</fixed-case>’s Submission to the <fixed-case>I</fixed-case>ndic <fixed-case>MT</fixed-case> Shared Task in <fixed-case>WMT</fixed-case> 2023</title>
       <author><first>Raj</first><last>Dabre</last></author>
       <author><first>Jay</first><last>Gala</last></author>
-      <author><first>Pranjal</first><last>Chitale</last></author>
+      <author><first>Pranjal A.</first><last>Chitale</last></author>
       <pages>941–949</pages>
       <abstract>In this paper, we (Team NICT-AI4B) describe our MT systems that we submit to the Indic MT task in WMT 2023. Our primary system consists of 3 stages: Joint denoising and MT training using officially approved monolingual and parallel corpora, backtranslation and, MT training on original and backtranslated parallel corpora. We observe that backtranslation leads to substantial improvements in translation quality up to 4 BLEU points. We also develop 2 contrastive systems on unconstrained settings, where the first system involves fine-tuning of IndicTrans2 DA models on official parallel corpora and seed data used in AI4Bharat et al, (2023), and the second system involves a system combination of the primary and the aforementioned system. Overall, we manage to obtain high-quality translation systems for the 4 low-resource North-East Indian languages of focus.</abstract>
       <url hash="a6672176">2023.wmt-1.88</url>
diff --git a/data/xml/2024.bucc.xml b/data/xml/2024.bucc.xml
new file mode 100644
index 0000000000..728839447f
--- /dev/null
+++ b/data/xml/2024.bucc.xml
@@ -0,0 +1,166 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.bucc">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024</booktitle>
+      <editor><first>Pierre</first><last>Zweigenbaum</last></editor>
+      <editor><first>Reinhard</first><last>Rapp</last></editor>
+      <editor><first>Serge</first><last>Sharoff</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="287762db">2024.bucc-1</url>
+      <venue>bucc</venue>
+    </meta>
+    <frontmatter>
+      <url hash="0755dfea">2024.bucc-1.0</url>
+      <bibkey>bucc-2024-building</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>On a Novel Application of <fixed-case>W</fixed-case>asserstein-<fixed-case>P</fixed-case>rocrustes for Unsupervised Cross-Lingual Alignment of Embeddings</title>
+      <author><first>Guillem</first><last>Ramírez</last></author>
+      <author><first>Rumen</first><last>Dangovski</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <author><first>Marin</first><last>Soljacic</last></author>
+      <pages>1–11</pages>
+      <url hash="d1c084f9">2024.bucc-1.1</url>
+      <bibkey>ramirez-etal-2024-novel</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Modeling Diachronic Change in <fixed-case>E</fixed-case>nglish Scientific Writing over 300+ Years with Transformer-based Language Model Surprisal</title>
+      <author><first>Julius</first><last>Steuer</last></author>
+      <author><first>Marie-Pauline</first><last>Krielke</last></author>
+      <author><first>Stefan</first><last>Fischer</last></author>
+      <author><first>Stefania</first><last>Degaetano-Ortlieb</last></author>
+      <author><first>Marius</first><last>Mosbach</last></author>
+      <author><first>Dietrich</first><last>Klakow</last></author>
+      <pages>12–23</pages>
+      <url hash="3227e6a2">2024.bucc-1.2</url>
+      <bibkey>steuer-etal-2024-modeling</bibkey>
+    </paper>
+    <paper id="3">
+      <title><fixed-case>PORTULAN</fixed-case> <fixed-case>E</fixed-case>xtra<fixed-case>GLUE</fixed-case> Datasets and Models: Kick-starting a Benchmark for the Neural Processing of <fixed-case>P</fixed-case>ortuguese</title>
+      <author><first>Tomás Freitas</first><last>Osório</last></author>
+      <author><first>Bernardo</first><last>Leite</last></author>
+      <author><first>Henrique</first><last>Lopes Cardoso</last></author>
+      <author><first>Luís</first><last>Gomes</last></author>
+      <author><first>João</first><last>Rodrigues</last></author>
+      <author><first>Rodrigo</first><last>Santos</last></author>
+      <author><first>António</first><last>Branco</last></author>
+      <pages>24–34</pages>
+      <url hash="410c8d81">2024.bucc-1.3</url>
+      <bibkey>osorio-etal-2024-portulan</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Invited Talk: The Way Towards Massively Multilingual Language Models</title>
+      <author><first>François</first><last>Yvon</last></author>
+      <pages>35</pages>
+      <url hash="9706ffa6">2024.bucc-1.4</url>
+      <bibkey>yvon-2024-invited</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Exploring the Necessity of Visual Modality in Multimodal Machine Translation using Authentic Datasets</title>
+      <author><first>Zi</first><last>Long</last></author>
+      <author><first>ZhenHao</first><last>Tang</last></author>
+      <author><first>Xianghua</first><last>Fu</last></author>
+      <author><first>Jian</first><last>Chen</last></author>
+      <author><first>Shilong</first><last>Hou</last></author>
+      <author><first>Jinze</first><last>Lyu</last></author>
+      <pages>36–50</pages>
+      <url hash="84f28932">2024.bucc-1.5</url>
+      <bibkey>long-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Exploring the Potential of Large Language Models in Adaptive Machine Translation for Generic Text and Subtitles</title>
+      <author><first>Abdelhadi</first><last>Soudi</last></author>
+      <author><first>Mohamed</first><last>Hannani</last></author>
+      <author><first>Kristof</first><last>Van Laerhoven</last></author>
+      <author><first>Eleftherios</first><last>Avramidis</last></author>
+      <pages>51–58</pages>
+      <url hash="1713f674">2024.bucc-1.6</url>
+      <bibkey>soudi-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>INCLURE</fixed-case>: a Dataset and Toolkit for Inclusive <fixed-case>F</fixed-case>rench Translation</title>
+      <author><first>Paul</first><last>Lerner</last></author>
+      <author><first>Cyril</first><last>Grouin</last></author>
+      <pages>59–68</pages>
+      <url hash="362393f6">2024.bucc-1.7</url>
+      <bibkey>lerner-grouin-2024-inclure</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>B</fixed-case>n<fixed-case>PC</fixed-case>: A Gold Standard Corpus for Paraphrase Detection in <fixed-case>B</fixed-case>angla, and its Evaluation</title>
+      <author><first>Sourav</first><last>Saha</last></author>
+      <author><first>Zeshan Ahmed</first><last>Nobin</last></author>
+      <author><first>Mufassir Ahmad</first><last>Chowdhury</last></author>
+      <author><first>Md. Shakirul Hasan Khan</first><last>Mobin</last></author>
+      <author><first>Mohammad Ruhul</first><last>Amin</last></author>
+      <author><first>Sudipta</first><last>Kar</last></author>
+      <pages>69–84</pages>
+      <url hash="26bc8da0">2024.bucc-1.8</url>
+      <bibkey>saha-etal-2024-bnpc</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Creating Clustered Comparable Corpora from <fixed-case>W</fixed-case>ikipedia with Different Fuzziness Levels and Language Representativity</title>
+      <author><first>Anna</first><last>Laskina</last></author>
+      <author><first>Eric</first><last>Gaussier</last></author>
+      <author><first>Gaelle</first><last>Calvary</last></author>
+      <pages>85–93</pages>
+      <url hash="284f0287">2024.bucc-1.9</url>
+      <bibkey>laskina-etal-2024-creating</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>E</fixed-case>u<fixed-case>R</fixed-case>e<fixed-case>C</fixed-case>o: Not Building and Yet Using Federated Comparable Corpora for Cross-Linguistic Research</title>
+      <author><first>Marc</first><last>Kupietz</last></author>
+      <author><first>Piotr</first><last>Banski</last></author>
+      <author><first>Nils</first><last>Diewald</last></author>
+      <author><first>Beata</first><last>Trawinski</last></author>
+      <author><first>Andreas</first><last>Witt</last></author>
+      <pages>94–103</pages>
+      <url hash="a899e788">2024.bucc-1.10</url>
+      <bibkey>kupietz-etal-2024-eureco</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Building Annotated Parallel Corpora Using the <fixed-case>ATIS</fixed-case> Dataset: Two <fixed-case>UD</fixed-case>-style treebanks in <fixed-case>E</fixed-case>nglish and <fixed-case>T</fixed-case>urkish</title>
+      <author><first>Neslihan</first><last>Cesur</last></author>
+      <author><first>Aslı</first><last>Kuzgun</last></author>
+      <author><first>Mehmet</first><last>Kose</last></author>
+      <author><first>Olcay Taner</first><last>Yıldız</last></author>
+      <pages>104–110</pages>
+      <url hash="3f564906">2024.bucc-1.11</url>
+      <bibkey>cesur-etal-2024-building</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Bootstrapping the Annotation of <fixed-case>UD</fixed-case> Learner Treebanks</title>
+      <author><first>Arianna</first><last>Masciolini</last></author>
+      <pages>111–117</pages>
+      <url hash="e4f2020f">2024.bucc-1.12</url>
+      <bibkey>masciolini-2024-bootstrapping</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>S</fixed-case>we<fixed-case>D</fixed-case>iagnostics: A Diagnostics Natural Language Inference Dataset for <fixed-case>S</fixed-case>wedish</title>
+      <author><first>Felix</first><last>Morger</last></author>
+      <pages>118–124</pages>
+      <url hash="b088cb08">2024.bucc-1.13</url>
+      <bibkey>morger-2024-swediagnostics</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Multiple Discourse Relations in <fixed-case>E</fixed-case>nglish <fixed-case>TED</fixed-case> Talks and Their Translation into <fixed-case>L</fixed-case>ithuanian, <fixed-case>P</fixed-case>ortuguese and <fixed-case>T</fixed-case>urkish</title>
+      <author><first>Deniz</first><last>Zeyrek</last></author>
+      <author><first>Giedrė</first><last>Valūnaitė Oleškevičienė</last></author>
+      <author><first>Amalia</first><last>Mendes</last></author>
+      <pages>125–134</pages>
+      <url hash="59d858de">2024.bucc-1.14</url>
+      <bibkey>zeyrek-etal-2024-multiple</bibkey>
+    </paper>
+    <paper id="15">
+      <title>mini-<fixed-case>CIEP</fixed-case>+ : A Shareable Parallel Corpus of Prose</title>
+      <author><first>Annemarie</first><last>Verkerk</last></author>
+      <author><first>Luigi</first><last>Talamo</last></author>
+      <pages>135–143</pages>
+      <url hash="55a15c43">2024.bucc-1.15</url>
+      <bibkey>verkerk-talamo-2024-mini</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.caldpseudo.xml b/data/xml/2024.caldpseudo.xml
index 4c66234bd2..111ce2f1e0 100644
--- a/data/xml/2024.caldpseudo.xml
+++ b/data/xml/2024.caldpseudo.xml
@@ -30,6 +30,7 @@
       <abstract>Missed recognition of named entities while de-identifying clinical narratives poses a critical challenge in protecting patient-sensitive health information. Mitigating name recognition errors is essential to minimize risk of patient re-identification. In this paper, we emphasize the need for stratified sampling and enhanced contextual considerations concerning Name Tokens using a fine-tuned Longformer BERT model for clinical text de-identifcation. We introduce a Hidden in Plain Sight (HIPS) Markov-based replacement technique for names to mask name recognition misses, revealing a significant reduction in name leakage rates. Our experimental results underscore the impact on addressing name recognition challenges in BERT-based de-identification systems for heightened privacy protection in electronic health records.</abstract>
       <url hash="ed1cd679">2024.caldpseudo-1.1</url>
       <bibkey>simancek-vydiswaran-2024-handling</bibkey>
+      <video href="2024.caldpseudo-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Assessing Authenticity and Anonymity of Synthetic User-generated Content in the Medical Domain</title>
@@ -42,6 +43,7 @@
       <abstract>Since medical text cannot be shared easily due to privacy concerns, synthetic data bears much potential for natural language processing applications. In the context of social media and user-generated messages about drug intake and adverse drug effects, this work presents different methods to examine the authenticity of synthetic text. We conclude that the generated tweets are untraceable and show enough authenticity from the medical point of view to be used as a replacement for a real Twitter corpus. However, original data might still be the preferred choice as they contain much more diversity.</abstract>
       <url hash="7c673002">2024.caldpseudo-1.2</url>
       <bibkey>nishiyama-etal-2024-assessing</bibkey>
+      <video href="2024.caldpseudo-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Automatic Detection and Labelling of Personal Data in Case Reports from the <fixed-case>ECHR</fixed-case> in <fixed-case>S</fixed-case>panish: Evaluation of Two Different Annotation Approaches</title>
@@ -52,6 +54,7 @@
       <abstract>In this paper we evaluate two annotation approaches for automatic detection and labelling of personal information in legal texts in relation to the ambiguity of the labels and the homogeneity of the annotations. For this purpose, we built a corpus of 44 case reports from the European Court of Human Rights in Spanish language and we annotated it following two different annotation approaches: automatic projection of the annotations of an existing English corpus, and manual annotation with our reinterpretation of their guidelines. Moreover, we employ Flair on a Named Entity Recognition task to compare its performance in the two annotation schemes.</abstract>
       <url hash="55b703c1">2024.caldpseudo-1.3</url>
       <bibkey>sierro-etal-2024-automatic</bibkey>
+      <video href="2024.caldpseudo-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title><fixed-case>PSILENCE</fixed-case>: A Pseudonymization Tool for International Law</title>
@@ -61,6 +64,7 @@
       <abstract>Since the announcement of the GDPR, the pseudonymization of legal documents has become a high-priority task in many legal organizations. This means that for making public a document, it is necessary to redact the identity of certain entities, such as witnesses. In this work, we present the first results obtained by PSILENCE, a pseudonymization tool created for redacting semi-automatically international arbitration documents in English. PSILENCE has been built using a Named Entity Recognition (NER) system, along with a Coreference Resolution system. These systems allow us to find the people that we need to redact in a clustered way, but also to propose the same pseudonym throughout one document. This last aspect makes it easier to read and comprehend a redacted legal document. Different experiments were done on four different datasets, one of which was legal, and the results are promising, reaching a Macro F-score of up to 0.72 on the legal dataset.</abstract>
       <url hash="ed380132">2024.caldpseudo-1.4</url>
       <bibkey>cabrera-diego-gheewala-2024-psilence</bibkey>
+      <video href="2024.caldpseudo-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Deidentifying a <fixed-case>N</fixed-case>orwegian Clinical Corpus - an Effort to Create a Privacy-preserving <fixed-case>N</fixed-case>orwegian Large Clinical Language Model</title>
@@ -74,6 +78,7 @@
       <abstract>The study discusses the methods and challenges of deidentifying and pseudonymizing Norwegian clinical text for research purposes. The results of the NorDeid tool for deidentification and pseudonymization on different types of protected health information were evaluated and discussed, as well as the extension of its functionality with regular expressions to identify specific types of sensitive information. The research used a clinical corpus of adult patients treated in a gastro-surgical department in Norway, which contains approximately nine million clinical notes. The study also highlights the challenges posed by the unique language and clinical terminology of Norway and emphasizes the importance of protecting privacy and the need for customized approaches to meet legal and research requirements.</abstract>
       <url hash="45b1afe2">2024.caldpseudo-1.5</url>
       <bibkey>ngo-etal-2024-deidentifying</bibkey>
+      <video href="2024.caldpseudo-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Extending Off-the-shelf <fixed-case>NER</fixed-case> Systems to Personal Information Detection in Dialogues with a Virtual Agent: Findings from a Real-Life Use Case</title>
@@ -85,6 +90,7 @@
       <abstract>We present the findings and results of our pseudonymisation system, which has been developed for a real-life use-case involving users and an informative chatbot in the context of the COVID-19 pandemic. Message exchanges between the two involve the former group providing information about themselves and their residential area, which could easily allow for their re-identification. We create a modular pipeline to detect PIIs and perform basic deidentification such that the data can be stored while mitigating any privacy concerns. The use-case presents several challenging aspects, the most difficult of which is the logistic challenge of not being able to directly view or access the data due to the very privacy issues we aim to resolve. Nevertheless, our system achieves a high recall of 0.99, correctly identifying almost all instances of personal data. However, this comes at the expense of precision, which only reaches 0.64. We describe the sensitive information identification in detail, explaining the design principles behind our decisions. We additionally highlight the particular challenges we’ve encountered.</abstract>
       <url hash="c1558c1b">2024.caldpseudo-1.6</url>
       <bibkey>mina-etal-2024-extending</bibkey>
+      <video href="2024.caldpseudo-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Detecting Personal Identifiable Information in <fixed-case>S</fixed-case>wedish Learner Essays</title>
@@ -97,6 +103,7 @@
       <abstract>Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.</abstract>
       <url hash="1c27bab1">2024.caldpseudo-1.7</url>
       <bibkey>szawerna-etal-2024-detecting</bibkey>
+      <video href="2024.caldpseudo-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Data Anonymization for Privacy-Preserving Large Language Model Fine-Tuning on Call Transcripts</title>
@@ -112,6 +119,7 @@
       <abstract>Large language models in public-facing industrial applications must accurately process data for the domain in which they are deployed, but they must not leak sensitive or confidential information when used. We present a process for anonymizing training data, a framework for quantitatively and qualitatively assessing the effectiveness of this process, and an assessment of the effectiveness of models fine-tuned on anonymized data in comparison with commercially available LLM APIs.</abstract>
       <url hash="bfd35b4f">2024.caldpseudo-1.8</url>
       <bibkey>gardiner-etal-2024-data</bibkey>
+      <video href="2024.caldpseudo-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title>When Is a Name Sensitive? Eponyms in Clinical Text and Implications for De-Identification</title>
@@ -123,6 +131,7 @@
       <abstract>Clinical data, in the form of electronic health records, are rich resources that can be tapped using natural language processing. At the same time, they contain very sensitive information that must be protected. One strategy is to remove or obscure data using automatic de-identification. However, the detection of sensitive data can yield false positives. This is especially true for tokens that are similar in form to sensitive entities, such as eponyms. These names tend to refer to medical procedures or diagnoses rather than specific persons. Previous research has shown that automatic de-identification systems often misclassify eponyms as names, leading to a loss of valuable medical information. In this study, we estimate the prevalence of eponyms in a real Swedish clinical corpus. Furthermore, we demonstrate that modern transformer-based de-identification systems are more accurate in distinguishing between names and eponyms than previous approaches.</abstract>
       <url hash="cea56147">2024.caldpseudo-1.9</url>
       <bibkey>vakili-etal-2024-name</bibkey>
+      <video href="2024.caldpseudo-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Did the Names <fixed-case>I</fixed-case> Used within My Essay Affect My Score? Diagnosing Name Biases in Automated Essay Scoring</title>
@@ -135,6 +144,7 @@
       <abstract>Automated essay scoring (AES) of second-language learner essays is a high-stakes task as it can affect the job and educational opportunities a student may have access to. Thus, it becomes imperative to make sure that the essays are graded based on the students’ language proficiency as opposed to other reasons, such as personal names used in the text of the essay. Moreover, most of the research data for AES tends to contain personal identifiable information. Because of that, pseudonymization becomes an important tool to make sure that this data can be freely shared. Thus, our systems should not grade students based on which given names were used in the text of the essay, both for fairness and for privacy reasons. In this paper we explore how given names affect the CEFR level classification of essays of second language learners of Swedish. We use essays containing just one personal name and substitute it for names from lists of given names from four different ethnic origins, namely Swedish, Finnish, Anglo-American, and Arabic. We find that changing the names within the essays has no apparent effect on the classification task, regardless of whether a feature-based or a transformer-based model is used.</abstract>
       <url hash="807e7591">2024.caldpseudo-1.10</url>
       <bibkey>munoz-sanchez-etal-2024-names</bibkey>
+      <video href="2024.caldpseudo-1.10.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.case.xml b/data/xml/2024.case.xml
index 7a928fde53..7d260a74fc 100644
--- a/data/xml/2024.case.xml
+++ b/data/xml/2024.case.xml
@@ -29,6 +29,7 @@
       <url hash="bdb15e0c">2024.case-1.1</url>
       <attachment type="SupplementaryMaterial" hash="629ad37b">2024.case-1.1.SupplementaryMaterial.txt</attachment>
       <bibkey>fellman-etal-2024-future</bibkey>
+      <video href="2024.case-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Fine-Tuning Language Models on <fixed-case>D</fixed-case>utch Protest Event Tweets</title>
@@ -54,6 +55,7 @@
       <url hash="27288ed8">2024.case-1.3</url>
       <attachment type="SupplementaryMaterial" hash="f3f99fb0">2024.case-1.3.SupplementaryMaterial.txt</attachment>
       <bibkey>bakker-etal-2024-timeline</bibkey>
+      <video href="2024.case-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Leveraging Approximate Pattern Matching with <fixed-case>BERT</fixed-case> for Event Detection</title>
@@ -63,6 +65,7 @@
       <url hash="03ea347e">2024.case-1.4</url>
       <attachment type="SupplementaryMaterial" hash="45f30892">2024.case-1.4.SupplementaryMaterial.txt</attachment>
       <bibkey>tanev-2024-leveraging</bibkey>
+      <video href="2024.case-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Socio-political Events of Conflict and Unrest: A Survey of Available Datasets</title>
@@ -75,6 +78,7 @@
       <url hash="12548161">2024.case-1.5</url>
       <attachment type="SupplementaryMaterial" hash="83a6bd3c">2024.case-1.5.SupplementaryMaterial.txt</attachment>
       <bibkey>olsen-etal-2024-socio</bibkey>
+      <video href="2024.case-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Evaluating <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>’s Ability to Detect Hate Speech in <fixed-case>T</fixed-case>urkish Tweets</title>
@@ -85,6 +89,7 @@
       <url hash="f3c26de6">2024.case-1.6</url>
       <attachment type="SupplementaryMaterial" hash="d99d3ee1">2024.case-1.6.SupplementaryMaterial.txt</attachment>
       <bibkey>dehghan-yanikoglu-2024-evaluating</bibkey>
+      <video href="2024.case-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title><fixed-case>YY</fixed-case>ama@Multimodal Hate Speech Event Detection 2024: Simpler Prompts, Better Results - Enhancing Zero-shot Detection with a Large Multimodal Model</title>
@@ -94,6 +99,7 @@
       <url hash="178aaa3e">2024.case-1.7</url>
       <attachment type="SupplementaryMaterial" hash="c7a427a5">2024.case-1.7.SupplementaryMaterial.txt</attachment>
       <bibkey>yamagishi-2024-yyama</bibkey>
+      <video href="2024.case-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title><fixed-case>RACAI</fixed-case> at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Improving Detection of Hate Speech by Extending <fixed-case>LLM</fixed-case> Predictions with Handcrafted Features</title>
@@ -103,6 +109,7 @@
       <url hash="0fc1708e">2024.case-1.8</url>
       <attachment type="SupplementaryMaterial" hash="1dc8a65f">2024.case-1.8.SupplementaryMaterial.txt</attachment>
       <bibkey>pais-2024-racai</bibkey>
+      <video href="2024.case-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title><fixed-case>CLTL</fixed-case>@Multimodal Hate Speech Event Detection 2024: The Winning Approach to Detecting Multimodal Hate Speech and Its Targets</title>
@@ -113,6 +120,7 @@
       <url hash="a917423c">2024.case-1.9</url>
       <attachment type="SupplementaryMaterial" hash="c7dbc61a">2024.case-1.9.SupplementaryMaterial.txt</attachment>
       <bibkey>wang-markov-2024-cltl</bibkey>
+      <video href="2024.case-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title><fixed-case>HAM</fixed-case>i<fixed-case>S</fixed-case>o<fixed-case>N</fixed-case>-Generative at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Stance Detection using generative large language models</title>
@@ -123,6 +131,7 @@
       <url hash="c2c88bff">2024.case-1.10</url>
       <attachment type="SupplementaryMaterial" hash="3a3140d9">2024.case-1.10.SupplementaryMaterial.txt</attachment>
       <bibkey>fraile-hernandez-penas-2024-hamison</bibkey>
+      <video href="2024.case-1.10.mp4"/>
     </paper>
     <paper id="11">
       <title><fixed-case>JRC</fixed-case> at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Lexicon-based Detection of Hate Speech</title>
@@ -142,6 +151,7 @@
       <url hash="7fee1627">2024.case-1.12</url>
       <attachment type="SupplementaryMaterial" hash="854915c7">2024.case-1.12.SupplementaryMaterial.txt</attachment>
       <bibkey>rodriguez-garcia-centeno-2024-hamison</bibkey>
+      <video href="2024.case-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title><fixed-case>NLPD</fixed-case>ame at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Mistral Sequence Classification with <fixed-case>PEFT</fixed-case> for Hate Speech, Targets and Stance Event Detection</title>
@@ -151,6 +161,7 @@
       <url hash="4cd13d3b">2024.case-1.13</url>
       <attachment type="SupplementaryMaterial" hash="58f41c6f">2024.case-1.13.SupplementaryMaterial.txt</attachment>
       <bibkey>christodoulou-2024-nlpdame</bibkey>
+      <video href="2024.case-1.13.mp4"/>
     </paper>
     <paper id="14">
       <title><fixed-case>AAST</fixed-case>-<fixed-case>NLP</fixed-case> at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Ensemble-Based Climate Activism Stance and Hate Speech Detection : Leveraging Pretrained Language Models</title>
@@ -161,6 +172,7 @@
       <url hash="7ae52ba3">2024.case-1.14</url>
       <attachment type="SupplementaryMaterial" hash="32bc3c17">2024.case-1.14.SupplementaryMaterial.txt</attachment>
       <bibkey>el-sayed-nasr-2024-aast</bibkey>
+      <video href="2024.case-1.14.mp4"/>
     </paper>
     <paper id="15">
       <title><fixed-case>ARC</fixed-case>-<fixed-case>NLP</fixed-case> at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Stance and Hate Speech Detection by Generative and Encoder Models Optimized with Tweet-Specific Elements</title>
@@ -172,6 +184,7 @@
       <url hash="a9015d2a">2024.case-1.15</url>
       <attachment type="SupplementaryMaterial" hash="7ec4b660">2024.case-1.15.SupplementaryMaterial.txt</attachment>
       <bibkey>kaya-etal-2024-arc</bibkey>
+      <video href="2024.case-1.15.mp4"/>
     </paper>
     <paper id="16">
       <title><fixed-case>HAM</fixed-case>i<fixed-case>S</fixed-case>o<fixed-case>N</fixed-case>-Ensemble at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Ensemble of <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a, Llama 2, and Multi-task for Stance Detection</title>
@@ -184,6 +197,7 @@
       <url hash="34b85316">2024.case-1.16</url>
       <attachment type="SupplementaryMaterial" hash="6e2eb695">2024.case-1.16.SupplementaryMaterial.txt</attachment>
       <bibkey>rodriguez-garcia-etal-2024-hamison</bibkey>
+      <video href="2024.case-1.16.mp4"/>
     </paper>
     <paper id="17">
       <title><fixed-case>M</fixed-case>ason<fixed-case>P</fixed-case>erplexity at Multimodal Hate Speech Event Detection 2024: Hate Speech and Target Detection Using Transformer Ensembles</title>
@@ -198,6 +212,7 @@
       <url hash="bfc479b7">2024.case-1.17</url>
       <attachment type="SupplementaryMaterial" hash="9c6c8c22">2024.case-1.17.SupplementaryMaterial.txt</attachment>
       <bibkey>ganguly-etal-2024-masonperplexity</bibkey>
+      <video href="2024.case-1.17.mp4"/>
     </paper>
     <paper id="18">
       <title><fixed-case>M</fixed-case>ason<fixed-case>P</fixed-case>erplexity at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Integrating Advanced Ensemble Techniques and Data Augmentation for Climate Activism Stance and Hate Event Identification</title>
@@ -211,6 +226,7 @@
       <url hash="1ef9fe9b">2024.case-1.18</url>
       <attachment type="SupplementaryMaterial" hash="b644eed8">2024.case-1.18.SupplementaryMaterial.txt</attachment>
       <bibkey>bin-emran-etal-2024-masonperplexity</bibkey>
+      <video href="2024.case-1.18.mp4"/>
     </paper>
     <paper id="19">
       <title><fixed-case>AAST</fixed-case>-<fixed-case>NLP</fixed-case> at Multimodal Hate Speech Event Detection 2024 : A Multimodal Approach for Classification of Text-Embedded Images Based on <fixed-case>CLIP</fixed-case> and <fixed-case>BERT</fixed-case>-Based Models.</title>
@@ -221,6 +237,7 @@
       <url hash="226b4c09">2024.case-1.19</url>
       <attachment type="SupplementaryMaterial" hash="eb095851">2024.case-1.19.SupplementaryMaterial.txt</attachment>
       <bibkey>el-sayed-nasr-2024-aast-nlp</bibkey>
+      <video href="2024.case-1.19.mp4"/>
     </paper>
     <paper id="20">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>B</fixed-case>inary_<fixed-case>H</fixed-case>ackers at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: A Comprehensive Evaluation and Superior Performance of Transformer-Based Models in Hate Speech Event Detection and Stance Classification for Climate Activism</title>
@@ -232,6 +249,7 @@
       <url hash="c1f36eb9">2024.case-1.20</url>
       <attachment type="SupplementaryMaterial" hash="7ef0f063">2024.case-1.20.SupplementaryMaterial.txt</attachment>
       <bibkey>farsi-etal-2024-cuet</bibkey>
+      <video href="2024.case-1.20.mp4"/>
     </paper>
     <paper id="21">
       <title><fixed-case>HAM</fixed-case>i<fixed-case>S</fixed-case>o<fixed-case>N</fixed-case>-baselines at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: A Study on the Use of External Data for Hate Speech and Stance Detection</title>
@@ -252,6 +270,7 @@
       <url hash="12506530">2024.case-1.22</url>
       <attachment type="SupplementaryMaterial" hash="b7a9a419">2024.case-1.22.SupplementaryMaterial.txt</attachment>
       <bibkey>narayan-biswal-2024-z</bibkey>
+      <video href="2024.case-1.22.mp4"/>
     </paper>
     <paper id="23">
       <title>Bryndza at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Stance, Target and Hate Event Detection via Retrieval-Augmented <fixed-case>GPT</fixed-case>-4 and <fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case></title>
@@ -266,6 +285,7 @@
       <url hash="62d8e0bd">2024.case-1.23</url>
       <attachment type="SupplementaryMaterial" hash="88de26ab">2024.case-1.23.SupplementaryMaterial.txt</attachment>
       <bibkey>suppa-etal-2024-bryndza</bibkey>
+      <video href="2024.case-1.23.mp4"/>
     </paper>
     <paper id="24">
       <title><fixed-case>IUST</fixed-case> at <fixed-case>C</fixed-case>limate<fixed-case>A</fixed-case>ctivism 2024: Towards Optimal Stance Detection: A Systematic Study of Architectural Choices and Data Cleaning Techniques</title>
@@ -276,6 +296,7 @@
       <url hash="4988c4f3">2024.case-1.24</url>
       <attachment type="SupplementaryMaterial" hash="594789b8">2024.case-1.24.SupplementaryMaterial.txt</attachment>
       <bibkey>mahmoudi-eetemadi-2024-iust</bibkey>
+      <video href="2024.case-1.24.mp4"/>
     </paper>
     <paper id="25">
       <title><fixed-case>VRLL</fixed-case>ab at <fixed-case>HSD</fixed-case>-2<fixed-case>L</fixed-case>ang 2024: <fixed-case>T</fixed-case>urkish Hate Speech Detection Online with <fixed-case>T</fixed-case>urkish<fixed-case>BERT</fixed-case>weet</title>
@@ -286,6 +307,7 @@
       <url hash="3e32fc2b">2024.case-1.25</url>
       <attachment type="SupplementaryMaterial" hash="2b08047f">2024.case-1.25.SupplementaryMaterial.txt</attachment>
       <bibkey>najafi-varol-2024-vrllab</bibkey>
+      <video href="2024.case-1.25.mp4"/>
     </paper>
     <paper id="26">
       <title>Transformers at <fixed-case>HSD</fixed-case>-2<fixed-case>L</fixed-case>ang 2024: Hate Speech Detection in <fixed-case>A</fixed-case>rabic and <fixed-case>T</fixed-case>urkish Tweets Using <fixed-case>BERT</fixed-case> Based Architectures</title>
@@ -296,6 +318,7 @@
       <url hash="2229bc15">2024.case-1.26</url>
       <attachment type="SupplementaryMaterial" hash="7908c8cf">2024.case-1.26.SupplementaryMaterial.txt</attachment>
       <bibkey>singhal-bedi-2024-transformers</bibkey>
+      <video href="2024.case-1.26.mp4"/>
     </paper>
     <paper id="27">
       <title><fixed-case>R</fixed-case>e<fixed-case>BERT</fixed-case> at <fixed-case>HSD</fixed-case>-2<fixed-case>L</fixed-case>ang 2024: Fine-Tuning <fixed-case>BERT</fixed-case> with <fixed-case>A</fixed-case>dam<fixed-case>W</fixed-case> for Hate Speech Detection in <fixed-case>A</fixed-case>rabic and <fixed-case>T</fixed-case>urkish</title>
@@ -307,6 +330,7 @@
       <url hash="17125fc1">2024.case-1.27</url>
       <attachment type="SupplementaryMaterial" hash="7bda495b">2024.case-1.27.SupplementaryMaterial.txt</attachment>
       <bibkey>yagci-etal-2024-rebert</bibkey>
+      <video href="2024.case-1.27.mp4"/>
     </paper>
     <paper id="28">
       <title><fixed-case>D</fixed-case>etective<fixed-case>R</fixed-case>e<fixed-case>DAS</fixed-case>ers at <fixed-case>HSD</fixed-case>-2<fixed-case>L</fixed-case>ang 2024: A New Pooling Strategy with Cross-lingual Augmentation and Ensembling for Hate Speech Detection in Low-resource Languages</title>
@@ -318,6 +342,7 @@
       <url hash="c9194c6c">2024.case-1.28</url>
       <attachment type="SupplementaryMaterial" hash="997170c1">2024.case-1.28.SupplementaryMaterial.txt</attachment>
       <bibkey>qachfar-etal-2024-detectiveredasers</bibkey>
+      <video href="2024.case-1.28.mp4"/>
     </paper>
     <paper id="29">
       <title>Detecting Hate Speech in <fixed-case>T</fixed-case>urkish Print Media: A Corpus and A Hybrid Approach with Target-oriented Linguistic Knowledge</title>
@@ -333,6 +358,7 @@
       <url hash="e304fe62">2024.case-1.29</url>
       <attachment type="SupplementaryMaterial" hash="720c49f9">2024.case-1.29.SupplementaryMaterial.txt</attachment>
       <bibkey>uludogan-etal-2024-detecting</bibkey>
+      <video href="2024.case-1.29.mp4"/>
     </paper>
     <paper id="30">
       <title>Team Curie at <fixed-case>HSD</fixed-case>-2<fixed-case>L</fixed-case>ang 2024: Hate Speech Detection in <fixed-case>T</fixed-case>urkish and <fixed-case>A</fixed-case>rabic Tweets using <fixed-case>BERT</fixed-case>-based models</title>
@@ -361,6 +387,7 @@
       <url hash="80c1e339">2024.case-1.31</url>
       <attachment type="SupplementaryMaterial" hash="49f8c01b">2024.case-1.31.SupplementaryMaterial.txt</attachment>
       <bibkey>thapa-etal-2024-extended</bibkey>
+      <video href="2024.case-1.31.mp4"/>
     </paper>
     <paper id="32">
       <title>Overview of the Hate Speech Detection in <fixed-case>T</fixed-case>urkish and <fixed-case>A</fixed-case>rabic Tweets (<fixed-case>HSD</fixed-case>-2<fixed-case>L</fixed-case>ang) Shared Task at <fixed-case>CASE</fixed-case> 2024</title>
@@ -375,6 +402,7 @@
       <url hash="9a039c4b">2024.case-1.32</url>
       <attachment type="SupplementaryMaterial" hash="a5f07f9f">2024.case-1.32.SupplementaryMaterial.txt</attachment>
       <bibkey>uludogan-etal-2024-overview</bibkey>
+      <video href="2024.case-1.32.mp4"/>
     </paper>
     <paper id="33">
       <title>Stance and Hate Event Detection in Tweets Related to Climate Activism - Shared Task at <fixed-case>CASE</fixed-case> 2024</title>
@@ -392,6 +420,7 @@
       <url hash="6429a831">2024.case-1.33</url>
       <attachment type="SupplementaryMaterial" hash="f7165bef">2024.case-1.33.SupplementaryMaterial.txt</attachment>
       <bibkey>thapa-etal-2024-stance</bibkey>
+      <video href="2024.case-1.33.mp4"/>
     </paper>
     <paper id="34">
       <title>A Concise Report of the 7th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text</title>
diff --git a/data/xml/2024.cawl.xml b/data/xml/2024.cawl.xml
new file mode 100644
index 0000000000..17a00f737b
--- /dev/null
+++ b/data/xml/2024.cawl.xml
@@ -0,0 +1,99 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.cawl">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024</booktitle>
+      <editor><first>Kyle</first><last>Gorman</last></editor>
+      <editor><first>Emily</first><last>Prud'hommeaux</last></editor>
+      <editor><first>Brian</first><last>Roark</last></editor>
+      <editor><first>Richard</first><last>Sproat</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="11e8e3dc">2024.cawl-1</url>
+      <venue>cawl</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="8b70f576">2024.cawl-1.0</url>
+      <bibkey>cawl-2024-computation</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>P</fixed-case>ars<fixed-case>T</fixed-case>ext: A Digraphic Corpus for <fixed-case>T</fixed-case>ajik-<fixed-case>F</fixed-case>arsi Transliteration</title>
+      <author><first>Rayyan</first><last>Merchant</last></author>
+      <author><first>Kevin</first><last>Tang</last></author>
+      <pages>1–7</pages>
+      <abstract>Despite speaking dialects of the same language, Persian speakers from Tajikistan cannot read Persian texts from Iran and Afghanistan. This is due to the fact that Tajik Persian is written in the Tajik-Cyrillic script, while Iranian and Afghan Persian are written in the Perso-Arabic script. As the formal registers of these dialects all maintain high levels of mutual intelligibility with each other, machine transliteration has been proposed as a more practical and appropriate solution than machine translation. Unfortunately, Persian texts written in both scripts are much more common in print in Tajikistan than online. This paper introduces a novel corpus meant to remedy that gap: ParsText. ParsText contains 2,813 Persian sentences written in both Tajik-Cyrillic and Perso-Arabic manually collected from blog pages and news articles online. This paper presents the need for such a corpus, previous and related work, data collection and alignment procedures, corpus statistics, and discusses directions for future work.</abstract>
+      <url hash="83587211">2024.cawl-1.1</url>
+      <bibkey>merchant-tang-2024-parstext</bibkey>
+    </paper>
+    <paper id="2">
+      <title>A Joint Approach for Automatic Analysis of Reading and Writing Errors</title>
+      <author><first>Wieke</first><last>Harmsen</last></author>
+      <author><first>Catia</first><last>Cucchiarini</last></author>
+      <author><first>Roeland</first><last>van Hout</last></author>
+      <author><first>Helmer</first><last>Strik</last></author>
+      <pages>8–17</pages>
+      <abstract>Analyzing the errors that children make on their ways to becoming fluent readers and writers can provide invaluable scientific insights into the processes that underlie literacy acquisition. To this end, we present in this paper an extension of an earlier developed spelling error detection and classification algorithm for Dutch, so that reading errors can also be automatically detected from their phonetic transcription. The strength of this algorithm lies in its ability to detect errors at Phoneme-Corresponding Unit (PCU) level, where a PCU is a sequence of letters corresponding to one phoneme. We validated this algorithm and found good agreement between manual and automatic reading error classifications. We also used the algorithm to analyze written words by second graders and phonetic transcriptions of read words by first graders. With respect to the writing data, we found that the PCUs ‘ei’, ‘eu’, ‘g’, ‘ij’ and ‘ch’ were most frequently written incorrectly, for the reading data, these were the PCUs ‘v’, ‘ui’, ‘ng’, ‘a’ and ‘g’. This study presents a first attempt at developing a joint method for detecting reading and writing errors. In future research this algorithm can be used to analyze corpora containing reading and writing data from the same children.</abstract>
+      <url hash="d818197a">2024.cawl-1.2</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="548d87c8">2024.cawl-1.2.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>harmsen-etal-2024-joint</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Tool for Constructing a Large-Scale Corpus of Code Comments and Other Source Code Annotations</title>
+      <author><first>Luna</first><last>Peck</last></author>
+      <author><first>Susan</first><last>Brown</last></author>
+      <pages>18–22</pages>
+      <abstract>The sublanguage of source code annotations—explanatory natural language writing that accompanies programming source code—is little-studied in linguistics. To facilitate research into this domain, we have developed a program prototype that can extract code comments and changelogs (i.e. commit messages) from public, open-source code repositories, with automatic tokenization and part-of-speech tagging on the extracted text. The program can also automatically detect and discard “commented-out” source code in data from Python repositories, to prevent it from polluting the corpus, demonstrating that such sanitization is likely feasible for other programming languages as well. With the current tool, we have produced a 6-million word corpus of English-language comments extracted from three different programming languages: Python, C, and C++.</abstract>
+      <url hash="f22cebf9">2024.cawl-1.3</url>
+      <bibkey>peck-brown-2024-tool</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Tokenization via Language Modeling: the Role of Preceding Text</title>
+      <author><first>Rastislav</first><last>Hronsky</last></author>
+      <author><first>Emmanuel</first><last>Keuleers</last></author>
+      <pages>23–35</pages>
+      <abstract>While language models benefit immensely from their capacity to model large context (i.e., sequence of preceding tokens), the role of context is unclear in text tokenization, which is, in many cases, language model-driven to begin with. In this paper, we attempt to explore the role in three different writing systems and using three different text tokenization strategies (word-based, Morfessor, and BPE). In the first experiment, we examined how the size of context used for predicting the next token affects the ranking of the segmentation strategies i.t.o. language model surprisal. This effect was very writing system specific: minimal in case of English, and rank-reversing due to increased context size and token granularity in case of Turkish and Chinese. In the second experiment, we examined how context alters segmentation hypotheses when using language models to identify word boundaries. In this case, the effect was subtle: using context-aware, rather than context-free segment scores improved boundary recognition accuracy by up to 0.5%, once baseline effects were exploited.</abstract>
+      <url hash="eed0710b">2024.cawl-1.4</url>
+      <bibkey>hronsky-keuleers-2024-tokenization</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Abbreviation Across the World’s Languages and Scripts</title>
+      <author><first>Kyle</first><last>Gorman</last></author>
+      <author><first>Brian</first><last>Roark</last></author>
+      <pages>36–42</pages>
+      <abstract>Detailed taxonomies for non-standard words, including abbreviations, have been developed for speech and language processing, though mostly with reference to English. In this paper, we examine abbreviation formation strategies in a diverse sample of more than 50 languages, dialects and scripts. The resulting taxonomy—and data about which strategies are attested in which languages—provides key information needed to create multilingual systems for abbreviation expansion, an essential component for speech processing and text understanding</abstract>
+      <url hash="1e5d9c77">2024.cawl-1.5</url>
+      <bibkey>gorman-roark-2024-abbreviation</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Now You See Me, Now You Don’t: ‘Poverty of the Stimulus’ Problems and Arbitrary Correspondences in End-to-End Speech Models</title>
+      <author><first>Daan</first><last>van Esch</last></author>
+      <pages>43–52</pages>
+      <abstract>End-to-end models for speech recognition and speech synthesis have many benefits, but we argue they also face a unique set of challenges not encountered in conventional multi-stage hybrid systems, which relied on the explicit injection of linguistic knowledge through resources such as phonemic dictionaries and verbalization grammars. These challenges include handling words with unusual grapheme-to-phoneme correspondences, converting between written forms like ‘12’ and spoken forms such as ‘twelve’, and contextual disambiguation of homophones or homographs. We describe the mitigation strategies that have been used for these problems in end-to-end systems, either implicitly or explicitly, and call out that the most commonly used mitigation techniques are likely incompatible with newly emerging approaches that use minimal amounts of supervised audio training data. We review best-of-both-world approaches that allow the use of end-to-end models combined with traditional linguistic resources, which we show are increasingly straightforward to create at scale, and close with an optimistic outlook for bringing speech technologies to many more languages by combining these strands of research.</abstract>
+      <url hash="18452f49">2024.cawl-1.6</url>
+      <bibkey>van-esch-2024-now</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Towards Fast Cognate Alignment on Imbalanced Data</title>
+      <author><first>Logan</first><last>Born</last></author>
+      <author><first>M. Willis</first><last>Monroe</last></author>
+      <author><first>Kathryn</first><last>Kelley</last></author>
+      <author><first>Anoop</first><last>Sarkar</last></author>
+      <pages>53–58</pages>
+      <abstract>Cognate alignment models purport to enable decipherment, but their speed and need for clean data can make them unsuitable for realistic decipherment problems. We seek to draw attention to these shortcomings in the hopes that future work may avoid them, and we outline two techniques which begin to overcome the described problems.</abstract>
+      <url hash="534284a7">2024.cawl-1.7</url>
+      <bibkey>born-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Simplified <fixed-case>C</fixed-case>hinese Character Distance Based on Ideographic Description Sequences</title>
+      <author><first>Yixia</first><last>Wang</last></author>
+      <author><first>Emmanuel</first><last>Keuleers</last></author>
+      <pages>59–66</pages>
+      <abstract>Character encoding systems have long overlooked the internal structure of characters. Ideographic Description Sequences, which explicitly represent spatial relations between character components, are a potential solution to this problem. In this paper, we illustrate the utility of Ideographic Description Sequences in computing edit distance and finding orthographic neighbors for Simplified Chinese characters. In addition, we explore the possibility of using Ideographic Description Sequences to encode spatial relations between components in other scripts.</abstract>
+      <url hash="fc34088b">2024.cawl-1.8</url>
+      <bibkey>wang-keuleers-2024-simplified</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.cl.xml b/data/xml/2024.cl.xml
new file mode 100644
index 0000000000..1ef12c5465
--- /dev/null
+++ b/data/xml/2024.cl.xml
@@ -0,0 +1,127 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.cl">
+  <volume id="1" type="journal">
+    <meta>
+      <booktitle>Computational Linguistics, Volume 50, Issue 1 - March 2024</booktitle>
+      <publisher>MIT Press</publisher>
+      <address>Cambridge, MA</address>
+      <month>March</month>
+      <year>2024</year>
+      <venue>cl</venue>
+      <journal-volume>50</journal-volume>
+      <journal-issue>1</journal-issue>
+    </meta>
+    <paper id="1">
+      <title>My Big, Fat 50-Year Journey</title>
+      <author><first>Martha</first><last>Palmer</last></author>
+      <doi>10.1162/coli_a_00499</doi>
+      <abstract>My most heartfelt thanks to ACL for this tremendous honor. I’m completely thrilled. I cannot tell you how surprised I was when I got Iryna’s email. It is amazing that my first ACL conference since 2019 in Florence includes this award. What a wonderful way to be back with all of my friends and family here at ACL. I’m going to tell you about my big fat 50-year journey. What have I been doing for the last 50 years? Well, finding meaning, quite literally in words. Or in other words, exploring how computational lexical semantics can support natural language understanding. This is going to be quick. Hold onto your hats, here we go.</abstract>
+      <pages>1–24</pages>
+      <url hash="350741f4">2024.cl-1.1</url>
+      <bibkey>palmer-2024-big</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Rethinking the Exploitation of Monolingual Data for Low-Resource Neural Machine Translation</title>
+      <author><first>Jianhui</first><last>Pang</last></author>
+      <author><first>Baosong</first><last>Yang*</last></author>
+      <author><first>Derek Fai</first><last>Wong*</last></author>
+      <author><first>Yu</first><last>Wan</last></author>
+      <author><first>Dayiheng</first><last>Liu</last></author>
+      <author><first>Lidia Sam</first><last>Chao</last></author>
+      <author><first>Jun</first><last>Xie</last></author>
+      <doi>10.1162/coli_a_00496</doi>
+      <abstract>The utilization of monolingual data has been shown to be a promising strategy for addressing low-resource machine translation problems. Previous studies have demonstrated the effectiveness of techniques such as back-translation and self-supervised objectives, including masked language modeling, causal language modeling, and denoise autoencoding, in improving the performance of machine translation models. However, the manner in which these methods contribute to the success of machine translation tasks and how they can be effectively combined remains an under-researched area. In this study, we carry out a systematic investigation of the effects of these techniques on linguistic properties through the use of probing tasks, including source language comprehension, bilingual word alignment, and translation fluency. We further evaluate the impact of pre-training, back-translation, and multi-task learning on bitexts of varying sizes. Our findings inform the design of more effective pipelines for leveraging monolingual data in extremely low-resource and low-resource machine translation tasks. Experiment results show consistent performance gains in seven translation directions, which provide further support for our conclusions and understanding of the role of monolingual data in machine translation.</abstract>
+      <pages>25–47</pages>
+      <url hash="97fc3d67">2024.cl-1.2</url>
+      <bibkey>pang-etal-2024-rethinking</bibkey>
+    </paper>
+    <paper id="3">
+      <title>How Is a “Kitchen Chair” like a “Farm Horse”? Exploring the Representation of Noun-Noun Compound Semantics in Transformer-based Language Models</title>
+      <author><first>Mark</first><last>Ormerod</last></author>
+      <author><first>Jesús Martínez</first><last>del Rincón</last></author>
+      <author><first>Barry</first><last>Devereux</last></author>
+      <doi>10.1162/coli_a_00495</doi>
+      <abstract>Despite the success of Transformer-based language models in a wide variety of natural language processing tasks, our understanding of how these models process a given input in order to represent task-relevant information remains incomplete. In this work, we focus on semantic composition and examine how Transformer-based language models represent semantic information related to the meaning of English noun-noun compounds. We probe Transformer-based language models for their knowledge of the thematic relations that link the head nouns and modifier words of compounds (e.g., KITCHEN CHAIR: a chair located in a kitchen). Firstly, using a dataset featuring groups of compounds with shared lexical or semantic features, we find that token representations of six Transformer-based language models distinguish between pairs of compounds based on whether they use the same thematic relation. Secondly, we utilize fine-grained vector representations of compound semantics derived from human annotations, and find that token vectors from several models elicit a strong signal of the semantic relations used in the compounds. In a novel “compositional probe” setting, where we compare the semantic relation signal in mean-pooled token vectors of compounds to mean-pooled token vectors when the two constituent words appear in separate sentences, we find that the Transformer-based language models that best represent the semantics of noun-noun compounds also do so substantially better than in the control condition where the two constituent works are processed separately. Overall, our results shed light on the ability of Transformer-based language models to support compositional semantic processes in representing the meaning of noun-noun compounds.</abstract>
+      <pages>49–81</pages>
+      <url hash="0aff206a">2024.cl-1.3</url>
+      <bibkey>ormerod-etal-2024-kitchen</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Universal Generation for <fixed-case>O</fixed-case>ptimality <fixed-case>T</fixed-case>heory Is <fixed-case>PSPACE</fixed-case>-Complete</title>
+      <author><first>Sophie</first><last>Hao</last></author>
+      <doi>10.1162/coli_a_00494</doi>
+      <abstract>This article shows that the universal generation problem for Optimality Theory (OT) is PSPACE-complete. While prior work has shown that universal generation is at least NP-hard and at most EXPSPACE-hard, our results place universal generation in between those two classes, assuming that NP ≠ PSPACE. We additionally show that when the number of constraints is bounded in advance, universal generation is at least NL-hard and at most NPNP-hard. Our proofs rely on a close connection between OT and the intersection non-emptiness problem for finite automata, which is PSPACE-complete in general and NL-complete when the number of automata is bounded. Our analysis shows that constraint interaction is the main contributor to the complexity of OT: The ability to factor transformations into simple, interacting constraints allows OT to furnish compact descriptions of intricate phonological phenomena.</abstract>
+      <pages>83–117</pages>
+      <url hash="59b66d33">2024.cl-1.4</url>
+      <bibkey>hao-2024-universal</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Analyzing Semantic Faithfulness of Language Models via Input Intervention on Question Answering</title>
+      <author><first>Akshay</first><last>Chaturvedi</last></author>
+      <author><first>Swarnadeep</first><last>Bhar</last></author>
+      <author><first>Soumadeep</first><last>Saha</last></author>
+      <author><first>Utpal</first><last>Garain</last></author>
+      <author><first>Nicholas</first><last>Asher</last></author>
+      <doi>10.1162/coli_a_00493</doi>
+      <abstract>Transformer-based language models have been shown to be highly effective for several NLP tasks. In this article, we consider three transformer models, BERT, RoBERTa, and XLNet, in both small and large versions, and investigate how faithful their representations are with respect to the semantic content of texts. We formalize a notion of semantic faithfulness, in which the semantic content of a text should causally figure in a model’s inferences in question answering. We then test this notion by observing a model’s behavior on answering questions about a story after performing two novel semantic interventions—deletion intervention and negation intervention. While transformer models achieve high performance on standard question answering tasks, we show that they fail to be semantically faithful once we perform these interventions for a significant number of cases (∼ 50% for deletion intervention, and ∼ 20% drop in accuracy for negation intervention). We then propose an intervention-based training regime that can mitigate the undesirable effects for deletion intervention by a significant margin (from ∼ 50% to ∼ 6%). We analyze the inner-workings of the models to better understand the effectiveness of intervention-based training for deletion intervention. But we show that this training does not attenuate other aspects of semantic unfaithfulness such as the models’ inability to deal with negation intervention or to capture the predicate–argument structure of texts. We also test InstructGPT, via prompting, for its ability to handle the two interventions and to capture predicate–argument structure. While InstructGPT models do achieve very high performance on predicate–argument structure task, they fail to respond adequately to our deletion and negation interventions.</abstract>
+      <pages>119–155</pages>
+      <url hash="1e92522f">2024.cl-1.5</url>
+      <bibkey>chaturvedi-etal-2024-analyzing</bibkey>
+    </paper>
+    <paper id="6">
+      <title>On the Role of Morphological Information for Contextual Lemmatization</title>
+      <author><first>Olia</first><last>Toporkov</last></author>
+      <author><first>Rodrigo</first><last>Agerri</last></author>
+      <doi>10.1162/coli_a_00497</doi>
+      <abstract>Lemmatization is a natural language processing (NLP) task that consists of producing, from a given inflected word, its canonical form or lemma. Lemmatization is one of the basic tasks that facilitate downstream NLP applications, and is of particular importance for high-inflected languages. Given that the process to obtain a lemma from an inflected word can be explained by looking at its morphosyntactic category, including fine-grained morphosyntactic information to train contextual lemmatizers has become common practice, without considering whether that is the optimum in terms of downstream performance. In order to address this issue, in this article we empirically investigate the role of morphological information to develop contextual lemmatizers in six languages within a varied spectrum of morphological complexity: Basque, Turkish, Russian, Czech, Spanish, and English. Furthermore, and unlike the vast majority of previous work, we also evaluate lemmatizers in out-of-domain settings, which constitutes, after all, their most common application use. The results of our study are rather surprising. It turns out that providing lemmatizers with fine-grained morphological features during training is not that beneficial, not even for agglutinative languages. In fact, modern contextual word representations seem to implicitly encode enough morphological information to obtain competitive contextual lemmatizers without seeing any explicit morphological signal. Moreover, our experiments suggest that the best lemmatizers out-of-domain are those using simple UPOS tags or those trained without morphology and, lastly, that current evaluation practices for lemmatization are not adequate to clearly discriminate between models.</abstract>
+      <pages>157–191</pages>
+      <url hash="c1b837bb">2024.cl-1.6</url>
+      <bibkey>toporkov-agerri-2024-role</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Stance Detection with Explanations</title>
+      <author><first>Rudra Ranajee</first><last>Saha</last></author>
+      <author><first>Laks V. S.</first><last>Lakshmanan</last></author>
+      <author><first>Raymond T.</first><last>Ng</last></author>
+      <doi>10.1162/coli_a_00501</doi>
+      <abstract>Identification of stance has recently gained a lot of attention with the extreme growth of fake news and filter bubbles. Over the last decade, many feature-based and deep-learning approaches have been proposed to solve stance detection. However, almost none of the existing works focus on providing a meaningful explanation for their prediction. In this work, we study stance detection with an emphasis on generating explanations for the predicted stance by capturing the pivotal argumentative structure embedded in a document. We propose to build a stance tree that utilizes rhetorical parsing to construct an evidence tree and to use Dempster Shafer Theory to aggregate the evidence. Human studies show that our unsupervised technique of generating stance explanations outperforms the SOTA extractive summarization method in terms of informativeness, non-redundancy, coverage, and overall quality. Furthermore, experiments show that our explanation-based stance prediction excels or matches the performance of the SOTA model on various benchmark datasets.</abstract>
+      <pages>193–235</pages>
+      <url hash="8243fb1a">2024.cl-1.7</url>
+      <bibkey>saha-etal-2024-stance</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Can Large Language Models Transform Computational Social Science?</title>
+      <author><first>Caleb</first><last>Ziems</last></author>
+      <author><first>William</first><last>Held</last></author>
+      <author><first>Omar</first><last>Shaikh</last></author>
+      <author><first>Jiaao</first><last>Chen</last></author>
+      <author><first>Zhehao</first><last>Zhang</last></author>
+      <author><first>Diyi</first><last>Yang</last></author>
+      <doi>10.1162/coli_a_00502</doi>
+      <abstract>Large language models (LLMs) are capable of successfully performing many language processing tasks zero-shot (without training data). If zero-shot LLMs can also reliably classify and explain social phenomena like persuasiveness and political ideology, then LLMs could augment the computational social science (CSS) pipeline in important ways. This work provides a road map for using LLMs as CSS tools. Towards this end, we contribute a set of prompting best practices and an extensive evaluation pipeline to measure the zero-shot performance of 13 language models on 25 representative English CSS benchmarks. On taxonomic labeling tasks (classification), LLMs fail to outperform the best fine-tuned models but still achieve fair levels of agreement with humans. On free-form coding tasks (generation), LLMs produce explanations that often exceed the quality of crowdworkers’ gold references. We conclude that the performance of today’s LLMs can augment the CSS research pipeline in two ways: (1) serving as zero-shot data annotators on human annotation teams, and (2) bootstrapping challenging creative generation tasks (e.g., explaining the underlying attributes of a text). In summary, LLMs are posed to meaningfully participate in social science analysis in partnership with humans.</abstract>
+      <pages>237–291</pages>
+      <url hash="47b4f1d0">2024.cl-1.8</url>
+      <bibkey>ziems-etal-2024-large</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Language Model Behavior: A Comprehensive Survey</title>
+      <author><first>Tyler A.</first><last>Chang</last></author>
+      <author><first>Benjamin K.</first><last>Bergen</last></author>
+      <doi>10.1162/coli_a_00492</doi>
+      <abstract>Transformer language models have received widespread public attention, yet their generated text is often surprising even to NLP researchers. In this survey, we discuss over 250 recent studies of English language model behavior before task-specific fine-tuning. Language models possess basic capabilities in syntax, semantics, pragmatics, world knowledge, and reasoning, but these capabilities are sensitive to specific inputs and surface features. Despite dramatic increases in generated text quality as models scale to hundreds of billions of parameters, the models are still prone to unfactual responses, commonsense errors, memorized text, and social biases. Many of these weaknesses can be framed as over-generalizations or under-generalizations of learned patterns in text. We synthesize recent results to highlight what is currently known about large language model capabilities, thus providing a resource for applied work and for research in adjacent fields that use language models.</abstract>
+      <pages>293–350</pages>
+      <url hash="c1447504">2024.cl-1.9</url>
+      <bibkey>chang-bergen-2024-language</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>P</fixed-case>olysemy—<fixed-case>E</fixed-case>vidence from Linguistics, Behavioral Science, and Contextualized Language Models</title>
+      <author><first>Janosch</first><last>Haber</last></author>
+      <author><first>Massimo</first><last>Poesio</last></author>
+      <doi>10.1162/coli_a_00500</doi>
+      <abstract>Polysemy is the type of lexical ambiguity where a word has multiple distinct but related interpretations. In the past decade, it has been the subject of a great many studies across multiple disciplines including linguistics, psychology, neuroscience, and computational linguistics, which have made it increasingly clear that the complexity of polysemy precludes simple, universal answers, especially concerning the representation and processing of polysemous words. But fuelled by the growing availability of large, crowdsourced datasets providing substantial empirical evidence; improved behavioral methodology; and the development of contextualized language models capable of encoding the fine-grained meaning of a word within a given context, the literature on polysemy recently has developed more complex theoretical analyses. In this survey we discuss these recent contributions to the investigation of polysemy against the backdrop of a long legacy of research across multiple decades and disciplines. Our aim is to bring together different perspectives to achieve a more complete picture of the heterogeneity and complexity of the phenomenon of polysemy. Specifically, we highlight evidence supporting a range of hybrid models of the mental processing of polysemes. These hybrid models combine elements from different previous theoretical approaches to explain patterns and idiosyncrasies in the processing of polysemous that the best known models so far have failed to account for. Our literature review finds that (i) traditional analyses of polysemy can be limited in their generalizability by loose definitions and selective materials; (ii) linguistic tests provide useful evidence on individual cases, but fail to capture the full range of factors involved in the processing of polysemous sense extensions; and (iii) recent behavioral (psycho) linguistics studies, large-scale annotation efforts, and investigations leveraging contextualized language models provide accumulating evidence suggesting that polysemous sense similarity covers a wide spectrum between identity of sense and homonymy-like unrelatedness of meaning. We hope that the interdisciplinary account of polysemy provided in this survey inspires further fundamental research on the nature of polysemy and better equips applied research to deal with the complexity surrounding the phenomenon, for example, by enabling the development of benchmarks and testing paradigms for large language models informed by a greater portion of the rich evidence on the phenomenon currently available.</abstract>
+      <pages>351–417</pages>
+      <url hash="2b6ed76e">2024.cl-1.10</url>
+      <bibkey>haber-poesio-2024-polysemy</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.cl4health.xml b/data/xml/2024.cl4health.xml
new file mode 100644
index 0000000000..a80a42db68
--- /dev/null
+++ b/data/xml/2024.cl4health.xml
@@ -0,0 +1,414 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.cl4health">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024</booktitle>
+      <editor><first>Dina</first><last>Demner-Fushman</last></editor>
+      <editor><first>Sophia</first><last>Ananiadou</last></editor>
+      <editor><first>Paul</first><last>Thompson</last></editor>
+      <editor><first>Brian</first><last>Ondov</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="0a35c181">2024.cl4health-1</url>
+      <venue>cl4health</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="f1014464">2024.cl4health-1.0</url>
+      <bibkey>cl4health-2024-patient</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Improving Sign Language Production in the Healthcare Domain Using <fixed-case>UMLS</fixed-case> and Multi-task Learning</title>
+      <author><first>Jonathan David</first><last>Mutal</last></author>
+      <author><first>Raphael</first><last>Rubino</last></author>
+      <author><first>Pierrette</first><last>Bouillon</last></author>
+      <author><first>Bastien</first><last>David</last></author>
+      <author><first>Johanna</first><last>Gerlach</last></author>
+      <author><first>Irene</first><last>Strasly</last></author>
+      <pages>1–7</pages>
+      <abstract>This paper presents a study on Swiss-French sign language production in the medical domain. In emergency care settings, a lack of clear communication can interfere with accurate delivery of health related services. For patients communicating with sign language, equal access to healthcare remains an issue. While previous work has explored producing sign language gloss from a source text, we propose to extend this approach to produce a multichannel sign language output given a written French input. Furthermore, we extend our approach with a multi-task framework allowing us to include the Unified Medical Language System (UMLS) in our model. Results show that the introduction of UMLS in the training data improves model accuracy by 13.64 points.</abstract>
+      <url hash="be588d51">2024.cl4health-1.1</url>
+      <bibkey>mutal-etal-2024-improving</bibkey>
+    </paper>
+    <paper id="2">
+      <title>It’s Difficult to Be Neutral – Human and <fixed-case>LLM</fixed-case>-based Sentiment Annotation of Patient Comments</title>
+      <author><first>Petter</first><last>Mæhlum</last></author>
+      <author><first>David</first><last>Samuel</last></author>
+      <author><first>Rebecka Maria</first><last>Norman</last></author>
+      <author><first>Elma</first><last>Jelin</last></author>
+      <author><first>Øyvind Andresen</first><last>Bjertnæs</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
+      <pages>8–19</pages>
+      <abstract>Sentiment analysis is an important tool for aggregating patient voices, in order to provide targeted improvements in healthcare services. A prerequisite for this is the availability of in-domain data annotated for sentiment. This article documents an effort to add sentiment annotations to free-text comments in patient surveys collected by the Norwegian Institute of Public Health (NIPH). However, annotation can be a time-consuming and resource-intensive process, particularly when it requires domain expertise. We therefore also evaluate a possible alternative to human annotation, using large language models (LLMs) as annotators. We perform an extensive evaluation of the approach for two openly available pretrained LLMs for Norwegian, experimenting with different configurations of prompts and in-context learning, comparing their performance to human annotators. We find that even for zero-shot runs, models perform well above the baseline for binary sentiment, but still cannot compete with human annotators on the full dataset.</abstract>
+      <url hash="f95fcf7e">2024.cl4health-1.2</url>
+      <bibkey>maehlum-etal-2024-difficult</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Simulating Diverse Patient Populations Using Patient Vignettes and Large Language Models</title>
+      <author><first>Daniel</first><last>Reichenpfader</last></author>
+      <author><first>Kerstin</first><last>Denecke</last></author>
+      <pages>20–25</pages>
+      <abstract>Ensuring equitable access to digital therapeutics (DTx) is essential to avoid healthcare inequalities in an era of increasing digitization. This requires DTx to be tested with users from diverse populations, which is often not realistic due to time and resource constraints. In this paper, we propose the use of large language models (LLMs) to simulate diverse patients. Specifically, we manually create a patient vignette that characterizes a specific population group. Variations of this vignette are used for role-prompting a commercial LLM, GPT-4, instructing the LLM to take on the role described in the patient vignette and act accordingly. We investigate if the LLM stays in its given role. To do this, we simulate a medical anamnesis interview with the role-prompted LLM and analyze its responses for compliance, coherence, correctness, containment, and clarification. Our results show that GPT-4 generates compliant, coherent and clinically valid responses, including information that is not explicitly stated in the provided patient vignette.</abstract>
+      <url hash="859c86e8">2024.cl4health-1.3</url>
+      <bibkey>reichenpfader-denecke-2024-simulating</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Annotating Emotions in Acquired Brain Injury Patients’ Narratives</title>
+      <author><first>Salomé</first><last>Klein</last></author>
+      <author><first>Amalia</first><last>Todirascu</last></author>
+      <author><first>Hélène</first><last>Vassiliadou</last></author>
+      <author><first>Marie</first><last>Kuppelin</last></author>
+      <author><first>Joffrey</first><last>Becart</last></author>
+      <author><first>Thalassio</first><last>Briand</last></author>
+      <author><first>Clara</first><last>Coridon</last></author>
+      <author><first>Francine</first><last>Gerhard-Krait</last></author>
+      <author><first>Joé</first><last>Laroche</last></author>
+      <author><first>Jean</first><last>Ulrich</last></author>
+      <author><first>Agata</first><last>Krasny-Pacini</last></author>
+      <pages>26–36</pages>
+      <abstract>In this article, we aim to measure the patients’ progress in recognizing and naming emotions by capturing a variety of phenomena that express emotion in discourse. To do so, we introduce an emotion annotation scheme adapted for Acquired Brain Injury (ABI) patients’ narratives. We draw on recent research outcomes in line with linguistic and psychological theories of emotion in the development of French resources for Natural Language Processing (NLP). From this perspective and following Battistelli et al. (2022) guidelines, our protocol considers several means of expressing emotions, including prototypical expressions as well as implicit means. Its originality lies on the methodology adopted for its creation, as we combined, adapted, and tested several previous annotation schemes to create a tool tailored to our spoken clinical French corpus and its unique characteristics and challenges.</abstract>
+      <url hash="61c13946">2024.cl4health-1.4</url>
+      <bibkey>klein-etal-2024-annotating</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Structuring Clinical Notes of <fixed-case>I</fixed-case>talian <fixed-case>ST</fixed-case>-elevation Myocardial Infarction Patients</title>
+      <author><first>Vittorio</first><last>Torri</last></author>
+      <author><first>Sara</first><last>Mazzucato</last></author>
+      <author><first>Stefano</first><last>Dalmiani</last></author>
+      <author><first>Umberto</first><last>Paradossi</last></author>
+      <author><first>Claudio</first><last>Passino</last></author>
+      <author><first>Sara</first><last>Moccia</last></author>
+      <author><first>Silvestro</first><last>Micera</last></author>
+      <author><first>Francesca</first><last>Ieva</last></author>
+      <pages>37–43</pages>
+      <abstract>In recent years, it has become common for patients to get full access to their Electronic Health Records (EHRs), thanks to the advancements in the EHRs systems of many healthcare providers. While this access empowers patients and doctors with comprehensive and real-time health information, it also introduces new challenges, in particular due to the unstructured nature of much of the information within EHRs. To address this, we propose a pipeline to structure clinical notes, providing them with a clear and concise overview of their health data and its longitudinal evolution, also allowing clinicians to focus more on patient care during consultations. In this paper, we present preliminary results on extracting structured information from anamneses of patients diagnosed with ST-Elevation Myocardial Infarction from an Italian hospital. Our pipeline exploits text classification models to extract relevant clinical variables, comparing rule-based, recurrent neural network and BERT-based models. While various approaches utilized ontologies or knowledge graphs for Italian data, our work represents the first attempt to develop this type of pipeline. The results for the extraction of most variables are satisfactory (f1-score &gt; 0.80), with the exception of the most rare values of certain variables, for which we propose future research directions to investigate.</abstract>
+      <url hash="28aeb989">2024.cl4health-1.5</url>
+      <bibkey>torri-etal-2024-structuring</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Towards <fixed-case>AI</fixed-case>-supported Health Communication in Plain Language: Evaluating Intralingual Machine Translation of Medical Texts</title>
+      <author><first>Silvana</first><last>Deilen</last></author>
+      <author><first>Ekaterina</first><last>Lapshinova-Koltunski</last></author>
+      <author><first>Sergio</first><last>Hernández Garrido</last></author>
+      <author><first>Christiane</first><last>Maaß</last></author>
+      <author><first>Julian</first><last>Hörner</last></author>
+      <author><first>Vanessa</first><last>Theel</last></author>
+      <author><first>Sophie</first><last>Ziemer</last></author>
+      <pages>44–53</pages>
+      <abstract>In this paper, we describe results of a study on evaluation of intralingual machine translation. The study focuses on machine translations of medical texts into Plain German. The automatically simplified texts were compared with manually simplified texts (i.e., simplified by human experts) as well as with the underlying, unsimplified source texts. We analyse the quality of the translations based on different criteria, such as correctness, readability, and syntactic complexity. The study revealed that the machine translations were easier to read than the source texts, but contained a higher number of complex syntactic relations than the human translations. Furthermore, we identified various types of mistakes. These included not only grammatical mistakes but also content-related mistakes that resulted, for example, from mistranslations of grammatical structures, ambiguous words or numbers, omissions of relevant prefixes or negation, and incorrect explanations of technical terms.</abstract>
+      <url hash="d234a90e">2024.cl4health-1.6</url>
+      <bibkey>deilen-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Large Language Models as Drug Information Providers for Patients</title>
+      <author><first>Luca</first><last>Giordano</last></author>
+      <author><first>Maria Pia</first><last>di Buono</last></author>
+      <pages>54–63</pages>
+      <abstract>Recently, a significant interest has arisen about the application of Large Language Models (LLMs) in medical settings to enhance various aspects of healthcare. Particularly, the application of such models to improve knowledge access for both clinicians and patients seems very promising but still far from perfect. In this paper, we present a preliminary evaluation of LLMs as drug information providers to support patients in drug administration. We focus on posology, namely dosage quantity and prescription, contraindications and adverse drug reactions and run an experiment on the Italian language to assess both the trustworthiness of the outputs and their readability. The results show that different types of errors affect the LLM answers. In some cases, the model does not recognize the drug name, due to the presence of synonymous words, or it provides untrustworthy information, caused by intrinsic hallucinations. Overall, the complexity of the language is lower and this could contribute to make medical information more accessible to lay people.</abstract>
+      <url hash="629c0bf2">2024.cl4health-1.7</url>
+      <bibkey>giordano-di-buono-2024-large</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Towards Generation of Personalised Health Intervention Messages</title>
+      <author><first>Clara</first><last>Wan Ching Ho</last></author>
+      <author><first>Volha</first><last>Petukhova</last></author>
+      <pages>64–72</pages>
+      <abstract>Self-care is essential in managing chronic diseases when patients could not always be monitored by medical staff. It therefore fills in the gap to provide patients with advice in improving their conditions in day-to-day practices. However, effectiveness of self-interventions in encouraging healthy behaviour is limited, as they are often delivered in the same manner for patients regardless of their demographics, personality and individual preferences. In this paper, we propose strategies to generate personalized health intervention messages departing from assumptions made by theories of social cognition and learning, planned behaviour and information processing. The main task is then defined personalised argument generation task. Specifically, an existing well-performing Natural Language Generation (NLG) pipeline model is extended to modulate linguistic features by ranking texts generated based on individuals’ predicted preferences for persuasive messages. Results show that the model is capable of generating diverse intervention messages while preserving the original intended meaning. The modulated interventions were approved by human evaluators as being more understandable and maintaining the same level of convincingness as human-written texts. However, the generated personalised interventions did not show significant improvements in the power to change health-related attitudes and/or behaviour compared to their non-personalised counterparts. This is attributed to the fact that human data collected for the model’s training was rather limited in size and variation.</abstract>
+      <url hash="72ec668a">2024.cl4health-1.8</url>
+      <bibkey>wan-ching-ho-petukhova-2024-towards</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Analysing Emotions in Cancer Narratives: A Corpus-Driven Approach</title>
+      <author><first>Daisy Monika</first><last>Lal</last></author>
+      <author><first>Paul</first><last>Rayson</last></author>
+      <author><first>Sheila A.</first><last>Payne</last></author>
+      <author><first>Yufeng</first><last>Liu</last></author>
+      <pages>73–83</pages>
+      <abstract>Cancer not only affects a patient’s physical health, but it can also elicit a wide spectrum of intense emotions in patients, friends, and family members. People with cancer and their carers (family member, partner, or friend) are increasingly turning to the web for information and support. Despite the expansion of sentiment analysis in the context of social media and healthcare, there is relatively less research on patient narratives, which are longer, more complex texts, and difficult to assess. In this exploratory work, we examine how patients and carers express their feelings about various aspects of cancer (treatments and stages). The objective of this paper is to illustrate with examples the nature of language in the clinical domain, as well as the complexities of language when performing automatic sentiment and emotion analysis. We perform a linguistic analysis of a corpus of cancer narratives collected from Reddit. We examine the performance of five state-of-the-art models (T5, DistilBERT, Roberta, RobertaGo, and NRCLex) to see how well they match with human comparisons separated by linguistic and medical background. The corpus yielded several surprising results that could be useful to sentiment analysis NLP experts. The linguistic issues encountered were classified into four categories: statements expressing a variety of emotions, ambiguous or conflicting statements with contradictory emotions, statements requiring additional context, and statements in which sentiment and emotions can be inferred but are not explicitly mentioned.</abstract>
+      <url hash="cf9a5f08">2024.cl4health-1.9</url>
+      <bibkey>lal-etal-2024-analysing</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Study of Medical Text Reading and Comprehension through Eye-Tracking Fixations</title>
+      <author><first>Oksana</first><last>Ivchenko</last></author>
+      <author><first>Natalia</first><last>Grabar</last></author>
+      <pages>84–92</pages>
+      <abstract>Reading plays a crucial role in cognitive processes, acting as the primary way in which people access and assimilate information. However, the ability to effectively comprehend and understand text is significantly influenced by various factors related to people and text types. We propose to study the reading easiness and comprehension of texts through the eye-tracking technology, which tracks gaze and records eye movement during reading. We concentrate on the study of eye-tracking measures related to fixations (average duration of fixations and number of fixations). The experiments are performed on several types of texts (clinical cases, encyclopedia articles related to the medical area, general-language texts, and simplified clinical cases). Eye-tracking measures are analysed quantitatively and qualitatively to draw the reading patterns and analyse how the reading differs across the text types.</abstract>
+      <url hash="41a21f48">2024.cl4health-1.10</url>
+      <bibkey>ivchenko-grabar-2024-study</bibkey>
+    </paper>
+    <paper id="11">
+      <title>A Neuro-Symbolic Approach to Monitoring Salt Content in Food</title>
+      <author><first>Anuja</first><last>Tayal</last></author>
+      <author><first>Barbara</first><last>Di Eugenio</last></author>
+      <author><first>Devika</first><last>Salunke</last></author>
+      <author><first>Andrew D.</first><last>Boyd</last></author>
+      <author><first>Carolyn A.</first><last>Dickens</last></author>
+      <author><first>Eulalia P.</first><last>Abril</last></author>
+      <author><first>Olga</first><last>Garcia-Bedoya</last></author>
+      <author><first>Paula G.</first><last>Allen-Meares</last></author>
+      <pages>93–103</pages>
+      <abstract>We propose a dialogue system that enables heart failure patients to inquire about salt content in foods and help them monitor and reduce salt intake. Addressing the lack of specific datasets for food-based salt content inquiries, we develop a template-based conversational dataset. The dataset is structured to ask clarification questions to identify food items and their salt content. Our findings indicate that while fine-tuning transformer-based models on the dataset yields limited performance, the integration of Neuro-Symbolic Rules significantly enhances the system’s performance. Our experiments show that by integrating neuro-symbolic rules, our system achieves an improvement in joint goal accuracy of over 20% across different data sizes compared to naively fine-tuning transformer-based models.</abstract>
+      <url hash="1d6acdda">2024.cl4health-1.11</url>
+      <bibkey>tayal-etal-2024-neuro</bibkey>
+    </paper>
+    <paper id="12">
+      <title>On Simplification of Discharge Summaries in <fixed-case>S</fixed-case>erbian: Facing the Challenges</title>
+      <author><first>Anđelka</first><last>Zečević</last></author>
+      <author><first>Milica</first><last>Ćulafić</last></author>
+      <author><first>Stefan</first><last>Stojković</last></author>
+      <pages>104–108</pages>
+      <abstract>The simplified information page (SIP) is a simplified discharge summary created to mitigate health risks caused by low medical comprehension. One of the most critical aspects of medical comprehension concerns interpreting medication instructions such as proper dosing, frequency, and duration. In our work, we examine the capacities of mainstream Large Language Models (LLMs) such as ChatGPT and Gemini to generate SIP-like medication-oriented pages based on the provided discharge summaries. We are sharing the initial qualitative assessments of our study based on a small collection of discharge summaries in Serbian, pointing to noticed inaccuracies, unfaithful content, and language quality. Hopefully, these findings might be helpful in addressing the multilingual perspective of patient-oriented language.</abstract>
+      <url hash="bb0afcc5">2024.cl4health-1.12</url>
+      <bibkey>zecevic-etal-2024-simplification</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Medical-<fixed-case>FLAVORS</fixed-case>: A Figurative Language and Vocabulary Open Repository for <fixed-case>S</fixed-case>panish in the Medical Domain</title>
+      <author><first>Lucia</first><last>Pitarch</last></author>
+      <author><first>Emma</first><last>Angles-Herrero</last></author>
+      <author><first>Yufeng</first><last>Liu</last></author>
+      <author><first>Daisy Monika</first><last>Lal</last></author>
+      <author><first>Jorge</first><last>Gracia</last></author>
+      <author><first>Paul</first><last>Rayson</last></author>
+      <author><first>Judith</first><last>Rietjens</last></author>
+      <pages>109–114</pages>
+      <abstract>Metaphors shape the way we think by enabling the expression of one concept in terms of another one. For instance, cancer can be understood as a place from which one can go in and out, as a journey that one can traverse, or as a battle. Giving patients awareness of the way they refer to cancer and different narratives in which they can reframe it has been proven to be a key aspect when experiencing the disease. In this work, we propose a preliminary identification and representation of Spanish cancer metaphors using MIP (Metaphor Identification Procedure) and MetaNet. The created resource is the first openly available dataset for medical metaphors in Spanish. Thus, in the future, we expect to use it as the gold standard in automatic metaphor processing tasks, which will also serve to further populate the resource and understand how cancer is experienced and narrated.</abstract>
+      <url hash="d0fa344b">2024.cl4health-1.13</url>
+      <bibkey>pitarch-etal-2024-medical</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Generating Synthetic Documents with Clinical Keywords: A Privacy-Sensitive Methodology</title>
+      <author><first>Simon</first><last>Meoni</last></author>
+      <author><first>Éric</first><last>De la Clergerie</last></author>
+      <author><first>Théo</first><last>Ryffel</last></author>
+      <pages>115–123</pages>
+      <abstract>Electronic Health Records store valuable patient-staff interaction data. These notes, often unstructured to save healthcare personnel time, can be challenging to analyze manually. Proprietary online Large Language Models have demonstrated impressive results in analyzing EHR notes. However, Clinical NLP faces unique challenges due to the sensitive and specialized nature of the data. Sending patient information via external APIs poses privacy risks, and hospitals require customized NLP systems to align with their unique practices. To address these challenges, developing customized LLMs using specific training datasets is crucial. To address this, we propose generating synthetic training data using keywords extracted without confidential information. Furthermore, we introduce a reward mechanism that iteratively refines the quality of synthetic documents. This involves scoring synthetic candidates against real clinical reports using a semantic textual similarity score and performing an aligment step to align the model with its best-scored utterances.</abstract>
+      <url hash="cbcb465d">2024.cl4health-1.14</url>
+      <bibkey>meoni-etal-2024-generating</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Building Certified Medical Chatbots: Overcoming Unstructured Data Limitations with Modular <fixed-case>RAG</fixed-case></title>
+      <author><first>Leonardo</first><last>Sanna</last></author>
+      <author><first>Patrizio</first><last>Bellan</last></author>
+      <author><first>Simone</first><last>Magnolini</last></author>
+      <author><first>Marina</first><last>Segala</last></author>
+      <author><first>Saba</first><last>Ghanbari Haez</last></author>
+      <author><first>Monica</first><last>Consolandi</last></author>
+      <author><first>Mauro</first><last>Dragoni</last></author>
+      <pages>124–130</pages>
+      <abstract>Creating a certified conversational agent poses several issues. The need to manage fine-grained information delivery and the necessity to provide reliable medical information requires a notable effort, especially in dataset preparation. In this paper, we investigate the challenges of building a certified medical chatbot in Italian that provides information about pregnancy and early childhood. We show some negative initial results regarding the possibility of creating a certified conversational agent within the RASA framework starting from unstructured data. Finally, we propose a modular RAG model to implement a Large Language Model in a certified context, overcoming data limitations and enabling data collection on actual conversations.</abstract>
+      <url hash="5cc8f95b">2024.cl4health-1.15</url>
+      <bibkey>sanna-etal-2024-building</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Towards Using Automatically Enhanced Knowledge Graphs to Aid Temporal Relation Extraction</title>
+      <author><first>Timotej</first><last>Knez</last></author>
+      <author><first>Slavko</first><last>Žitnik</last></author>
+      <pages>131–136</pages>
+      <abstract>Temporal relation extraction in medical document analysis is crucial for understanding patient histories and treatment outcomes. This paper introduces a novel approach leveraging a bimodal model integrating textual content and a knowledge graph, to enhance temporal relation extraction. The paper presents ongoing research in constructing an optimal knowledge graph by augmenting PrimeKG with dynamically expanded information using a language model-generated knowledge graph, and further personalize the information with patient-specific graphs tailored for relation prediction. The pipeline for constructing this enriched knowledge graph is detailed, aiming to improve the capabilities of temporal relation extraction models. The preliminary results show that adding a simple knowledge graph to the temporal relation extraction model can significantly increase the performance, achieving new state-of-the-art results. While the research in using enhanced knowledge graphs is still ongoing, this paper lays the groundwork for leveraging common knowledge to advance temporal relation extraction in medical contexts. This approach holds promise for enhancing the understanding of patient histories and treatment outcomes, potentially leading to improved healthcare decision-making and patient care.</abstract>
+      <url hash="31575496">2024.cl4health-1.16</url>
+      <bibkey>knez-zitnik-2024-towards</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Experiments in Automated Generation of Discharge Summaries in <fixed-case>I</fixed-case>talian</title>
+      <author><first>Lorenzo</first><last>Ruinelli</last></author>
+      <author><first>Amos</first><last>Colombo</last></author>
+      <author><first>Mathilde</first><last>Rochat</last></author>
+      <author><first>Sotirios Georgios</first><last>Popeskou</last></author>
+      <author><first>Andrea</first><last>Franchini</last></author>
+      <author><first>Sandra</first><last>Mitrović</last></author>
+      <author><first>Oscar William</first><last>Lithgow</last></author>
+      <author><first>Joseph</first><last>Cornelius</last></author>
+      <author><first>Fabio</first><last>Rinaldi</last></author>
+      <pages>137–144</pages>
+      <abstract>Hospital discharge letters are a fundamental component of patient management, as they provide the crucial information needed for patient post-hospital care. However their creation is very demanding and resource intensive, as it requires consultation of several reports documenting the patient’s journey throughout their hospital stay. Given the increasing pressures on doctor’s time, tools that can draft a reasonable discharge summary, to be then reviewed and finalized by the experts, would be welcome. In this paper we present a comparative study exploring the possibility of automatic generation of discharge summaries within the context of an hospital in an Italian-speaking region and discuss quantitative and qualitative results. Despite some shortcomings, the obtained results show that a generic generative system such as ChatGPT is capable of producing discharge summaries which are relatively close to the human generated ones, even in Italian.</abstract>
+      <url hash="cfa8d560">2024.cl4health-1.17</url>
+      <bibkey>ruinelli-etal-2024-experiments</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Evaluating <fixed-case>LLM</fixed-case>s for Temporal Entity Extraction from Pediatric Clinical Text in Rare Diseases Context</title>
+      <author><first>Judith Jeyafreeda</first><last>Andrew</last></author>
+      <author><first>Marc</first><last>Vincent</last></author>
+      <author><first>Anita</first><last>Burgun</last></author>
+      <author><first>Nicolas</first><last>Garcelon</last></author>
+      <pages>145–152</pages>
+      <abstract>The aim of this work is to extract Temporal Entities from patients’ EHR from pediatric hospital specialising in Rare Diseases, thus allowing to create a patient timeline relative to diagnosis . We aim to perform an evaluation of NLP tools and Large Language Models (LLM) to test their application in the field of clinical study where data is limited and sensitive. We present a short annotation guideline for temporal entity identification. We then use the tool EDS-NLP, the Language Model CamemBERT-with-Dates and the LLM Vicuna to extract temporal entities. We perform experiments using three different prompting techniques on the LLM Vicuna to evaluate the model thoroughly. We use a small dataset of 50 EHR describing the evolution of rare diseases in patients to perform our experiments. We show that among the different methods to prompt a LLM, using a decomposed structure of prompting method on the LLM vicuna produces the best results for temporal entity recognition. The LLM learns from examples in the prompt and decomposing one prompt to several prompts allows the model to avoid confusions between the different entity types. Identifying the temporal entities in EHRs helps to build the timeline of a patient and to learn the evolution of a diseases. This is specifically important in the case of rare diseases due to the availability of limited examples. In this paper, we show that this can be made possible with the use of Language Models and LLM in a secure environment, thus preserving the privacy of the patient</abstract>
+      <url hash="d5209fef">2024.cl4health-1.18</url>
+      <bibkey>andrew-etal-2024-evaluating</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Generating Distributable Surrogate Corpus for Medical Multi-label Classification</title>
+      <author><first>Seiji</first><last>Shimizu</last></author>
+      <author><first>Shuntaro</first><last>Yada</last></author>
+      <author><first>Shoko</first><last>Wakamiya</last></author>
+      <author><first>Eiji</first><last>Aramaki</last></author>
+      <pages>153–162</pages>
+      <abstract>In medical and social media domains, annotated corpora are often hard to distribute due to copyrights and privacy issues. To overcome this situation, we propose a new method to generate a surrogate corpus for a downstream task by using a text generation model. We chose a medical multi-label classification task, MedWeb, in which patient-generated short messages express multiple symptoms. We first fine-tuned text generation models with different prompting designs on the original corpus to obtain synthetic versions of that corpus. To assess the viability of the generated corpora for the downstream task, we compared the performance of multi-label classification models trained either on the original or the surrogate corpora. The results and the error analysis showed the difficulty of generating surrogate corpus in multi-label settings, suggesting text generation under complex conditions is not trivial. On the other hand, our experiment demonstrates that the generated corpus with a sentinel-based prompting is comparatively viable in a single-label (multiclass) classification setting.</abstract>
+      <url hash="090fe2ae">2024.cl4health-1.19</url>
+      <bibkey>shimizu-etal-2024-generating</bibkey>
+    </paper>
+    <paper id="20">
+      <title><fixed-case>C</fixed-case>lini<fixed-case>R</fixed-case>es: Publicly Available Mapping of Clinical Lexical Resources</title>
+      <author><first>Elena</first><last>Zotova</last></author>
+      <author><first>Montse</first><last>Cuadros</last></author>
+      <author><first>German</first><last>Rigau</last></author>
+      <pages>163–172</pages>
+      <abstract>This paper presents a human-readable resource for mapping identifiers from various clinical knowledge bases. This resource is a version of UMLS Metathesaurus enriched with WordNet 3.0 and 3.1 synsets, Wikidata items with their clinical identifiers, SNOMED CT to ICD-10 mapping and Spanish ICD-10 codes description. The main goal of the presented resource is to provide semantic interoperability across the clinical concepts from various knowledge bases and facilitate its integration into mapping tools. As a side effect, the mapping enriches already annotated medical corpora for entity recognition or entity linking tasks with new labels. We experiment with entity linking task, using a corpus annotated both manually and with the mapping method and demonstrate that a semi-automatic way of annotation may be used to create new labels. The resource is available in English and Spanish, although all languages of UMLS may be extracted. The new lexical resource is publicly available.</abstract>
+      <url hash="4adca1af">2024.cl4health-1.20</url>
+      <bibkey>zotova-etal-2024-clinires</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>M</fixed-case>ed<fixed-case>D</fixed-case>ialog-<fixed-case>FR</fixed-case>: A <fixed-case>F</fixed-case>rench Version of the <fixed-case>M</fixed-case>ed<fixed-case>D</fixed-case>ialog Corpus for Multi-label Classification and Response Generation Related to Women’s Intimate Health</title>
+      <author><first>Xingyu</first><last>Liu</last></author>
+      <author><first>Vincent</first><last>Segonne</last></author>
+      <author><first>Aidan</first><last>Mannion</last></author>
+      <author><first>Didier</first><last>Schwab</last></author>
+      <author><first>Lorraine</first><last>Goeuriot</last></author>
+      <author><first>François</first><last>Portet</last></author>
+      <pages>173–183</pages>
+      <abstract>This article presents MedDialog-FR, a large publicly available corpus of French medical conversations for the medical domain. Motivated by the lack of French dialogue corpora for data-driven dialogue systems and the paucity of available information related to women’s intimate health, we introduce an annotated corpus of question-and-answer dialogues between a real patient and a real doctor concerning women’s intimate health. The corpus is composed of about 20,000 dialogues automatically translated from the English version of MedDialog-EN. The corpus test set is composed of 1,400 dialogues that have been manually post-edited and annotated with 22 categories from the UMLS ontology. We also fine-tuned state-of-the-art reference models to automatically perform multi-label classification and response generation to give an initial performance benchmark and highlight the difficulty of the tasks.</abstract>
+      <url hash="f320b71f">2024.cl4health-1.21</url>
+      <bibkey>liu-etal-2024-meddialog</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Exploring the Suitability of Transformer Models to Analyse Mental Health Peer Support Forum Data for a Realist Evaluation</title>
+      <author><first>Matthew</first><last>Coole</last></author>
+      <author><first>Paul</first><last>Rayson</last></author>
+      <author><first>Zoe</first><last>Glossop</last></author>
+      <author><first>Fiona</first><last>Lobban</last></author>
+      <author><first>Paul</first><last>Marshall</last></author>
+      <author><first>John</first><last>Vidler</last></author>
+      <pages>184–188</pages>
+      <abstract>Mental health peer support forums have become widely used in recent years. The emerging mental health crisis and the COVID-19 pandemic have meant that finding a place online for support and advice when dealing with mental health issues is more critical than ever. The need to examine, understand and find ways to improve the support provided by mental health forums is vital in the current climate. As part of this, we present our initial explorations in using modern transformer models to detect four key concepts (connectedness, lived experience, empathy and gratitude), which we believe are essential to understanding how people use mental health forums and will serve as a basis for testing more expansive realise theories about mental health forums in the future. As part of this work, we also replicate previously published results on empathy utilising an existing annotated dataset and test the other concepts on our manually annotated mental health forum posts dataset. These results serve as a basis for future research examining peer support forums.</abstract>
+      <url hash="17d321d7">2024.cl4health-1.22</url>
+      <bibkey>coole-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Revisiting the <fixed-case>MIMIC</fixed-case>-<fixed-case>IV</fixed-case> Benchmark: Experiments Using Language Models for Electronic Health Records</title>
+      <author><first>Jesus</first><last>Lovon-Melgarejo</last></author>
+      <author><first>Thouria</first><last>Ben-Haddi</last></author>
+      <author><first>Jules</first><last>Di Scala</last></author>
+      <author><first>Jose G.</first><last>Moreno</last></author>
+      <author><first>Lynda</first><last>Tamine</last></author>
+      <pages>189–196</pages>
+      <abstract>The lack of standardized evaluation benchmarks in the medical domain for text inputs can be a barrier to widely adopting and leveraging the potential of natural language models for health-related downstream tasks. This paper revisited an openly available MIMIC-IV benchmark for electronic health records (EHRs) to address this issue. First, we integrate the MIMIC-IV data within the Hugging Face datasets library to allow an easy share and use of this collection. Second, we investigate the application of templates to convert EHR tabular data to text. Experiments using fine-tuned and zero-shot LLMs on the mortality of patients task show that fine-tuned text-based models are competitive against robust tabular classifiers. In contrast, zero-shot LLMs struggle to leverage EHR representations. This study underlines the potential of text-based approaches in the medical field and highlights areas for further improvement.</abstract>
+      <url hash="d477f4ab">2024.cl4health-1.23</url>
+      <bibkey>lovon-melgarejo-etal-2024-revisiting</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Unraveling Clinical Insights: A Lightweight and Interpretable Approach for Multimodal and Multilingual Knowledge Integration</title>
+      <author><first>Kanimozhi</first><last>Uma</last></author>
+      <author><first>Marie-Francine</first><last>Moens</last></author>
+      <pages>197–203</pages>
+      <abstract>In recent years, the analysis of clinical texts has evolved significantly, driven by the emergence of language models like BERT such as PubMedBERT, and ClinicalBERT, which have been tailored for the (bio)medical domain that rely on extensive archives of medical documents. While they boast high accuracy, their lack of interpretability and language transfer limitations restrict their clinical utility. To address this, we propose a new, lightweight graph-based embedding method designed specifically for radiology reports. This approach considers the report’s structure and content, connecting medical terms through the multilingual SNOMED Clinical Terms knowledge base. The resulting graph embedding reveals intricate relationships among clinical terms, enhancing both clinician comprehension and clinical accuracy without the need for large pre-training datasets. Demonstrating the versatility of our method, we apply this embedding to two tasks: disease and image classification in X-ray reports. In disease classification, our model competes effectively with BERT-based approaches, yet it is significantly smaller and requires less training data. Additionally, in image classification, we illustrate the efficacy of the graph embedding by leveraging cross-modal knowledge transfer, highlighting its applicability across diverse languages.</abstract>
+      <url hash="708a9e53">2024.cl4health-1.24</url>
+      <bibkey>uma-moens-2024-unraveling</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Automated Question-Answer Generation for Evaluating <fixed-case>RAG</fixed-case>-based Chatbots</title>
+      <author><first>Juan José</first><last>González Torres</last></author>
+      <author><first>Mihai Bogdan</first><last>Bîndilă</last></author>
+      <author><first>Sebastiaan</first><last>Hofstee</last></author>
+      <author><first>Daniel</first><last>Szondy</last></author>
+      <author><first>Quang-Hung</first><last>Nguyen</last></author>
+      <author><first>Shenghui</first><last>Wang</last></author>
+      <author><first>Gwenn</first><last>Englebienne</last></author>
+      <pages>204–214</pages>
+      <abstract>In this research, we propose a framework to generate human-like question-answer pairs with long or factoid answers automatically and, based on them, automatically evaluate the quality of Retrieval-Augmented Generation (RAG). Our framework can also create datasets that assess hallucination levels of Large Language Models (LLMs) by simulating unanswerable questions. We then apply the framework to create a dataset of question-answer (QA) pairs based on more than 1,000 leaflets about the medical and administrative procedures of a hospital. The dataset was evaluated by hospital specialists, who confirmed that more than 50% of the QA pairs are applicable. Finally, we show that our framework can be used to evaluate LLM performance by using Llama-2-13B fine-tuned in Dutch (Vanroy, 2023) with the generated dataset, and show the method’s use in testing models with regard to answering unanswerable and factoid questions appears promising.</abstract>
+      <url hash="4b9b7408">2024.cl4health-1.25</url>
+      <bibkey>gonzalez-torres-etal-2024-automated</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Speech Accommodation in Health-Care Interactions: Evidence Using a Mixed-Reality Platform</title>
+      <author><first>Rose</first><last>Baker</last></author>
+      <author><first>Susan C.</first><last>Bobb</last></author>
+      <author><first>Dai’Sha</first><last>Dowson</last></author>
+      <author><first>Elisha</first><last>Eanes</last></author>
+      <author><first>Makyah</first><last>McNeill</last></author>
+      <author><first>Hannah</first><last>Ragsdale</last></author>
+      <author><first>Audrey</first><last>Eaves</last></author>
+      <author><first>Joseph G.</first><last>Lee</last></author>
+      <author><first>Kathrin</first><last>Rothermich</last></author>
+      <pages>215–219</pages>
+      <abstract>Many people in the US use more than one language at home, yet English remains the dominant (L1) language in US society, which can complicate medical encounters. In this study we ask in what ways effective communication can be ensured in health care settings when speakers differ in language proficiency. One strategy people use is second language (L2) speech accommodation, which is characterized by slowed speech, less complex words, and clearer enunciation. We employ a mixed-reality platform called MURSION to document how a group of Physician Assistant students use speech accommodation during a healthcare encounter. MURSION is a computer-based virtual environment where participants interact with an Avatar controlled by a human interactor in a standardized environment. We record 5-minute interactions between the student and a high or low English proficiency Avatar. Our analyses evaluate lexical choices in L1-L2 interactions with SCOPE (South Carolina Psycholinguistic Metabase) and acoustic properties with PRAAT. Results show that clinical students use slower speech and high frequency words when speaking to a low proficiency virtual patient, indicating a sensitivity for the communicative needs of L2 English users. Speech accommodation results will contribute to communication training modules for clinicians to interact efficiently with linguistically diverse populations.</abstract>
+      <url hash="67c65b6d">2024.cl4health-1.26</url>
+      <bibkey>baker-etal-2024-speech</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Enhancing Consumer Health Question Reformulation: Chain-of-Thought Prompting Integrating Focus, Type, and User Knowledge Level</title>
+      <author><first>Jooyeon</first><last>Lee</last></author>
+      <author><first>Luan Huy</first><last>Pham</last></author>
+      <author><first>Özlem</first><last>Uzuner</last></author>
+      <pages>220–228</pages>
+      <abstract>In this paper, we explore consumer health question (CHQ) reformulation, focusing on enhancing the quality of reformation of questions without considering interest shifts. Our study introduces the use of the NIH GARD website as a gold standard dataset for this specific task, emphasizing its relevance and applicability. Additionally, we developed other datasets consisting of related questions scraped from Google, Bing, and Yahoo. We augmented, evaluated and analyzed the various datasets, demonstrating that the reformulation task closely resembles the question entailment generation task. Our approach, which integrates the Focus and Type of consumer inquiries, represents a significant advancement in the field of question reformulation. We provide a comprehensive analysis of different methodologies, offering insights into the development of more effective and user-centric AI systems for consumer health support.</abstract>
+      <url hash="6c73da3b">2024.cl4health-1.27</url>
+      <bibkey>lee-etal-2024-enhancing</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Exploring the Challenges of Behaviour Change Language Classification: A Study on Semi-Supervised Learning and the Impact of Pseudo-Labelled Data</title>
+      <author><first>Selina</first><last>Meyer</last></author>
+      <author><first>Marcos</first><last>Fernandez-Pichel</last></author>
+      <author><first>David</first><last>Elsweiler</last></author>
+      <author><first>David E.</first><last>Losada</last></author>
+      <pages>229–239</pages>
+      <abstract>Automatic classification of behaviour change language can enhance conversational agents’ capabilities to adjust their behaviour based on users’ current situations and to encourage individuals to make positive changes. However, the lack of annotated language data of change-seekers hampers the performance of existing classifiers. In this study, we investigate the use of semi-supervised learning (SSL) to classify highly imbalanced texts around behaviour change. We assess the impact of including pseudo-labelled data from various sources and examine the balance between the amount of added pseudo-labelled data and the strictness of the inclusion criteria. Our findings indicate that while adding pseudo-labelled samples to the training data has limited classification impact, it does not significantly reduce performance regardless of the source of these new samples. This reinforces previous findings on the feasibility of applying classifiers trained on behaviour change language to diverse contexts.</abstract>
+      <url hash="02ea42f9">2024.cl4health-1.28</url>
+      <bibkey>meyer-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Development of a Benchmark Corpus for Medical Device Adverse Event Detection</title>
+      <author><first>Susmitha</first><last>Wunnava</last></author>
+      <author><first>David A.</first><last>Harris</last></author>
+      <author><first>Florence T.</first><last>Bourgeois</last></author>
+      <author><first>Timothy A.</first><last>Miller</last></author>
+      <pages>240–245</pages>
+      <abstract>The U.S. Food and Drug Administration (FDA) collects real-world adverse events, including device-associated deaths, injuries, and malfunctions, through passive reporting to the agency’s Manufacturer and User Facility Device Experience (MAUDE) database. However, this system’s full potential remains untapped given the extensive use of unstructured text in medical device adverse event reports and lack of FDA resources and expertise to properly analyze all available data. In this work, we focus on addressing this limitation through the development of an annotated benchmark corpus to support the design and development of state-of-the-art NLP approaches towards automatic extraction of device-related adverse event information from FDA Medical Device Adverse Event Reports. We develop a dataset of labeled medical device reports from a diverse set of high-risk device types, that can be used for supervised machine learning. We develop annotation guidelines and manually annotate for nine entity types. The resulting dataset contains 935 annotated adverse event reports, containing 12252 annotated spans across the nine entity types. The dataset developed in this work will be made publicly available upon publication.</abstract>
+      <url hash="6da4d2aa">2024.cl4health-1.29</url>
+      <bibkey>wunnava-etal-2024-development</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Using <fixed-case>BART</fixed-case> to Automatically Generate Discharge Summaries from <fixed-case>S</fixed-case>wedish Clinical Text</title>
+      <author><first>Nils</first><last>Berg</last></author>
+      <author><first>Hercules</first><last>Dalianis</last></author>
+      <pages>246–252</pages>
+      <abstract>Documentation is a regular part of contemporary healthcare practices and one such documentation task is the creation of a discharge summary, which summarizes a care episode. However, to manually write discharge summaries is a time-consuming task, and research has shown that discharge summaries are often lacking quality in various respects. To alleviate this problem, text summarization methods could be applied on text from electronic health records, such as patient notes, to automatically create a discharge summary. Previous research has been conducted on this topic on text in various languages and with various methods, but no such research has been conducted on Swedish text. In this paper, four datasets extracted from a Swedish clinical corpora were used to fine-tune four BART language models to perform the task of summarizing Swedish patient notes into a discharge summary. Out of these models, the best performing model was manually evaluated by a senior, now retired, nurse and clinical coder. The evaluation results show that the best performing model produces discharge summaries of overall low quality. This is possibly due to issues in the data extracted from the Health Bank research infrastructure, which warrants further work on this topic.</abstract>
+      <url hash="88d6d056">2024.cl4health-1.30</url>
+      <bibkey>berg-dalianis-2024-using</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Biomedical Entity Linking for <fixed-case>D</fixed-case>utch: Fine-tuning a Self-alignment <fixed-case>BERT</fixed-case> Model on an Automatically Generated <fixed-case>W</fixed-case>ikipedia Corpus</title>
+      <author><first>Fons</first><last>Hartendorp</last></author>
+      <author><first>Tom</first><last>Seinen</last></author>
+      <author><first>Erik</first><last>van Mulligen</last></author>
+      <author><first>Suzan</first><last>Verberne</last></author>
+      <pages>253–263</pages>
+      <abstract>Biomedical entity linking, a main component in automatic information extraction from health-related texts, plays a pivotal role in connecting textual entities (such as diseases, drugs and body parts mentioned by patients) to their corresponding concepts in a structured biomedical knowledge base. The task remains challenging despite recent developments in natural language processing. This report presents the first evaluated biomedical entity linking model for the Dutch language. We use MedRoBERTa.nl as basemodel and perform second-phase pretraining through self-alignment on a Dutch biomedical ontology extracted from the UMLS and Dutch SNOMED. We derive a corpus from Wikipedia of ontology-linked Dutch biomedical entities in context and fine-tune our model on this dataset. We evaluate our model on the Dutch portion of the Mantra GSC-corpus and achieve 54.7% classification accuracy and 69.8% 1-distance accuracy. We then perform a case study on a collection of unlabeled, patient-support forum data and show that our model is hampered by the limited quality of the preceding entity recognition step. Manual evaluation of small sample indicates that of the correctly extracted entities, around 65% is linked to the correct concept in the ontology. Our results indicate that biomedical entity linking in a language other than English remains challenging, but our Dutch model can be used to for high-level analysis of patient-generated text.</abstract>
+      <url hash="31c2e447">2024.cl4health-1.31</url>
+      <bibkey>hartendorp-etal-2024-biomedical</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Unveiling Voices: Identification of Concerns in a Social Media Breast Cancer Cohort via Natural Language Processing</title>
+      <author><first>Swati</first><last>Rajwal</last></author>
+      <author><first>Avinash Kumar</first><last>Pandey</last></author>
+      <author><first>Zhishuo</first><last>Han</last></author>
+      <author><first>Abeed</first><last>Sarker</last></author>
+      <pages>264–270</pages>
+      <abstract>We leveraged a dataset of ∼1.5 million Twitter (now X) posts to develop a framework for analyzing breast cancer (BC) patients’ concerns and possible reasons for treatment discontinuation. Our primary objectives were threefold: (1) to curate and collect data from a BC cohort; (2) to identify topics related to uncertainty/concerns in BC-related posts; and (3) to conduct a sentiment intensity analysis of posts to identify and analyze negatively polarized posts. RoBERTa outperformed other models with a micro-averaged F1 score of 0.894 and a macro-averaged F1 score of 0.853 for (1). For (2), we used GPT-4 and BERTopic, and qualitatively analyzed posts under relevant topics. For (3), sentiment intensity analysis of posts followed by qualitative analyses shed light on potential reasons behind treatment discontinuation. Our work demonstrates the utility of social media mining to discover BC patient concerns. Information derived from the cohort data may help design strategies in the future for increasing treatment compliance.</abstract>
+      <url hash="d166b733">2024.cl4health-1.32</url>
+      <bibkey>rajwal-etal-2024-unveiling</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Intent Detection and Entity Extraction from Biomedical Literature</title>
+      <author><first>Ankan</first><last>Mullick</last></author>
+      <author><first>Mukur</first><last>Gupta</last></author>
+      <author><first>Pawan</first><last>Goyal</last></author>
+      <pages>271–278</pages>
+      <abstract>Biomedical queries have become increasingly prevalent in web searches, reflecting the growing interest in accessing biomedical literature. Despite recent research on large-language models (LLMs) motivated by endeavors to attain generalized intelligence, their efficacy in replacing task and domain-specific natural language understanding approaches remains questionable. In this paper, we address this question by conducting a comprehensive empirical evaluation of intent detection and named entity recognition (NER) tasks from biomedical text. We show that Supervised Fine Tuned approaches are still relevant and more effective than general-purpose LLMs. Biomedical transformer models such as PubMedBERT can surpass ChatGPT on NER task with only 5 supervised examples.</abstract>
+      <url hash="c744373f">2024.cl4health-1.33</url>
+      <bibkey>mullick-etal-2024-intent</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.clpsych.xml b/data/xml/2024.clpsych.xml
index eb10dac684..5babfa07c2 100644
--- a/data/xml/2024.clpsych.xml
+++ b/data/xml/2024.clpsych.xml
@@ -46,6 +46,7 @@
       <abstract>Depression is a global concern suffered by millions of people, significantly impacting their thoughts and behavior. Over the years, heightened awareness, spurred by health campaigns and other initiatives, has driven the study of this disorder using data collected from social media platforms. In our research, we aim to gauge the severity of symptoms related to depression among social media users. The ultimate goal is to estimate the user’s responses to a well-known standardized psychological questionnaire, the Beck Depression Inventory-II (BDI). This is a 21-question multiple-choice self-report inventory that covers multiple topics about how the subject has been feeling. Mining users’ social media interactions and understanding psychological states represents a challenging goal. To that end, we present here an approach based on search and summarization that extracts multiple BDI-biased summaries from the thread of users’ publications. We also leverage a robust large language model to estimate the potential answer for each BDI item. Our method involves several steps. First, we employ a search strategy based on sentence similarity to obtain pertinent extracts related to each topic in the BDI questionnaire. Next, we compile summaries of the content of these groups of extracts. Last, we exploit chatGPT to respond to the 21 BDI questions, using the summaries as contextual information in the prompt. Our model has undergone rigorous evaluation across various depression datasets, yielding encouraging results. The experimental report includes a comparison against an assessment done by expert humans and competes favorably with state-of-the-art methods.</abstract>
       <url hash="714a6aae">2024.clpsych-1.2</url>
       <bibkey>aragon-etal-2024-delving</bibkey>
+      <video href="2024.clpsych-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>How Can Client Motivational Language Inform Psychotherapy Agents?</title>
@@ -56,6 +57,7 @@
       <abstract>Within Motivational Interviewing (MI), client utterances are coded as for or against a certain behaviour change, along with commitment strength; this is essential to ensure therapists soften rather than persisting goal-related actions in the face of resistance. Prior works in MI agents have been scripted or semi-scripted, limiting users’ natural language expressions. With the aim of automating the MI interactions, we propose and explore the task of automated identification of client motivational language. Employing Large Language Models (LLMs), we compare in-context learning (ICL) and instruction fine-tuning (IFT) with varying training sizes for this identification task. Our experiments show that both approaches can learn under low-resourced settings. Our results demonstrate that IFT, though cheaper, is more stable to prompt choice, and yields better performance with more data. Given the detected motivation, we further present an approach to the analysis of therapists’ strategies for balancing building rapport with clients with advancing the treatment plan. A framework of MI agents is developed using insights from the data and the psychotherapy literature.</abstract>
       <url hash="8d169458">2024.clpsych-1.3</url>
       <bibkey>hoang-etal-2024-client</bibkey>
+      <video href="2024.clpsych-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Linguistic markers of schizophrenia: a case study of <fixed-case>R</fixed-case>obert <fixed-case>W</fixed-case>alser</title>
@@ -79,6 +81,7 @@
       <abstract>Therapist Self-Disclosure (TSD) within the context of psychotherapy entails the revelation of personal information by the therapist. The ongoing scholarly discourse surrounding the utility of TSD, spanning from the inception of psychotherapy to the present day, has underscored the need for greater specificity in conceptualizing TSD. This inquiry has yielded more refined classifications within the TSD domain, with a consensus emerging on the distinction between immediate and non-immediate TSD, each of which plays a distinct role in the therapeutic process. Despite this progress in the field of psychotherapy, the Natural Language Processing (NLP) domain currently lacks methodological solutions or explorations for such scenarios. This lacuna can be partly due to the difficulty of attaining publicly available clinical data. To address this gap, this paper presents an innovative NLP-based approach that formalizes TSD as an NLP task. The proposed methodology involves the creation of publicly available, expert-annotated test sets designed to simulate therapist utterances, and the employment of NLP techniques for evaluation purposes. By integrating insights from psychotherapy research with NLP methodologies, this study aims to catalyze advancements in both NLP and psychotherapy research.</abstract>
       <url hash="2485fe70">2024.clpsych-1.5</url>
       <bibkey>shapira-alfi-yogev-2024-therapist</bibkey>
+      <video href="2024.clpsych-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Ethical thematic and topic modelling analysis of sleep concerns in a social media derived suicidality dataset</title>
@@ -89,6 +92,7 @@
       <abstract>Objective: A thematic and topic modelling analysis of sleep concerns in a social media derived, privacy-preserving, suicidality dataset. This forms the basis for an exploration of sleep as a potential computational linguistic signal in suicide prevention. Background: Suicidal ideation is a limited signal for suicide. Developments in computational linguistics and mental health datasets afford an opportunity to investigate additional signals and to consider the broader clinical ethical design implications. Methodology: A clinician-led integration of reflexive thematic analysis, with machine learning topic modelling (Bertopic), and the purposeful sampling of the University of Maryland Suicidality Dataset. Results: Sleep as a place of refuge and escape, revitalisation for exhaustion, and risk and vulnerability were generated as core themes in an initial thematic analysis of 546 posts. Bertopic analysing 21,876 sleep references in 16791 posts facilitated the production of 40 topics that were clinically interpretable, relevant, and thematically aligned to a level that exceeded original expectations. Privacy and synthetic representative data, reproducibility, validity and stochastic variability of results, and a multi-signal formulation perspective, are highlighted as key research and clinical issues.</abstract>
       <url hash="046e1c52">2024.clpsych-1.6</url>
       <bibkey>orr-etal-2024-ethical</bibkey>
+      <video href="2024.clpsych-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Automatic Annotation of Dream Report’s Emotional Content with Large Language Models</title>
@@ -102,6 +106,7 @@
       <abstract>In the field of dream research, the study of dream content typically relies on the analysis of verbal reports provided by dreamers upon awakening from their sleep. This task is classically performed through manual scoring provided by trained annotators, at a great time expense. While a consistent body of work suggests that natural language processing (NLP) tools can support the automatic analysis of dream reports, proposed methods lacked the ability to reason over a report’s full context and required extensive data pre-processing. Furthermore, in most cases, these methods were not validated against standard manual scoring approaches. In this work, we address these limitations by adopting large language models (LLMs) to study and replicate the manual annotation of dream reports, using a mixture of off-the-shelf and bespoke approaches, with a focus on references to reports’ emotions. Our results show that the off-the-shelf method achieves a low performance probably in light of inherent linguistic differences between reports collected in different (groups of) individuals. On the other hand, the proposed bespoke text classification method achieves a high performance, which is robust against potential biases. Overall, these observations indicate that our approach could find application in the analysis of large dream datasets and may favour reproducibility and comparability of results across studies.</abstract>
       <url hash="5e4d4df7">2024.clpsych-1.7</url>
       <bibkey>bertolini-etal-2024-automatic</bibkey>
+      <video href="2024.clpsych-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Explainable Depression Detection Using Large Language Models on Social Media Data</title>
@@ -112,6 +117,7 @@
       <abstract>Due to the rapid growth of user interaction on different social media platforms, publicly available social media data has increased substantially. The sheer amount of data and level of personal information being shared on such platforms has made analyzing textual information to predict mental disorders such as depression a reliable preliminary step when it comes to psychometrics. In this study, we first proposed a system to search for texts that are related to depression symptoms from the Beck’s Depression Inventory (BDI) questionnaire, and providing a ranking for further investigation in a second step. Then, in this second step, we address the even more challenging task of automatic depression level detection, using writings and voluntary answers provided by users on Reddit. Several Large Language Models (LLMs) were applied in experiments. Our proposed system based on LLMs can generate both predictions and explanations for each question. By combining two LLMs for different questions, we achieved better performance on three of four metrics compared to the state-of-the-art and remained competitive on the one remaining metric. In addition, our system is explainable on two levels: first, knowing the answers to the BDI questions provides clues about the possible symptoms that could lead to a clinical diagnosis of depression; second, our system can explain the predicted answer for each question.</abstract>
       <url hash="b013668d">2024.clpsych-1.8</url>
       <bibkey>wang-etal-2024-explainable</bibkey>
+      <video href="2024.clpsych-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title>Analysing relevance of Discourse Structure for Improved Mental Health Estimation</title>
@@ -122,6 +128,7 @@
       <abstract>Automated depression estimation has received significant research attention in recent years as a result of its growing impact on the global community. Within the context of studies based on patient-therapist interview transcripts, most researchers treat the dyadic discourse as a sequence of unstructured sentences, thus ignoring the discourse structure within the learning process. In this paper we propose Multi-view architectures that divide the input transcript into patient and therapist views based on sentence type in an attempt to utilize symmetric discourse structure for improved model performance. Experiments on DAIC-WOZ dataset for binary classification task within depression estimation show advantages of Multi-view architecture over sequential input representations. Our model also outperforms the current state-of-the-art results and provide new SOTA performance on test set of DAIC-WOZ dataset.</abstract>
       <url hash="8b0e31e0">2024.clpsych-1.9</url>
       <bibkey>agarwal-etal-2024-analysing</bibkey>
+      <video href="2024.clpsych-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Using Daily Language to Understand Drinking: Multi-Level Longitudinal Differential Language Analysis</title>
@@ -139,6 +146,7 @@
       <abstract>Analyses for linking language with psychological factors or behaviors predominately treat linguistic features as a static set, working with a single document per person or aggregating across multiple posts (e.g. on social media) into a single set of features. This limits language to mostly shed light on between-person differences rather than changes in behavior within-person. Here, we collected a novel dataset of daily surveys where participants were asked to describe their experienced well-being and report the number of alcoholic beverages they had within the past 24 hours. Through this data, we first build a multi-level forecasting model that is able to capture within-person change and leverage both the psychological features of the person and daily well-being responses. Then, we propose a longitudinal version of differential language analysis that finds patterns associated with drinking more (e.g. social events) and less (e.g. task-oriented), as well as distinguishing patterns of heavy drinks versus light drinkers.</abstract>
       <url hash="2579df42">2024.clpsych-1.10</url>
       <bibkey>matero-etal-2024-using</bibkey>
+      <video href="2024.clpsych-1.10.mp4"/>
     </paper>
     <paper id="11">
       <title>Prevalent Frequency of Emotional and Physical Symptoms in Social Anxiety using Zero Shot Classification: An Observational Study</title>
@@ -148,6 +156,7 @@
       <abstract>Social anxiety represents a prevalent challenge in modern society, affecting individuals across personal and professional spheres. Left unaddressed, this condition can yield substantial negative consequences, impacting social interactions and performance. Further understanding its diverse physical and emotional symptoms becomes pivotal for comprehensive diagnosis and tailored therapeutic interventions. This study analyze prev lance and frequency of social anxiety symptoms taken from Mayo Clinic, exploring diverse human experiences from utilizing a large Reddit dataset dedicated to this issue. Leveraging these platforms, the research aims to extract insights and examine a spectrum of physical and emotional symptoms linked to social anxiety disorder. Upholding ethical considerations, the study maintains strict user anonymity within the dataset. By employing a novel approach, the research utilizes BART-based multi-label zero-shot classification to identify and measure symptom prevalence and significance in the form of probability score for each symptom under consideration. Results uncover distinctive patterns: “Trembling” emerges as a prevalent physical symptom, while emotional symptoms like “Fear of being judged negatively” exhibit high frequencies. These findings offer insights into the multifaceted nature of social anxiety, aiding clinical practices and interventions tailored to its diverse expressions.</abstract>
       <url hash="657bed2b">2024.clpsych-1.11</url>
       <bibkey>rizwan-demsar-2024-prevalent</bibkey>
+      <video href="2024.clpsych-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Comparing panic and anxiety on a dataset collected from social media</title>
@@ -158,6 +167,7 @@
       <abstract>The recognition of mental health’s crucial significance has led to a growing interest in utilizing social media text data in current research trends. However, there remains a significant gap in the study of panic and anxiety on these platforms, despite their high prevalence and severe impact. In this paper, we address this gap by presenting a dataset consisting of 1,930 user posts from Quora and Reddit specifically focusing on panic and anxiety. Through a combination of lexical analysis, emotion detection, and writer attitude assessment, we explore the unique characteristics of each condition. To gain deeper insights, we employ a mental health-specific transformer model and a large language model for qualitative analysis. Our findings not only contribute to the understanding digital discourse on anxiety and panic but also provide valuable resources for the broader research community. We make our dataset, methodologies, and code available to advance understanding and facilitate future studies.</abstract>
       <url hash="cb568f10">2024.clpsych-1.12</url>
       <bibkey>mitrovic-etal-2024-comparing</bibkey>
+      <video href="2024.clpsych-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Your Model Is Not Predicting Depression Well And That Is Why: A Case Study of <fixed-case>PRIMATE</fixed-case> Dataset</title>
@@ -168,6 +178,7 @@
       <abstract>This paper addresses the quality of annotations in mental health datasets used for NLP-based depression level estimation from social media texts. While previous research relies on social media-based datasets annotated with binary categories, i.e. depressed or non-depressed, recent datasets such as D2S and PRIMATE aim for nuanced annotations using PHQ-9 symptoms. However, most of these datasets rely on crowd workers without the domain knowledge for annotation. Focusing on the PRIMATE dataset, our study reveals concerns regarding annotation validity, particularly for the lack of interest or pleasure symptom. Through reannotation by a mental health professional, we introduce finer labels and textual spans as evidence, identifying a notable number of false positives. Our refined annotations, to be released under a Data Use Agreement, offer a higher-quality test set for anhedonia detection. This study underscores the necessity of addressing annotation quality issues in mental health datasets, advocating for improved methodologies to enhance NLP model reliability in mental health assessments.</abstract>
       <url hash="e743e05f">2024.clpsych-1.13</url>
       <bibkey>milintsevich-etal-2024-model</bibkey>
+      <video href="2024.clpsych-1.13.mp4"/>
     </paper>
     <paper id="14">
       <title>Detecting a Proxy for Potential Comorbid <fixed-case>ADHD</fixed-case> in People Reporting Anxiety Symptoms from Social Media Data</title>
@@ -178,6 +189,7 @@
       <abstract>We present a novel task that can elucidate the connection between anxiety and ADHD; use Transformers to make progress toward solving a task that is not solvable by keyword-based classifiers; and discuss a method for visualization of our classifier illuminating the connection between anxiety and ADHD presentations. Up to approximately 50% of adults with ADHD may also have an anxiety disorder and approximately 30% of adults with anxiety may also have ADHD. Patients presenting with anxiety may be treated for anxiety without ADHD ever being considered, possibly affecting treatment. We show how data that bears on ADHD that is comorbid with anxiety can be obtained from social media data, and show that Transformers can be used to detect a proxy for possible comorbid ADHD in people with anxiety symptoms. We collected data from anxiety and ADHD online forums (subreddits). We identified posters who first started posting in the Anxiety subreddit and later started posting in the ADHD subreddit as well. We use this subset of the posters as a proxy for people who presented with anxiety symptoms and then became aware that they might have ADHD. We fine-tune a Transformer architecture-based classifier to classify people who started posting in the Anxiety subreddit and then started posting in the ADHD subreddit vs. people who posted in the Anxiety subreddit without later posting in the ADHD subreddit. We show that a Transformer architecture is capable of achieving reasonable results (76% correct for RoBERTa vs. under 60% correct for the best keyword-based model, both with 50% base rate).</abstract>
       <url hash="090fc9b5">2024.clpsych-1.14</url>
       <bibkey>lee-etal-2024-detecting</bibkey>
+      <video href="2024.clpsych-1.14.mp4"/>
     </paper>
     <paper id="15">
       <title>Overview of the <fixed-case>CLP</fixed-case>sych 2024 Shared Task: Leveraging Large Language Models to Identify Evidence of Suicidality Risk in Online Posts</title>
@@ -204,6 +216,7 @@
       <abstract>This paper presents our approach to the CLPsych 2024 shared task: utilizing large language models (LLMs) for finding supporting evidence about an individual’s suicide risk level in Reddit posts. Our framework is constructed around an LLM with knowledge self-generation and output refinement. The knowledge self-generation process produces task-related knowledge which is generated by the LLM and leads to accurate risk predictions. The output refinement process, later, with the selected best set of LLM-generated knowledge, refines the outputs by prompting the LLM repeatedly with different knowledge instances interchangeably. We achieved highly competitive results comparing to the top-performance participants with our official recall of 93.5%, recall–precision harmonic-mean of 92.3%, and mean consistency of 96.1%.</abstract>
       <url hash="fc15c94f">2024.clpsych-1.16</url>
       <bibkey>tran-matsui-2024-team</bibkey>
+      <video href="2024.clpsych-1.16.mp4"/>
     </paper>
     <paper id="17">
       <title>Exploring Instructive Prompts for Large Language Models in the Extraction of Evidence for Supporting Assigned Suicidal Risk Levels</title>
@@ -217,6 +230,7 @@
       <abstract>Monitoring and predicting the expression of suicidal risk in individuals’ social media posts is a central focus in clinical NLP. Yet, existing approaches frequently lack a crucial explainability component necessary for extracting evidence related to an individual’s mental health state. We describe the CSIRO Data61 team’s evidence extraction system submitted to the CLPsych 2024 shared task. The task aims to investigate the zero-shot capabilities of open-source LLM in extracting evidence regarding an individual’s assigned suicide risk level from social media discourse. The results are assessed against ground truth evidence annotated by psychological experts, with an achieved recall-oriented BERTScore of 0.919. Our findings suggest that LLMs showcase strong feasibility in the extraction of information supporting the evaluation of suicidal risk in social media discourse. Opportunities for refinement exist, notably in crafting concise and effective instructions to guide the extraction process.</abstract>
       <url hash="0d25df1a">2024.clpsych-1.17</url>
       <bibkey>chen-etal-2024-exploring</bibkey>
+      <video href="2024.clpsych-1.17.mp4"/>
     </paper>
     <paper id="18">
       <title>Psychological Assessments with Large Language Models: A Privacy-Focused and Cost-Effective Approach</title>
@@ -225,6 +239,7 @@
       <abstract>This study explores the use of Large Language Models (LLMs) to analyze text comments from Reddit users, aiming to achieve two primary objectives: firstly, to pinpoint critical excerpts that support a predefined psychological assessment of suicidal risk; and secondly, to summarize the material to substantiate the preassigned suicidal risk level. The work is circumscribed to the use of “open-source” LLMs that can be run locally, thereby enhancing data privacy. Furthermore, it prioritizes models with low computational requirements, making it accessible to both individuals and institutions operating on limited computing budgets. The implemented strategy only relies on a carefully crafted prompt and a grammar to guide the LLM’s text completion. Despite its simplicity, the evaluation metrics show outstanding results, making it a valuable privacy-focused and cost-effective approach. This work is part of the Computational Linguistics and Clinical Psychology (CLPsych) 2024 shared task.</abstract>
       <url hash="0423b5e6">2024.clpsych-1.18</url>
       <bibkey>blanco-cuaresma-2024-psychological</bibkey>
+      <video href="2024.clpsych-1.18.mp4"/>
     </paper>
     <paper id="19">
       <title>Incorporating Word Count Information into Depression Risk Summary Generation: <fixed-case>INF</fixed-case>@<fixed-case>U</fixed-case>o<fixed-case>S</fixed-case> <fixed-case>CLP</fixed-case>sych 2024 Submission</title>
@@ -234,6 +249,7 @@
       <abstract>Large language model classifiers do not directly offer transparency: it is not clear why one class is chosen over another. In this work, summaries explaining the suicide risk level assigned using a fine-tuned mental-roberta-base model are generated from key phrases extracted using SHAP explainability using Mistral-7B. The training data for the classifier consists of all Reddit posts of a user in the University of Maryland Reddit Suicidality Dataset, Version 2, with their suicide risk labels along with selected features extracted from each post by the Linguistic Inquiry and Word Count (LIWC-22) tool. The resulting model is used to make predictions regarding risk on each post of the users in the evaluation set of the CLPsych 2024 shared task, with a SHAP explainer used to identify the phrases contributing to the top scoring, correct and severe risk categories. Some basic stoplisting is applied to the extracted phrases, along with length based filtering, and a locally run version of Mistral-7B-Instruct-v0.1 is used to create summaries from the highest value (based on SHAP) phrases.</abstract>
       <url hash="4388c862">2024.clpsych-1.19</url>
       <bibkey>preiss-chen-2024-incorporating</bibkey>
+      <video href="2024.clpsych-1.19.mp4"/>
     </paper>
     <paper id="20">
       <title>Extracting and Summarizing Evidence of Suicidal Ideation in Social Media Contents Using Large Language Models</title>
@@ -245,6 +261,7 @@
       <abstract>This paper explores the use of Large Language Models (LLMs) in analyzing social media content for mental health monitoring, specifically focusing on detecting and summarizing evidence of suicidal ideation. We utilized LLMs Mixtral7bx8 and Tulu-2-DPO-70B, applying diverse prompting strategies for effective content extraction and summarization. Our methodology included detailed analysis through Few-shot and Zero-shot learning, evaluating the ability of Chain-of-Thought and Direct prompting strategies. The study achieved notable success in the CLPsych 2024 shared task (ranked top for the evidence extraction task and second for the summarization task), demonstrating the potential of LLMs in mental health interventions and setting a precedent for future research in digital mental health monitoring.</abstract>
       <url hash="890c4a2e">2024.clpsych-1.20</url>
       <bibkey>gyanendro-singh-etal-2024-extracting</bibkey>
+      <video href="2024.clpsych-1.20.mp4"/>
     </paper>
     <paper id="21">
       <title>Detecting Suicide Risk Patterns using Hierarchical Attention Networks with Large Language Models</title>
@@ -255,6 +272,7 @@
       <abstract>Suicide has become a major public health and social concern in the world . This Paper looks into a method through use of LLMs (Large Lan- guage Model) to extract the likely reason for a person to attempt suicide , through analysis of their social media text posts detailing about the event , using this data we can extract the rea- son for the cause such mental state which can provide support for suicide prevention. This submission presents our approach for CLPsych Shared Task 2024. Our model uses Hierarchi- cal Attention Networks (HAN) and Llama2 for finding supporting evidence about an individ- ual’s suicide risk level.</abstract>
       <url hash="326e0638">2024.clpsych-1.21</url>
       <bibkey>l-etal-2024-detecting</bibkey>
+      <video href="2024.clpsych-1.21.mp4"/>
     </paper>
     <paper id="22">
       <title>Using Large Language Models (<fixed-case>LLM</fixed-case>s) to Extract Evidence from Pre-Annotated Social Media Data</title>
@@ -265,6 +283,7 @@
       <abstract>For numerous years, researchers have employed social media data to gain insights into users’ mental health. Nevertheless, the majority of investigations concentrate on categorizing users into those experiencing depression and those considered healthy, or on detection of suicidal thoughts. In this paper, we aim to extract evidence of a pre-assigned gold label. We used a suicidality dataset containing Reddit posts labeled with the suicide risk level. The task is to use Large Language Models (LLMs) to extract evidence from the post that justifies the given label. We used Meta Llama 7b and lexicons for solving the task and we achieved a precision of 0.96.</abstract>
       <url hash="fe5d1841">2024.clpsych-1.22</url>
       <bibkey>alhamed-etal-2024-using</bibkey>
+      <video href="2024.clpsych-1.22.mp4"/>
     </paper>
     <paper id="23">
       <title><fixed-case>X</fixed-case>in<fixed-case>H</fixed-case>ai@<fixed-case>CLP</fixed-case>sych 2024 Shared Task: Prompting Healthcare-oriented <fixed-case>LLM</fixed-case>s for Evidence Highlighting in Posts with Suicide Risk</title>
@@ -276,6 +295,7 @@
       <abstract>In this article, we introduce a new method for analyzing and summarizing posts from r/SuicideWatch on Reddit, overcoming the limitations of current techniques in processing complex mental health discussions online. Existing methods often struggle to accurately identify and contextualize subtle expressions of mental health problems, leading to inadequate support and intervention strategies. Our approach combines the open-source Large Language Model (LLM), fine-tuned with health-oriented knowledge, to effectively process Reddit posts. We also design prompts that focus on suicide-related statements, extracting key statements, and generating concise summaries that capture the core aspects of the discussions. The preliminary results indicate that our method improves the understanding of online suicide-related posts compared to existing methodologies.</abstract>
       <url hash="fd9388ac">2024.clpsych-1.23</url>
       <bibkey>zhu-etal-2024-xinhai</bibkey>
+      <video href="2024.clpsych-1.23.mp4"/>
     </paper>
     <paper id="24">
       <title>A Dual-Prompting for Interpretable Mental Health Language Models</title>
@@ -289,6 +309,7 @@
       <abstract>Despite the increasing demand for AI-based mental health monitoring tools, their practical utility for clinicians is limited by the lack of interpretability. The CLPsych 2024 Shared Task (Chim et al., 2024) aims to enhance the interpretability of Large Language Models (LLMs), particularly in mental health analysis, by providing evidence of suicidality through linguistic content. We propose a dual-prompting approach: (i) Knowledge-aware evidence extraction by leveraging the expert identity and a suicide dictionary with a mental health-specific LLM; and (ii) Evidence summarization by employing an LLM-based consistency evaluator. Comprehensive experiments demonstrate the effectiveness of combining domain-specific information, revealing performance improvements and the approach’s potential to aid clinicians in assessing mental state progression.</abstract>
       <url hash="ed86b2b5">2024.clpsych-1.24</url>
       <bibkey>jeon-etal-2024-dual</bibkey>
+      <video href="2024.clpsych-1.24.mp4"/>
     </paper>
     <paper id="25">
       <title>Cheap Ways of Extracting Clinical Markers from Texts</title>
@@ -299,6 +320,7 @@
       <abstract>This paper describes the Unibuc Archaeology team work for CLPsych’s 2024 Shared Task that involved finding evidence within the text supporting the assigned suicide risk level. Two types of evidence were required: highlights (extracting relevant spans within the text) and summaries (aggregating evidence into a synthesis). Our work focuses on evaluating Large Language Models (LLM) as opposed to an alternative method that is much more memory and resource efficient. The first approach employs an LLM that is used for generating the summaries and is guided to provide sequences of text indicating suicidal tendencies through a processing chain for highlights. The second approach involves implementing a good old-fashioned machine learning tf-idf with a logistic regression classifier, whose representative features we use to extract relevant highlights.</abstract>
       <url hash="233c541e">2024.clpsych-1.25</url>
       <bibkey>sandu-etal-2024-cheap</bibkey>
+      <video href="2024.clpsych-1.25.mp4"/>
     </paper>
     <paper id="26">
       <title>Utilizing Large Language Models to Identify Evidence of Suicidality Risk through Analysis of Emotionally Charged Posts</title>
@@ -309,6 +331,7 @@
       <abstract>This paper presents our contribution to the CLPsych 2024 shared task, focusing on the use of open-source large language models (LLMs) for suicide risk assessment through the analysis of social media posts. We achieved first place (out of 15 participating teams) in the task of providing summarized evidence of a user’s suicide risk. Our approach is based on Retrieval Augmented Generation (RAG), where we retrieve the top-k (k=5) posts with the highest emotional charge and provide the level of three different negative emotions (sadness, fear, anger) for each post during the generation phase.</abstract>
       <url hash="c333966f">2024.clpsych-1.26</url>
       <bibkey>uluslu-etal-2024-utilizing</bibkey>
+      <video href="2024.clpsych-1.26.mp4"/>
     </paper>
     <paper id="27">
       <title>Integrating Supervised Extractive and Generative Language Models for Suicide Risk Evidence Summarization</title>
@@ -318,6 +341,7 @@
       <abstract>We propose a method that integrates supervised extractive and generative language models for providing supporting evidence of suicide risk in the CLPsych 2024 shared task. Our approach comprises three steps. Initially, we construct a BERT-based model for estimating sentence-level suicide risk and negative sentiment. Next, we precisely identify high suicide risk sentences by emphasizing elevated probabilities of both suicide risk and negative sentiment. Finally, we integrate generative summaries using the MentaLLaMa framework and extractive summaries from identified high suicide risk sentences and a specialized dictionary of suicidal risk words. SophiaADS, our team, achieved 1st place for highlight extraction and ranked 10th for summary generation, both based on recall and consistency metrics, respectively.</abstract>
       <url hash="e7f6935e">2024.clpsych-1.27</url>
       <bibkey>tanaka-fukazawa-2024-integrating</bibkey>
+      <video href="2024.clpsych-1.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Archetypes and Entropy: Theory-Driven Extraction of Evidence for Suicide Risk</title>
@@ -341,6 +365,7 @@
       <bibkey>varadarajan-etal-2024-archetypes</bibkey>
       <revision id="1" href="2024.clpsych-1.28v1" hash="fc1d5614"/>
       <revision id="2" href="2024.clpsych-1.28v2" hash="d1630b11" date="2024-04-14">The sponsors were added in the Acknowledgement section since it was missed in the initial submission.</revision>
+      <video href="2024.clpsych-1.28.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.codi.xml b/data/xml/2024.codi.xml
index 82e0310b12..bafe63fbb4 100644
--- a/data/xml/2024.codi.xml
+++ b/data/xml/2024.codi.xml
@@ -29,6 +29,7 @@
       <abstract>Although diagrams are fundamental to Rhetorical Structure Theory, their interpretation has received little in-depth exploration. This paper presents an algorithmic approach to accessing the meaning of these diagrams. Three algorithms are presented. The first of these, called reenactment, recreates the abstract process whereby structures are created, following the dynamic of coherence development, starting from simple relational propositions, and combing these to form complex expressions which are in turn integrated to define the comprehensive discourse organization. The second algorithm, called composition, implements Marcu’s strong nuclearity assumption. It uses a simple inference mechanism to demonstrate the reducibility of complex structures to simple relational propositions. The third algorithm, called compress, picks up where Marcu’s assumption leaves off, providing a generalized fully scalable procedure for progressive reduction of relational propositions to their simplest accessible forms. These inferred reductions may then be recycled to produce RST diagrams of abridged texts. The algorithms described here are useful in positioning computational descriptions of rhetorical structures as discursive processes, allowing researchers to go beyond static diagrams and look into their formative and interpretative significance.</abstract>
       <url hash="e8dd9a2f">2024.codi-1.1</url>
       <bibkey>potter-2024-algorithmic</bibkey>
+      <video href="2024.codi-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title><fixed-case>S</fixed-case>ci<fixed-case>P</fixed-case>ara: A New Dataset for Investigating Paragraph Discourse Structure in Scientific Papers</title>
@@ -41,6 +42,7 @@
       <abstract>Good scientific writing makes use of specific sentence and paragraph structures, providing a rich platform for discourse analysis and developing tools to enhance text readability. In this vein, we introduce SciPara, a novel dataset consisting of 981 scientific paragraphs annotated by experts in terms of sentence discourse types and topic information. On this dataset, we explored two tasks: 1) discourse category classification, which is to predict the discourse category of a sentence by using its paragraph and surrounding paragraphs as context, and 2) discourse sentence generation, which is to generate a sentence of a certain discourse category by using various contexts as input. We found that Pre-trained Language Models (PLMs) can accurately identify Topic Sentences in SciPara, but have difficulty distinguishing Concluding, Transition, and Supporting Sentences. The quality of the sentences generated by all investigated PLMs improved with amount of context, regardless of discourse category. However, not all contexts were equally influential. Contrary to common assumptions about well-crafted scientific paragraphs, our analysis revealed that paradoxically, paragraphs with complete discourse structures were less readable.</abstract>
       <url hash="5e71d7b8">2024.codi-1.2</url>
       <bibkey>kiepura-etal-2024-scipara</bibkey>
+      <video href="2024.codi-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Using Discourse Connectives to Test Genre Bias in Masked Language Models</title>
@@ -53,6 +55,7 @@
       <abstract>This paper presents evidence for an effect of genre on the use of discourse connectives in argumentation. Drawing from discourse processing research on reasoning based structures, we use fill-mask computation to measure genre-induced expectations of argument realisation, and beta regression to model the probabilities of these realisations against a set of predictors. Contrasting fill-mask probabilities for the presence or absence of a discourse connective in baseline and finetuned language models reveals that genre introduces biases for the realisation of argument structure. These outcomes suggest that cross-domain discourse processing, but also argument mining, should take into account generalisations about specific features, such as connectives, and their probability related to the genre context.</abstract>
       <url hash="62c8784d">2024.codi-1.3</url>
       <bibkey>dorgeloh-etal-2024-using</bibkey>
+      <video href="2024.codi-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Projecting Annotations for Discourse Relations: Connective Identification for Low-Resource Languages</title>
@@ -63,6 +66,7 @@
       <url hash="9aa2132d">2024.codi-1.4</url>
       <attachment type="SupplementaryMaterial" hash="8d8093c1">2024.codi-1.4.SupplementaryMaterial.zip</attachment>
       <bibkey>bourgonje-lin-2024-projecting</bibkey>
+      <video href="2024.codi-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Experimenting with Discourse Segmentation of <fixed-case>T</fixed-case>aiwan <fixed-case>S</fixed-case>outhern <fixed-case>M</fixed-case>in Spontaneous Speech</title>
@@ -73,6 +77,7 @@
       <url hash="ab8ed7c4">2024.codi-1.5</url>
       <attachment type="SupplementaryMaterial" hash="4b11ad01">2024.codi-1.5.SupplementaryMaterial.tex</attachment>
       <bibkey>prevot-wang-2024-experimenting</bibkey>
+      <video href="2024.codi-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Actor Identification in Discourse: A Challenge for <fixed-case>LLM</fixed-case>s?</title>
@@ -84,6 +89,7 @@
       <url hash="e184e85c">2024.codi-1.6</url>
       <attachment type="SupplementaryMaterial" hash="fed19e3c">2024.codi-1.6.SupplementaryMaterial.gz</attachment>
       <bibkey>baric-etal-2024-actor</bibkey>
+      <video href="2024.codi-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Quantitative metrics to the <fixed-case>CARS</fixed-case> model in academic discourse in biology introductions</title>
@@ -93,6 +99,7 @@
       <abstract>Writing research articles is crucial in any academic’s development and is thus an important component of the academic discourse. The Introduction section is often seen as a difficult task within the research article genre. This study presents two metrics of rhetorical moves in academic writing: step-n-grams and lengths of steps. While scholars agree that expert writers follow the general pattern described in the CARS model (Swales, 1990), this study complements previous studies with empirical quantitative data that highlight how writers progress from one rhetorical function to another in practice, based on 50 recent papers by expert writers. The discussion shows the significance of the results in relation to writing instructors and data-driven learning.</abstract>
       <url hash="f62b4a10">2024.codi-1.7</url>
       <bibkey>lam-nnamoko-2024-quantitative</bibkey>
+      <video href="2024.codi-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Probing of pretrained multilingual models on the knowledge of discourse</title>
@@ -103,6 +110,7 @@
       <url hash="bce068ed">2024.codi-1.8</url>
       <attachment type="SupplementaryMaterial" hash="6911f593">2024.codi-1.8.SupplementaryMaterial.zip</attachment>
       <bibkey>godunova-voloshina-2024-probing</bibkey>
+      <video href="2024.codi-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title>Feature-augmented model for multilingual discourse relation classification</title>
@@ -114,6 +122,7 @@
       <url hash="1a578d64">2024.codi-1.9</url>
       <attachment type="SupplementaryMaterial" hash="42552f91">2024.codi-1.9.SupplementaryMaterial.zip</attachment>
       <bibkey>metheniti-etal-2024-feature</bibkey>
+      <video href="2024.codi-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Complex question generation using discourse-based data augmentation</title>
@@ -125,6 +134,7 @@
       <url hash="c61e5806">2024.codi-1.10</url>
       <attachment type="SupplementaryMaterial" hash="9e2a947f">2024.codi-1.10.SupplementaryMaterial.zip</attachment>
       <bibkey>jahangir-etal-2024-complex</bibkey>
+      <video href="2024.codi-1.10.mp4"/>
     </paper>
     <paper id="11">
       <title>Exploring Soft-Label Training for Implicit Discourse Relation Recognition</title>
@@ -134,6 +144,7 @@
       <abstract>This paper proposes a classification model for single label implicit discourse relation recognition trained on soft-label distributions. It follows the PDTB 3.0 framework and it was trained and tested on the DiscoGeM corpus, where it achieves an F1-score of 51.38 on third-level sense classification of implicit discourse relations. We argue that training on soft-label distributions allows the model to better discern between more ambiguous discourse relations.</abstract>
       <url hash="e36da8e5">2024.codi-1.11</url>
       <bibkey>costa-kosseim-2024-exploring</bibkey>
+      <video href="2024.codi-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>The <fixed-case>ARRAU</fixed-case> 3.0 Corpus</title>
@@ -147,6 +158,7 @@
       <url hash="f8aff236">2024.codi-1.12</url>
       <attachment type="SupplementaryMaterial" hash="c00b2ac7">2024.codi-1.12.SupplementaryMaterial.zip</attachment>
       <bibkey>poesio-etal-2024-arrau</bibkey>
+      <video href="2024.codi-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Signals as Features: Predicting Error/Success in Rhetorical Structure Parsing</title>
@@ -157,6 +169,7 @@
       <url hash="7475c0a8">2024.codi-1.13</url>
       <attachment type="SupplementaryMaterial" hash="7cc5880f">2024.codi-1.13.SupplementaryMaterial.zip</attachment>
       <bibkey>pastor-oostdijk-2024-signals</bibkey>
+      <video href="2024.codi-1.13.mp4"/>
     </paper>
     <paper id="14">
       <title><fixed-case>G</fixed-case>round<fixed-case>H</fixed-case>og: Dialogue Generation using Multi-Grained Linguistic Input</title>
@@ -168,6 +181,7 @@
       <url hash="50c3b64c">2024.codi-1.14</url>
       <attachment type="SupplementaryMaterial" hash="0ee32d15">2024.codi-1.14.SupplementaryMaterial.zip</attachment>
       <bibkey>chernyavskiy-etal-2024-groundhog</bibkey>
+      <video href="2024.codi-1.14.mp4"/>
     </paper>
     <paper id="15">
       <title>Discourse Relation Prediction and Discourse Parsing in Dialogues with Minimal Supervision</title>
@@ -179,6 +193,7 @@
       <abstract>Discourse analysis plays a crucial role in Natural Language Processing, with discourse relation prediction arguably being the most difficult task in discourse parsing. Previous studies have generally focused on explicit or implicit discourse relation classification in monologues, leaving dialogue an under-explored domain. Facing the data scarcity issue, we propose to leverage self-training strategies based on a Transformer backbone. Moreover, we design the first semi-supervised pipeline that sequentially predicts discourse structures and relations. Using 50 examples, our relation prediction module achieves 58.4 in accuracy on the STAC corpus, close to supervised state-of-the-art. Full parsing results show notable improvements compared to the supervised models both in-domain (gaming) and cross-domain (technical chat), with better stability.</abstract>
       <url hash="1dc4b78b">2024.codi-1.15</url>
       <bibkey>li-etal-2024-discourse</bibkey>
+      <video href="2024.codi-1.15.mp4"/>
     </paper>
     <paper id="16">
       <title>With a Little Help from my (Linguistic) <fixed-case>F</fixed-case>riends: Topic segmentation of multi-party casual conversations</title>
@@ -189,6 +204,7 @@
       <url hash="82d0a535">2024.codi-1.16</url>
       <attachment type="SupplementaryMaterial" hash="3fd4100d">2024.codi-1.16.SupplementaryMaterial.zip</attachment>
       <bibkey>decker-amblard-2024-little</bibkey>
+      <video href="2024.codi-1.16.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.cogalex.xml b/data/xml/2024.cogalex.xml
new file mode 100644
index 0000000000..e312bd32a2
--- /dev/null
+++ b/data/xml/2024.cogalex.xml
@@ -0,0 +1,214 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.cogalex">
+  <volume id="1" ingest-date="2024-05-20" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Workshop on Cognitive Aspects of the Lexicon @ LREC-COLING 2024</booktitle>
+      <editor><first>Michael</first><last>Zock</last></editor>
+      <editor><first>Emmanuele</first><last>Chersoni</last></editor>
+      <editor><first>Yu-Yin</first><last>Hsu</last></editor>
+      <editor><first>Simon</first><last>de Deyne</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="2ce4db09">2024.cogalex-1</url>
+      <venue>cogalex</venue>
+    </meta>
+    <frontmatter>
+      <url hash="2b012ec0">2024.cogalex-1.0</url>
+      <bibkey>cogalex-2024-cognitive</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>CLAVELL</fixed-case> - Cognitive Linguistic Annotation and Visualization Environment for Language Learning</title>
+      <author><first>Werner</first><last>Winiwarter</last></author>
+      <pages>1–13</pages>
+      <abstract>In this paper we introduce a novel sentence annotation based on radical construction grammar and Uniform Meaning Representation, which covers all levels of linguistic analysis, from interlinear morphemic glossing to PropBank rolesets, WordNet synsets, and Wikipedia page titles as concept identifiers. We visually enhance our annotation by using images to represent concepts, emojis for thematic roles, and color-coding for constructions. The meaning representation is embedded into the syntactic parse by aligning all concepts with the surface tokens in the sentence. The main motivation for developing this type of representation was its use in second language acquisition as part of a Web-based language learning environment. In entertaining and engaging annotation tasks language students assemble the representation step-by-step following a bottom-up strategy. Based on language exposure while performing these exercises, we populate personal idiolectal constructicons representing the students’ current status of second language comprehension. As first use case, we have implemented a solution for Japanese due to its soaring popularity in our language education program and the particular challenges involved with trying to master this language.</abstract>
+      <url hash="51612b86">2024.cogalex-1.1</url>
+      <bibkey>winiwarter-2024-clavell</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Individual Text Corpora Predict Openness, Interests, Knowledge and Level of Education</title>
+      <author><first>Markus J.</first><last>Hofmann</last></author>
+      <author><first>Markus T.</first><last>Jansen</last></author>
+      <author><first>Christoph</first><last>Wigbels</last></author>
+      <author><first>Benny</first><last>Briesemeister</last></author>
+      <author><first>Arthur M.</first><last>Jacobs</last></author>
+      <pages>14–25</pages>
+      <abstract>Here we examine whether the personality dimension of openness to experience can be predicted from the individual google search history. By web scraping, individual text corpora (ICs) were generated from 214 participants with a mean number of 5 million word tokens. We trained word2vec models and used the similarities of each IC to label words, which were derived from a lexical approach of personality. These IC-label-word similarities were utilized as predictive features in neural models. For training and validation, we relied on 179 participants and held out a test sample of 35 participants. A grid search with varying number of predictive features, hidden units and boost factor was performed. As model selection criterion, we used R2 in the validation samples penalized by the absolute R2 difference between training and validation. The selected neural model explained 35% of the openness variance in the test sample, while an ensemble model with the same architecture often provided slightly more stable predictions for intellectual interests, knowledge in humanities and level of education. Finally, a learning curve analysis suggested that around 500 training participants are required for generalizable predictions. We discuss ICs as a complement or replacement of survey-based psychodiagnostics.</abstract>
+      <url hash="556474f8">2024.cogalex-1.2</url>
+      <bibkey>hofmann-etal-2024-individual</bibkey>
+    </paper>
+    <paper id="3">
+      <title>An Empirical Study on Vague Deictic Temporal Adverbials</title>
+      <author><first>Svenja</first><last>Kenneweg</last></author>
+      <author><first>Brendan Balcerak</first><last>Jackson</last></author>
+      <author><first>Joerg</first><last>Deigmoeller</last></author>
+      <author><first>Julian</first><last>Eggert</last></author>
+      <author><first>Philipp</first><last>Cimiano</last></author>
+      <pages>26–31</pages>
+      <abstract>Temporal adverbial phrases such as recently and some time ago have a special function in communication and temporal cognition. These adverbials are deictic, in that their meaning is tied to their time of utterance; and they are vague, in that the time periods to which they apply are under-specified in comparison to expressions such as yesterday, which precisely indicates the day before the day of utterance. Despite their vagueness, conversational participants have a mental image of when events described using these adverbials take place. We present a study that aims to quantify this mental model in terms of fuzzy or graded membership. To achieve this, we investigated the four English temporal adverbials recently, just, some time ago and long time ago as applied to types of events with different durations and frequencies, by conducting surveys to measure how speakers judge the different adverbials to apply in different time ranges. Our results suggest that it is possible to represent the meanings of deictic vague temporal adverbials geometrically in terms of graded membership within a temporal conceptual space.</abstract>
+      <url hash="722201b5">2024.cogalex-1.3</url>
+      <bibkey>kenneweg-etal-2024-empirical</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Symbolic Learning of Rules for Semantic Relation Types Identification in <fixed-case>F</fixed-case>rench Genitive Postnominal Prepositional Phrases</title>
+      <author><first>Hani</first><last>Guenoune</last></author>
+      <author><first>Mathieu</first><last>Lafourcade</last></author>
+      <pages>32–41</pages>
+      <abstract>We are interested in the semantic relations conveyed by polylexical entities in the postnominal prepositional noun phrases form “A de B” (A of B). After identifying a relevant set of semantic relations types, we proceed, using generative AI, to build a collection of phrases, for each semantic relation type identified. We propose an algorithm for creating rules that allow the selection of the relation between A and B in noun phrases of each type. These rules correspond to selecting from a knowledge base the appropriate neighborhood of a given term. For the phrase “désert d’Algérie” carrying the location relation, the term “désert” is identified as a geographical location, and “Algérie” as a country. These constraints are used to automatically learn a set of rules for selecting the location relation for this type of example. Rules are not exclusive as there may be instances that fall under multiple relations. In the phrase “portrait de sa mère - the portrait of his/her mother”, all of depiction, possession, and producer types are a possible match.</abstract>
+      <url hash="18dc2de3">2024.cogalex-1.4</url>
+      <bibkey>guenoune-lafourcade-2024-symbolic</bibkey>
+    </paper>
+    <paper id="5">
+      <title>How Human-Like Are Word Associations in Generative Models? An Experiment in <fixed-case>S</fixed-case>lovene</title>
+      <author><first>Špela</first><last>Vintar</last></author>
+      <author><first>Mojca</first><last>Brglez</last></author>
+      <author><first>Aleš</first><last>Žagar</last></author>
+      <pages>42–48</pages>
+      <abstract>Large language models (LLMs) show extraordinary performance in a broad range of cognitive tasks, yet their capability to reproduce human semantic similarity judgements remains disputed. We report an experiment in which we fine-tune two LLMs for Slovene, a monolingual SloT5 and a multilingual mT5, as well as an mT5 for English, to generate word associations. The models are fine-tuned on human word association norms created within the Small World of Words project, which recently started to collect data for Slovene. Since our aim was to explore differences between human and model-generated outputs, the model parameters were minimally adjusted to fit the association task. We perform automatic evaluation using a set of methods to measure the overlap and ranking, and in addition a subset of human and model-generated responses were manually classified into four categories (meaning-, positionand form-based, and erratic). Results show that human-machine overlap is very small, but that the models produce a similar distribution of association categories as humans.</abstract>
+      <url hash="eb0e9653">2024.cogalex-1.5</url>
+      <bibkey>vintar-etal-2024-human</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Idiom Complexity in Apple-Pie Order: The Disentanglement of Decomposability and Transparency</title>
+      <author><first>Irene</first><last>Pagliai</last></author>
+      <pages>49–55</pages>
+      <abstract>Both decomposability and transparency investigate the interplay between literality and figurativity in idioms. For this reason, they have often been merged. This study argues that idiom decomposability and transparency are related but conceptually different constructs, thus advocating for their distinction. Leveraging a normed lexicon of Italian and English idioms, the respective effects of decomposability and transparency on idiom meaning recognition are explored via statistical modeling. Results show the two variables contribute differently to idiom meaning recognition in the two languages, while the absence of collinearity underscores their distinct contributions. Based on this empirical evidence, the study finally proposes FrameNet and MetaNet as computational tools for modeling idiom decomposability and transparency. This study thus not only substantiates the separation of idiom decomposability and transparency, but also sets a foundation for future interdisciplinary research to bridge the gap in idiom research between empirical psycholinguistics, cognitive linguistics and computational applications.</abstract>
+      <url hash="45379a74">2024.cogalex-1.6</url>
+      <bibkey>pagliai-2024-idiom</bibkey>
+    </paper>
+    <paper id="7">
+      <title>What <fixed-case>GPT</fixed-case>-4 Knows about Aspectual Coercion: Focused on “Begin the Book”</title>
+      <author><first>Seohyun</first><last>Im</last></author>
+      <author><first>Chungmin</first><last>Lee</last></author>
+      <pages>56–67</pages>
+      <abstract>This paper explores whether Pre-trained Large Language Models (PLLMs) like GPT-4 can grasp profound linguistic insights into language phenomena such as Aspectual Coercion through interaction with Microsoft’s Copilot, which integrates GPT-4. Firstly, we examined Copilot’s understanding of the co-occurrence constraints of the aspectual verb “begin” and the complex-type noun “book” using the classic illustration of Aspectual Coercion, “begin the book.” Secondly, we verified Copilot’s awareness of both the default interpretation of “begin the book” with no specific context and the contextually preferred interpretation. Ultimately, Copilot provided appropriate responses regarding potential interpretations of “begin the book” based on its distributional properties and context-dependent preferred interpretations. However, it did not furnish sophisticated explanations concerning these interpretations from a linguistic theoretical perspective. On the other hand, by offering diverse interpretations grounded in distributional properties, language models like GPT-4 demonstrated their potential contribution to the refinement of linguistic theories. Furthermore, we suggested the feasibility of employing Language Models to construct language resources associated with language phenomena including Aspectual Coercion.</abstract>
+      <url hash="e3af633f">2024.cogalex-1.7</url>
+      <bibkey>im-lee-2024-gpt</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Can <fixed-case>GPT</fixed-case>-4 Recover Latent Semantic Relational Information from Word Associations? A Detailed Analysis of Agreement with Human-annotated Semantic Ontologies.</title>
+      <author><first>Simon</first><last>De Deyne</last></author>
+      <author><first>Chunhua</first><last>Liu</last></author>
+      <author><first>Lea</first><last>Frermann</last></author>
+      <pages>68–78</pages>
+      <abstract>Word associations, i.e., spontaneous responses to a cue word, provide not only a window into the human mental lexicon but have also been shown to be a repository of common-sense knowledge and can underpin efforts in lexicography and the construction of dictionaries. Especially the latter tasks require knowledge about the relations underlying the associations (e.g., Taxonomic vs. Situational); however, to date, there is neither an established ontology of relations nor an effective labelling paradigm. Here, we test GPT-4’s ability to infer semantic relations for human-produced word associations. We use four human-labelled data sets of word associations and semantic features, with differing relation inventories and various levels of annotator agreement. We directly prompt GPT-4 with detailed relation definitions without further fine-tuning or training. Our results show that while GPT-4 provided a good account of higher-level classifications (e.g. Taxonomic vs Situational), prompting instructions alone cannot obtain similar performance for detailed classifications (e.g. superordinate, subordinate or coordinate relations) despite high agreement among human annotators. This suggests that latent relations can at least be partially recovered from word associations and highlights ways in which LLMs could be improved and human annotation protocols could adapted to reduce coding ambiguity.</abstract>
+      <url hash="b3d61ec5">2024.cogalex-1.8</url>
+      <bibkey>de-deyne-etal-2024-gpt</bibkey>
+    </paper>
+    <paper id="9">
+      <title>What’s in a Name? Electrophysiological Differences in Processing Proper Nouns in <fixed-case>M</fixed-case>andarin <fixed-case>C</fixed-case>hinese</title>
+      <author><first>Bernard A. J.</first><last>Jap</last></author>
+      <author><first>Yu-Yin</first><last>Hsu</last></author>
+      <author><first>Lavinia</first><last>Salicchi</last></author>
+      <author><first>Yu Xi</first><last>Li</last></author>
+      <pages>79–85</pages>
+      <abstract>The current study examines how proper names and common nouns in Chinese are cognitively processed during sentence comprehension. EEG data was recorded when participants were presented with neutral contexts followed by either a proper name or a common noun. Proper names in Chinese often consist of characters that can function independently as words or be combined with other characters to form words, potentially benefiting from the semantic features carried by each character. Using cluster-based permutation tests, we found a larger N400 for common nouns when compared to proper names. Our results suggest that the semantics of characters do play a role in facilitating the processing of proper names. This is consistent with previous behavioral findings on noun processing in Chinese, indicating that common nouns require more cognitive resources to process than proper names. Moreover, our results suggest that proper names are processed differently between alphabetic languages and Chinese language.</abstract>
+      <url hash="ce57eaa4">2024.cogalex-1.9</url>
+      <bibkey>jap-etal-2024-whats</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Cross-Linguistic Processing of Non-Compositional Expressions in <fixed-case>S</fixed-case>lavic Languages</title>
+      <author><first>Iuliia</first><last>Zaitova</last></author>
+      <author><first>Irina</first><last>Stenger</last></author>
+      <author><first>Muhammad Umer</first><last>Butt</last></author>
+      <author><first>Tania</first><last>Avgustinova</last></author>
+      <pages>86–97</pages>
+      <abstract>This study focuses on evaluating and predicting the intelligibility of non-compositional expressions within the context of five closely related Slavic languages: Belarusian, Bulgarian, Czech, Polish, and Ukrainian, as perceived by native speakers of Russian. Our investigation employs a web-based experiment where native Russian respondents take part in free-response and multiple-choice translation tasks. Based on the previous studies in mutual intelligibility and non-compositionality, we propose two predictive factors for reading comprehension of unknown but closely related languages: 1) linguistic distances, which include orthographic and phonological distances; 2) surprisal scores obtained from monolingual Language Models (LMs). Our primary objective is to explore the relationship of these two factors with the intelligibility scores and response times of our web-based experiment. Our findings reveal that, while intelligibility scores from the experimental tasks exhibit a stronger correlation with phonological distances, LM surprisal scores appear to be better predictors of the time participants invest in completing the translation tasks.</abstract>
+      <url hash="828ef0bf">2024.cogalex-1.10</url>
+      <bibkey>zaitova-etal-2024-cross</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Using Language Models to Unravel Semantic Development in Children’s Use of Perception Verbs</title>
+      <author><first>Bram</first><last>van Dijk</last></author>
+      <author><first>Max J.</first><last>van Duijn</last></author>
+      <author><first>Li</first><last>Kloostra</last></author>
+      <author><first>Marco</first><last>Spruit</last></author>
+      <author><first>Barend</first><last>Beekhuizen</last></author>
+      <pages>98–106</pages>
+      <abstract>In this short paper we employ a Language Model (LM) to gain insight into how complex semantics of a Perception Verb (PV) emerge in children. Using a Dutch LM as representation of mature language use, we find that for all ages 1) the LM accurately predicts PV use in children’s freely-told narratives; 2) children’s PV use is close to mature use; 3) complex PV meanings with attentional and cognitive aspects can be found. Our approach illustrates how LMs can be meaningfully employed in studying language development, hence takes a constructive position in the debate on the relevance of LMs in this context.</abstract>
+      <url hash="ab4e956d">2024.cogalex-1.11</url>
+      <bibkey>van-dijk-etal-2024-using</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Representing Abstract Concepts with Images: An Investigation with Large Language Models</title>
+      <author><first>Ludovica</first><last>Cerini</last></author>
+      <author><first>Alessandro</first><last>Bondielli</last></author>
+      <author><first>Alessandro</first><last>Lenci</last></author>
+      <pages>107–113</pages>
+      <abstract>Multimodal metaphorical interpretation of abstract concepts has always been a debated problem in many research fields, including cognitive linguistics and NLP. With the dramatic improvements of Large Language Models (LLMs) and the increasing attention toward multimodal Vision-Language Models (VLMs), there has been pronounced attention on the conceptualization of abstracts. Nevertheless, a systematic scientific investigation is still lacking. This work introduces a framework designed to shed light on the indirect grounding mechanisms that anchor the meaning of abstract concepts to concrete situations (e.g. ability - a person skating), following the idea that abstracts acquire meaning from embodied and situated simulation. We assessed human and LLMs performances by a situation generation task. Moreover, we assess the figurative richness of images depicting concrete scenarios, via a text-to-image retrieval task performed on LAION-400M.</abstract>
+      <url hash="14e3ba60">2024.cogalex-1.12</url>
+      <bibkey>cerini-etal-2024-representing</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Big-Five Backstage: A Dramatic Dataset for Characters Personality Traits &amp; Gender Analysis</title>
+      <author><first>Vadim A.</first><last>Porvatov</last></author>
+      <author><first>Carlo</first><last>Strapparava</last></author>
+      <author><first>Marina</first><last>Tiuleneva</last></author>
+      <pages>114–119</pages>
+      <abstract>This paper introduces a novel textual dataset comprising fictional characters’ lines with annotations based on their gender and Big-Five personality traits. Using psycholinguistic findings, we compared texts attributed to fictional characters and real people with respect to their genders and personality traits. Our results indicate that imagined personae mirror most of the language categories observed in real people while demonstrating them in a more expressive manner.</abstract>
+      <url hash="41c47413">2024.cogalex-1.13</url>
+      <bibkey>porvatov-etal-2024-big</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Interaction of Semantics and Morphology in <fixed-case>R</fixed-case>ussian Word Vectors</title>
+      <author><first>Yulia</first><last>Zinova</last></author>
+      <author><first>Ruben</first><last>van de Vijver</last></author>
+      <author><first>Anastasia</first><last>Yablokova</last></author>
+      <pages>120–128</pages>
+      <abstract>In this paper we explore how morphological information can be extracted from fastText embeddings for Russian nouns. We investigate the negative effects of syncretism and propose ways of modifying the vectors that can help to find better representations for morphological functions and thus for out of vocabulary words. In particular, we look at the effect of analysing shift vectors instead of original vectors, discuss various possibilities of finding base forms to create shift vectors, and show that using only the high frequency data is beneficial when looking for structure with respect to the morphosyntactic functions in the embeddings.</abstract>
+      <url hash="fe77015f">2024.cogalex-1.14</url>
+      <bibkey>zinova-etal-2024-interaction</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Listen, Repeat, Decide: Investigating Pronunciation Variation in Spoken Word Recognition among <fixed-case>R</fixed-case>ussian Speakers</title>
+      <author><first>Vladislav Ivanovich</first><last>Zubov</last></author>
+      <author><first>Elena</first><last>Riekhakaynen</last></author>
+      <pages>129–132</pages>
+      <abstract>Variability is one of the important features of natural speech and a challenge for spoken word recognition models and automatic speech recognition systems. We conducted two preliminary experiments aimed at finding out whether native Russian speakers regard differently certain types of pronunciation variation when the variants are equally possible according to orthoepic norms. In the first experiment, the participants had to repeat the words with three different types of pronunciation variability. In the second experiment, we focused on the assessment of words with variable and only one standard stress. Our results support the hypothesis that listeners pay the most attention to words with variable stress, less to the variability of soft and hard consonants, and even less to the presence / absence of /j/. Assessing the correct pronunciation of words with variable stress takes significantly more time than assessing words which have only one correct pronunciation variant. These preliminary results show that pronunciation variants can provide new evidence on how a listener access the mental lexicon during natural speech processing and chooses among the variants stored in it.</abstract>
+      <url hash="b719769f">2024.cogalex-1.15</url>
+      <bibkey>zubov-riekhakaynen-2024-listen</bibkey>
+    </paper>
+    <paper id="16">
+      <title>The Mental Lexicon of Communicative Fragments and Contours: The Remix N-gram Method</title>
+      <author><first>Emese</first><last>K. Molnár</last></author>
+      <author><first>Andrea</first><last>Dömötör</last></author>
+      <pages>133–139</pages>
+      <abstract>The classical mental lexicon models represented the lexicon as a list of words. Usage-based models describe the mental lexicon more dynamically, but they do not capture the real-time operation of speech production. In the linguistic model of Boris Gasparov, the notions of communicative fragment and contour can provide a comprehensive description of the diversity of linguistic experience. Fragments and contours form larger linguistic structures than words and they are recognized as a whole unit by speakers through their communicative profile. Fragments are prefabricated units that can be added to or merged with each other during speech production. The contours serve as templates for the utterances by combining specific and abstract linguistic elements. Based on this theoretical framework, our tool applies remix n-grams (combination of word forms, lemmas and POS-tags) to identify similar linguistic structures in different texts that form the basic units of the mental lexicon.</abstract>
+      <url hash="27a03850">2024.cogalex-1.16</url>
+      <bibkey>k-molnar-domotor-2024-mental</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Three Studies on Predicting Word Concreteness with Embedding Vectors</title>
+      <author><first>Michael</first><last>Flor</last></author>
+      <pages>140–150</pages>
+      <abstract>Human-assigned concreteness ratings for words are commonly used in psycholinguistic and computational linguistic studies. Previous research has shown that such ratings can be modeled and extrapolated by using dense word-embedding representations. However, due to rater disagreement, considerable amounts of human ratings in published datasets are not reliable. We investigate how such unreliable data influences modeling of concreteness with word embeddings. Study 1 compares fourteen embedding models over three datasets of concreteness ratings, showing that most models achieve high correlations with human ratings, and exhibit low error rates on predictions. Study 2 investigates how exclusion of the less reliable ratings influences the modeling results. It indicates that improved results can be achieved when data is cleaned. Study 3 adds additional conditions over those of study 2 and indicates that the improved results hold only for the cleaned data, and that in the general case removing the less reliable data points is not useful.</abstract>
+      <url hash="4ba37983">2024.cogalex-1.17</url>
+      <bibkey>flor-2024-three</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Combining Neo-Structuralist and Cognitive Approaches to Semantics to Build Wordnets for Ancient Languages: Challenges and Perspectives</title>
+      <author><first>Erica</first><last>Biagetti</last></author>
+      <author><first>Martina</first><last>Giuliani</last></author>
+      <author><first>Silvia</first><last>Zampetta</last></author>
+      <author><first>Silvia</first><last>Luraghi</last></author>
+      <author><first>Chiara</first><last>Zanchi</last></author>
+      <pages>151–161</pages>
+      <abstract>This paper addresses challenges encountered in constructing lexical databases, specifically WordNets, for three ancient Indo-European languages: Ancient Greek, Latin, and Sanskrit. The difficulties partly arise from adapting concepts and methodologies designed for modern languages to the construction of lexical resources for ancient ones. A further significant challenge arises from the goal of creating WordNets that not only adhere to a neo-structuralist relational view of meaning but also integrate Cognitive Semantics concepts, aiming for a more realistic representation of meaning. This integration is crucial for facilitating studies in diachronic semantics and lexicology, and representing meaning in such a nuanced manner becomes paramount when constructing language resources for theoretical research, rather than for applied tasks, as is the case with lexical resources for ancient languages. The paper delves into these challenges through a case study focused on the TEMPERATURE conceptual domain in the three languages. It outlines difficulties in distinguishing prototypical and non-prototypical senses, literal and non-literal ones, and, within non-literal meanings, between metaphorical and metonymic ones. Solutions adopted to address these challenges are presented, highlighting the necessity of achieving maximum granularity in meaning representation while maintaining a sustainable workflow for annotators.</abstract>
+      <url hash="ec345b9b">2024.cogalex-1.18</url>
+      <bibkey>biagetti-etal-2024-combining</bibkey>
+    </paper>
+    <paper id="19">
+      <title><fixed-case>S</fixed-case>ensory<fixed-case>T</fixed-case>5: Infusing Sensorimotor Norms into T5 for Enhanced Fine-grained Emotion Classification</title>
+      <author><first>Yuhan</first><last>Xia</last></author>
+      <author><first>Qingqing</first><last>Zhao</last></author>
+      <author><first>Yunfei</first><last>Long</last></author>
+      <author><first>Ge</first><last>Xu</last></author>
+      <author><first>Jia</first><last>Wang</last></author>
+      <pages>162–174</pages>
+      <abstract>In traditional research approaches, sensory perception and emotion classification have traditionally been considered separate domains. Yet, the significant influence of sensory experiences on emotional responses is undeniable. The natural language processing (NLP) community has often missed the opportunity to merge sensory knowledge with emotion classification. To address this gap, we propose SensoryT5, a neurocognitive approach that integrates sensory information into the T5 (Text-to-Text Transfer Transformer) model, designed specifically for fine-grained emotion classification. This methodology incorporates sensory cues into the T5’s attention mechanism, enabling a harmonious balance between contextual understanding and sensory awareness. The resulting model amplifies the richness of emotional representations. In rigorous tests across various detailed emotion classification datasets, SensoryT5 showcases improved performance, surpassing both the foundational T5 model and current state-of-the-art works. Notably, SensoryT5’s success signifies a pivotal change in the NLP domain, highlighting the potential influence of neurocognitive data in refining machine learning models’ emotional sensitivity.</abstract>
+      <url hash="e1598bf3">2024.cogalex-1.19</url>
+      <bibkey>xia-etal-2024-sensoryt5</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.coling.xml b/data/xml/2024.coling.xml
new file mode 100644
index 0000000000..9f8310e478
--- /dev/null
+++ b/data/xml/2024.coling.xml
@@ -0,0 +1,51 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.coling">
+  <event id="coling-2024">
+    <meta>
+      <title>The 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
+      <location>Torino, Italia</location>
+      <dates>May, 2024</dates>
+    </meta>
+    <links>
+      <url type="website">https://lrec-coling-2024.org</url>
+    </links>
+    <colocated>
+      <volume-id>2024.bucc-1</volume-id>
+      <volume-id>2024.cawl-1</volume-id>
+      <volume-id>2024.cl4health-1</volume-id>
+      <volume-id>2024.cogalex-1</volume-id>
+      <volume-id>2024.determit-1</volume-id>
+      <volume-id>2024.delite-1</volume-id>
+      <volume-id>2024.dlnld-1</volume-id>
+      <volume-id>2024.dmr-1</volume-id>
+      <volume-id>2024.ecnlp-1</volume-id>
+      <volume-id>2024.eurali-1</volume-id>
+      <volume-id>2024.finnlp-1</volume-id>
+      <volume-id>2024.games-1</volume-id>
+      <volume-id>2024.htres-1</volume-id>
+      <volume-id>2024.humeval-1</volume-id>
+      <volume-id>2024.isa-1</volume-id>
+      <volume-id>2024.ldl-1</volume-id>
+      <volume-id>2024.legal-1</volume-id>
+      <volume-id>2024.lt4hala-1</volume-id>
+      <volume-id>2024.mathnlp-1</volume-id>
+      <volume-id>2024.mwe-1</volume-id>
+      <volume-id>2024.neusymbridge-1</volume-id>
+      <volume-id>2024.nlperspectives-1</volume-id>
+      <volume-id>2024.osact-1</volume-id>
+      <volume-id>2024.parlaclarin-1</volume-id>
+      <volume-id>2024.politicalnlp-1</volume-id>
+      <volume-id>2024.rail-1</volume-id>
+      <volume-id>2024.rapid-1</volume-id>
+      <volume-id>2024.readi-1</volume-id>
+      <volume-id>2024.rfp-1</volume-id>
+      <volume-id>2024.safety4convai-1</volume-id>
+      <volume-id>2024.sigul-1</volume-id>
+      <volume-id>2024.signlang-1</volume-id>
+      <volume-id>2024.tdle-1</volume-id>
+      <volume-id>2024.trac-1</volume-id>
+      <volume-id>2024.unlp-1</volume-id>
+      <volume-id>2024.wildre-1</volume-id>
+    </colocated>
+  </event>
+</collection>
diff --git a/data/xml/2024.computel.xml b/data/xml/2024.computel.xml
index 2aa634a05e..86b3353050 100644
--- a/data/xml/2024.computel.xml
+++ b/data/xml/2024.computel.xml
@@ -36,6 +36,7 @@
       <abstract>Blackfoot is challenging for English speaking instructors and learners to acquire because it exhibits unique pitch patterns. This study presents MeTILDA (Melodic Transcription in Language Documentation and Application) as a solution to teaching pitch patterns distinct from English. Specifically, we explore ways to improve data visualization through a visualized pronunciation teaching guide called Pitch Art. The working materials can be downloaded or stored in the cloud for further use and collaboration. These features are aimed to facilitate teachers in developing curriculum for learning pronunciation, and provide students with an interactive and integrative learning environment to better understand Blackfoot language and pronunciation.</abstract>
       <url hash="c8f1df30">2024.computel-1.1</url>
       <bibkey>chen-etal-2024-cloud</bibkey>
+      <video href="2024.computel-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Technology and Language Revitalization: A Roadmap for the Mvskoke Language</title>
@@ -45,6 +46,7 @@
       <url hash="e0312b64">2024.computel-1.2</url>
       <attachment type="SupplementaryMaterial" hash="c8758f63">2024.computel-1.2.SupplementaryMaterial.zip</attachment>
       <bibkey>mainzinger-2024-technology</bibkey>
+      <video href="2024.computel-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Investigating the productivity of Passamaquoddy medials: A computational approach</title>
@@ -83,6 +85,7 @@
       <abstract>Descriptive linguistics is a sub-field of linguistics that involves the collection and annotationof language resources to describe linguistic phenomena. The transcription of these resources is often described as a tedious task, and Automatic Speech Recognition (ASR) has frequently been employed to support this process. However, the typical research approach to ASR in documentary linguistics often only captures a subset of the field’s diverse reality. In this paper, we focus specifically on one type of data known as grammaticality judgment elicitation in the context of documenting Kréyòl Gwadloupéyen. We show that only a few minutes of speech is enough to fine-tune a model originally trained in French to transcribe segments in Kréyol.</abstract>
       <url hash="5bcf7e06">2024.computel-1.6</url>
       <bibkey>le-ferrand-prudhommeaux-2024-automatic</bibkey>
+      <video href="2024.computel-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Fitting a Square Peg into a Round Hole: Creating a <fixed-case>U</fixed-case>ni<fixed-case>M</fixed-case>orph dataset of Kanien’kéha Verbs</title>
@@ -111,6 +114,7 @@
       <abstract>We investigate the performance of state-of-the-art neural ASR systems in transcribing audio recordings for Hupa, a critically endangered language of the Hoopa Valley Tribe. We also explore the impact on ASR performance when augmenting a small dataset of gold-standard high-quality transcriptions with a) a larger dataset with transcriptions of lower quality, and b) model-generated transcriptions in a self-training approach. An evaluation of both data augmentation approaches shows that the self-training approach is competitive, producing better WER scores than models trained with no additional data and not lagging far behind models trained with additional lower quality manual transcriptions instead: the deterioration in WER score is just 4.85 points when all the additional data is used in experiments with the best performing system, Wav2Vec. These findings have encouraging implications on the use of ASR systems for transcription and language documentation efforts in the Hupa language.</abstract>
       <url hash="68269d2c">2024.computel-1.9</url>
       <bibkey>venkateswaran-liu-2024-looking</bibkey>
+      <video href="2024.computel-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Creating Digital Learning and Reference Resources for <fixed-case>S</fixed-case>outhern <fixed-case>M</fixed-case>ichif</title>
@@ -136,6 +140,7 @@
       <abstract>We present MunTTS, an end-to-end text-to-speech (TTS) system specifically for Mundari, a low-resource Indian language of the Austo-Asiatic family. Our work addresses the gap in linguistic technology for underrepresented languages by collecting and processing data to build a speech synthesis system. We begin our study by gathering a substantial dataset of Mundari text and speech and train end-to-end speech models. We also delve into the methods used for training our models, ensuring they are efficient and effective despite the data constraints. We evaluate our system with native speakers and objective metrics, demonstrating its potential as a tool for preserving and promoting the Mundari language in the digital age.</abstract>
       <url hash="2930064f">2024.computel-1.11</url>
       <bibkey>gumma-etal-2024-muntts</bibkey>
+      <video href="2024.computel-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>End-to-End Speech Recognition for Endangered Languages of <fixed-case>N</fixed-case>epal</title>
diff --git a/data/xml/2024.delite.xml b/data/xml/2024.delite.xml
new file mode 100644
index 0000000000..7cb464c124
--- /dev/null
+++ b/data/xml/2024.delite.xml
@@ -0,0 +1,98 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.delite">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the First Workshop on Language-driven Deliberation Technology (DELITE) @ LREC-COLING 2024</booktitle>
+      <editor><first>Annette</first><last>Hautli-Janisz</last></editor>
+      <editor><first>Gabriella</first><last>Lapesa</last></editor>
+      <editor><first>Lucas</first><last>Anastasiou</last></editor>
+      <editor><first>Valentin</first><last>Gold</last></editor>
+      <editor><first>Anna De</first><last>Liddo</last></editor>
+      <editor><first>Chris</first><last>Reed</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="18466707">2024.delite-1</url>
+      <venue>delite</venue>
+    </meta>
+    <frontmatter>
+      <url hash="142c5bb6">2024.delite-1.0</url>
+      <bibkey>delite-2024-language</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>AQ</fixed-case>u<fixed-case>A</fixed-case> – Combining Experts’ and Non-Experts’ Views To Assess Deliberation Quality in Online Discussions Using <fixed-case>LLM</fixed-case>s</title>
+      <author><first>Maike</first><last>Behrendt</last></author>
+      <author><first>Stefan Sylvius</first><last>Wagner</last></author>
+      <author><first>Marc</first><last>Ziegele</last></author>
+      <author><first>Lena</first><last>Wilms</last></author>
+      <author><first>Anke</first><last>Stoll</last></author>
+      <author><first>Dominique</first><last>Heinbach</last></author>
+      <author><first>Stefan</first><last>Harmeling</last></author>
+      <pages>1–12</pages>
+      <abstract>Measuring the quality of contributions in political online discussions is crucial in deliberation research and computer science. Research has identified various indicators to assess online discussion quality, and with deep learning advancements, automating these measures has become feasible. While some studies focus on analyzing specific quality indicators, a comprehensive quality score incorporating various deliberative aspects is often preferred. In this work, we introduce AQuA, an additive score that calculates a unified deliberative quality score from multiple indices for each discussion post. Unlike other singular scores, AQuA preserves information on the deliberative aspects present in comments, enhancing model transparency. We develop adapter models for 20 deliberative indices, and calculate correlation coefficients between experts’ annotations and the perceived deliberativeness by non-experts to weigh the individual indices into a single deliberative score. We demonstrate that the AQuA score can be computed easily from pre-trained adapters and aligns well with annotations on other datasets that have not be seen during training. The analysis of experts’ vs. non-experts’ annotations confirms theoretical findings in the social science literature.</abstract>
+      <url hash="9acdb949">2024.delite-1.1</url>
+      <bibkey>behrendt-etal-2024-aqua</bibkey>
+    </paper>
+    <paper id="2">
+      <title>A Unified <fixed-case>LLM</fixed-case>-<fixed-case>KG</fixed-case> Framework to Assist Fact-Checking in Public Deliberation</title>
+      <author><first>Nikolaos</first><last>Giarelis</last></author>
+      <author><first>Charalampos</first><last>Mastrokostas</last></author>
+      <author><first>Nikos</first><last>Karacapilidis</last></author>
+      <pages>13–19</pages>
+      <abstract>Fact-checking plays a crucial role in public deliberation by promoting transparency, accuracy, credibility, and accountability. Aiming to augment the efficiency and adoption of current public deliberation platforms, which mostly rely on the abilities of participants to meaningfully process and interpret the associated content, this paper explores the combination of deep learning and symbolic reasoning. Specifically, it proposes a framework that unifies the capabilities of Large Language Models (LLMs) and Knowledge Graphs (KGs), and reports on an experimental evaluation. This evaluation is conducted through a questionnaire asking users to assess a baseline LLM against the proposed framework, using a series of fact-checking metrics, namely readability, coverage, non-redundancy, and quality. The experimentation results are promising and confirm the potential of combining the capabilities of these two technologies in the context of public deliberation and digital democracy.</abstract>
+      <url hash="7fc63397">2024.delite-1.2</url>
+      <bibkey>giarelis-etal-2024-unified</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Can Text Simplification Help to Increase the Acceptance of <fixed-case>E</fixed-case>-participation?</title>
+      <author><first>Regina</first><last>Stodden</last></author>
+      <author><first>Phillip</first><last>Nguyen</last></author>
+      <pages>20–32</pages>
+      <abstract>This study investigated the effect of text simplification (with and without artificial intelligence support) and the role of participants (author or reader) on the acceptance of e-participation processes. Therefore, a near-realistic experimental study with 276 participants was conducted simulating a participatory budgeting process. The results of our study show, on the one hand, that text simplification and the role of participants has no direct influence on the intention to use e-participation. Although a higher level of participation cannot be achieved by text simplification, our results also show that no negative consequences for usage intention can be expected from text simplification. On the other hand, the results show that people with reading and writing difficulties prefer text simplification for proposals in e-participation.</abstract>
+      <url hash="0f3b57eb">2024.delite-1.3</url>
+      <bibkey>stodden-nguyen-2024-text</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Pitfalls of Conversational <fixed-case>LLM</fixed-case>s on News Debiasing</title>
+      <author><first>Ipek</first><last>Baris Schlicht</last></author>
+      <author><first>Defne</first><last>Altiok</last></author>
+      <author><first>Maryanne</first><last>Taouk</last></author>
+      <author><first>Lucie</first><last>Flek</last></author>
+      <pages>33–38</pages>
+      <abstract>This paper addresses debiasing in news editing and evaluates the effectiveness of conversational Large Language Models in this task. We designed an evaluation checklist tailored to news editors’ perspectives, obtained generated texts from three popular conversational models using a subset of a publicly available dataset in media bias, and evaluated the texts according to the designed checklist. Furthermore, we examined the models as evaluator for checking the quality of debiased model outputs. Our findings indicate that none of the LLMs are perfect in debiasing. Notably, some models, including ChatGPT, introduced unnecessary changes that may impact the author’s style and create misinformation. Lastly, we show that the models do not perform as proficiently as domain experts in evaluating the quality of debiased outputs.</abstract>
+      <url hash="4eba65dd">2024.delite-1.4</url>
+      <bibkey>baris-schlicht-etal-2024-pitfalls</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Integrating conflict prevention tools into deliberative democracy online platforms</title>
+      <author><first>Sara</first><last>Greco</last></author>
+      <author><first>Chiara</first><last>Jermini</last></author>
+      <pages>39–44</pages>
+      <abstract>This paper presents a set of preliminary guidelines for conflict prevention developed within the EU-funded research project ORBIS (“Augmenting participation, co-creation, trust and transparency in Deliberative Democracy at all scales”), whose goal is developing online platforms that enable citizens to enhance their participation in democratic processes, through open discussions around important political topics. Based on previous research on communication and argumentation in conflict resolution discourse and on the empirical analysis of discussions around deliberative democracy topics, this paper highlights recurrent interpersonal communication problems that might occur in group discussions around complex topics and that, if not handled well, can lead to conflicts; and introduces a first proposal for solutions to help, both through technology and with the assistance of human moderations, participants in such discussions to avoid the development and the escalation of conflicts.</abstract>
+      <url hash="43496c1c">2024.delite-1.5</url>
+      <bibkey>greco-jermini-2024-integrating</bibkey>
+    </paper>
+    <paper id="6">
+      <title>A Hybrid Human-<fixed-case>AI</fixed-case> Approach for Argument Map Creation From Transcripts</title>
+      <author><first>Lucas</first><last>Anastasiou</last></author>
+      <author><first>Anna</first><last>De Liddo</last></author>
+      <pages>45–51</pages>
+      <abstract>In order to overcome challenges of traditional deliberation approaches that often silo information exchange between synchronous and asynchronous modes therefore hindering effective deliberation, we present a hybrid framework combining Large Language Models (LLMs) and human-in-the-loop curation to generate argument maps from deliberation transcripts. This approach aims to enhance the efficiency and quality of the generated argument maps, promote transparency, and connect the asynchronous and synchronous deliberation modes. Finally, we outline a realistic deliberation scenario where this process can be successfully integrated.</abstract>
+      <url hash="b715a34a">2024.delite-1.6</url>
+      <bibkey>anastasiou-de-liddo-2024-hybrid</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Leveraging High-Precision Corpus Queries for Text Classification via Large Language Models</title>
+      <author><first>Nathan</first><last>Dykes</last></author>
+      <author><first>Stephanie</first><last>Evert</last></author>
+      <author><first>Philipp</first><last>Heinrich</last></author>
+      <author><first>Merlin</first><last>Humml</last></author>
+      <author><first>Lutz</first><last>Schröder</last></author>
+      <pages>52–57</pages>
+      <abstract>We use query results from manually designed corpus queries for fine-tuning an LLM to identify argumentative fragments as a text mining task. The resulting model outperforms both an LLM fine-tuned on a relatively large manually annotated gold standard of tweets as well as a rule-based approach. This proof-of-concept study demonstrates the usefulness of corpus queries to generate training data for complex text categorisation tasks, especially if the targeted category has low prevalence (so that a manually annotated gold standard contains only a small number of positive examples).</abstract>
+      <url hash="6334d784">2024.delite-1.7</url>
+      <bibkey>dykes-etal-2024-leveraging</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.determit.xml b/data/xml/2024.determit.xml
new file mode 100644
index 0000000000..deccd9099d
--- /dev/null
+++ b/data/xml/2024.determit.xml
@@ -0,0 +1,204 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.determit">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Workshop on DeTermIt! Evaluating Text Difficulty in a Multilingual Context @ LREC-COLING 2024</booktitle>
+      <editor><first>Giorgio Maria Di</first><last>Nunzio</last></editor>
+      <editor><first>Federica</first><last>Vezzani</last></editor>
+      <editor><first>Liana</first><last>Ermakova</last></editor>
+      <editor><first>Hosein</first><last>Azarbonyad</last></editor>
+      <editor><first>Jaap</first><last>Kamps</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="a18cbe23">2024.determit-1</url>
+      <venue>determit</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="09f76227">2024.determit-1.0</url>
+      <bibkey>determit-2024-determit</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Reproduction of <fixed-case>G</fixed-case>erman Text Simplification Systems</title>
+      <author><first>Regina</first><last>Stodden</last></author>
+      <pages>1–15</pages>
+      <abstract>The paper investigates the reproducibility of various approaches to automatically simplify German texts and identifies key challenges in the process. We reproduce eight sentence simplification systems including rules-based models, fine-tuned models, and prompting of autoregressive models. We highlight three main issues of reproducibility: the impossibility of reproduction due to missing details, code, or restricted access to data/models; variations in reproduction, hindering meaningful comparisons; and discrepancies in evaluation scores between reported and reproduced models. To enhance reproducibility and facilitate model comparison, we recommend the publication of model-related details, including checkpoints, code, and training methodologies. Our study also emphasizes the importance of releasing system generations, when possible, for thorough analysis and better understanding of original works. In our effort to compare reproduced models, we also create a German sentence simplification benchmark of the eight models across six test sets. Overall, the study underscores the significance of transparency, documentation, and diverse training data for advancing reproducibility and meaningful model comparison in automated German text simplification.</abstract>
+      <url hash="e1668a87">2024.determit-1.1</url>
+      <bibkey>stodden-2024-reproduction</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Complexity-Aware Scientific Literature Search: Searching for Relevant and Accessible Scientific Text</title>
+      <author><first>Liana</first><last>Ermakova</last></author>
+      <author><first>Jaap</first><last>Kamps</last></author>
+      <pages>16–26</pages>
+      <abstract>Abstract: We conduct a series of experiments on ranking scientific abstracts in response to popular science queries issued by non-expert users. We show that standard IR ranking models optimized on topical relevance are indeed ignoring the individual user’s context and background knowledge. We also demonstrate the viability of complexity-aware retrieval models that retrieve more accessible relevant documents or ensure these are ranked prior to more advanced documents on the topic. More generally, our results help remove some of the barriers to consulting scientific literature by non-experts and hold the potential to promote science literacy in the general public. Lay Summary: In a world of misinformation and disinformation, access to objective evidence-based scientific information is crucial. The general public ignores scientific information due to its perceived complexity, resorting to shallow information on the web or in social media. We analyze the complexity of scientific texts retrieved for a lay person’s topic, and find a great variation in text complexity. A proof of concept complexity-aware search engine is able to retrieve both relevant and accessible scientific information for a layperson’s information need.</abstract>
+      <url hash="f8ae5943">2024.determit-1.2</url>
+      <bibkey>ermakova-kamps-2024-complexity</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Beyond Sentence-level Text Simplification: Reproducibility Study of Context-Aware Document Simplification</title>
+      <author><first>Jan</first><last>Bakker</last></author>
+      <author><first>Jaap</first><last>Kamps</last></author>
+      <pages>27–38</pages>
+      <abstract>Previous research on automatic text simplification has focused on almost exclusively on sentence-level inputs. However, the simplification of full documents cannot be tackled by naively simplifying each sentence in isolation, as this approach fails to preserve the discourse structure of the document. Recent Context-Aware Document Simplification approaches explore various models whose input goes beyond the sentence-level. These model achieve state-of-the-art performance on the Newsela-auto dataset, which requires a difficult to obtain license to use. We replicate these experiments on an open-source dataset, namely Wiki-auto, and share all training details to make future reproductions easy. Our results validate the claim that models guided by a document-level plan outperform their standard counterparts. However, they do not support the claim that simplification models perform better when they have access to a local document context. We also find that planning models do not generalize well to out-of-domain settings. Lay Summary: We have access to unprecedented amounts of information, yet the most authoritative sources may exceed a user’s language proficiency level. Text simplification technology can change the writing style while preserving the main content. Recent paragraph-level and document-level text simplification approaches outcompete traditional sentence-level approaches, and increase the understandability of complex texts.</abstract>
+      <url hash="0d94e1b3">2024.determit-1.3</url>
+      <bibkey>bakker-kamps-2024-beyond</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Towards Automatic <fixed-case>F</fixed-case>innish Text Simplification</title>
+      <author><first>Anna</first><last>Dmitrieva</last></author>
+      <author><first>Jörg</first><last>Tiedemann</last></author>
+      <pages>39–50</pages>
+      <abstract>Automatic text simplification (ATS/TS) models typically require substantial parallel training data. This paper describes our work on expanding the Finnish-Easy Finnish parallel corpus and making baseline simplification models. We discuss different approaches to document and sentence alignment. After finding the optimal alignment methodologies, we increase the amount of document-aligned data 6.5 times and add a sentence-aligned version of the dataset consisting of more than twelve thousand sentence pairs. Using sentence-aligned data, we fine-tune two models for text simplification. The first is mBART, a sequence-to-sequence translation architecture proven to show good results for monolingual translation tasks. The second is the Finnish GPT model, for which we utilize instruction fine-tuning. This work is the first attempt to create simplification models for Finnish using monolingual parallel data in this language. The data has been deposited in the Finnish Language Bank (Kielipankki) and is available for non-commercial use, and the models will be made accessible through either Kielipankki or public repositories such as Huggingface or GitHub.</abstract>
+      <url hash="a7279e36">2024.determit-1.4</url>
+      <bibkey>dmitrieva-tiedemann-2024-towards</bibkey>
+    </paper>
+    <paper id="5">
+      <title>A Multilingual Survey of Recent Lexical Complexity Prediction Resources through the Recommendations of the Complex 2.0 Framework</title>
+      <author><first>Matthew</first><last>Shardlow</last></author>
+      <author><first>Kai</first><last>North</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <pages>51–59</pages>
+      <abstract>Lexical complexity prediction is the NLP task aimed at using machine learning to predict the difficulty of a target word in context for a given user or user group. Multiple datasets exist for lexical complexity prediction, many of which have been published recently in diverse languages. In this survey, we discuss nine recent datasets (2018-2024) all of which provide lexical complexity prediction annotations. Particularly, we identified eight languages (French, Spanish, Chinese, German, Russian, Japanese, Turkish and Portuguese) with at least one lexical complexity dataset. We do not consider the English datasets, which have already received significant treatment elsewhere in the literature. To survey these datasets, we use the recommendations of the Complex 2.0 Framework (Shardlow et al., 2022), identifying how the datasets differ along the following dimensions: annotation scale, context, multiple token instances, multiple token annotations, diverse annotators. We conclude with future research challenges arising from our survey of existing lexical complexity prediction datasets.</abstract>
+      <url hash="413aa700">2024.determit-1.5</url>
+      <bibkey>shardlow-etal-2024-multilingual</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Plain Language Summarization of Clinical Trials</title>
+      <author><first>Polydoros</first><last>Giannouris</last></author>
+      <author><first>Theodoros</first><last>Myridis</last></author>
+      <author><first>Tatiana</first><last>Passali</last></author>
+      <author><first>Grigorios</first><last>Tsoumakas</last></author>
+      <pages>60–67</pages>
+      <abstract>Plain language summarization, or lay summarization, is an emerging natural language processing task, aiming to make scientific articles accessible to an audience of non-scientific backgrounds. The healthcare domain can greatly benefit from applications of automatic plain language summarization, as results that concern a large portion of the population are reported in large documents with complex terminology. However, existing corpora for this task are limited in scope, usually regarding conference or journal article abstracts. In this paper, we introduce the task of automated generation of plain language summaries for clinical trials, and construct CARES (Clinical Abstractive Result Extraction and Simplification), the first corresponding dataset. CARES consists of publicly available, human-written summaries of clinical trials conducted by Pfizer. Source text is identified from documents released throughout the life-cycle of the trial, and steps are taken to remove noise and select the appropriate sections. Experiments show that state-of-the-art models achieve satisfactory results in most evaluation metrics</abstract>
+      <url hash="58128bde">2024.determit-1.6</url>
+      <bibkey>giannouris-etal-2024-plain</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Enhancing Lexical Complexity Prediction through Few-shot Learning with Gpt-3</title>
+      <author><first>Jenny Alexandra</first><last>Ortiz-Zambrano</last></author>
+      <author><first>César Humberto</first><last>Espín-Riofrío</last></author>
+      <author><first>Arturo</first><last>Montejo-Ráez</last></author>
+      <pages>68–76</pages>
+      <abstract>This paper describes an experiment to evaluate the ability of the GPT-3 language model to classify terms regarding their lexical complexity. This was achieved through the creation and evaluation of different versions of the model: text-Davinci-002 y text-Davinci-003 and prompts for few-shot learning to determine the complexity of the words. The results obtained on the CompLex dataset achieve a minimum average error of 0.0856. Although this is not better than the state of the art (which is 0.0609), it is a performing and promising approach to lexical complexity prediction without the need for model fine-tuning.</abstract>
+      <url hash="4969b2fc">2024.determit-1.7</url>
+      <bibkey>ortiz-zambrano-etal-2024-enhancing</bibkey>
+    </paper>
+    <paper id="8">
+      <title>An Approach towards Unsupervised Text Simplification on Paragraph-Level for <fixed-case>G</fixed-case>erman Texts</title>
+      <author><first>Leon</first><last>Fruth</last></author>
+      <author><first>Robin</first><last>Jegan</last></author>
+      <author><first>Andreas</first><last>Henrich</last></author>
+      <pages>77–89</pages>
+      <abstract>Text simplification as a research field has received attention in recent years for English and other languages, however, German text simplification techniques are lacking thus far. We present an unsupervised simplification approach for German texts using reinforcement learning (self-critical sequence training). Our main contributions are the adaption of an existing method for English, the selection and creation of German corpora for this task and the customization of rewards for particular aspects of the German language. In our paper, we describe our system and an evaluation, including still present issues and problems due to the complexity of the German language, as well as directions for future research.</abstract>
+      <url hash="103afd88">2024.determit-1.8</url>
+      <bibkey>fruth-etal-2024-approach</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Simplification Strategies in <fixed-case>F</fixed-case>rench Spontaneous Speech</title>
+      <author><first>Lucía</first><last>Ormaechea</last></author>
+      <author><first>Nikos</first><last>Tsourakis</last></author>
+      <author><first>Didier</first><last>Schwab</last></author>
+      <author><first>Pierrette</first><last>Bouillon</last></author>
+      <author><first>Benjamin</first><last>Lecouteux</last></author>
+      <pages>90–102</pages>
+      <abstract>Automatic Text Simplification (ATS) aims at rewriting texts into simpler variants while preserving their original meaning, so they can be more easily understood by different audiences. While ATS has been widely used for written texts, its application to spoken language remains unexplored, even if it is not exempt from difficulty. This study aims to characterize the edit operations performed in order to simplify French transcripts for non-native speakers. To do so, we relied on a data sample randomly extracted from the Orféo-CEFC French spontaneous speech dataset. In the absence of guidelines to direct this process, we adopted an intuitive simplification approach, so as to investigate the crafted simplifications based on expert linguists’ criteria, and to compare them with those produced by a generative AI (namely, ChatGPT). The results, analyzed quantitatively and qualitatively, reveal that the most common edits are deletions, and affect oral production aspects, like restarts or hesitations. Consequently, candidate simplifications are typically register-standardized sentences that solely include the propositional content of the input. The study also examines the alignment between human- and machine-based simplifications, revealing a moderate level of agreement, and highlighting the subjective nature of the task. The findings contribute to understanding the intricacies of simplifying spontaneous spoken language. In addition, the provision of a small-scale parallel dataset derived from such expert simplifications, Propicto-Orféo-Simple, can facilitate the evaluation of speech simplification solutions.</abstract>
+      <url hash="32015fd0">2024.determit-1.9</url>
+      <bibkey>ormaechea-etal-2024-simplification</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>DARES</fixed-case>: Dataset for <fixed-case>A</fixed-case>rabic Readability Estimation of School Materials</title>
+      <author><first>Mo</first><last>El-Haj</last></author>
+      <author><first>Sultan</first><last>Almujaiwel</last></author>
+      <author><first>Damith</first><last>Premasiri</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>103–113</pages>
+      <abstract>This research introduces DARES, a dataset for assessing the readability of Arabic text in Saudi school materials. DARES compromise of 13335 instances from textbooks used in 2021 and contains two subtasks; (a) Coarse-grained readability assessment where the text is classified into different educational levels such as primary and secondary. (b) Fine-grained readability assessment where the text is classified into individual grades.. We fine-tuned five transformer models that support Arabic and found that CAMeLBERTmix performed the best in all input settings. Evaluation results showed high performance for the coarse-grained readability assessment task, achieving a weighted F1 score of 0.91 and a macro F1 score of 0.79. The fine-grained task achieved a weighted F1 score of 0.68 and a macro F1 score of 0.55. These findings demonstrate the potential of our approach for advancing Arabic text readability assessment in education, with implications for future innovations in the field.</abstract>
+      <url hash="32c655e3">2024.determit-1.10</url>
+      <bibkey>el-haj-etal-2024-dares</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Legal Text Reader Profiling: Evidences from Eye Tracking and Surprisal Based Analysis</title>
+      <author><first>Calogero J.</first><last>Scozzaro</last></author>
+      <author><first>Davide</first><last>Colla</last></author>
+      <author><first>Matteo</first><last>Delsanto</last></author>
+      <author><first>Antonio</first><last>Mastropaolo</last></author>
+      <author><first>Enrico</first><last>Mensa</last></author>
+      <author><first>Luisa</first><last>Revelli</last></author>
+      <author><first>Daniele P.</first><last>Radicioni</last></author>
+      <pages>114–124</pages>
+      <abstract>Reading movements and times are a precious cue to follow reader’s strategy, and to track the underlying effort in text processing. To date, many approaches are being devised to simplify texts to overcome difficulties stemming from sentences obscure, ambiguous or deserving clarification. In the legal domain, ensuring the clarity of norms and regulations is of the utmost importance, as the full understanding of such documents lies at the foundation of core social obligations and rights. This task requires determining which utterances and text excerpts are difficult for which (sort of) reader. This investigation is the aim of the present work. We propose a preliminary study based on eye-tracking data of 61 readers, with focus on individuating different reader profiles, and on predicting reading times of our readers.</abstract>
+      <url hash="db5e9d36">2024.determit-1.11</url>
+      <bibkey>scozzaro-etal-2024-legal</bibkey>
+    </paper>
+    <paper id="12">
+      <title>The Simplification of the Language of Public Administration: The Case of Ombudsman Institutions</title>
+      <author><first>Gabriel</first><last>Gonzalez-Delgado</last></author>
+      <author><first>Borja</first><last>Navarro-Colorado</last></author>
+      <pages>125–133</pages>
+      <abstract>Language produced by Public Administrations has crucial implications in citizens’ lives. However, its syntactic complexity and the use of legal jargon, among other factors, make it difficult to be understood for laypeople and certain target audiences. The NLP task of Automatic Text Simplification (ATS) can help to the necessary simplification of this technical language. For that purpose, specialized parallel datasets of complex-simple pairs need to be developed for the training of these ATS systems. In this position paper, an on-going project is presented, whose main objectives are (a) to extensively analyze the syntactical, lexical, and discursive features of the language of English-speaking ombudsmen, as samples of public administrative language, with special attention to those characteristics that pose a threat to comprehension, and (b) to develop the OmbudsCorpus, a parallel corpus of complex-simple supra-sentential fragments from ombudsmen’s case reports that have been manually simplified by professionals and annotated with standardized simplification operations. This research endeavor aims to provide a deeper understanding of the simplification process and to enhance the training of ATS systems specialized in administrative texts.</abstract>
+      <url hash="f5ac291e">2024.determit-1.12</url>
+      <bibkey>gonzalez-delgado-navarro-colorado-2024-simplification</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Term Variation in Institutional Languages: Degrees of Specialization in Municipal Waste Management Terminology</title>
+      <author><first>Nicola</first><last>Cirillo</last></author>
+      <author><first>Daniela</first><last>Vellutino</last></author>
+      <pages>134–140</pages>
+      <abstract>Institutional Italian is a variety of Italian used in the official communications of institutions, especially in public administrations. Besides legal and administrative languages, it comprises the language used in websites, social media and advertising material produced by public administrations. To understand the lexical profile of institutional languages completely, standard measures of lexical complexity, like the type-token ratio and the percentage of basic vocabulary, should be complemented with the examination of the terminological variation. This study compares the terminology of three types of institutional texts: administrative acts, technical-operational texts, and informative texts. In particular, we collected 86 terms with various degrees of specialization and analysed their distribution within the subcorpora of ItaIst-DdAC_GRU, a corpus composed of institutional texts drafted by Italian municipalities about municipal waste management. Results suggest that administrative acts employ high-specialization terms compliant with the law, often in the form of acronyms. Conversely, informative texts contain more low-specialization terms, privileging single-word terms to remain self-contained. Finally, the terminology of technical-operational texts is characterised by standardized and formulaic phrases.</abstract>
+      <url hash="da4fd2a0">2024.determit-1.13</url>
+      <bibkey>cirillo-vellutino-2024-term</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>LARGEMED</fixed-case>: A Resource for Identifying and Generating Paraphrases for <fixed-case>F</fixed-case>rench Medical Terms</title>
+      <author><first>Ioana</first><last>Buhnila</last></author>
+      <author><first>Amalia</first><last>Todirascu</last></author>
+      <pages>141–151</pages>
+      <abstract>This article presents a method extending an existing French corpus of paraphrases of medical terms ANONYMOUS with new data from Web archives created during the Covid-19 pandemic. Our method semi-automatically detects new terms and paraphrase markers introducing paraphrases from these Web archives, followed by a manual annotation step to identify paraphrases and their lexical and semantic properties. The extended large corpus LARGEMED could be used for automatic medical text simplification for patients and their families. To automatise data collection, we propose two experiments. The first experiment uses the new LARGEMED dataset to train a binary classifier aiming to detect new sentences containing possible paraphrases. The second experiment aims to use correct paraphrases to train a model for paraphrase generation, by adapting T5 Language Model to the paraphrase generation task using an adversarial algorithm.</abstract>
+      <url hash="c26fa6c5">2024.determit-1.14</url>
+      <bibkey>buhnila-todirascu-2024-largemed</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Clearer Governmental Communication: Text Simplification with <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> Evaluated by Quantitative and Qualitative Research</title>
+      <author><first>Nadine</first><last>Beks van Raaij</last></author>
+      <author><first>Daan</first><last>Kolkman</last></author>
+      <author><first>Ksenia</first><last>Podoynitsyna</last></author>
+      <pages>152–178</pages>
+      <abstract>This research investigates the application of ChatGPT for the simplification of Dutch government letters, aiming to enhance their comprehensibility without compromising legal accuracy. We use a three-stage mixed method evaluation procedure to compare the performance of a naive approach, RoBERTA, and ChatGPT. We select the six most complicated letters from a corpus of 200 letters and use the three approaches to simplify them. First, we compare their scores on four evaluation metrics (ROUGE, BLEU, BLEURT, and LiNT), then we assess the simplifications with a legal and linguistic expert. Finally we investigate the performance of ChatGPT in a randomized controlled trial with 72 participants. Our findings reveal that ChatGPT significantly improves the readability of government letters, demonstrating over a 20% increase in comprehensibility scores and a 19% increase in correct question answering among participants. We also demonstrate the importance of a robust evaluation procedure.</abstract>
+      <url hash="207fb764">2024.determit-1.15</url>
+      <bibkey>beks-van-raaij-etal-2024-clearer</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Legal Science and Compute Science: A Preliminary Discussions on How to Represent the “Penumbra” Cone with <fixed-case>AI</fixed-case></title>
+      <author><first>Angela</first><last>Condello</last></author>
+      <author><first>Giorgio Maria</first><last>Di Nunzio</last></author>
+      <pages>179–184</pages>
+      <abstract>Legal science encounters significant challenges with the widespread integration of AI software across various legal operations. The distinction between signs, senses, and references from a linguistic point of view, as drawn by Gottlob Frege, underscores the complexity of legal language, especially in multilingual contexts like the European Union. In this paper, we describe the problems of legal terminology, examining the “penumbra” problem through Herbert Hart’s legal theory of meaning. We also analyze the feasibility of training automatic systems to handle conflicts between different interpretations of legal norms, particularly in multilingual legal systems. By examining the transformative impact of Artificial Intelligence on traditional legal practices, this research contributes to the theoretical discussion about the exploration of innovative methodologies for simplifying complex terminologies without compromising meaning.</abstract>
+      <url hash="2cdf77a6">2024.determit-1.16</url>
+      <bibkey>condello-di-nunzio-2024-legal</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Simpler Becomes Harder: Do <fixed-case>LLM</fixed-case>s Exhibit a Coherent Behavior on Simplified Corpora?</title>
+      <author><first>Miriam</first><last>Anschütz</last></author>
+      <author><first>Edoardo</first><last>Mosca</last></author>
+      <author><first>Georg</first><last>Groh</last></author>
+      <pages>185–195</pages>
+      <abstract>Text simplification seeks to improve readability while retaining the original content and meaning. Our study investigates whether pre-trained classifiers also maintain such coherence by comparing their predictions on both original and simplified inputs. We conduct experiments using 11 pre-trained models, including BERT and OpenAI’s GPT 3.5, across six datasets spanning three languages. Additionally, we conduct a detailed analysis of the correlation between prediction change rates and simplification types/strengths. Our findings reveal alarming inconsistencies across all languages and models. If not promptly addressed, simplified inputs can be easily exploited to craft zero-iteration model-agnostic adversarial attacks with success rates of up to 50%.</abstract>
+      <url hash="32ea23aa">2024.determit-1.17</url>
+      <bibkey>anschutz-etal-2024-simpler</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Pre-Gamus: Reducing Complexity of Scientific Literature as a Support against Misinformation</title>
+      <author><first>Nico</first><last>Colic</last></author>
+      <author><first>Jin-Dong</first><last>Kim</last></author>
+      <author><first>Fabio</first><last>Rinaldi</last></author>
+      <pages>196–201</pages>
+      <abstract>Scientific literature encodes a wealth of knowledge relevant to various users. However, the complexity of scientific jargon makes it inaccessible to all but domain specialists. It would be helpful for different types of people to be able to get at least a gist of a paper. Biomedical practitioners often find it difficult to keep up with the information load; but even lay people would benefit from scientific information, for example to dispel medical misconceptions. Besides, in many countries, familiarity with English is limited, let alone scientific English, even among professionals. All this points to the need for simplified access to the scientific literature. We thus present an application aimed at solving this problem, which is capable of summarising scientific text in a way that is tailored to specific types of users, and in their native language. For this objective, we used an LLM that our system queries using user-selected parameters. We conducted an informal evaluation of this prototype using a questionnaire in 3 different languages.</abstract>
+      <url hash="923fd21f">2024.determit-1.18</url>
+      <bibkey>colic-etal-2024-pre</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.dlnld.xml b/data/xml/2024.dlnld.xml
new file mode 100644
index 0000000000..68a2ac7460
--- /dev/null
+++ b/data/xml/2024.dlnld.xml
@@ -0,0 +1,104 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.dlnld">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Workshop on Deep Learning and Linked Data (DLnLD) @ LREC-COLING 2024</booktitle>
+      <editor><first>Gilles</first><last>Sérasset</last></editor>
+      <editor><first>Hugo Gonçalo</first><last>Oliveira</last></editor>
+      <editor><first>Giedre Valunaite</first><last>Oleskeviciene</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="c81ea098">2024.dlnld-1</url>
+      <venue>dlnld</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="2a4fdbd7">2024.dlnld-1.0</url>
+      <bibkey>dlnld-2024-deep</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Investigating the Impact of Different Graph Representations for Relation Extraction with Graph Neural Networks</title>
+      <author><first>Moritz</first><last>Blum</last></author>
+      <author><first>Gennaro</first><last>Nolano</last></author>
+      <author><first>Basil</first><last>Ell</last></author>
+      <author><first>Philipp</first><last>Cimiano</last></author>
+      <pages>1–13</pages>
+      <abstract>Graph Neural Networks(GNNs) have been applied successfully to various NLP tasks, particularly Relation Extraction(RE). Even though most of these approaches rely on the syntactic dependency tree of a sentence to derive a graph representation, the impact of this choice compared to other possible graph representations has not been evaluated. We examine the effect of representing text though a graph of different graph representations for GNNs that are applied to RE, considering, e.g., a fully connected graph of tokens, of semantic role structures, and combinations thereof. We further examine the impact of background knowledge injection from Knowledge Graphs(KGs) into the graph representation to achieve enhanced graph representations. Our results show that combining multiple graph representations can improve the model’s predictions. Moreover, the integration of background knowledge positively impacts scores, as enhancing the text graphs with Wikidata features or WordNet features can lead to an improvement of close to 0.1 points in F1.</abstract>
+      <url hash="f39c3c41">2024.dlnld-1.1</url>
+      <bibkey>blum-etal-2024-investigating</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>T</fixed-case>axo<fixed-case>C</fixed-case>ritic: Exploring Credit Assignment in Taxonomy Induction with Multi-Critic Reinforcement Learning</title>
+      <author><first>Injy</first><last>Sarhan</last></author>
+      <author><first>Bendegúz</first><last>Toth</last></author>
+      <author><first>Pablo</first><last>Mosteiro</last></author>
+      <author><first>Shihan</first><last>Wang</last></author>
+      <pages>14–30</pages>
+      <abstract>Taxonomies can serve as a vital foundation for several downstream tasks such as information retrieval and question answering, yet manual construction limits coverage and full potential. Automatic taxonomy induction, particularly using deep Reinforcement Learning (RL), is underexplored in Natural Language Processing (NLP). To address this gap, we present TaxoCritic, a novel approach that leverages deep multi-critic RL agents for taxonomy induction while incorporating credit assignment mechanisms. Our system uniquely assesses different sub-actions within the induction process, providing a granular analysis that aids in the precise attribution of credit and blame. We evaluate the effectiveness of multi-critic algorithms in experiments regarding both accuracy and robustness performance in edge identification. By providing a detailed comparison with state-of-the-art models and highlighting the strengths and limitations of our method, we aim to contribute to the ongoing</abstract>
+      <url hash="9e15a68e">2024.dlnld-1.2</url>
+      <bibkey>sarhan-etal-2024-taxocritic</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Combining Deep Learning Models and Lexical Linked Data: Some Insights from the Development of a Multilingual News Named Entity Recognition and Linking Dataset</title>
+      <author><first>Emmanuel</first><last>Cartier</last></author>
+      <author><first>Emile</first><last>Peetermans</last></author>
+      <pages>31–44</pages>
+      <abstract>This paper presents the methodology and outcomes of a Named Entity Recognition and Linking multilingual news benchmark that leverages both Deep learning approaches by using a fine-tuned transformer model to detect mentions of persons, locations and organisations in text, and Linguistic Linked Open Data, through the use of Wikidata to disambiguate mentions and link them to ontology entries. It shows all the advantages of combining both approaches, not only for building the benchmark but also for fine-tuning detection models. We also insist on several perspectives of research to improve the accuracy of a combining system and go further on leveraging the complementary approaches.</abstract>
+      <url hash="d8da1006">2024.dlnld-1.3</url>
+      <bibkey>cartier-peetermans-2024-combining</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Deductive Verification of <fixed-case>LLM</fixed-case> Generated <fixed-case>SPARQL</fixed-case> Queries</title>
+      <author><first>Alexandre</first><last>Rademaker</last></author>
+      <author><first>Guilherme</first><last>Lima</last></author>
+      <author><first>Sandro Rama</first><last>Fiorini</last></author>
+      <author><first>Viviane Torres</first><last>da Silva</last></author>
+      <pages>45–52</pages>
+      <abstract>Considering the increasing applications of Large Language Models (LLMs) to many natural language tasks, this paper presents preliminary findings on developing a verification component for detecting hallucinations of an LLM that produces SPARQL queries from natural language questions. We suggest a logic-based deductive verification of the generated SPARQL query by checking if the original NL question’s deep semantic representation entails the SPARQL’s semantic representation.</abstract>
+      <url hash="2e536dc4">2024.dlnld-1.4</url>
+      <bibkey>rademaker-etal-2024-deductive</bibkey>
+    </paper>
+    <paper id="5">
+      <title>How to Turn Card Catalogs into <fixed-case>LLM</fixed-case> Fodder</title>
+      <author><first>Mary Ann</first><last>Tan</last></author>
+      <author><first>Shufan</first><last>Jiang</last></author>
+      <author><first>Harald</first><last>Sack</last></author>
+      <pages>53–65</pages>
+      <abstract>Bibliographical metadata collections describing pre-modern objects suffer from incompleteness and inaccuracies. This hampers the identification of literary works. In addition, titles often contain voluminous descriptive texts that do not adhere to contemporary title conventions. This paper explores several NLP approaches where greater textual length in titles is leveraged to enhance descriptive information.</abstract>
+      <url hash="d7bee583">2024.dlnld-1.5</url>
+      <bibkey>tan-etal-2024-turn</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Evaluating Large Language Models for Linguistic Linked Data Generation</title>
+      <author><first>Maria Pia</first><last>di Buono</last></author>
+      <author><first>Blerina</first><last>Spahiu</last></author>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <pages>66–75</pages>
+      <abstract>Large language models (LLMs) have revolutionized human-machine interaction with their ability to converse and perform various language tasks. This study investigates the potential of LLMs for knowledge formalization using well-defined vocabularies, specifically focusing on OntoLex-Lemon. As a preliminary exploration, we test four languages (English, Italian, Albanian, Romanian) and analyze the formalization quality of nine words with varying characteristics applying a multidimensional evaluation approach. While manual validation provided initial insights, it highlights the need for developing scalable evaluation methods for future large-scale experiments. This research aims to initiate a discussion on the potential and challenges of utilizing LLMs for knowledge formalization within the Semantic Web framework.</abstract>
+      <url hash="0841a12e">2024.dlnld-1.6</url>
+      <bibkey>di-buono-etal-2024-evaluating</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Towards Automated Evaluation of Knowledge Encoded in Large Language Models</title>
+      <author><first>Bruno Carlos Luís</first><last>Ferreira</last></author>
+      <author><first>Catarina</first><last>Silva</last></author>
+      <author><first>Hugo</first><last>Gonçalo Oliveira</last></author>
+      <pages>76–85</pages>
+      <abstract>Large Language Models (LLMs) have a significant user base and are gaining increasing interest and impact across various domains. Given their expanding influence, it is crucial to implement appropriate guardrails or controls to ensure ethical and responsible use. In this paper, we propose to automate the evaluation of the knowledge stored in LLMs. This is achieved by generating datasets tailored for this specific purpose, in any selected domain. Our approach consists of four major steps: (i) extraction of relevant entities; (ii) gathering of domain properties; (iii) dataset generation; and (iv) model evaluation. In order to materialize this vision, tools and resources were experimented for entity linking, knowledge acquisition, classification and prompt generation, yielding valuable insights and lessons. The generation of datasets for domain specific model evaluation has successfully proved that the approach can be a future tool for evaluating and moving LLMs “black-boxes” to human-interpretable knowledge bases.</abstract>
+      <url hash="cf1143d5">2024.dlnld-1.7</url>
+      <bibkey>ferreira-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Self-Evaluation of Generative <fixed-case>AI</fixed-case> Prompts for Linguistic Linked Open Data Modelling in Diachronic Analysis</title>
+      <author><first>Florentina</first><last>Armaselu</last></author>
+      <author><first>Chaya</first><last>Liebeskind</last></author>
+      <author><first>Giedre</first><last>Valunaite Oleskeviciene</last></author>
+      <pages>86–91</pages>
+      <abstract>This article addresses the question of evaluating generative AI prompts designed for specific tasks such as linguistic linked open data modelling and refining of word embedding results. The prompts were created to assist the pre-modelling phase in the construction of LLODIA, a linguistic linked open data model for diachronic analysis. We present a self-evaluation framework based on the method known in literature as LLM-Eval. The discussion includes prompts related to the RDF-XML conception of the model, and neighbour list refinement, dictionary alignment and contextualisation for the term revolution in French, Hebrew and Lithuanian, as a proof of concept.</abstract>
+      <url hash="a952352e">2024.dlnld-1.8</url>
+      <bibkey>armaselu-etal-2024-self</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.dmr.xml b/data/xml/2024.dmr.xml
new file mode 100644
index 0000000000..ed29b4041a
--- /dev/null
+++ b/data/xml/2024.dmr.xml
@@ -0,0 +1,201 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.dmr">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Fifth International Workshop on Designing Meaning Representations @ LREC-COLING 2024</booktitle>
+      <editor><first>Claire</first><last>Bonial</last></editor>
+      <editor><first>Julia</first><last>Bonn</last></editor>
+      <editor><first>Jena D.</first><last>Hwang</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="f2e7e524">2024.dmr-1</url>
+      <venue>dmr</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="c0d5c95b">2024.dmr-1.0</url>
+      <bibkey>dmr-2024-international</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>P</fixed-case>rop<fixed-case>B</fixed-case>ank-Powered Data Creation: Utilizing Sense-Role Labelling to Generate Disaster Scenario Data</title>
+      <author><first>Mollie Frances</first><last>Shichman</last></author>
+      <author><first>Claire</first><last>Bonial</last></author>
+      <author><first>Taylor A.</first><last>Hudson</last></author>
+      <author><first>Austin</first><last>Blodgett</last></author>
+      <author><first>Francis</first><last>Ferraro</last></author>
+      <author><first>Rachel</first><last>Rudinger</last></author>
+      <pages>1–10</pages>
+      <abstract>For human-robot dialogue in a search-and-rescue scenario, a strong knowledge of the conditions and objects a robot will face is essential for effective interpretation of natural language instructions. In order to utilize the power of large language models without overwhelming the limited storage capacity of a robot, we propose PropBank-Powered Data Creation. PropBank-Powered Data Creation is an expert-in-the-loop data generation pipeline which creates training data for disaster-specific language models. We leverage semantic role labeling and Rich Event Ontology resources to efficiently develop seed sentences for fine-tuning a smaller, targeted model that could operate onboard a robot for disaster relief. We developed 32 sentence templates, which we used to make 2 seed datasets of 175 instructions for earthquake search and rescue and train derailment response. We further leverage our seed datasets as evaluation data to test our baseline fine-tuned models.</abstract>
+      <url hash="0bc18bad">2024.dmr-1.1</url>
+      <bibkey>shichman-etal-2024-propbank</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Aspect Variability and the Annotation of Aspect in the <fixed-case>IMAGACT</fixed-case> Ontology of Action</title>
+      <author><first>Massimo</first><last>Moneglia</last></author>
+      <author><first>Rossella</first><last>Varvara</last></author>
+      <pages>11–19</pages>
+      <abstract>This paper highlights some theoretical and quantitative issues related to the representation and annotation of aspectual meaning in the IMAGACT corpus-based multimodal ontology of action. Given the multimodal nature of this ontology, in which actions are represented through both prototypical visual scenes and linguistic captions, the annotation of aspect in this resource allows us to draw some important considerations about the relation between aspectual meaning and eventualities. The annotation procedure is reported and quantitative data show that, both in the English and Italian corpora, many verbs present aspectual variation, and many eventualities can be represented by locally equivalent verbs with different aspect. The reason why verb aspectual class may vary is investigated. Our analysis makes once more evident that verbs may vary their aspectual properties with respect not only to their argument structure but, more precisely, to the inner qualities of the eventualities they express. Crucially, when eventualities are expressed by equivalent verbs with different aspectual properties, the verbs put on focus different parts of the structure of the eventuality.</abstract>
+      <url hash="1a2c23f7">2024.dmr-1.2</url>
+      <bibkey>moneglia-varvara-2024-aspect</bibkey>
+    </paper>
+    <paper id="3">
+      <title><fixed-case>N</fixed-case>o<fixed-case>VR</fixed-case>ol: A semantic role lexicon of <fixed-case>N</fixed-case>orwegian verbs</title>
+      <author><first>Henrik</first><last>Torgersen</last></author>
+      <author><first>Erlend Ø.</first><last>Ravnanger</last></author>
+      <author><first>Lars</first><last>Hellan</last></author>
+      <author><first>Dag</first><last>Haug</last></author>
+      <pages>20–29</pages>
+      <abstract>In this paper, we describe NoVRol, a semantic role lexicon of Norwegian verbs. We start from the NorVal valency lexicon, which describes the syntactic frames of 7.400 verbs. We then enrich each of these frames by annotating, based on the VerbNet annotation scheme, each argument of the verb with the semantic role that it gets. We also encode the syntactic roles of the arguments based on the UD annotation scheme. Our resource will faciliate future research on Norwegian verbs, and can at a future stage be expanded to a full VerbNet</abstract>
+      <url hash="bde2044a">2024.dmr-1.3</url>
+      <bibkey>torgersen-etal-2024-novrol</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Expanding <fixed-case>R</fixed-case>ussian <fixed-case>P</fixed-case>rop<fixed-case>B</fixed-case>ank: Challenges and Insights for Developing New <fixed-case>SRL</fixed-case> Resources</title>
+      <author><first>Skatje</first><last>Myers</last></author>
+      <author><first>Roman</first><last>Khamov</last></author>
+      <author><first>Adam</first><last>Pollins</last></author>
+      <author><first>Rebekah</first><last>Tozier</last></author>
+      <author><first>Olga</first><last>Babko-Malaya</last></author>
+      <author><first>Martha</first><last>Palmer</last></author>
+      <pages>30–38</pages>
+      <abstract>Semantic role labeling (SRL) resources, such as Proposition Bank (PropBank), provide useful input to downstream applications. In this paper we present some challenges and insights we learned while expanding the previously developed Russian PropBank. This new effort involved annotation and adjudication of all predicates within a subset of the prior work in order to provide a test corpus for future applications. We discuss a number of new issues that arose while developing our PropBank for Russian as well as our solutions. Framing issues include: distinguishing between morphological processes that warrant new frames, differentiating between modal verbs and predicate verbs, and maintaining accurate representations of a given language’s semantics. Annotation issues include disagreements derived from variability in Universal Dependency parses and semantic ambiguity within the text. Finally, we demonstrate how Russian sentence structures reveal inherent limitations to PropBank’s ability to capture semantic data. These discussions should prove useful to anyone developing a PropBank or similar SRL resources for a new language.</abstract>
+      <url hash="80246302">2024.dmr-1.4</url>
+      <bibkey>myers-etal-2024-expanding</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Unveiling Semantic Information in Sentence Embeddings</title>
+      <author><first>Leixin</first><last>Zhang</last></author>
+      <author><first>David</first><last>Burian</last></author>
+      <author><first>Vojtěch</first><last>John</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <pages>39–47</pages>
+      <abstract>This study evaluates the extent to which semantic information is preserved within sentence embeddings generated from state-of-art sentence embedding models: SBERT and LaBSE. Specifically, we analyzed 13 semantic attributes in sentence embeddings. Our findings indicate that some semantic features (such as tense-related classes) can be decoded from the representation of sentence embeddings. Additionally, we discover the limitation of the current sentence embedding models: inferring meaning beyond the lexical level has proven to be difficult.</abstract>
+      <url hash="e45365fa">2024.dmr-1.5</url>
+      <bibkey>zhang-etal-2024-unveiling</bibkey>
+    </paper>
+    <paper id="6">
+      <title>A Quantum Theory of Terms and New Challenges to Meaning Representation of Quanterms</title>
+      <author><first>Diego</first><last>Burgos</last></author>
+      <pages>48–53</pages>
+      <abstract>This article discusses the challenges to meaning representation of terms posed by a quantum theory of terms (QTT) that was recently reported. We first summarize this theory and then highlight the difficulties of representing quanterms, which is the name we coined for the view that the QTT has of terms as quantum systems by analogy with quantum objects in quantum mechanics. We briefly summarize the representation practices followed to date to record and represent terminology. We use findings reported in the literature to model both terms and quanterms and found that current representations of terms in specialized repositories are collapsed quanterms at the expense of other states of the original quanterm. In this work, both quanterms and collapsed quanterms are mathematically modelled following formulations used in quantum mechanics. These formulations suggest that representations of quanterms need to include information about the probabilities of quanterm states and the role they play in the entanglement of terms for phenomena such as specialized collocations.</abstract>
+      <url hash="acdc2edc">2024.dmr-1.6</url>
+      <bibkey>burgos-2024-quantum</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>VOLARE</fixed-case> - Visual Ontological <fixed-case>LA</fixed-case>nguage <fixed-case>RE</fixed-case>presentation</title>
+      <author><first>Werner</first><last>Winiwarter</last></author>
+      <pages>54–65</pages>
+      <abstract>In this paper, we introduce a novel meaning representation, which is based on AMR but extends it towards a visual ontological representation. We visualize concepts by representative images, and roles by emojis. All concepts are identified either by PropBank rolesets, Wikipedia page titles, WordNet synsets, or Wikidata lexeme senses. We have developed a Web-based annotation environment enabled by augmented browsing and interactive diagramming. As first application, we have implemented a multilingual annotation solution by using English as anchor language and comparing it with French and Japanese language versions. Therefore, we have extended our representation by a translation deviation annotation to document the differences between the language versions. The intended user groups are, besides professional translators and interpreters, students of translation, language, and literary studies. We describe a first use case in which we use novels by French authors and compare them with their English and Japanese translations. The main motivation for choosing Japanese is the soaring popularity of Japanese courses at our university and the particular challenges involved with trying to master this language.</abstract>
+      <url hash="40c95906">2024.dmr-1.7</url>
+      <bibkey>winiwarter-2024-volare</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>YARN</fixed-case> is All You Knit: Encoding Multiple Semantic Phenomena with Layers</title>
+      <author><first>Siyana</first><last>Pavlova</last></author>
+      <author><first>Maxime</first><last>Amblard</last></author>
+      <author><first>Bruno</first><last>Guillaume</last></author>
+      <pages>66–76</pages>
+      <abstract>In this paper, we present the first version of YARN, a new semantic representation formalism. We propose this new formalism to unify the advantages of logic-based formalisms while retaining direct interpretation, making it widely usable. YARN is rooted in the encoding of different semantic phenomena as separate layers. We begin by presenting a formal definition of the mathematical structure that constitutes YARN. We then illustrate with concrete examples how this structure can be used in the context of semantic representation for encoding multiple phenomena (such as modality, negation and quantification) as layers built on top of a central predicate-argument structure. The benefit of YARN is that it allows for the independent annotation and analysis of different phenomena as they are easy to “switch off”. Furthermore, we have explored YARN’s ability to encode simple interactions between phenomena. We wrap up the work presented by a discussion of some of the interesting observations made during the development of YARN so far and outline our extensive future plans for this formalism.</abstract>
+      <url hash="5107bd72">2024.dmr-1.8</url>
+      <bibkey>pavlova-etal-2024-yarn</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Argument Sharing in Meaning Representation Parsing</title>
+      <author><first>Maja</first><last>Buljan</last></author>
+      <author><first>Stephan</first><last>Oepen</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
+      <pages>77–87</pages>
+      <abstract>We present a contrastive study of argument sharing across three graph-based meaning representation frameworks, where semantically shared arguments manifest as reentrant graph nodes. For a state-of-the-art graph parser, we observe how parser performance – in terms of output quality – covaries with overall graph complexity, on the one hand, and presence of different types of reentrancies, on the other hand. We identify common linguistic phenomena that give rise to shared arguments, and therefore node reentrancies, through a small-case and partially automated annotation study and parallel error anaylsis of actual parser outputs. Our results provide new insights into the distribution of different types of reentrancies in meaning representation graphs for three distinct frameworks, as well as on the effects that these structures have on parser performance, thus suggesting both novel cross-framework generalisations as well as avenues for focussed parser development.</abstract>
+      <url hash="467b837d">2024.dmr-1.9</url>
+      <bibkey>buljan-etal-2024-argument</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Mapping <fixed-case>P</fixed-case>rop<fixed-case>B</fixed-case>ank Argument Labels to <fixed-case>C</fixed-case>zech Verbal Valency</title>
+      <author><first>Jan</first><last>Hajič</last></author>
+      <author><first>Eva</first><last>Fučíková</last></author>
+      <author><first>Marketa</first><last>Lopatkova</last></author>
+      <author><first>Zdeňka</first><last>Urešová</last></author>
+      <pages>88–100</pages>
+      <abstract>For many years, there has been attempts to compare predicate-argument labeling schemas between formalism, typically under the dependency assumptions (even if the annotation by these schemas could have been performed on either constituent-based specifications or dependency ones). Given the growing number of resources that link various lexical resources to one another, as well as thanks to parallel annotated corpora (with or without annotation), it is now possible to do more in-depth studies of those correspondences. We present here a high-coverage pilot study of mapping the labeling system used in PropBank (for English) to Czech, which has so far used mainly valency lexicons (in several closely related forms) for annotation projects, under a different level of specification and different theoretical assumptions. The purpose of this study is both theoretical (comparing the argument labeling schemes) and practical (to be able to annotate Czech under the standard UMR specifications).</abstract>
+      <url hash="119b06ce">2024.dmr-1.10</url>
+      <bibkey>hajic-etal-2024-mapping</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Lexicalized Meaning Representation (<fixed-case>LMR</fixed-case>)</title>
+      <author><first>Jorge</first><last>Baptista</last></author>
+      <author><first>Sónia</first><last>Reis</last></author>
+      <author><first>João</first><last>Dias</last></author>
+      <author><first>Pedro</first><last>Santos</last></author>
+      <pages>101–111</pages>
+      <abstract>This paper presents an adaptation of the Abstract Meaning Representation (AMR) framework for European Portuguese. This adaptation, referred to as Lexicalized Meaning Representation (LMR), was deemed necessary to address specific challenges posed by the grammar of the language, as well as various linguistic issues raised by the current version of AMR annotation guidelines. Some of these aspects stemmed from the use of a notation similar to AMR to represent real texts from the legal domain, enabling its use in Natural Language Processing (NLP) applications. In this context, several aspects of AMR were significantly simplified (e.g., the representation of multi-word expressions, named entities, and temporal expressions), while others were introduced, with efforts made to maintain the representation scheme as compatible as possible with standard AMR notation.</abstract>
+      <url hash="ba3c1056">2024.dmr-1.11</url>
+      <bibkey>baptista-etal-2024-lexicalized</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Adjudicating <fixed-case>LLM</fixed-case>s as <fixed-case>P</fixed-case>rop<fixed-case>B</fixed-case>ank Adjudicators</title>
+      <author><first>Julia</first><last>Bonn</last></author>
+      <author><first>Harish</first><last>Tayyar Madabushi</last></author>
+      <author><first>Jena D.</first><last>Hwang</last></author>
+      <author><first>Claire</first><last>Bonial</last></author>
+      <pages>112–123</pages>
+      <abstract>We evaluate the ability of large language models (LLMs) to provide PropBank semantic role label annotations across different realizations of the same verbs in transitive, intransitive, and middle voice constructions. In order to assess the meta-linguistic capabilities of LLMs as well as their ability to glean such capabilities through in-context learning, we evaluate the models in a zero-shot setting, in a setting where it is given three examples of another verb used in transitive, intransitive, and middle voice constructions, and finally in a setting where it is given the examples as well as the correct sense and roleset information. We find that zero-shot knowledge of PropBank annotation is almost nonexistent. The largest model evaluated, GPT-4, achieves the best performance in the setting where it is given both examples and the correct roleset in the prompt, demonstrating that larger models can ascertain some meta-linguistic capabilities through in-context learning. However, even in this setting, which is simpler than the task of a human in PropBank annotation, the model achieves only 48% accuracy in marking numbered arguments correctly. To ensure transparency and reproducibility, we publicly release our dataset and model responses.</abstract>
+      <url hash="1050cb58">2024.dmr-1.12</url>
+      <bibkey>bonn-etal-2024-adjudicating</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Extending <fixed-case>V</fixed-case>erb<fixed-case>N</fixed-case>et’s Verb-Specific Features to Enhance Selectional Preferences of Semantic Roles</title>
+      <author><first>Susan Windisch</first><last>Brown</last></author>
+      <pages>124–130</pages>
+      <abstract>This work proposes expanding the thematic role selectional preferences used in the lexical resource VerbNet as a way to increase the available semantic information in the resource, induce semantically-based subclasses for the more generic VerbNet classes, and create new links across classes. The addition of verb-specific features in the latest version of VerbNet provides a means for adding more specific selectional preferences based on the meaning of a class’s individual member verbs. These features could refine both the instantiated class roles and the new implicit roles introduced in VerbNet version 4. We suggest 49 classes that would benefit from 111 verb-specific selectional preferences and explain how they would enhance VerbNet’s semantic representations.</abstract>
+      <url hash="7dd87420">2024.dmr-1.13</url>
+      <bibkey>brown-2024-extending</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>C</fixed-case>hinese <fixed-case>UMR</fixed-case> annotation: Can <fixed-case>LLM</fixed-case>s help?</title>
+      <author><first>Haibo</first><last>Sun</last></author>
+      <author><first>Nianwen</first><last>Xue</last></author>
+      <author><first>Jin</first><last>Zhao</last></author>
+      <author><first>Liulu</first><last>Yue</last></author>
+      <author><first>Yao</first><last>Sun</last></author>
+      <author><first>Keer</first><last>Xu</last></author>
+      <author><first>Jiawei</first><last>Wu</last></author>
+      <pages>131–139</pages>
+      <abstract>We explore using LLMs, GPT-4 specifically, to generate draft sentence-level Chinese Uniform Meaning Representations (UMRs) that human annotators can revise to speed up the UMR annotation process. In this study, we use few-shot learning and Think-Aloud prompting to guide GPT-4 to generate sentence-level graphs of UMR. Our experimental results show that compared with annotating UMRs from scratch, using LLMs as a preprocessing step reduces the annotation time by two thirds on average. This indicates that there is great potential for integrating LLMs into the pipeline for complicated semantic annotation tasks.</abstract>
+      <url hash="f2fb7406">2024.dmr-1.14</url>
+      <bibkey>sun-etal-2024-chinese</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Accelerating <fixed-case>UMR</fixed-case> Adoption: Neuro-Symbolic Conversion from <fixed-case>AMR</fixed-case>-to-<fixed-case>UMR</fixed-case> with Low Supervision</title>
+      <author><first>Claire Benet</first><last>Post</last></author>
+      <author><first>Marie C.</first><last>McGregor</last></author>
+      <author><first>Maria Leonor</first><last>Pacheco</last></author>
+      <author><first>Alexis</first><last>Palmer</last></author>
+      <pages>140–150</pages>
+      <abstract>Despite Uniform Meaning Representation’s (UMR) potential for cross-lingual semantics, limited annotated data has hindered its adoption. There are large datasets of English AMRs (Abstract Meaning Representations), but the process of converting AMR graphs to UMR graphs is non-trivial. In this paper we address a complex piece of that conversion process, namely cases where one AMR role can be mapped to multiple UMR roles through a non-deterministic process. We propose a neuro-symbolic method for role conversion, integrating animacy parsing and logic rules to guide a neural network, and minimizing human intervention. On test data, the model achieves promising accuracy, highlighting its potential to accelerate AMR-to-UMR conversion. Future work includes expanding animacy parsing, incorporating human feedback, and applying the method to broader aspects of conversion. This research demonstrates the benefits of combining symbolic and neural approaches for complex semantic tasks.</abstract>
+      <url hash="3f17421d">2024.dmr-1.15</url>
+      <bibkey>post-etal-2024-accelerating</bibkey>
+    </paper>
+    <paper id="16">
+      <title>The Relative Clauses <fixed-case>AMR</fixed-case> Parsers Hate Most</title>
+      <author><first>Xiulin</first><last>Yang</last></author>
+      <author><first>Nathan</first><last>Schneider</last></author>
+      <pages>151–161</pages>
+      <abstract>This paper evaluates how well English Abstract Meaning Representation parsers process an important and frequent kind of Long-Distance Dependency construction, namely, relative clauses (RCs). On two syntactically parsed datasets, we evaluate five AMR parsers at recovering the semantic reentrancies triggered by different syntactic subtypes of relative clauses. Our findings reveal a general difficulty among parsers at predicting such reentrancies, with recall below 64% on the EWT corpus. The sequence-to-sequence models (regardless of whether structural biases were included in training) outperform the compositional model. An analysis by relative clause subtype shows that passive subject RCs are the easiest, and oblique and reduced RCs the most challenging, for AMR parsers.</abstract>
+      <url hash="e9070dac">2024.dmr-1.16</url>
+      <bibkey>yang-schneider-2024-relative</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Gaining More Insight into Neural Semantic Parsing with Challenging Benchmarks</title>
+      <author><first>Xiao</first><last>Zhang</last></author>
+      <author><first>Chunliu</first><last>Wang</last></author>
+      <author><first>Rik</first><last>van Noord</last></author>
+      <author><first>Johan</first><last>Bos</last></author>
+      <pages>162–175</pages>
+      <abstract>The Parallel Meaning Bank (PMB) serves as a corpus for semantic processing with a focus on semantic parsing and text generation. Currently, we witness an excellent performance of neural parsers and generators on the PMB. This might suggest that such semantic processing tasks have by and large been solved. We argue that this is not the case and that performance scores from the past on the PMB are inflated by non-optimal data splits and test sets that are too easy. In response, we introduce several changes. First, instead of the prior random split, we propose a more systematic splitting approach to improve the reliability of the standard test data. Second, except for the standard test set, we also propose two challenge sets: one with longer texts including discourse structure, and one that addresses compositional generalization. We evaluate five neural models for semantic parsing and meaning-to-text generation. Our results show that model performance declines (in some cases dramatically) on the challenge sets, revealing the limitations of neural models when confronting such challenges.</abstract>
+      <url hash="aed43174">2024.dmr-1.17</url>
+      <bibkey>zhang-etal-2024-gaining</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.dravidianlangtech.xml b/data/xml/2024.dravidianlangtech.xml
index aedafda436..27d20a311c 100644
--- a/data/xml/2024.dravidianlangtech.xml
+++ b/data/xml/2024.dravidianlangtech.xml
@@ -31,6 +31,7 @@
       <abstract>Accented speech classification plays a vital role in the advancement of high-quality automatic speech recognition (ASR) technology. For certain applications, like multi-accented speech classification, it is not always viable to obtain data with accent variation, especially for resource-poor languages. This is one of the major reasons that contributes to the underperformance of the speech classification systems. Therefore, in order to handle speech variability in Indian language speaker accents, we propose a few-shot learning paradigm in this study. It learns generic feature embeddings using an encoder from a pre-trained whisper model and a classification head for classification. The model is refined using LLM’s fine-tuning techniques, such as LoRA and QLoRA, for the six Indian English accents in the Indic Accent Dataset. The experimental findings show that the accuracy of the model is greatly increased by the few-shot learning paradigm’s effectiveness combined with LLM’s fine-tuning techniques. In optimal settings, the model’s accuracy can reach 94% when the trainable parameters are set to 5%.</abstract>
       <url hash="d99268d9">2024.dravidianlangtech-1.1</url>
       <bibkey>r-etal-2024-shot</bibkey>
+      <video href="2024.dravidianlangtech-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Neural Machine Translation for <fixed-case>M</fixed-case>alayalam Paraphrase Generation</title>
@@ -41,6 +42,7 @@
       <abstract>This study explores four methods of generating paraphrases in Malayalam, utilizing resources available for English paraphrasing and pre-trained Neural Machine Translation (NMT) models. We evaluate the resulting paraphrases using both automated metrics, such as BLEU, METEOR, and cosine similarity, as well as human annotation. Our findings suggest that automated evaluation measures may not be fully appropriate for Malayalam, as they do not consistently align with human judgment. This discrepancy underscores the need for more nuanced paraphrase evaluation approaches especially for highly agglutinative languages.</abstract>
       <url hash="92f6a3dc">2024.dravidianlangtech-1.2</url>
       <bibkey>varghese-etal-2024-neural</bibkey>
+      <video href="2024.dravidianlangtech-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>From Dataset to Detection: A Comprehensive Approach to Combating <fixed-case>M</fixed-case>alayalam Fake News</title>
@@ -54,6 +56,7 @@
       <abstract>Identifying fake news hidden as real news is crucial to fight misinformation and ensure reliable information, especially in resource-scarce languages like Malayalam. To recognize the unique challenges of fake news in languages like Malayalam, we present a dataset curated specifically for classifying fake news in Malayalam. This fake news is categorized based on the degree of misinformation, marking the first of its kind in this language. Further, we propose baseline models employing multilingual BERT and diverse machine learning classifiers. Our findings indicate that logistic regression trained on LaBSE features demonstrates promising initial performance with an F1 score of 0.3393. However, addressing the significant data imbalance remains essential for further improvement in model accuracy.</abstract>
       <url hash="0532e410">2024.dravidianlangtech-1.3</url>
       <bibkey>k-etal-2024-dataset</bibkey>
+      <video href="2024.dravidianlangtech-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Social Media Fake News Classification Using Machine Learning Algorithm</title>
@@ -65,6 +68,7 @@
       <abstract>The rise of social media has facilitated easier communication, information sharing, and current affairs updates. However, the prevalence of misleading and deceptive content, commonly referred to as fake news, poses a significant challenge. This paper focuses on the classification of fake news in Malayalam, a Dravidian language, utilizing natural language processing (NLP) techniques. To develop a model, we employed a random forest machine learning method on a dataset provided by a shared task(DravidianLangTech@EACL 2024)1. When evaluated by the separate test dataset, our developed model achieved a 0.71 macro F1 measure.</abstract>
       <url hash="e50b3179">2024.dravidianlangtech-1.4</url>
       <bibkey>bade-etal-2024-social</bibkey>
+      <video href="2024.dravidianlangtech-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Exploring the impact of noise in low-resource <fixed-case>ASR</fixed-case> for <fixed-case>T</fixed-case>amil</title>
@@ -74,6 +78,7 @@
       <abstract>The use of deep learning algorithms has resulted in significant progress in automatic speech recognition (ASR). Robust high-accuracy ASR models typically require thousands or tens of thousands of hours of speech data, but even the strongest models tend fail under noisy conditions. Unsurprisingly, the impact of noise on accuracy is more drastic in low-resource settings. In this paper, we investigate the impact of noise on ASR in a low-resource setting. We explore novel methods for developing noise-robust ASR models using a a small dataset for Tamil, a widely-spoken but under-resourced Dravidian languages. We add various noises to the audio data to determine the impact of different kinds of noise (e.g., punctuated vs. constant, man-made vs natural) We also explore the relationship between different data augmentation methods are better suited to handling different types of noise. Our results show that all noises, regardless of the type, had an impact on ASR performance, and that upgrading the architecture alone could not mitigate the impact of noise. SpecAugment, the most common data augmentation method, was not as helpful as raw data augmentation, in which noise is explicitly added to the training data. Raw data augmentation enhances ASR performance on both clean data and noise-mixed data.</abstract>
       <url hash="e1593a27">2024.dravidianlangtech-1.5</url>
       <bibkey>lakshminarayanan-prudhommeaux-2024-exploring</bibkey>
+      <video href="2024.dravidianlangtech-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title><fixed-case>S</fixed-case>et<fixed-case>F</fixed-case>it: A Robust Approach for Offensive Content Detection in <fixed-case>T</fixed-case>amil-<fixed-case>E</fixed-case>nglish Code-Mixed Conversations Using Sentence Transfer Fine-tuning</title>
@@ -86,6 +91,7 @@
       <abstract>Code-mixed languages are increasingly prevalent on social media and online platforms, presenting significant challenges in offensive content detection for natural language processing (NLP) systems. Our study explores how effectively the Sentence Transfer Fine-tuning (Set-Fit) method, combined with logistic regression, detects offensive content in a Tamil-English code-mixed dataset. We compare our model’s performance with five other NLP models: Multilingual BERT (mBERT), LSTM, BERT, IndicBERT, and Language-agnostic BERT Sentence Embeddings (LaBSE). Our model, SetFit, outperforms these models in accuracy, achieving an impressive 89.72%, significantly higher than other models. These results suggest the sentence transformer model’s substantial potential for detecting offensive content in codemixed languages. Our study provides valuable insights into the sentence transformer model’s ability to identify various types of offensive material in Tamil-English online conversations, paving the way for more advanced NLP systems tailored to code-mixed languages.</abstract>
       <url hash="5da8acd2">2024.dravidianlangtech-1.6</url>
       <bibkey>pannerselvam-etal-2024-setfit</bibkey>
+      <video href="2024.dravidianlangtech-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Findings of the First Shared Task on Offensive Span Identification from Code-Mixed <fixed-case>K</fixed-case>annada-<fixed-case>E</fixed-case>nglish Comments</title>
@@ -98,6 +104,7 @@
       <abstract>Effectively managing offensive content is crucial on social media platforms to encourage positive online interactions. However, addressing offensive contents in code-mixed Dravidian languages faces challenges, as current moderation methods focus on flagging entire comments rather than pinpointing specific offensive segments. This limitation stems from a lack of annotated data and accessible systems designed to identify offensive language sections. To address this, our shared task presents a dataset comprising Kannada-English code-mixed social comments, encompassing offensive comments. This paper outlines the dataset, the utilized algorithms, and the results obtained by systems participating in this shared task.</abstract>
       <url hash="06178c7c">2024.dravidianlangtech-1.7</url>
       <bibkey>ravikiran-etal-2024-findings</bibkey>
+      <video href="2024.dravidianlangtech-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Findings of the Shared Task on Hate and Offensive Language Detection in <fixed-case>T</fixed-case>elugu Codemixed Text (<fixed-case>HOLD</fixed-case>-<fixed-case>T</fixed-case>elugu)@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024</title>
@@ -112,6 +119,7 @@
       <abstract>This paper examines the submissions of various participating teams to the task on Hate and Offensive Language Detection in Telugu Codemixed Text (HOLD-Telugu) organized as part of DravidianLangTech 2024. In order to identify the contents containing harmful information in Telugu codemixed social media text, the shared task pushes researchers and academicians to build models. The dataset for the task was created by gathering YouTube comments and annotated manually. A total of 23 teams participated and submitted their results to the shared task. The rank list was created by assessing the submitted results using the macro F1-score.</abstract>
       <url hash="8e4caa1b">2024.dravidianlangtech-1.8</url>
       <bibkey>b-etal-2024-findings</bibkey>
+      <video href="2024.dravidianlangtech-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title>Findings of the Shared Task on Multimodal Social Media Data Analysis in <fixed-case>D</fixed-case>ravidian Languages (<fixed-case>MSMDA</fixed-case>-<fixed-case>DL</fixed-case>)@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024</title>
@@ -131,6 +139,7 @@
       <abstract>This paper presents the findings of the shared task on multimodal sentiment analysis, abusive language detection and hate speech detection in Dravidian languages. Through this shared task, researchers worldwide can submit models for three crucial social media data analysis challenges in Dravidian languages: sentiment analysis, abusive language detection, and hate speech detection. The aim is to build models for deriving fine-grained sentiment analysis from multimodal data in Tamil and Malayalam, identifying abusive and hate content from multimodal data in Tamil. Three modalities make up the multimodal data: text, audio, and video. YouTube videos were gathered to create the datasets for the tasks. Thirty-nine teams took part in the competition. However, only two teams, though, turned in their findings. The macro F1-score was used to assess the submissions</abstract>
       <url hash="2e33c740">2024.dravidianlangtech-1.9</url>
       <bibkey>b-etal-2024-findings-shared</bibkey>
+      <video href="2024.dravidianlangtech-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Overview of Second Shared Task on Sentiment Analysis in Code-mixed <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu</title>
@@ -148,6 +157,7 @@
       <abstract>Sentiment Analysis (SA) in Dravidian codemixed text is a hot research area right now. In this regard, the “Second Shared Task on SA in Code-mixed Tamil and Tulu” at Dravidian- LangTech (EACL-2024) is organized. Two tasks namely SA in Tamil-English and Tulu- English code-mixed data, make up this shared assignment. In total, 64 teams registered for the shared task, out of which 19 and 17 systems were received for Tamil and Tulu, respectively. The performance of the systems submitted by the participants was evaluated based on the macro F1-score. The best method obtained macro F1-scores of 0.260 and 0.584 for code-mixed Tamil and Tulu texts, respectively.</abstract>
       <url hash="7f09930b">2024.dravidianlangtech-1.10</url>
       <bibkey>sambath-kumar-etal-2024-overview</bibkey>
+      <video href="2024.dravidianlangtech-1.10.mp4"/>
     </paper>
     <paper id="11">
       <title>Overview of the Second Shared Task on Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages: <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>EACL</fixed-case> 2024</title>
@@ -168,6 +178,7 @@
       <abstract>The rise of online social media has revolutionized communication, offering users a convenient way to share information and stay updated on current events. However, this surge in connectivity has also led to the proliferation of misinformation, commonly known as fake news. This misleading content, often disguised as legitimate news, poses a significant challenge as it can distort public perception and erode trust in reliable sources. This shared task consists of two subtasks such as task 1 and task 2. Task 1 aims to classify a given social media text into original or fake. The goal of the FakeDetect-Malayalam task2 is to encourage participants to develop effective models capable of accurately detecting and classifying fake news articles in the Malayalam language into different categories like False, Half True, Mostly False, Partly False, and Mostly True. For this shared task, 33 participants submitted their results.</abstract>
       <url hash="ce0a570a">2024.dravidianlangtech-1.11</url>
       <bibkey>subramanian-etal-2024-overview</bibkey>
+      <video href="2024.dravidianlangtech-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>byte<fixed-case>S</fixed-case>ized<fixed-case>LLM</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024: Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages - Unleashing the Power of Custom Subword Tokenization with <fixed-case>S</fixed-case>ubword2<fixed-case>V</fixed-case>ec and <fixed-case>B</fixed-case>i<fixed-case>LSTM</fixed-case></title>
@@ -177,6 +188,7 @@
       <abstract>This paper focuses on detecting fake news in resource-constrained languages, particularly Malayalam. We present a novel framework combining subword tokenization, Sanskrit-transliterated Subword2vec embeddings, and a powerful Bidirectional Long Short-Term Memory (BiLSTM) architecture. Despite using only monolingual Malayalam data, our model excelled in the FakeDetect-Malayalam challenge, ranking 4th. The innovative subword tokenizer achieves a remarkable 200x compression ratio, highlighting its efficiency in minimizing model size without compromising accuracy. Our work facilitates resource-efficient deployment in diverse linguistic landscapes and sparks discussion on the potential of multilingual data augmentation. This research provides a promising avenue for mitigating linguistic challenges in the NLP-driven battle against deceptive content.</abstract>
       <url hash="8857a448">2024.dravidianlangtech-1.12</url>
       <bibkey>kodali-manukonda-2024-bytesizedllm</bibkey>
+      <video href="2024.dravidianlangtech-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Fida @<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024: A Novel Approach to Hate Speech Detection Using Distilbert-base-multilingual-cased</title>
@@ -190,6 +202,7 @@
       <abstract>In the contemporary digital landscape, social media has emerged as a prominent means of communication and information dissemination, offering a rapid outreach to a broad audience compared to traditional communication methods. Unfortunately, the escalating prevalence of abusive language and hate speech on these platforms has become a pressing issue. Detecting and addressing such content on the Internet has garnered considerable attention due to the significant impact it has on individuals. The advent of deep learning has facilitated the use of pre-trained deep neural network models for text classification tasks. While these models demonstrate high performance, some exhibit a substantial number of parameters. In the DravidianLangTech@EACL 2024 task, we opted for the Distilbert-base-multilingual-cased model, an enhancement of the BERT model that effectively reduces the number of parameters without compromising performance. This model was selected based on its exceptional results in the task. Our system achieved a commendable Macro F1 score of 0.6369%.</abstract>
       <url hash="d3fc8c7a">2024.dravidianlangtech-1.13</url>
       <bibkey>ullah-etal-2024-fida</bibkey>
+      <video href="2024.dravidianlangtech-1.13.mp4"/>
     </paper>
     <paper id="14">
       <title>Selam@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024:Identifying Hate Speech and Offensive Language</title>
@@ -211,6 +224,7 @@
       <abstract>This study goes into our team’s active participation in the Hate and Offensive Language Detection in Telugu Codemixed Text (HOLDTelugu) shared task, which is an essential component of the DravidianLangTech@EACL 2024 workshop. The ultimate goal of this collaborative work is to push the bounds of hate speech recognition, especially tackling the issues given by codemixed text in Telugu, where English blends smoothly. Our inquiry offers a complete evaluation of the task’s aims, the technique used, and the precise achievements obtained by our team, providing a full insight into our contributions to this crucial linguistic and technical undertaking.</abstract>
       <url hash="fd94fde9">2024.dravidianlangtech-1.15</url>
       <bibkey>achamaleh-etal-2024-tewodros</bibkey>
+      <video href="2024.dravidianlangtech-1.15.mp4"/>
     </paper>
     <paper id="16">
       <title>Lidoma@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024: Identifying Hate Speech in <fixed-case>T</fixed-case>elugu Code-Mixed: A <fixed-case>BERT</fixed-case> Multilingual</title>
@@ -223,6 +237,7 @@
       <abstract>Over the past few years, research on hate speech and offensive content identification on social media has been ongoing. Since most people in the world are not native English speakers, unapproved messages are typically sent in code-mixed language. We accomplished collaborative work to identify the language of code-mixed text on social media in order to address the difficulties associated with it in the Telugu language scenario. Specifically, we participated in the shared task on the provided dataset by the Dravidian- LangTech Organizer for the purpose of identifying hate and non-hate content. The assignment is to classify each sentence in the provided text into two predetermined groups: hate or non-hate. We developed a model in Python and selected a BERT multilingual to do the given task. Using a train-development data set, we developed a model, which we then tested on test data sets. An average macro F1 score metric was used to measure the model’s performance. For the task, the model reported an average macro F1 of 0.6151.</abstract>
       <url hash="a7405764">2024.dravidianlangtech-1.16</url>
       <bibkey>zamir-etal-2024-lidoma</bibkey>
+      <video href="2024.dravidianlangtech-1.16.mp4"/>
     </paper>
     <paper id="17">
       <title>Zavira@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024:<fixed-case>T</fixed-case>elugu hate speech detection using <fixed-case>LSTM</fixed-case></title>
@@ -234,6 +249,7 @@
       <abstract>Hate speech is communication, often oral or written, that incites, stigmatizes, or incites violence or prejudice against individuals or groups based on characteristics such as race, religion, ethnicity, gender, sexual orientation, or other protected characteristics. This usually involves expressions of hostility, contempt, or prejudice and can have harmful social consequences.Among the broader social landscape, an important problem and challenge facing the medical community is related to the impact of people’s verbal expression. These words have a significant and immediate effect on human behavior and psyche. Repeating such phrases can even lead to depression and social isolation.In an attempt to identify and classify these Telugu text samples in the social media domain, our research LSTM and the findings of this experiment are summarized in this paper, in which out of 27 participants, we obtained 8th place with an F1 score of 0.68.</abstract>
       <url hash="7abc93af">2024.dravidianlangtech-1.17</url>
       <bibkey>ahani-etal-2024-zavira</bibkey>
+      <video href="2024.dravidianlangtech-1.17.mp4"/>
     </paper>
     <paper id="18">
       <title>Tayyab@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024:Detecting Fake News in <fixed-case>M</fixed-case>alayalam <fixed-case>LSTM</fixed-case> Approach and Challenges</title>
@@ -246,6 +262,7 @@
       <abstract>Global communication has been made easier by the emergence of online social media, but it has also made it easier for “fake news,” or information that is misleading or false, to spread. Since this phenomenon presents a significant challenge, reliable detection techniques are required to discern between authentic and fraudulent content. The primary goal of this study is to identify fake news on social media platforms and in Malayalam-language articles by using LSTM (Long Short-Term Memory) model. This research explores this approach in tackling the DravidianLangTech@EACL 2024 tasks. Using LSTM networks to differentiate between real and fake content at the comment or post level, Task 1 focuses on classifying social media text. To precisely classify the authenticity of the content, LSTM models are employed, drawing on a variety of sources such as comments on YouTube. Task 2 is dubbed the FakeDetect-Malayalam challenge, wherein Malayalam-language articles with fake news are identified and categorized using LSTM models. In order to successfully navigate the challenges of identifying false information in regional languages, we use lstm model. This algoritms seek to accurately categorize the multiple classes written in Malayalam. In Task 1, the results are encouraging. LSTM models distinguish between orignal and fake social media content with an impressive macro F1 score of 0.78 when testing. The LSTM model’s macro F1 score of 0.2393 indicates that Task 2 offers a more complex landscape. This emphasizes the persistent difficulties in LSTM-based fake news detection across various linguistic contexts and the difficulty of correctly classifying fake news within the context of the Malayalam language.</abstract>
       <url hash="a248b6d9">2024.dravidianlangtech-1.18</url>
       <bibkey>zamir-etal-2024-tayyab</bibkey>
+      <video href="2024.dravidianlangtech-1.18.mp4"/>
     </paper>
     <paper id="19">
       <title><fixed-case>IIITDWD</fixed-case>_<fixed-case>SVC</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-2024: Breaking Language Barriers; Hate Speech Detection in <fixed-case>T</fixed-case>elugu-<fixed-case>E</fixed-case>nglish Code-Mixed Text</title>
@@ -257,6 +274,7 @@
       <abstract>Social media platforms have become increasingly popular and are utilized for a wide range of purposes, including product promotion, news sharing, accomplishment sharing, and much more. However, it is also employed for defamatory speech, intimidation, and the propagation of untruths about particular groups of people. Further, hateful and offensive posts spread quickly and often have a negative impact on people; it is important to identify and remove them from social media platforms as soon as possible. Over the past few years, research on hate speech detection and offensive content has grown in popularity. One of the many difficulties in identifying hate speech on social media platforms is the use of code-mixed language. The majority of people who use social media typically share their messages in languages with mixed codes, like Telugu–English. To encourage research in this direction, the organizers of DravidianLangTech@EACL-2024 conducted a shared task to identify hateful content in Telugu-English code-mixed text. Our team participated in this shared task, employing three different models: Xlm-Roberta, BERT, and Hate-BERT. In particular, our BERT-based model secured the 14th rank in the competition with a macro F1 score of 0.65.</abstract>
       <url hash="eefa3450">2024.dravidianlangtech-1.19</url>
       <bibkey>sai-etal-2024-iiitdwd</bibkey>
+      <video href="2024.dravidianlangtech-1.19.mp4"/>
     </paper>
     <paper id="20">
       <title>Beyond Tech@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2024 : Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages Using Machine Learning</title>
@@ -269,6 +287,7 @@
       <abstract>In the digital age, identifying fake news is essential when fake information travels quickly via social media platforms. This project employs machine learning techniques, including Random Forest, Logistic Regression, and Decision Tree, to distinguish between real and fake news. With the rise of news consumption on social media, it becomes essential to authenticate information shared on platforms like YouTube comments. The research emphasizes the need to stop spreading harmful rumors and focuses on authenticating news articles. The proposed model utilizes machine learning and natural language processing, specifically Support Vector Machines, to aggregate and determine the authenticity of news. To address the challenges of detecting fake news in this paper, describe the Machine Learning (ML) models submitted to ‘Fake News Detection in Dravidian Languages” at DravidianLangTech@EACL 2024 shared task. Four different models, namely: Naive Bayes, Support Vector Machine (SVM), Random forest, and Decision tree.</abstract>
       <url hash="cf7c24ab">2024.dravidianlangtech-1.20</url>
       <bibkey>shanmugavadivel-etal-2024-beyond</bibkey>
+      <video href="2024.dravidianlangtech-1.20.mp4"/>
     </paper>
     <paper id="21">
       <title><fixed-case>C</fixed-case>ode_<fixed-case>M</fixed-case>akers@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>EACL</fixed-case> 2024 : Sentiment Analysis in Code-Mixed <fixed-case>T</fixed-case>amil using Machine Learning Techniques</title>
@@ -280,6 +299,7 @@
       <abstract>The rising importance of sentiment analysis online community research is addressed in our project, which focuses on the surge of code-mixed writing in multilingual social media. Targeting sentiments in texts combining Tamil and English, our supervised learning approach, particularly the Decision Tree algorithm, proves essential for effective sentiment classification. Notably, Decision Tree(accuracy: 0.99, average F1 score: 0.39), Random Forest exhibit high accuracy (accuracy: 0.99, macro average F1 score : 0.35), SVM (accuracy: 0.78, macro average F1 score : 0.68), Logistic Regression (accuracy: 0.75, macro average F1 score: 0.62), KNN (accuracy: 0.73, macro average F1 score : 0.26) also demonstrate commendable results. These findings showcase the project’s efficacy, offering promise for linguistic research and technological advancements. Securing the 8th rank emphasizes its recognition in the field.</abstract>
       <url hash="d4901063">2024.dravidianlangtech-1.21</url>
       <bibkey>shanmugavadivel-etal-2024-code</bibkey>
+      <video href="2024.dravidianlangtech-1.21.mp4"/>
     </paper>
     <paper id="22">
       <title><fixed-case>IIITDWD</fixed-case>-zk@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-2024: Leveraging the Power of Language Models for Hate Speech Detection in <fixed-case>T</fixed-case>elugu-<fixed-case>E</fixed-case>nglish Code-Mixed Text</title>
@@ -291,6 +311,7 @@
       <abstract>Hateful online content is a growing concern, especially for young people. While social media platforms aim to connect us, they can also become breeding grounds for negativity and harmful language. This study tackles this issue by proposing a novel framework called HOLD-Z, specifically designed to detect hate and offensive comments in Telugu-English code-mixed social media content. HOLD-Z leverages a combination of approaches, including three powerful models: LSTM architecture, Zypher, and openchat_3.5. The study highlights the effectiveness of prompt engineering and Quantized Low-Rank Adaptation (QLoRA) in boosting performance. Notably, HOLD-Z secured the 9th place in the prestigious HOLD-Telugu DravidianLangTech@EACL-2024 shared task, showcasing its potential for tackling the complexities of hate and offensive comment classification.</abstract>
       <url hash="4bd7c234">2024.dravidianlangtech-1.22</url>
       <bibkey>shaik-etal-2024-iiitdwd</bibkey>
+      <video href="2024.dravidianlangtech-1.22.mp4"/>
     </paper>
     <paper id="23">
       <title><fixed-case>DLRG</fixed-case>-<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>EACL</fixed-case>2024 : Combating Hate Speech in <fixed-case>T</fixed-case>elugu Code-mixed Text on Social Media</title>
@@ -324,6 +345,7 @@
       <abstract>In recent years, there has been a persistent focus on developing systems that can automatically identify the hate speech content circulating on diverse social media platforms. This paper describes the team Transformers’ submission to the Caste/Immigration Hate Speech Detection in Tamil shared task by LT-EDI 2024 workshop at EACL 2024. We used an ensemble approach in the shared task, combining various transformer-based pre-trained models using majority voting. The best macro average F1-score achieved was 0.82. We secured the 1st rank in the Caste/Immigration Hate Speech in Tamil shared task.</abstract>
       <url hash="49aadf19">2024.dravidianlangtech-1.25</url>
       <bibkey>singhal-bedi-2024-transformers-dravidianlangtech</bibkey>
+      <video href="2024.dravidianlangtech-1.25.mp4"/>
     </paper>
     <paper id="26">
       <title>Habesha@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024: Detecting Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages using Deep Learning</title>
@@ -335,6 +357,7 @@
       <abstract>This research tackles the issue of fake news by utilizing the RNN-LSTM deep learning method with optimized hyperparameters identified through grid search. The model’s performance in multi-label classification is hindered by unbalanced data, despite its success in binary classification. We achieved a score of 0.82 in the binary classification task, whereas in the multi-class task, the score was 0.32. We suggest incorporating data balancing techniques for researchers who aim to further this task, aiming to improve results in managing a variety of information.</abstract>
       <url hash="c8e3d0ba">2024.dravidianlangtech-1.26</url>
       <bibkey>yigezu-etal-2024-habesha</bibkey>
+      <video href="2024.dravidianlangtech-1.26.mp4"/>
     </paper>
     <paper id="27">
       <title><fixed-case>W</fixed-case>ord<fixed-case>W</fixed-case>izards@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024:Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages using Cross-lingual Sentence Embeddings</title>
@@ -347,6 +370,7 @@
       <abstract>The proliferation of fake news in digital media has become a significant societal concern, impacting public opinion, trust, and decision-making. This project focuses on the development of machine learning models for the detection of fake news. Leveraging a dataset containing both genuine and deceptive news articles, the proposed models employ natural language processing techniques, feature extraction and classification algorithms. This paper provides a solution to Fake News Detection in Dravidian Languages - DravidianLangTech 2024. There are two sub tasks: Task 1 - The goal of this task is to classify a given social media text into original or fake. We propose an approach for this with the help of a supervised machine learning model – SVM (Support Vector Machine). The SVM classifier achieved a macro F1 score of 0.78 in test data and a rank 11. The Task 2 is classifying fake news articles in Malayalam language into different categories namely False, Half True, Mostly False, Partly False and Mostly True.We have used Naive Bayes which achieved macro F1-score 0.3517 in test data and a rank 6.</abstract>
       <url hash="59cf5fa3">2024.dravidianlangtech-1.27</url>
       <bibkey>anbalagan-etal-2024-wordwizards</bibkey>
+      <video href="2024.dravidianlangtech-1.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Sandalphon@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>EACL</fixed-case>2024: Hate and Offensive Language Detection in <fixed-case>T</fixed-case>elugu Code-mixed Text using Transliteration-Augmentation</title>
@@ -359,6 +383,7 @@
       <abstract>Hate and offensive language in online platforms pose significant challenges, necessitating automatic detection methods. Particularly in the case of codemixed text, which is very common in social media, the complexity of this problem increases due to the cultural nuances of different languages. DravidianLangTech-EACL2024 organized a shared task on detecting hate and offensive language for Telugu. To complete this task, this study investigates the effectiveness of transliteration-augmented datasets for Telugu code-mixed text. In this work, we compare the performance of various machine learning (ML), deep learning (DL), and transformer-based models on both original and augmented datasets. Experimental findings demonstrate the superiority of transformer models, particularly Telugu-BERT, achieving the highest <tex-math>f_1</tex-math>-score of 0.77 on the augmented dataset, ranking the <tex-math>1^{st}</tex-math> position in the leaderboard. The study highlights the potential of transliteration-augmented datasets in improving model performance and suggests further exploration of diverse transliteration options to address real-world scenarios.</abstract>
       <url hash="71384a81">2024.dravidianlangtech-1.28</url>
       <bibkey>tabassum-etal-2024-sandalphon</bibkey>
+      <video href="2024.dravidianlangtech-1.28.mp4"/>
     </paper>
     <paper id="29">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>B</fixed-case>inary_<fixed-case>H</fixed-case>ackers@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech <fixed-case>EACL</fixed-case>2024: Fake News Detection in <fixed-case>M</fixed-case>alayalam Language Leveraging Fine-tuned <fixed-case>M</fixed-case>u<fixed-case>RIL</fixed-case> <fixed-case>BERT</fixed-case></title>
@@ -374,6 +399,7 @@
       <abstract>Due to technological advancements, various methods have emerged for disseminating news to the masses. The pervasive reach of news, however, has given rise to a significant concern: the proliferation of fake news. In response to this challenge, a shared task in Dravidian- LangTech EACL2024 was initiated to detect fake news and classify its types in the Malayalam language. The shared task consisted of two sub-tasks. Task 1 focused on a binary classification problem, determining whether a piece of news is fake or not. Whereas task 2 delved into a multi-class classification problem, categorizing news into five distinct levels. Our approach involved the exploration of various machine learning (RF, SVM, XGBoost, Ensemble), deep learning (BiLSTM, CNN), and transformer-based models (MuRIL, Indic- SBERT, m-BERT, XLM-R, Distil-BERT) by emphasizing parameter tuning to enhance overall model performance. As a result, we introduce a fine-tuned MuRIL model that leverages parameter tuning, achieving notable success with an F1-score of 0.86 in task 1 and 0.5191 in task 2. This successful implementation led to our system securing the 3rd position in task 1 and the 1st position in task 2. The source code will be found in the GitHub repository at this link: https://github.com/Salman1804102/ DravidianLangTech-EACL-2024-FakeNews.</abstract>
       <url hash="1e514b30">2024.dravidianlangtech-1.29</url>
       <bibkey>farsi-etal-2024-cuet-binary</bibkey>
+      <video href="2024.dravidianlangtech-1.29.mp4"/>
     </paper>
     <paper id="30">
       <title><fixed-case>P</fixed-case>unny_<fixed-case>P</fixed-case>unctuators@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>EACL</fixed-case>2024: Transformer-based Approach for Detection and Classification of Fake News in <fixed-case>M</fixed-case>alayalam Social Media Text</title>
@@ -387,6 +413,7 @@
       <abstract>The alarming rise of fake news on social media poses a significant threat to public discourse and decision-making. While automatic detection of fake news offers a promising solution, research in low-resource languages like Malayalam often falls behind due to limited data and tools. This paper presents the participation of team Punny_Punctuators in the Fake News Detection in Dravidian Languages shared task at DravidianLangTech@EACL 2024, addressing this gap. The shared task focuses on two sub-tasks: 1. classifying social media texts as original or fake, and 2. categorizing fake news into 5 categories. We experimented with various machine learning (ML), deep learning (DL) and transformer-based models as well as processing techniques such as transliteration. Malayalam-BERT achieved the best performance on both sub-tasks, which obtained us <tex-math>2^{nd}</tex-math> place with a macro <tex-math>f_1</tex-math>-score of 0.87 for the subtask-1 and <tex-math>11^{th}</tex-math> place with a macro <tex-math>f_1</tex-math>-score of 0.17 for the subtask-2. Our results highlight the potential of transformer models for low-resource languages in fake news detection and pave the way for further research in this crucial area.</abstract>
       <url hash="331552dd">2024.dravidianlangtech-1.30</url>
       <bibkey>tabassum-etal-2024-punny</bibkey>
+      <video href="2024.dravidianlangtech-1.30.mp4"/>
     </paper>
     <paper id="31">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>NLP</fixed-case>_<fixed-case>G</fixed-case>ood<fixed-case>F</fixed-case>ellows@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech <fixed-case>EACL</fixed-case>2024: A Transformer-Based Approach for Detecting Fake News in <fixed-case>D</fixed-case>ravidian Languages</title>
@@ -400,6 +427,7 @@
       <abstract>In this modern era, many people have been using Facebook and Twitter, leading to increased information sharing and communication. However, a considerable amount of information on these platforms is misleading or intentionally crafted to deceive users, which is often termed as fake news. A shared task on fake news detection in Malayalam organized by DravidianLangTech@EACL 2024 allowed us for addressing the challenge of distinguishing between original and fake news content in the Malayalam language. Our approach involves creating an intelligent framework to categorize text as either fake or original. We experimented with various machine learning models, including Logistic Regression, Decision Tree, Random Forest, Multinomial Naive Bayes, SVM, and SGD, and various deep learning models, including CNN, BiLSTM, and BiLSTM + Attention. We also explored Indic-BERT, MuRIL, XLM-R, and m-BERT for transformer-based approaches. Notably, our most successful model, m-BERT, achieved a macro F1 score of 0.85 and ranked 4th in the shared task. This research contributes to combating misinformation on social media news, offering an effective solution to classify content accurately.</abstract>
       <url hash="1d56b4f6">2024.dravidianlangtech-1.31</url>
       <bibkey>osama-etal-2024-cuet</bibkey>
+      <video href="2024.dravidianlangtech-1.31.mp4"/>
     </paper>
     <paper id="32">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>B</fixed-case>inary_<fixed-case>H</fixed-case>ackers@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech <fixed-case>EACL</fixed-case>2024: Hate and Offensive Language Detection in <fixed-case>T</fixed-case>elugu Code-Mixed Text Using Sentence Similarity <fixed-case>BERT</fixed-case></title>
@@ -413,6 +441,7 @@
       <abstract>With the continuous evolution of technology and widespread internet access, various social media platforms have gained immense popularity, attracting a vast number of active users globally. However, this surge in online activity has also led to a concerning trend by driving many individuals to resort to posting hateful and offensive comments or posts, publicly targeting groups or individuals. In response to these challenges, we participated in this shared task. Our approach involved proposing a fine-tuning-based pre-trained transformer model to effectively discern whether a given text contains offensive content that propagates hatred. We conducted comprehensive experiments, exploring various machine learning (LR, SVM, and Ensemble), deep learning (CNN, BiLSTM, CNN+BiLSTM), and transformer-based models (Indic-SBERT, m- BERT, MuRIL, Distil-BERT, XLM-R), adhering to a meticulous fine-tuning methodology. Among the models evaluated, our fine-tuned L3Cube-Indic-Sentence-Similarity- BERT or Indic-SBERT model demonstrated superior performance, achieving a macro-average F1-score of 0.7013. This notable result positioned us at the 6th place in the task. The implementation details of the task will be found in the GitHub repository.</abstract>
       <url hash="24abe71e">2024.dravidianlangtech-1.32</url>
       <bibkey>farsi-etal-2024-cuet-binary-hackers</bibkey>
+      <video href="2024.dravidianlangtech-1.32.mp4"/>
     </paper>
     <paper id="33">
       <title><fixed-case>T</fixed-case>ech<fixed-case>W</fixed-case>hiz@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024: Fake News Detection Using Deep Learning Models</title>
@@ -424,6 +453,7 @@
       <abstract>The ever-evolving landscape of online social media has initiated a transformative phase in communication, presenting unprecedented opportunities alongside inherent challenges. The pervasive issue of false information, commonly termed fake news, has emerged as a significant concern within these dynamic platforms. This study delves into the domain of Fake News Detection, with a specific focus on Malayalam. Utilizing advanced transformer models like mBERT, ALBERT, and XMLRoBERTa, our research proficiently classifies social media text into original or fake categories. Notably, our proposed model achieved commendable results, securing a rank of 3 in Task 1 with macro F1 scores of 0.84 using mBERT, 0.56 using ALBERT, and 0.84 using XMLRoBERTa. In Task 2, the XMLRoBERTa model excelled with a rank of 12, attaining a macro F1 score of 0.21, while mBERT and BERT achieved scores of 0.16 and 0.11, respectively. This research aims to develop robust systems capable of discerning authentic from deceptive content, a crucial endeavor in maintaining information reliability on social media platforms amid the rampant spread of misinformation.</abstract>
       <url hash="950acc0d">2024.dravidianlangtech-1.33</url>
       <bibkey>m-etal-2024-techwhiz</bibkey>
+      <video href="2024.dravidianlangtech-1.33.mp4"/>
     </paper>
     <paper id="34">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>B</fixed-case>inary_<fixed-case>H</fixed-case>ackers@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>EACL</fixed-case> 2024: Sentiment Analysis using Transformer-Based Models in Code-Mixed and Transliterated <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu</title>
@@ -437,6 +467,7 @@
       <abstract>Textual Sentiment Analysis (TSA) delves into people’s opinions, intuitions, and emotions regarding any entity. Natural Language Processing (NLP) serves as a technique to extract subjective knowledge, determining whether an idea or comment leans positive, negative, neutral, or a mix thereof toward an entity. In recent years, it has garnered substantial attention from NLP researchers due to the vast availability of online comments and opinions. Despite extensive studies in this domain, sentiment analysis in low-resourced languages such as Tamil and Tulu needs help handling code-mixed and transliterated content. To address these challenges, this work focuses on sentiment analysis of code-mixed and transliterated Tamil and Tulu social media comments. It explored four machine learning (ML) approaches (LR, SVM, XGBoost, Ensemble), four deep learning (DL) methods (BiLSTM and CNN with FastText and Word2Vec), and four transformer-based models (m-BERT, MuRIL, L3Cube-IndicSBERT, and Distilm-BERT) for both languages. For Tamil, L3Cube-IndicSBERT and ensemble approaches outperformed others, while m-BERT demonstrated superior performance among the models for Tulu. The presented models achieved the <tex-math>3^{rd}</tex-math> and <tex-math>1^{st}</tex-math> ranks by attaining macro F1-scores of 0.227 and 0.584 in Tamil and Tulu, respectively.</abstract>
       <url hash="86b88476">2024.dravidianlangtech-1.34</url>
       <bibkey>eusha-etal-2024-cuet</bibkey>
+      <video href="2024.dravidianlangtech-1.34.mp4"/>
     </paper>
     <paper id="35">
       <title><fixed-case>B</fixed-case>inary_<fixed-case>B</fixed-case>easts@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>EACL</fixed-case> 2024: Multimodal Abusive Language Detection in <fixed-case>T</fixed-case>amil based on Integrated Approach of Machine Learning and Deep Learning Techniques</title>
@@ -451,6 +482,7 @@
       <abstract>Detecting abusive language on social media is a challenging task that needs to be solved effectively. This research addresses the formidable challenge of detecting abusive language in Tamil through a comprehensive multimodal approach, incorporating textual, acoustic, and visual inputs. This study utilized ConvLSTM, 3D-CNN, and a hybrid 3D-CNN with BiLSTM to extract video features. Several models, such as BiLSTM, LR, and CNN, are explored for processing audio data, whereas for textual content, MNB, LR, and LSTM methods are explored. To further enhance overall performance, this work introduced a weighted late fusion model amalgamating predictions from all modalities. The fusion model was then applied to make predictions on the test dataset. The ConvLSTM+BiLSTM+MNB model yielded the highest macro F1 score of 71.43%. Our methodology allowed us to achieve 1 st rank for multimodal abusive language detection in the shared task</abstract>
       <url hash="ab85d0fb">2024.dravidianlangtech-1.35</url>
       <bibkey>rahman-etal-2024-binary</bibkey>
+      <video href="2024.dravidianlangtech-1.35.mp4"/>
     </paper>
     <paper id="36">
       <title><fixed-case>W</fixed-case>ord<fixed-case>W</fixed-case>izards@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024: Sentiment Analysis in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu using Sentence Embedding</title>
@@ -463,6 +495,7 @@
       <abstract>Sentiment Analysis of Dravidian Languages has begun to garner attention recently as there is more need to analyze emotional responses and subjective opinions present in social media text. As this data is code-mixed and there are not many solutions to code-mixed text out there, we present to you a stellar solution to DravidianLangTech 2024: Sentiment Analysis in Tamil and Tulu task. To understand the sentiment of social media text, we used pre-trained transformer models and feature extraction vectorizers to classify the data with results that placed us 11th in the rankings for the Tamil task and 8th for the Tulu task with a accuracy F1 score of 0.12 and 0.30 which shows the efficiency of our approach.</abstract>
       <url hash="62363665">2024.dravidianlangtech-1.36</url>
       <bibkey>balaji-etal-2024-wordwizards</bibkey>
+      <video href="2024.dravidianlangtech-1.36.mp4"/>
     </paper>
     <paper id="37">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>DUO</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech <fixed-case>EACL</fixed-case>2024: Fake News Classification Using <fixed-case>M</fixed-case>alayalam-<fixed-case>BERT</fixed-case></title>
@@ -477,6 +510,7 @@
       <abstract>Identifying between fake and original news in social media demands vigilant procedures. This paper introduces the significant shared task on ‘Fake News Detection in Dravidian Languages - DravidianLangTech@EACL 2024’. With a focus on the Malayalam language, this task is crucial in identifying social media posts as either fake or original news. The participating teams contribute immensely to this task through their varied strategies, employing methods ranging from conventional machine-learning techniques to advanced transformer-based models. Notably, the findings of this work highlight the effectiveness of the Malayalam-BERT model, demonstrating an impressive macro F1 score of 0.88 in distinguishing between fake and original news in Malayalam social media content, achieving a commendable rank of 1st among the participants.</abstract>
       <url hash="0b1d6597">2024.dravidianlangtech-1.37</url>
       <bibkey>rahman-etal-2024-cuet</bibkey>
+      <video href="2024.dravidianlangtech-1.37.mp4"/>
     </paper>
     <paper id="38">
       <title>Wit Hub@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-2024:Multimodal Social Media Data Analysis in <fixed-case>D</fixed-case>ravidian Languages using Machine Learning Models</title>
@@ -489,6 +523,7 @@
       <abstract>The main objective of the task is categorised into three subtasks. Subtask-1 Build models to determine the sentiment expressed in multimodal posts (or videos) in Tamil and Malayalam languages, leveraging textual, audio, and visual components. The videos are labelled into five categories: highly positive, positive, neutral, negative and highly negative. Subtask-2 Design machine models that effectively identify and classify abusive language within the multimodal context of social media posts in Tamil. The data are categorized into abusive and non-abusive categories. Subtask-3 Develop advanced models that accurately detect and categorize hate speech and offensive language in multimodal social media posts in Dravidian languages. The data points are categorized into Caste, Offensive, Racist and Sexist classes. In this session, the focus is primarily on Tamil language text data analysis. Various combination of machine learning models have been used to perform each tasks and do oversampling techniques to train models on biased dataset.</abstract>
       <url hash="0039aab6">2024.dravidianlangtech-1.38</url>
       <bibkey>s-etal-2024-wit</bibkey>
+      <video href="2024.dravidianlangtech-1.38.mp4"/>
     </paper>
     <paper id="39">
       <title><fixed-case>CUETS</fixed-case>entiment<fixed-case>S</fixed-case>illies@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>EACL</fixed-case>2024: Transformer-based Approach for Sentiment Analysis in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu Code-Mixed Texts</title>
@@ -503,6 +538,7 @@
       <abstract>Sentiment analysis (SA) on social media reviews has become a challenging research agenda in recent years due to the exponential growth of textual content. Although several effective solutions are available for SA in high-resourced languages, it is considered a critical problem for low-resourced languages. This work introduces an automatic system for analyzing sentiment in Tamil and Tulu code-mixed languages. Several ML (DT, RF, MNB), DL (CNN, BiLSTM, CNN+BiLSTM), and transformer-based models (Indic-BERT, XLM-RoBERTa, m-BERT) are investigated for SA tasks using Tamil and Tulu code-mixed textual data. Experimental outcomes reveal that the transformer-based models XLM-R and m-BERT surpassed others in performance for Tamil and Tulu, respectively. The proposed XLM-R and m-BERT models attained macro F1-scores of 0.258 (Tamil) and 0.468 (Tulu) on test datasets, securing the <tex-math>2^{nd}</tex-math> and <tex-math>5^{th}</tex-math> positions, respectively, in the shared task.</abstract>
       <url hash="02675852">2024.dravidianlangtech-1.39</url>
       <bibkey>tripty-etal-2024-cuetsentimentsillies</bibkey>
+      <video href="2024.dravidianlangtech-1.39.mp4"/>
     </paper>
     <paper id="40">
       <title>Social Media Hate and Offensive Speech Detection Using Machine Learning method</title>
@@ -514,6 +550,7 @@
       <abstract>Even though the improper use of social media is increasing nowadays, there is also technology that brings solutions. Here, improperness is posting hate and offensive speech that might harm an individual or group. Hate speech refers to an insult toward an individual or group based on their identities. Spreading it on social media platforms is a serious problem for society. The solution, on the other hand, is the availability of natural language processing(NLP) technology that is capable to detect and handle such problems. This paper presents the detection of social media’s hate and offensive speech in the code-mixed Telugu language. For this, the task and golden standard dataset were provided for us by the shared task organizer (DravidianLangTech@ EACL 2024)1. To this end, we have employed the TF-IDF technique for numeric feature extraction and used a random forest algorithm for modeling hate speech detection. Finally, the developed model was evaluated on the test dataset and achieved 0.492 macro-F1.</abstract>
       <url hash="a8e33719">2024.dravidianlangtech-1.40</url>
       <bibkey>bade-etal-2024-social-media</bibkey>
+      <video href="2024.dravidianlangtech-1.40.mp4"/>
     </paper>
     <paper id="41">
       <title><fixed-case>CUETS</fixed-case>entiment<fixed-case>S</fixed-case>illies@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech <fixed-case>EACL</fixed-case>2024: Transformer-based Approach for Detecting and Categorizing Fake News in <fixed-case>M</fixed-case>alayalam Language</title>
@@ -527,6 +564,7 @@
       <abstract>Fake news misleads people and may lead to real-world miscommunication and injury. Removing misinformation encourages critical thinking, democracy, and the prevention of hatred, fear, and misunderstanding. Identifying and removing fake news and developing a detection system is essential for reliable, accurate, and clear information. Therefore, a shared task was organized to detect fake news in Malayalam. This paper presents a system developed for the shared task of detecting and classifying fake news in Malayalam. The approach involves a combination of machine learning models (LR, DT, RF, MNB), deep learning models (CNN, BiLSTM, CNN+BiLSTM), and transformer-based models (Indic-BERT, XLMR, Malayalam-BERT, m-BERT) for both subtasks. The experimental results demonstrate that transformer-based models, specifically m- BERT and Malayalam-BERT, outperformed others. The m-BERT model achieved superior performance in subtask 1 with macro F1-scores of 0.84, and Malayalam-BERT outperformed the other models in subtask 2 with macro F1- scores of 0.496, securing us the 5th and 2nd positions in subtask 1 and subtask 2, respectively.</abstract>
       <url hash="4d314301">2024.dravidianlangtech-1.41</url>
       <bibkey>tripty-etal-2024-cuetsentimentsillies-dravidianlangtech</bibkey>
+      <video href="2024.dravidianlangtech-1.41.mp4"/>
     </paper>
     <paper id="42">
       <title><fixed-case>MUCS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-2024: Role of Learning Approaches in Strengthening Hate-Alert Systems for code-mixed text</title>
@@ -553,6 +591,7 @@
       <abstract>Sentiment Analysis (SA) is a field of computational study that analyzes and understands people’s opinions, attitudes, and emotions toward any entity. A review of an entity can be written about an individual, an event, a topic, a product, etc., and such reviews are abundant on social media platforms. The increasing number of social media users and the growing amount of user-generated code-mixed content such as reviews, comments, posts etc., on social media have resulted in a rising demand for efficient tools capable of effectively analyzing such content to detect the sentiments. In spite of this, SA of social media text is challenging because the code-mixed text is complex. To address SA in code-mixed Tamil and Tulu text, this paper describes the Machine Learning (ML) models submitted by our team - MUCS to “Sentiment Analysis in Tamil and Tulu - Dravidian- LangTech” - a shared task organized at European Chapter of the Association for Computational Linguistics (EACL) 2024. Linear Support Vector classifier (LinearSVC) and ensemble of 5 ML classifiers (k Nearest Neighbour (kNN), Stochastic Gradient Descent (SGD), Logistic Regression (LR), LinearSVC, and Random Forest Classifier (RFC)) with hard voting trained using concatenated features obtained from word and character n-ngrams vectoized from Term Frequency-Inverse Document Frequency (TF-IDF) vectorizer and CountVectorizer. Further, Gridsearch algorithm is employed to obtain optimal hyperparameter values.The proposed ensemble model obtained macro F1 scores of 0.260 and 0.550 for Tamil and Tulu languages respectively.</abstract>
       <url hash="38cc420c">2024.dravidianlangtech-1.43</url>
       <bibkey>b-etal-2024-mucs</bibkey>
+      <video href="2024.dravidianlangtech-1.43.mp4"/>
     </paper>
     <paper id="44">
       <title><fixed-case>I</fixed-case>nnovation<fixed-case>E</fixed-case>ngineers@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>EACL</fixed-case> 2024: Sentimental Analysis of <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube Comments in <fixed-case>T</fixed-case>amil by using Machine Learning</title>
@@ -564,6 +603,7 @@
       <abstract>There is opportunity for machine learning and natural language processing research because of the growing volume of textual data. Although there has been little research done on trend extraction from YouTube comments, sentiment analysis is an intriguing issue because of the poor consistency and quality of the material found there. The purpose of this work is to use machine learning techniques and algorithms to do sentiment analysis on YouTube comments pertaining to popular themes. The findings demonstrate that sentiment analysis is capable of giving a clear picture of how actual events affect public opinion. This study aims to make it easier for academics to find high-quality sentiment analysis research publications. Data normalisation methods are used to clean an annotated corpus of 1500 citation sentences for the study. .For classification, a system utilising one machine learning algorithm—K-Nearest Neighbour (KNN), Na ̈ıve Bayes, SVC (Support Vector Machine), and RandomForest—is built. Metrics like the f1-score and correctness score are used to assess the correctness of the system.</abstract>
       <url hash="651eb93a">2024.dravidianlangtech-1.44</url>
       <bibkey>shanmugavadivel-etal-2024-innovationengineers</bibkey>
+      <video href="2024.dravidianlangtech-1.44.mp4"/>
     </paper>
     <paper id="45">
       <title><fixed-case>KEC</fixed-case>_<fixed-case>HAWKS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2024 : Detecting <fixed-case>M</fixed-case>alayalam Fake News using Machine Learning Models</title>
@@ -576,6 +616,7 @@
       <abstract>The proliferation of fake news in the Malayalam language across digital platforms has emerged as a pressing issue. By employing Recurrent Neural Networks (RNNs), a type of machine learning model, we aim to distinguish between Original and Fake News in Malayalam and achieved 9th rank in Task 1.RNNs are chosen for their ability to understand the sequence of words in a sentence, which is important in languages like Malayalam. Our main goal is to develop better models that can spot fake news effectively. We analyze various features to understand what contributes most to this accuracy. By doing so, we hope to provide a reliable method for identifying and combating fake news in the Malayalam language.</abstract>
       <url hash="8f5aa581">2024.dravidianlangtech-1.45</url>
       <bibkey>subramanian-etal-2024-kec</bibkey>
+      <video href="2024.dravidianlangtech-1.45.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.eacl.xml b/data/xml/2024.eacl.xml
index 4d4545149a..88ce0cf81f 100644
--- a/data/xml/2024.eacl.xml
+++ b/data/xml/2024.eacl.xml
@@ -26,6 +26,7 @@
       <abstract>An increasing amount of research in Natural Language Inference (NLI) focuses on the application and evaluation of Large Language Models (LLMs) and their reasoning capabilities. Despite their success, however, LLMs are still prone to factual errors and inconsistencies in their explanations, offering limited control and interpretability for inference in complex domains. In this paper, we focus on ethical NLI, investigating how hybrid neuro-symbolic techniques can enhance the logical validity and alignment of ethical explanations produced by LLMs. Specifically, we present an abductive-deductive framework named Logic-Explainer, which integrates LLMs with an external backward-chaining solver to refine step-wise natural language explanations and jointly verify their correctness, reduce incompleteness and minimise redundancy. An extensive empirical analysis demonstrates that Logic-Explainer can improve explanations generated via in-context learning methods and Chain-of-Thought (CoT) on challenging ethical NLI tasks, while, at the same time, producing formal proofs describing and supporting models’ reasoning. As ethical NLI requires commonsense reasoning to identify underlying moral violations, our results suggest the effectiveness of neuro-symbolic methods for multi-step NLI more broadly, opening new opportunities to enhance the logical consistency, reliability, and alignment of LLMs.</abstract>
       <url hash="066f368a">2024.eacl-long.1</url>
       <bibkey>quan-etal-2024-enhancing</bibkey>
+      <video href="2024.eacl-long.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Multi-Relational Hyperbolic Word Embeddings from Natural Language Definitions</title>
@@ -37,6 +38,7 @@
       <url hash="89559947">2024.eacl-long.2</url>
       <attachment type="software" hash="5008b672">2024.eacl-long.2.software.zip</attachment>
       <bibkey>valentino-etal-2024-multi</bibkey>
+      <video href="2024.eacl-long.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Anisotropy Is Inherent to Self-Attention in Transformers</title>
@@ -47,6 +49,7 @@
       <abstract>The representation degeneration problem is a phenomenon that is widely observed among self-supervised learning methods based on Transformers. In NLP, it takes the form of anisotropy, a singular property of hidden representations which makes them unexpectedly close to each other in terms of angular distance (cosine-similarity). Some recent works tend to show that anisotropy is a consequence of optimizing the cross-entropy loss on long-tailed distributions of tokens. We show in this paper that anisotropy can also be observed empirically in language models with specific objectives that should not suffer directly from the same consequences. We also show that the anisotropy problem extends to Transformers trained on other modalities. Our observations tend to demonstrate that anisotropy might actually be inherent to Transformers-based models.</abstract>
       <url hash="6b88a441">2024.eacl-long.3</url>
       <bibkey>godey-etal-2024-anisotropy</bibkey>
+      <video href="2024.eacl-long.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Generating Benchmarks for Factuality Evaluation of Language Models</title>
@@ -66,6 +69,7 @@
       <attachment type="software" hash="28858435">2024.eacl-long.4.software.zip</attachment>
       <attachment type="note" hash="b521cf2d">2024.eacl-long.4.note.zip</attachment>
       <bibkey>muhlgay-etal-2024-generating</bibkey>
+      <video href="2024.eacl-long.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Leak, Cheat, Repeat: Data Contamination and Evaluation Malpractices in Closed-Source <fixed-case>LLM</fixed-case>s</title>
@@ -78,6 +82,7 @@
       <url hash="59105272">2024.eacl-long.5</url>
       <bibkey>balloccu-etal-2024-leak</bibkey>
       <award>Best Non-publicized Paper Award</award>
+      <video href="2024.eacl-long.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Archer: A Human-Labeled Text-to-<fixed-case>SQL</fixed-case> Dataset with Arithmetic, Commonsense and Hypothetical Reasoning</title>
@@ -88,6 +93,7 @@
       <abstract>We present Archer, a challenging bilingual text-to-SQL dataset specific to complex reasoning, including arithmetic, commonsense and hypothetical reasoning. It contains 1,042 English questions and 1,042 Chinese questions, along with 521 unique SQL queries, covering 20 English databases across 20 domains. Notably, this dataset demonstrates a significantly higher level of complexity compared to existing publicly available datasets. Our evaluation shows that Archer challenges the capabilities of current state-of-the-art models, with a high-ranked model on the Spider leaderboard achieving only 6.73% execution accuracy on Archer test set. Thus, Archer presents a significant challenge for future research in this field.</abstract>
       <url hash="9a7935c2">2024.eacl-long.6</url>
       <bibkey>zheng-etal-2024-archer</bibkey>
+      <video href="2024.eacl-long.6.mp4"/>
     </paper>
     <paper id="7">
       <title><fixed-case>GEAR</fixed-case>: Augmenting Language Models with Generalizable and Efficient Tool Resolution</title>
@@ -98,6 +104,7 @@
       <abstract>Augmenting large language models (LLM) to use external tools enhances their performance across a variety of tasks. However, prior works over-rely on task-specific demonstration of tool use that limits their generalizability and computational cost due to making many calls to large-scale LLMs. We introduce GEAR, a computationally efficient query-tool grounding algorithm that is generalizable to various tasks that require tool use while not relying on task-specific demonstrations. GEAR achieves better efficiency by delegating tool grounding and execution to small language models (SLM) and LLM, respectively; while leveraging semantic and pattern-based evaluation at both question and answer levels for generalizable tool grounding. We evaluate GEAR on 14 datasets across 6 downstream tasks, demonstrating its strong generalizability to novel tasks, tools and different SLMs. Despite offering more efficiency, GEAR achieves higher precision in tool grounding compared to prior strategies using LLM prompting, thus improving downstream accuracy at a reduced computational cost. For example, we demonstrate that GEAR-augmented GPT-J and GPT-3 outperform counterpart tool-augmented baselines because of better tool use.</abstract>
       <url hash="80f455cc">2024.eacl-long.7</url>
       <bibkey>lu-etal-2024-gear</bibkey>
+      <video href="2024.eacl-long.7.mp4"/>
     </paper>
     <paper id="8">
       <title><fixed-case>LLM</fixed-case> Comparative Assessment: Zero-shot <fixed-case>NLG</fixed-case> Evaluation through Pairwise Comparisons using Large Language Models</title>
@@ -109,6 +116,7 @@
       <url hash="4ce6a306">2024.eacl-long.8</url>
       <attachment type="software" hash="e9ead08e">2024.eacl-long.8.software.zip</attachment>
       <bibkey>liusie-etal-2024-llm</bibkey>
+      <video href="2024.eacl-long.8.mp4"/>
     </paper>
     <paper id="9">
       <title>Parameter-Efficient Conversational Recommender System as a Language Processing Task</title>
@@ -121,6 +129,7 @@
       <abstract>Conversational recommender systems (CRS) aim to recommend relevant items to users by eliciting user preference through natural language conversation. Prior work often utilizes external knowledge graphs for items’ semantic information, a language model for dialogue generation, and a recommendation module for ranking relevant items. This combination of multiple components suffers from a cumber-some training process, and leads to semantic misalignment issues between dialogue generation and item recommendation. In this paper, we represent items in natural language and formulate CRS as a natural language processing task. Accordingly, we leverage the power of pre-trained language models to encode items, understand user intent via conversation, perform item recommendation through semantic matching, and generate dialogues. As a unified model, our PECRS (Parameter-Efficient CRS), can be optimized in a single stage, without relying on non-textual metadata such as a knowledge graph. Experiments on two benchmark CRS datasets, ReDial and INSPIRED, demonstrate the effectiveness of PECRS on recommendation and conversation. Our code is available at: https://github.com/Ravoxsg/efficient_unified_crs.</abstract>
       <url hash="ba4a938d">2024.eacl-long.9</url>
       <bibkey>ravaut-etal-2024-parameter</bibkey>
+      <video href="2024.eacl-long.9.mp4"/>
     </paper>
     <paper id="10">
       <title><fixed-case>O</fixed-case>pen<fixed-case>PI</fixed-case>2.0: An Improved Dataset for Entity Tracking in Texts</title>
@@ -135,6 +144,7 @@
       <attachment type="software" hash="ecafe860">2024.eacl-long.10.software.zip</attachment>
       <attachment type="note" hash="ecafe860">2024.eacl-long.10.note.zip</attachment>
       <bibkey>zhang-etal-2024-openpi2</bibkey>
+      <video href="2024.eacl-long.10.mp4"/>
     </paper>
     <paper id="11">
       <title>A Comparative Multidimensional Analysis of Empathetic Systems</title>
@@ -146,6 +156,7 @@
       <abstract>Recently, empathetic dialogue systems have received significant attention.While some researchers have noted limitations, e.g., that these systems tend to generate generic utterances, no study has systematically verified these issues. We survey 21 systems, asking what progress has been made on the task. We observe multiple limitations of current evaluation procedures. Most critically, studies tend to rely on a single non-reproducible empathy score, which inadequately reflects the multidimensional nature of empathy. To better understand the differences between systems, we comprehensively analyze each system with automated methods that are grounded in a variety of aspects of empathy. We find that recent systems lack three important aspects of empathy: specificity, reflection levels, and diversity. Based on our results, we discuss problematic behaviors that may have gone undetected in prior evaluations, and offer guidance for developing future systems.</abstract>
       <url hash="df025edc">2024.eacl-long.11</url>
       <bibkey>lee-etal-2024-comparative</bibkey>
+      <video href="2024.eacl-long.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Few-Shot Data Synthesis for Open Domain Multi-Hop Question Answering</title>
@@ -173,6 +184,7 @@
       <attachment type="software" hash="59dd943e">2024.eacl-long.13.software.zip</attachment>
       <attachment type="note" hash="59dd943e">2024.eacl-long.13.note.zip</attachment>
       <bibkey>yang-etal-2024-language</bibkey>
+      <video href="2024.eacl-long.13.mp4"/>
     </paper>
     <paper id="14">
       <title><fixed-case>SIB</fixed-case>-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic Classification in 200+ Languages and Dialects</title>
@@ -190,6 +202,7 @@
       <attachment type="software" hash="dcb0841f">2024.eacl-long.14.software.zip</attachment>
       <attachment type="note" hash="dcb0841f">2024.eacl-long.14.note.zip</attachment>
       <bibkey>adelani-etal-2024-sib</bibkey>
+      <video href="2024.eacl-long.14.mp4"/>
     </paper>
     <paper id="15">
       <title><fixed-case>F</fixed-case>in<fixed-case>BPM</fixed-case>: A Framework for Portfolio Management-based Financial Investor Behavior Perception Model</title>
@@ -203,6 +216,7 @@
       <abstract>The goal of portfolio management is to simultaneously maximize the accumulated return and also to control risk. In consecutive trading periods, portfolio manager needs to continuously adjust the portfolio weights based on the factors which can cause price fluctuation in the market. In the stock market, the factors affecting the stock price can be divided into two categories. The first is price fluctuations caused by irrational investment of the speculators. The second is endogenous value changes caused by operations of the company. In recent years, with the advancement of artificial intelligence technology, reinforcement learning (RL) algorithms have been increasingly employed by scholars to address financial problems, particularly in the area of portfolio management. However, the deep RL models proposed by these scholars in the past have focused more on analyzing the price changes caused by the investment behavior of speculators in response to technical indicators of actual stock prices. In this research, we introduce an RL-based framework called FinBPM, which takes both the factor pertaining to the impact on operations of the company and the factor of the irrational investment of the speculator into consideration. For our experimentation, we randomly selected twelve stocks from the Dow Jones Industrial Index to construct our portfolio. The experimental results reveal that, in comparison to conventional reinforcement learning methods, our approach with at least 13.26% increase over other methods compared. Additionally, it achieved the best Sharpe ratio of 2.77, effectively maximizing the return per unit of risk.</abstract>
       <url hash="0d1fae97">2024.eacl-long.15</url>
       <bibkey>zhang-etal-2024-finbpm</bibkey>
+      <video href="2024.eacl-long.15.mp4"/>
     </paper>
     <paper id="16">
       <title>Asking the Right Question at the Right Time: Human and Model Uncertainty Guidance to Ask Clarification Questions</title>
@@ -212,6 +226,7 @@
       <abstract>Clarification questions are an essential dialogue tool to signal misunderstanding, ambiguities, and under-specification in language use. While humans are able to resolve uncertainty by asking questions since childhood, modern dialogue systems struggle to generate effective questions. To make progress in this direction, in this work we take a collaborative dialogue task as a testbed and study how model uncertainty relates to human uncertainty—an as yet under-explored problem. We show that model uncertainty does not mirror human clarification-seeking behavior, which suggests that using human clarification questions as supervision for deciding when to ask may not be the most effective way to resolve model uncertainty. To address this issue, we propose an approach to generating clarification questions based on model uncertainty estimation, compare it to several alternatives, and show that it leads to significant improvements in terms of task success. Our findings highlight the importance of equipping dialogue systems with the ability to assess their own uncertainty and exploit in interaction.</abstract>
       <url hash="d8992014">2024.eacl-long.16</url>
       <bibkey>testoni-fernandez-2024-asking</bibkey>
+      <video href="2024.eacl-long.16.mp4"/>
     </paper>
     <paper id="17">
       <title>Like a Good Nearest Neighbor: Practical Content Moderation and Text Classification</title>
@@ -223,6 +238,7 @@
       <attachment type="software" hash="40e95628">2024.eacl-long.17.software.zip</attachment>
       <attachment type="note" hash="af4a194c">2024.eacl-long.17.note.zip</attachment>
       <bibkey>bates-gurevych-2024-like</bibkey>
+      <video href="2024.eacl-long.17.mp4"/>
     </paper>
     <paper id="18">
       <title>Zero-shot Sentiment Analysis in Low-Resource Languages Using a Multilingual Sentiment Lexicon</title>
@@ -235,6 +251,7 @@
       <abstract>Improving multilingual language models capabilities in low-resource languages is generally difficult due to the scarcity of large-scale data in those languages. In this paper, we relax the reliance on texts in low-resource languages by using multilingual lexicons in pretraining to enhance multilingual capabilities. Specifically, we focus on zero-shot sentiment analysis tasks across 34 languages, including 6 high/medium-resource languages, 25 low-resource languages, and 3 code-switching datasets. We demonstrate that pretraining using multilingual lexicons, without using any sentence-level sentiment data, achieves superior zero-shot performance compared to models fine-tuned on English sentiment datasets, and large language models like GPT–3.5, BLOOMZ, and XGLM. These findings are observable for unseen low-resource languages to code-mixed scenarios involving high-resource languages.</abstract>
       <url hash="c55df9c7">2024.eacl-long.18</url>
       <bibkey>koto-etal-2024-zero</bibkey>
+      <video href="2024.eacl-long.18.mp4"/>
     </paper>
     <paper id="19">
       <title><fixed-case>CEAN</fixed-case>: Contrastive Event Aggregation Network with <fixed-case>LLM</fixed-case>-based Augmentation for Event Extraction</title>
@@ -247,6 +264,7 @@
       <abstract>Event Extraction is a crucial yet arduous task in natural language processing (NLP), as its performance is significantly hindered by laborious data annotation. Given this challenge, recent research has predominantly focused on two approaches: pretraining task-oriented models for event extraction and employing data augmentation techniques. These methods involve integrating external knowledge, semantic structures, or artificially generated samples using large language models (LLMs). However, their performances can be compromised due to two fundamental issues. Firstly, the alignment between the introduced knowledge and event extraction knowledge is crucial. Secondly, the introduction of data noise during the augmentation is unavoidable and can mislead the model’s convergence. To address these issues, we propose a Contrastive Event Aggregation Network with LLM-based Augmentation to promote low-resource learning and reduce data noise for event extraction. Different from the existing methods introducing linguistic knowledge into data augmentation, an event aggregation network is established to introduce event knowledge into supervised learning by constructing adaptively-updated semantic representation for trigger and argument. For LLM-based augmentation, we design a new scheme including a multi-pattern rephrasing paradigm and a data-free composing paradigm. Instead of directly using augmentation samples in the supervised task, we introduce span-level contrastive learning to reduce data noise. Experiments on the ACE2005 and ERE-EN demonstrate that our proposed approach achieves new state-of-the-art results on both of the two datasets.</abstract>
       <url hash="bb74697a">2024.eacl-long.19</url>
       <bibkey>meng-etal-2024-cean</bibkey>
+      <video href="2024.eacl-long.19.mp4"/>
     </paper>
     <paper id="20">
       <title>How Transferable are Attribute Controllers on Pretrained Multilingual Translation Models?</title>
@@ -257,6 +275,7 @@
       <url hash="3b6ada37">2024.eacl-long.20</url>
       <attachment type="software" hash="4d9af2bc">2024.eacl-long.20.software.zip</attachment>
       <bibkey>liu-niehues-2024-transferable</bibkey>
+      <video href="2024.eacl-long.20.mp4"/>
     </paper>
     <paper id="21">
       <title><fixed-case>M</fixed-case>ulti<fixed-case>MUC</fixed-case>: Multilingual Template Filling on <fixed-case>MUC</fixed-case>-4</title>
@@ -271,6 +290,7 @@
       <abstract>We introduce MultiMUC, the first multilingual parallel corpus for template filling, comprising translations of the classic MUC-4 template filling benchmark into five languages: Arabic, Chinese, Farsi, Korean, and Russian. We obtain automatic translations from a strong multilingual machine translation system and manually project the original English annotations into each target language. For all languages, we also provide human translations for key portions of the dev and test splits. Finally, we present baselines on MultiMUC both with state-of-the-art template filling models for MUC-4 and with ChatGPT. We release MUC-4 and the supervised baselines to facilitate further work on document-level information extraction in multilingual settings.</abstract>
       <url hash="c4b6f876">2024.eacl-long.21</url>
       <bibkey>gantt-etal-2024-multimuc</bibkey>
+      <video href="2024.eacl-long.21.mp4"/>
     </paper>
     <paper id="22">
       <title>Align and Augment: Generative Data Augmentation for Compositional Generalization</title>
@@ -281,6 +301,7 @@
       <abstract>Recent work on semantic parsing has shown that seq2seq models find compositional generalization challenging. Several strategies have been proposed to mitigate this challenge. One such strategy is to improve compositional generalization via data augmentation techniques. In this paper we follow this line of work and propose Archer, a data-augmentation strategy that exploits alignment annotations between sentences and their corresponding meaning representations. More precisely, we use alignments to train a two step generative model that combines monotonic lexical generation with reordering. Our experiments show that Archer leads to significant improvements in compositional generalization performance.</abstract>
       <url hash="757ffe9e">2024.eacl-long.22</url>
       <bibkey>cazzaro-etal-2024-align</bibkey>
+      <video href="2024.eacl-long.22.mp4"/>
     </paper>
     <paper id="23">
       <title><fixed-case>UNSEE</fixed-case>: Unsupervised Non-contrastive Sentence Embeddings</title>
@@ -289,6 +310,7 @@
       <abstract>In this paper, we introduce UNSEE, which stands for Unsupervised Non-Contrastive Sentence Embeddings. UNSEE demonstrates better performance compared to SimCSE in the Massive Text Embedding (MTEB) benchmark. We begin by highlighting the issue of representation collapse that occurs with the replacement of contrastive objectives with non-contrastive objectives in SimCSE. Subsequently, we introduce a straightforward solution called the target network to mitigate this problem. This approach enables us to harness non-contrastive objectives while ensuring training stability and achieving performance improvements similar to those seen with contrastive objectives. We have reached peak performance in non-contrastive sentence embeddings through extensive fine-tuning and optimization. These efforts have resulted in superior sentence representation models, emphasizing the importance of careful tuning and optimization for non-contrastive objectives.</abstract>
       <url hash="66089413">2024.eacl-long.23</url>
       <bibkey>cagatan-2024-unsee</bibkey>
+      <video href="2024.eacl-long.23.mp4"/>
     </paper>
     <paper id="24">
       <title><fixed-case>EXPLORER</fixed-case>: Exploration-guided Reasoning for Textual Reinforcement Learning</title>
@@ -302,6 +324,7 @@
       <abstract>Text-based games (TBGs) have emerged as an important collection of NLP tasks, requiring reinforcement learning (RL) agents to combine natural language understanding with reasoning. A key challenge for agents attempting to solve such tasks is to generalize across multiple games and demonstrate good performance on both seen and unseen objects. Purely deep-RL-based approaches may perform well on seen objects; however, they fail to showcase the same performance on unseen objects. Commonsense-infused deep-RL agents may work better on unseen data; unfortunately, their policies are often not interpretable or easily transferable. To tackle these issues, in this paper, we present EXPLORER which is an exploration-guided reasoning agent for textual reinforcement learning. EXPLORER is neuro-symbolic in nature, as it relies on a neural module for exploration and a symbolic module for exploitation. It can also learn generalized symbolic policies and perform well over unseen data. Our experiments show that EXPLORER outperforms the baseline agents on Text-World cooking (TW-Cooking) and Text-World Commonsense (TWC) games.</abstract>
       <url hash="869cec73">2024.eacl-long.24</url>
       <bibkey>basu-etal-2024-explorer</bibkey>
+      <video href="2024.eacl-long.24.mp4"/>
     </paper>
     <paper id="25">
       <title>From Text Segmentation to Smart Chaptering: A Novel Benchmark for Structuring Video Transcriptions</title>
@@ -311,6 +334,7 @@
       <abstract>Text segmentation is a fundamental task in natural language processing, where documents are split into contiguous sections. However, prior research in this area has been constrained by limited datasets, which are either small in scale, synthesized, or only contain well-structured documents. In this paper, we address these limitations by introducing a novel benchmark YTSeg focusing on spoken content that is inherently more unstructured and both topically and structurally diverse. As part of this work, we introduce an efficient hierarchical segmentation model MiniSeg, that outperforms state-of-the-art baselines. Lastly, we expand the notion of text segmentation to a more practical “smart chaptering” task that involves the segmentation of unstructured content, the generation of meaningful segment titles, and a potential real-time application of the models.</abstract>
       <url hash="048cd823">2024.eacl-long.25</url>
       <bibkey>retkowski-waibel-2024-text</bibkey>
+      <video href="2024.eacl-long.25.mp4"/>
     </paper>
     <paper id="26">
       <title>Fréchet Distance for Offline Evaluation of Information Retrieval Systems with Sparse Labels</title>
@@ -320,6 +344,7 @@
       <abstract>The rapid advancement of natural language processing, information retrieval (IR), computer vision, and other technologies has presented significant challenges in evaluating the performance of these systems. One of the main challenges is the scarcity of human-labeled data, which hinders the fair and accurate assessment of these systems. In this work, we specifically focus on evaluating IR systems with sparse labels, borrowing from recent research on evaluating computer vision tasks.taking inspiration from the success of using Fréchet Inception Distance (FID) in assessing text-to-image generation systems. We propose leveraging the Fréchet Distance to measure the distance between the distributions of relevant judged items and retrieved results. Our experimental results on MS MARCO V1 dataset and TREC Deep Learning Tracks query sets demonstrate the effectiveness of the Fréchet Distance as a metric for evaluating IR systems, particularly in settings where a few labels are available.This approach contributes to the advancement of evaluation methodologies in real-world scenarios such as the assessment of generative IR systems.</abstract>
       <url hash="d9d00dfa">2024.eacl-long.26</url>
       <bibkey>arabzadeh-clarke-2024-frechet</bibkey>
+      <video href="2024.eacl-long.26.mp4"/>
     </paper>
     <paper id="27">
       <title>Semantic Sensitivities and Inconsistent Predictions: Measuring the Fragility of <fixed-case>NLI</fixed-case> Models</title>
@@ -331,6 +356,7 @@
       <url hash="98c94193">2024.eacl-long.27</url>
       <bibkey>arakelyan-etal-2024-semantic</bibkey>
       <award>Outstanding Paper Award</award>
+      <video href="2024.eacl-long.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Exploring the Robustness of Task-oriented Dialogue Systems for Colloquial <fixed-case>G</fixed-case>erman Varieties</title>
@@ -342,6 +368,7 @@
       <url hash="51ed9c7f">2024.eacl-long.28</url>
       <attachment type="software" hash="3a9f24a3">2024.eacl-long.28.software.zip</attachment>
       <bibkey>artemova-etal-2024-exploring</bibkey>
+      <video href="2024.eacl-long.28.mp4"/>
     </paper>
     <paper id="29">
       <title><fixed-case>PEARL</fixed-case>: Prompting Large Language Models to Plan and Execute Actions Over Long Documents</title>
@@ -355,6 +382,7 @@
       <abstract>Strategies such as chain-of-thought prompting improve the performance of large language models (LLMs) on complex reasoning tasks by decomposing input examples into intermediate steps. However, it remains unclear how to apply such methods to reason over long input documents, in which both the decomposition and the output of each intermediate step are non-trivial to obtain. In this work, we propose PEARL, a prompting framework to improve reasoning over long documents, which consists of three stages: action mining, plan formulation, and plan execution. More specifically, given a question about a long document, PEARL decomposes the question into a sequence of actions (e.g., SUMMARIZE, FIND_EVENT, FIND_RELATION) and then executes them over the document to obtain the answer. Each stage of PEARL is implemented via zero-shot or few-shot prompting of LLMs (in our work, GPT-4) with minimal human input. We evaluate PEARL on a challenging subset of the QuALITY dataset, which contains questions that require complex reasoning over long narrative texts. PEARL outperforms zero-shot and chain-of-thought prompting on this dataset, and ablation experiments show that each stage of PEARL is critical to its performance. Overall, PEARL is a first step towards leveraging LLMs to reason over long documents.</abstract>
       <url hash="3191421f">2024.eacl-long.29</url>
       <bibkey>sun-etal-2024-pearl</bibkey>
+      <video href="2024.eacl-long.29.mp4"/>
     </paper>
     <paper id="30">
       <title><fixed-case>LA</fixed-case>ra<fixed-case>B</fixed-case>ench: Benchmarking <fixed-case>A</fixed-case>rabic <fixed-case>AI</fixed-case> with Large Language Models</title>
@@ -374,11 +402,14 @@
       <author><first>Ahmed</first><last>Ali</last><affiliation>Qatar Computing Research Institute</affiliation></author>
       <author><first>Nadir</first><last>Durrani</last><affiliation>Qatar Computing Research Institute</affiliation></author>
       <author><first>Natasa</first><last>Milic-Frayling</last><affiliation>Qatar Computing Research Institute</affiliation></author>
-      <author><first>Firoj</first><last>Alam</last><affiliation>Qatar Computing Research Institute</affiliation></author>
+      <author><first>Majd</first><last>Hawasly</last></author>
+      <author><first>Nadir</first><last>Durrani</last></author>
+      <author><first>Firoj</first><last>Alam</last></author>
       <pages>487-520</pages>
       <abstract>Recent advancements in Large Language Models (LLMs) have significantly influenced the landscape of language and speech research. Despite this progress, these models lack specific benchmarking against state-of-the-art (SOTA) models tailored to particular languages and tasks. LAraBench addresses this gap for Arabic Natural Language Processing (NLP) and Speech Processing tasks, including sequence tagging and content classification across different domains. We utilized models such as GPT-3.5-turbo, GPT-4, BLOOMZ, Jais-13b-chat, Whisper, and USM, employing zero and few-shot learning techniques to tackle 33 distinct tasks across 61 publicly available datasets. This involved 98 experimental setups, encompassing ~296K data points, ~46 hours of speech, and 30 sentences for Text-to-Speech (TTS). This effort resulted in 330+ sets of experiments. Our analysis focused on measuring the performance gap between SOTA models and LLMs. The overarching trend observed was that SOTA models generally outperformed LLMs in zero-shot learning, with a few exceptions. Notably, larger computational models with few-shot learning techniques managed to reduce these performance gaps. Our findings provide valuable insights into the applicability of LLMs for Arabic NLP and speech processing tasks.</abstract>
       <url hash="c04171bb">2024.eacl-long.30</url>
       <bibkey>abdelali-etal-2024-larabench</bibkey>
+      <video href="2024.eacl-long.30.mp4"/>
     </paper>
     <paper id="31">
       <title><fixed-case>S</fixed-case>entence<fixed-case>LDA</fixed-case>: Discriminative and Robust Document Representation with Sentence Level Topic Model</title>
@@ -389,6 +420,7 @@
       <url hash="44628ed9">2024.eacl-long.31</url>
       <attachment type="software" hash="265f6a16">2024.eacl-long.31.software.zip</attachment>
       <bibkey>cha-lee-2024-sentencelda</bibkey>
+      <video href="2024.eacl-long.31.mp4"/>
     </paper>
     <paper id="32">
       <title>Towards Hierarchical Spoken Language Disfluency Modeling</title>
@@ -398,6 +430,7 @@
       <abstract>Speech dysfluency modeling is the bottleneck for both speech therapy and language learning. However, there is no AI solution to systematically tackle this problem. We first propose to define the concept of dysfluent speech and dysfluent speech modeling. We then present Hierarchical Unconstrained Dysfluency Modeling (H-UDM) approach that addresses both dysfluency transcription and detection to eliminate the need for extensive manual annotation. Furthermore, we introduce a simulated dysfluent dataset called VCTK++ to enhance the capabilities of H-UDM in phonetic transcription. Our experimental results demonstrate the effectiveness and robustness of our proposed methods in both transcription and detection tasks.</abstract>
       <url hash="b36dc1d9">2024.eacl-long.32</url>
       <bibkey>lian-anumanchipalli-2024-towards</bibkey>
+      <video href="2024.eacl-long.32.mp4"/>
     </paper>
     <paper id="33">
       <title>Finding a Needle in the Adversarial Haystack: A Targeted Paraphrasing Approach For Uncovering Edge Cases with Minimal Distribution Distortion</title>
@@ -407,6 +440,7 @@
       <abstract>Adversarial attacks against Language models (LMs) are a significant concern. In particular, adversarial samples exploit the model’s sensitivity to small input changes. While these changes appear insignificant on the semantics of the input sample, they result in significant decay in model performance. In this paper, we propose Targeted Paraphrasing via RL (TPRL), an approach to automatically learn a policy to generate challenging samples that improve the model’s performance. TPRL leverages FLAN-T5, a language model, as a generator and employs a self-learned policy using a proximal policy optimization to generate the adversarial examples automatically. TPRL’s reward is based on the confusion induced in the classifier, preserving the original text meaning through a Mutual Implication score. We demonstrate &amp; evaluate TPRL’s effectiveness in discovering natural adversarial attacks and improving model performance through extensive experiments on four diverse NLP classification tasks via Automatic &amp; Human evaluation. TPRL outperforms strong baselines, exhibits generalizability across classifiers and datasets, and combines the strengths of language modeling and reinforcement learning to generate diverse and influential adversarial examples.</abstract>
       <url hash="ab4fb34e">2024.eacl-long.33</url>
       <bibkey>kassem-saad-2024-finding</bibkey>
+      <video href="2024.eacl-long.33.mp4"/>
     </paper>
     <paper id="34">
       <title><fixed-case>FAIR</fixed-case>: Filtering of Automatically Induced Rules</title>
@@ -415,9 +449,10 @@
       <author><first>Manjesh</first><last>Hanawal</last><affiliation>Indian Institute of Technology Bombay</affiliation></author>
       <author><first>Ganesh</first><last>Ramakrishnan</last><affiliation>Indian Institute of Technology Bombay, Indian Institute of Technology Bombay</affiliation></author>
       <pages>573-588</pages>
-      <abstract>Availability of large annotated data can be a critical bottleneck in training machine learning algorithms successfully, especially when applied to diverse domains. Weak supervision offers a promising alternative by accelerating the creation of labeled training data using domain-specific rules. However, it requires users to write a diverse set of high-quality rules to assign labels to the unlabeled data (eg., Snorkel (CITATION)). Automatic Rule Induction (ARI) approaches such as Snuba (CITATION) circumvent this problem by automatically creating rules from features on a small labeled set and filtering a final set of rules from them. In the ARI approach, the crucial step is to filter out a set of a high-quality useful subset of rules from the large set of automatically created rules. In this paper, we propose an algorithm FAIR (Filtering of Automatically Induced Rules) to filter rules from a large number of automatically induced rules using submodular objective functions that account for the collective precision, coverage, and conflicts of the rule set. We experiment with three ARI approaches and five text classification datasets to validate the superior performance of our algorithm with respect to several semi-supervised label aggregation approaches. We show that our approach achieves statistically significant results in comparison to existing rule-filtering approaches. The anonymized source code is available at <url>https://anonymous.4open.science/r/FAIR-LF-Induction-9B60</url>.</abstract>
+      <abstract>The availability of large annotated data can be a critical bottleneck in training machine learning algorithms successfully, especially when applied to diverse domains. Weak supervision offers a promising alternative by accelerating the creation of labeled training data using domainspecific rules. However, it requires users to write a diverse set of high-quality rules to assign labels to the unlabeled data. Automatic Rule Induction (ARI) approaches circumvent this problem by automatically creating rules from features on a small labeled set and filtering a final set of rules from them. In the ARI approach, the crucial step is to filter out a set of a high-quality useful subset of rules from the large set of automatically created rules. In this paper, we propose an algorithm FAIR (Filtering of Automatically Induced Rules) to filter rules from a large number of automatically induced rules using submodular objective functions that account for the collective precision, coverage, and conflicts of the rule set. We experiment with three ARI approaches and five text classification datasets to validate the superior performance of our algorithm with respect to several semi-supervised label aggregation approaches. Further, we show that FAIR achieves statistically significant results in comparison to existing rule-filtering approaches. The source code is available at https://github.com/ ayushbits/FAIR-LF-Induction.</abstract>
       <url hash="69aec150">2024.eacl-long.34</url>
       <bibkey>bajpai-etal-2024-fair</bibkey>
+      <video href="2024.eacl-long.34.mp4"/>
     </paper>
     <paper id="35">
       <title><fixed-case>NNOSE</fixed-case>: Nearest Neighbor Occupational Skill Extraction</title>
@@ -429,6 +464,7 @@
       <abstract>The labor market is changing rapidly, prompting increased interest in the automatic extraction of occupational skills from text. With the advent of English benchmark job description datasets, there is a need for systems that handle their diversity well. We tackle the complexity in occupational skill datasets tasks—combining and leveraging multiple datasets for skill extraction, to identify rarely observed skills within a dataset, and overcoming the scarcity of skills across datasets. In particular, we investigate the retrieval-augmentation of language models, employing an external datastore for retrieving similar skills in a dataset-unifying manner. Our proposed method, <b>N</b>earest <b>N</b>eighbor <b>O</b>ccupational <b>S</b>kill <b>E</b>xtraction (NNOSE) effectively leverages multiple datasets by retrieving neighboring skills from other datasets in the datastore. This improves skill extraction <i>without</i> additional fine-tuning. Crucially, we observe a performance gain in predicting infrequent patterns, with substantial gains of up to 30% span-F1 in cross-dataset settings.</abstract>
       <url hash="fa763f2b">2024.eacl-long.35</url>
       <bibkey>zhang-etal-2024-nnose</bibkey>
+      <video href="2024.eacl-long.35.mp4"/>
     </paper>
     <paper id="36">
       <title><fixed-case>GAINER</fixed-case>: Graph Machine Learning with Node-specific Radius for Classification of Short Texts and Documents</title>
@@ -437,6 +473,7 @@
       <abstract>Graphs provide a natural, intuitive, and holistic means to capture relationships between different text elements in Natural Language Processing (NLP) such as words, sentences, and documents. Recent advancements in the field of Graph Machine Learning (GML) have led to the development of numerous models to process text for various natural language applications, including but not limited to short-text classification, document classification, and others.At the heart of GML models, specifically those based on Graph Neural Networks (GNNs), lies the message passing operation which has shown to be an essential component for strong empirical performance in NLP.However, the number of message passing steps (often known as the radius) is <tex-math>\textit{fixed for all the nodes}</tex-math> in existing GML models for NLP.Fixing the radius poses a fundamental restriction as nodes exhibit diverse properties and varying amounts of informative local structures in the input graph.This paper presents GAINER, a novel framework called Graph mAchine learnIng with Node-spEcific Radius, aimed at graph-based NLP. We propose non-neural and novel neural approaches built on the core ideas of GAINER.Through rigorous experimentation, we demonstrate the efficacy of GAINER in various popular NLP tasks.</abstract>
       <url hash="261a9e17">2024.eacl-long.36</url>
       <bibkey>yadati-2024-gainer</bibkey>
+      <video href="2024.eacl-long.36.mp4"/>
     </paper>
     <paper id="37">
       <title><fixed-case>MAFIA</fixed-case>: Multi-Adapter Fused Inclusive Language Models</title>
@@ -449,6 +486,7 @@
       <abstract>Pretrained Language Models (PLMs) are widely used in NLP for various tasks. Recent studies have identified various biases that such models exhibit and have proposed methods to correct these biases. However, most of the works address a limited set of bias dimensions independently such as gender, race, or religion. Moreover, the methods typically involve finetuning the full model in order to maintain the performance on the downstream task. In this work, we aim to modularly debias a pre-trained language model across multiple dimensions. Previous works extensively explored debiasing PLMs by using limited US-centric counterfactual data augmentation (CDA). We use structured knowledge and a large generative model to build a diverse CDA across multiple bias dimensions in a semi-automated way. We highlight how existing debiasing methods do not consider interactions between multiple societal biases and propose a debiasing model that exploits the synergy amongst various societal biases and enables multi-bias debiasing simultaneously. An extensive evaluation on multiple tasks and languages demonstrates the efficacy of the approach.</abstract>
       <url hash="289af4f2">2024.eacl-long.37</url>
       <bibkey>jain-etal-2024-mafia</bibkey>
+      <video href="2024.eacl-long.37.mp4"/>
     </paper>
     <paper id="38">
       <title>Code-Switched Language Identification is Harder Than You Think</title>
@@ -460,6 +498,7 @@
       <abstract>Code switching (CS) is a very common phenomenon in written and spoken communication, but is handled poorly by many NLP applications. Looking to the application of building CS corpora, we explore CS language identification for corpus building. We make the task more realistic by scaling it to more languages and considering models with simpler architectures for faster inference. We also reformulate the task as a sentence-level multi-label tagging problem to make it more tractable. Having defined the task, we investigate three reasonable architectures for this task and define metrics which better reflect desired performance. We present empirical evidence that no current approach is adequate, and finally provide recommendations for future work in this area.</abstract>
       <url hash="2a2ec008">2024.eacl-long.38</url>
       <bibkey>burchell-etal-2024-code</bibkey>
+      <video href="2024.eacl-long.38.mp4"/>
     </paper>
     <paper id="39">
       <title>Generation-driven Contrastive Self-training for Zero-shot Text Classification with Instruction-following <fixed-case>LLM</fixed-case></title>
@@ -471,6 +510,7 @@
       <url hash="b1c41a8f">2024.eacl-long.39</url>
       <attachment type="software" hash="6b1f6922">2024.eacl-long.39.software.zip</attachment>
       <bibkey>zhang-etal-2024-generation</bibkey>
+      <video href="2024.eacl-long.39.mp4"/>
     </paper>
     <paper id="40">
       <title>Quantifying the Hyperparameter Sensitivity of Neural Networks for Character-level Sequence-to-Sequence Tasks</title>
@@ -481,6 +521,7 @@
       <abstract>Hyperparameter tuning, the process of searching for suitable hyperparameters, becomes more difficult as the computing resources required to train neural networks continue to grow. This topic continues to receive little attention and discussion—much of it hearsay—despite its obvious importance. We attempt to formalize hyperparameter sensitivity using two metrics: similarity-based sensitivity and performance-based sensitivity. We then use these metrics to quantify two such claims: (1) transformers are more sensitive to hyperparameter choices than LSTMs and (2) transformers are particularly sensitive to batch size. We conduct experiments on two different character-level sequence-to-sequence tasks and find that, indeed, the transformer is slightly more sensitive to hyperparameters according to both of our metrics. However, we do not find that it is more sensitive to batch size in particular.</abstract>
       <url hash="1f7882b9">2024.eacl-long.40</url>
       <bibkey>wiemerslage-etal-2024-quantifying</bibkey>
+      <video href="2024.eacl-long.40.mp4"/>
     </paper>
     <paper id="41">
       <title>Examining Gender and Racial Bias in Large Vision–Language Models Using a Novel Dataset of Parallel Images</title>
@@ -491,6 +532,7 @@
       <url hash="81161232">2024.eacl-long.41</url>
       <attachment type="note" hash="24d2c338">2024.eacl-long.41.note.zip</attachment>
       <bibkey>fraser-kiritchenko-2024-examining</bibkey>
+      <video href="2024.eacl-long.41.mp4"/>
     </paper>
     <paper id="42">
       <title><fixed-case>C</fixed-case>onstraint<fixed-case>C</fixed-case>hecker: A Plugin for Large Language Models to Reason on Commonsense Knowledge Bases</title>
@@ -504,6 +546,7 @@
       <url hash="d007f452">2024.eacl-long.42</url>
       <attachment type="software" hash="938f2c5d">2024.eacl-long.42.software.zip</attachment>
       <bibkey>do-etal-2024-constraintchecker</bibkey>
+      <video href="2024.eacl-long.42.mp4"/>
     </paper>
     <paper id="43">
       <title>A* shortest string decoding for non-idempotent semirings</title>
@@ -514,6 +557,7 @@
       <url hash="ac12cde0">2024.eacl-long.43</url>
       <attachment type="software" hash="e3595a77">2024.eacl-long.43.software.tgz</attachment>
       <bibkey>gorman-allauzen-2024-shortest</bibkey>
+      <video href="2024.eacl-long.43.mp4"/>
     </paper>
     <paper id="44">
       <title>Importance-Aware Data Augmentation for Document-Level Neural Machine Translation</title>
@@ -526,6 +570,7 @@
       <abstract>Document-level neural machine translation (DocNMT) aims to generate translations that are both coherent and cohesive, in contrast to its sentence-level counterpart. However, due to its longer input length and limited availability of training data, DocNMT often faces the challenge of data sparsity. To overcome this issue, we propose a novel Importance-Aware Data Augmentation (IADA) algorithm for DocNMT that augments the training data based on token importance information estimated by the norm of hidden states and training gradients. We conduct comprehensive experiments on three widely-used DocNMT benchmarks. Our empirical results show that our proposed IADA outperforms strong DocNMT baselines as well as several data augmentation approaches, with statistical significance on both sentence-level and document-level BLEU.</abstract>
       <url hash="9ae09aa2">2024.eacl-long.44</url>
       <bibkey>wu-etal-2024-importance</bibkey>
+      <video href="2024.eacl-long.44.mp4"/>
     </paper>
     <paper id="45">
       <title>Lost in Translationese? Reducing Translation Effect Using <fixed-case>A</fixed-case>bstract <fixed-case>M</fixed-case>eaning <fixed-case>R</fixed-case>epresentation</title>
@@ -535,6 +580,7 @@
       <abstract>Translated texts bear several hallmarks distinct from texts originating in the language (“translationese”). Though individual translated texts are often fluent and preserve meaning, at a large scale, translated texts have statistical tendencies which distinguish them from text originally written in the language and can affect model performance. We frame the novel task of translationese reduction and hypothesize that Abstract Meaning Representation (AMR), a graph-based semantic representation which abstracts away from the surface form, can be used as an interlingua to reduce the amount of translationese in translated texts. By parsing English translations into an AMR and then generating text from that AMR, the result more closely resembles originally English text across three quantitative macro-level measures, without severely compromising fluency or adequacy. We compare our AMR-based approach against three other techniques based on machine translation or paraphrase generation. This work represents the first approach to reducing translationese in text and highlights the promise of AMR, given that our AMR-based approach outperforms more computationally intensive methods.</abstract>
       <url hash="0262de7e">2024.eacl-long.45</url>
       <bibkey>wein-schneider-2024-lost</bibkey>
+      <video href="2024.eacl-long.45.mp4"/>
     </paper>
     <paper id="46">
       <title>Comparing Template-based and Template-free Language Model Probing</title>
@@ -567,6 +613,7 @@
       <url hash="1c518b33">2024.eacl-long.48</url>
       <attachment type="software" hash="13e2df5b">2024.eacl-long.48.software.zip</attachment>
       <bibkey>hawasly-etal-2024-scaling</bibkey>
+      <video href="2024.eacl-long.48.mp4"/>
     </paper>
     <paper id="49">
       <title><fixed-case>A</fixed-case>nthro<fixed-case>S</fixed-case>core: A Computational Linguistic Measure of Anthropomorphism</title>
@@ -580,6 +627,7 @@
       <attachment type="software" hash="976fa742">2024.eacl-long.49.software.zip</attachment>
       <attachment type="note" hash="e0c7f2b8">2024.eacl-long.49.note.zip</attachment>
       <bibkey>cheng-etal-2024-anthroscore</bibkey>
+      <video href="2024.eacl-long.49.mp4"/>
     </paper>
     <paper id="50">
       <title>Centering the Speech Community</title>
@@ -590,6 +638,7 @@
       <url hash="746b433a">2024.eacl-long.50</url>
       <bibkey>bird-yibarbuk-2024-centering</bibkey>
       <award>Outstanding Paper Award</award>
+      <video href="2024.eacl-long.50.mp4"/>
     </paper>
     <paper id="51">
       <title>Improving the <fixed-case>TENOR</fixed-case> of Labeling: Re-evaluating Topic Models for Content Analysis</title>
@@ -605,6 +654,7 @@
       <abstract>Topic models are a popular tool for understanding text collections, but their evaluation has been a point of contention. Automated evaluation metrics such as coherence are often used, however, their validity has been questioned for neural topic models (NTMs) and can overlook a model’s benefits in real-world applications. To this end, we conduct the first evaluation of neural, supervised and classical topic models in an interactive task-based setting. We combine topic models with a classifier and test their ability to help humans conduct content analysis and document annotation. From simulated, real user and expert pilot studies, the Contextual Neural Topic Model does the best on cluster evaluation metrics and human evaluations; however, LDA is competitive with two other NTMs under our simulated experiment and user study results, contrary to what coherence scores suggest. We show that current automated metrics do not provide a complete picture of topic modeling capabilities, but the right choice of NTMs can be better than classical models on practical tasks.</abstract>
       <url hash="9a220a85">2024.eacl-long.51</url>
       <bibkey>li-etal-2024-improving</bibkey>
+      <video href="2024.eacl-long.51.mp4"/>
     </paper>
     <paper id="52">
       <title>Quality Does Matter: A Detailed Look at the Quality and Utility of Web-Mined Parallel Corpora</title>
@@ -618,6 +668,7 @@
       <url hash="fc413046">2024.eacl-long.52</url>
       <bibkey>ranathunga-etal-2024-quality</bibkey>
       <award>Low-Resource Paper Award</award>
+      <video href="2024.eacl-long.52.mp4"/>
     </paper>
     <paper id="53">
       <title><fixed-case>VOLTAGE</fixed-case>: A Versatile Contrastive Learning based <fixed-case>OCR</fixed-case> Methodology for ultra low-resource scripts through Auto Glyph Feature Extraction</title>
@@ -645,6 +696,7 @@
       <attachment type="software" hash="4badab27">2024.eacl-long.54.software.zip</attachment>
       <attachment type="note" hash="7ef8b127">2024.eacl-long.54.note.zip</attachment>
       <bibkey>stoehr-etal-2024-unsupervised</bibkey>
+      <video href="2024.eacl-long.54.mp4"/>
     </paper>
     <paper id="55">
       <title>Entity-level Factual Adaptiveness of Fine-tuning based Abstractive Summarization Models</title>
@@ -659,6 +711,7 @@
       <abstract>Abstractive summarization models often generate factually inconsistent content particularly when the parametric knowledge of the model conflicts with the knowledge in the input document. In this paper, we analyze the robustness of fine-tuning based summarization models to the knowledge conflict, which we call factual adaptiveness. We utilize pre-trained language models to construct evaluation sets and find that factual adaptiveness is not strongly correlated with factual consistency on original datasets. Furthermore, we introduce a controllable counterfactual data augmentation method where the degree of knowledge conflict within the augmented data can be adjustable. Our experimental results on two pre-trained language models (PEGASUS and BART) and two fine-tuning datasets (XSum and CNN/DailyMail) demonstrate that our method enhances factual adaptiveness while achieving factual consistency on original datasets on par with the contrastive learning baseline.</abstract>
       <url hash="06aa3cef">2024.eacl-long.55</url>
       <bibkey>song-etal-2024-entity</bibkey>
+      <video href="2024.eacl-long.55.mp4"/>
     </paper>
     <paper id="56">
       <title>Meme-ingful Analysis: Enhanced Understanding of Cyberbullying in Memes Through Multimodal Explanations</title>
@@ -672,6 +725,7 @@
       <abstract>Internet memes have gained significant influence in communicating political, psychological, and sociocultural ideas. While meme are often humorous, there has been a rise in the use of memes for trolling and cyberbullying. Although a wide variety of effective deep learning-based models have been developed for detecting offensive multimodal memes, only a few works have been done on explainability aspect. Recent laws like “right to explanations” of General Data Protection Regulation, have spurred research in developing interpretable models rather than only focusing on performance. Motivated by this, we introduce MultiBully-Ex, the first benchmark dataset for multimodal explanation from code-mixed cyberbullying memes. Here, both visual and textual modalities are highlighted to explain why a given meme is cyberbullying. A Contrastive Language-Image Pretraining (CLIP) projection based multimodal shared-private multitask approach has been proposed for visual and textual explanation of a meme. Experimental results demonstrate that training with multimodal explanations improves performance in generating textual justifications and more accurately identifying the visual evidence supporting a decision with reliable performance improvements.</abstract>
       <url hash="e4b37150">2024.eacl-long.56</url>
       <bibkey>jha-etal-2024-meme</bibkey>
+      <video href="2024.eacl-long.56.mp4"/>
     </paper>
     <paper id="57">
       <title><fixed-case>L</fixed-case>a<fixed-case>M</fixed-case>ini-<fixed-case>LM</fixed-case>: A Diverse Herd of Distilled Models from Large-Scale Instructions</title>
@@ -679,11 +733,12 @@
       <author><first>Abdul</first><last>Waheed</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <author><first>Chiyu</first><last>Zhang</last><affiliation>University of British Columbia</affiliation></author>
       <author><first>Muhammad</first><last>Abdul-Mageed</last><affiliation>University of British Columbia</affiliation></author>
-      <author><first>Alham</first><last>Aji</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and Amazon</affiliation></author>
+      <author><first>Alham Fikri</first><last>Aji</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and Amazon</affiliation></author>
       <pages>944-964</pages>
       <abstract>Large language models (LLMs) with instruction fine-tuning demonstrate superior generative capabilities. However, these models are resource-intensive. To alleviate this issue, we explore distilling knowledge from instruction-tuned LLMs into much smaller ones. While other similar works have been done, they are often conducted on a limited set of (usually still large) models and are not accompanied by proper evaluations. To this end, we carefully develop a large set of 2.58M instructions based on both existing and newly-generated instructions. In addition to being sizable, we design our instructions to cover a broad set of topics to ensure diversity. Extensive analysis of our instruction dataset confirms its diversity, and we generate responses for these instructions using gpt-3.5-turbo. Leveraging these instructions, we fine-tune a diverse herd of models, collectively referred to as LaMini-LM, which includes models from both the encoder-decoder and decoder-only families, with varying sizes. We evaluate the performance of our models using automatic metrics on 15 different natural language processing (NLP) benchmarks, as well as through human assessment. We also assess the model for hallucination and toxicity, and for the former, we introduce a new benchmark dataset for hallucination-inducing QA. The results demonstrate that our proposed LaMini-LM models are comparable to strong baselines while being much smaller in size.</abstract>
       <url hash="30928b55">2024.eacl-long.57</url>
       <bibkey>wu-etal-2024-lamini</bibkey>
+      <video href="2024.eacl-long.57.mp4"/>
     </paper>
     <paper id="58">
       <title>Automated Cognate Detection as a Supervised Link Prediction Task with Cognate Transformer</title>
@@ -694,6 +749,7 @@
       <url hash="9766a43a">2024.eacl-long.58</url>
       <attachment type="software" hash="5091b085">2024.eacl-long.58.software.zip</attachment>
       <bibkey>akavarapu-bhattacharya-2024-automated</bibkey>
+      <video href="2024.eacl-long.58.mp4"/>
     </paper>
     <paper id="59">
       <title>Leveraging Multi-lingual Positive Instances in Contrastive Learning to Improve Sentence Embedding</title>
@@ -705,6 +761,7 @@
       <abstract>Learning multilingual sentence embeddings is a fundamental task in natural language processing. Recent trends in learning both monolingual and multilingual sentence embeddings are mainly based on contrastive learning (CL) among an anchor, one positive, and multiple negative instances. In this work, we argue that leveraging multiple positives should be considered for multilingual sentence embeddings because (1) positives in a diverse set of languages can benefit cross-lingual learning, and (2) transitive similarity across multiple positives can provide reliable structural information for learning.In order to investigate the impact of multiple positives in CL, we propose a novel approach, named MPCL, to effectively utilize multiple positive instances to improve the learning of multilingual sentence embeddings. Experimental results on various backbone models and downstream tasks demonstrate that MPCL leads to better retrieval, semantic similarity, and classification performance compared to conventional CL. We also observe that in unseen languages, sentence embedding models trained on multiple positives show better cross-lingual transfer performance than models trained on a single positive instance.</abstract>
       <url hash="b048238d">2024.eacl-long.59</url>
       <bibkey>zhao-etal-2024-leveraging</bibkey>
+      <video href="2024.eacl-long.59.mp4"/>
     </paper>
     <paper id="60">
       <title>Moderation in the Wild: Investigating User-Driven Moderation in Online Discussions</title>
@@ -729,6 +786,7 @@
       <url hash="401ed994">2024.eacl-long.61</url>
       <attachment type="note" hash="12508e70">2024.eacl-long.61.note.zip</attachment>
       <bibkey>micallef-etal-2024-cross</bibkey>
+      <video href="2024.eacl-long.61.mp4"/>
     </paper>
     <paper id="62">
       <title>Where Do We Go From Here? Multi-scale Allocentric Relational Inferencefrom Natural Spatial Descriptions</title>
@@ -742,6 +800,7 @@
       <url hash="591a1c74">2024.eacl-long.62</url>
       <attachment type="note" hash="b3c63758">2024.eacl-long.62.note.zip</attachment>
       <bibkey>paz-argaman-etal-2024-go</bibkey>
+      <video href="2024.eacl-long.62.mp4"/>
     </paper>
     <paper id="63">
       <title>Bias in Opinion Summarisation from Pre-training to Adaptation: A Case Study in Political Bias</title>
@@ -752,6 +811,7 @@
       <abstract>Opinion summarisation aims to summarise the salient information and opinions presented in documents such as product reviews, discussion forums, and social media texts into short summaries that enable users to effectively understand the opinions therein.Generating biased summaries has the risk of potentially swaying public opinion. Previous studies focused on studying bias in opinion summarisation using extractive models, but limited research has paid attention to abstractive summarisation models. In this study, using political bias as a case study, we first establish a methodology to quantify bias in abstractive models, then trace it from the pre-trained models to the task of summarising social media opinions using different models and adaptation methods. We find that most models exhibit intrinsic bias. Using a social media text summarisation dataset and contrasting various adaptation methods, we find that tuning a smaller number of parameters is less biased compared to standard fine-tuning; however, the diversity of topics in training data used for fine-tuning is critical.</abstract>
       <url hash="eca14f3f">2024.eacl-long.63</url>
       <bibkey>huang-etal-2024-bias</bibkey>
+      <video href="2024.eacl-long.63.mp4"/>
     </paper>
     <paper id="64">
       <title>Document Structure in Long Document Transformers</title>
@@ -766,6 +826,7 @@
       <attachment type="software" hash="397ee9e7">2024.eacl-long.64.software.zip</attachment>
       <attachment type="note" hash="57053c89">2024.eacl-long.64.note.zip</attachment>
       <bibkey>buchmann-etal-2024-document</bibkey>
+      <video href="2024.eacl-long.64.mp4"/>
     </paper>
     <paper id="65">
       <title>The Role of Data Curation in Image Captioning</title>
@@ -778,6 +839,7 @@
       <url hash="cfe82387">2024.eacl-long.65</url>
       <attachment type="software" hash="1b313c29">2024.eacl-long.65.software.zip</attachment>
       <bibkey>li-etal-2024-role</bibkey>
+      <video href="2024.eacl-long.65.mp4"/>
     </paper>
     <paper id="66">
       <title>Large-Scale Bitext Corpora Provide New Evidence for Cognitive Representations of Spatial Terms</title>
@@ -789,6 +851,7 @@
       <abstract>Recent evidence from cognitive science suggests that there exist two classes of cognitive representations within the spatial terms of a language, one represented geometrically (e.g., above, below) and the other functionally (e.g., on, in). It has been hypothesized that geometric terms are more constrained and are mastered relatively early in language learning, whereas functional terms are less constrained and are mastered over longer time periods (Landau, 2016). One consequence of this hypothesis is that these two classes should exhibit different cross-linguistic variability, which is supported by human elicitation studies. In this work we present to our knowledge the first corpus-based empirical test of this hypothesis. We develop a pipeline for extracting, isolating, and aligning spatial terms in basic locative constructions from parallel text. Using Shannon entropy to measure the variability of spatial term use across eight languages, we find supporting evidence that variability in functional terms differs significantly from that of geometric terms. We also perform latent variable modeling and find support for the division of spatial terms into geometric and functional classes.</abstract>
       <url hash="be8947a4">2024.eacl-long.66</url>
       <bibkey>viechnicki-etal-2024-large</bibkey>
+      <video href="2024.eacl-long.66.mp4"/>
     </paper>
     <paper id="67">
       <title><fixed-case>REFINER</fixed-case>: Reasoning Feedback on Intermediate Representations</title>
@@ -803,6 +866,7 @@
       <abstract>Language models (LMs) have recently shown remarkable performance on reasoning tasks by explicitly generating intermediate inferences,e.g., chain-of-thought prompting. However, these intermediate inference steps may be inappropriate deductions from the initial contextand lead to incorrect final predictions. Here we introduce REFINER, a framework for finetuning LMs to explicitly generate intermediate reasoning steps while interacting with a critic model that provides automated feedback on the reasoning. Specifically, the critic provides structured feedback that the reasoning LM uses to iteratively improve its intermediate arguments. Empirical evaluations of REFINER on three diverse reasoning tasks show significant improvements over baseline LMs of comparable scale. Furthermore, when using GPT-3.5 or ChatGPT as the reasoner, the trained critic significantly improves reasoning without finetuning the reasoner. Finally, our critic model is trained without expensive human-in-the-loop data but can be substituted with humans at inference time.</abstract>
       <url hash="3d68ab77">2024.eacl-long.67</url>
       <bibkey>paul-etal-2024-refiner</bibkey>
+      <video href="2024.eacl-long.67.mp4"/>
     </paper>
     <paper id="68">
       <title><fixed-case>H</fixed-case>um<fixed-case>BEL</fixed-case>: A Human-in-the-Loop Approach for Evaluating Demographic Factors of Language Models in Human-Machine Conversations</title>
@@ -813,6 +877,7 @@
       <abstract>While demographic factors like age and gender change the way people talk, and in particular, the way people talk to machines, there is little investigation into how large pre-trained language models (LMs) can adapt to these changes. To remedy this gap, we consider how demographic factors in LM language skills can be measured to determine compatibility with a target demographic. We suggest clinical techniques from Speech Language Pathology, which has norms for acquisition of language skills in humans. We conduct evaluation with a domain expert (i.e., a clinically licensed speech language pathologist), and also propose automated techniques to complement clinical evaluation at scale. Empirically, we focus on age, finding LM capability varies widely depending on task: GPT-3.5 mimics the ability of humans ranging from age 6-15 at tasks requiring inference, and simultaneously, outperforms a typical 21 year old at memorization. GPT-3.5 also has trouble with social language use, exhibiting less than 50% of the tested pragmatic skills. Findings affirm the importance of considering demographic alignment and conversational goals when using LMs as public-facing tools. Code, data, and a package will be available.</abstract>
       <url hash="a238df32">2024.eacl-long.68</url>
       <bibkey>sicilia-etal-2024-humbel</bibkey>
+      <video href="2024.eacl-long.68.mp4"/>
     </paper>
     <paper id="69">
       <title><fixed-case>LOCOST</fixed-case>: State-Space Models for Long Document Abstractive Summarization</title>
@@ -832,6 +897,7 @@
       <revision id="1" href="2024.eacl-long.69v1" hash="7d28d719"/>
       <revision id="2" href="2024.eacl-long.69v2" hash="e4b40864" date="2024-03-21">Add an extra acknowlegement.</revision>
       <award>Best Paper Award</award>
+      <video href="2024.eacl-long.69.mp4"/>
     </paper>
     <paper id="70">
       <title>A Classification-Guided Approach for Adversarial Attacks against Neural Machine Translation</title>
@@ -842,6 +908,7 @@
       <abstract>Neural Machine Translation (NMT) models have been shown to be vulnerable to adversarial attacks, wherein carefully crafted perturbations of the input can mislead the target model. In this paper, we introduce ACT, a novel adversarial attack framework against NMT systems guided by a classifier. In our attack, the adversary aims to craft meaning-preserving adversarial examples whose translations in the target language by the NMT model belong to a different class than the original translations. Unlike previous attacks, our new approach has a more substantial effect on the translation by altering the overall meaning, which then leads to a different class determined by an oracle classifier. To evaluate the robustness of NMT models to our attack, we propose enhancements to existing black-box word-replacement-based attacks by incorporating output translations of the target NMT model and the output logits of a classifier within the attack process. Extensive experiments, including a comparison with existing untargeted attacks, show that our attack is considerably more successful in altering the class of the output translation and has more effect on the translation. This new paradigm can reveal the vulnerabilities of NMT systems by focusing on the class of translation rather than the mere translation quality as studied traditionally.</abstract>
       <url hash="3af276be">2024.eacl-long.70</url>
       <bibkey>sadrizadeh-etal-2024-classification</bibkey>
+      <video href="2024.eacl-long.70.mp4"/>
     </paper>
     <paper id="71">
       <title>Improving Generalization in Semantic Parsing by Increasing Natural Language Variation</title>
@@ -851,6 +918,7 @@
       <abstract>Text-to-SQL semantic parsing has made significant progress in recent years, with various models demonstrating impressive performance on the challenging Spider benchmark. However, it has also been shown that these models often struggle to generalize even when faced with small perturbations of previously (accurately) parsed expressions. This is mainly due to the linguistic form of questions in Spider which are overly specific, unnatural, and display limited variation. In this work, we use data augmentation to enhance the robustness of text-to-SQL parsers against natural language variations. Existing approaches generate question reformulations either via models trained on Spider or only introduce local changes. In contrast, we leverage the capabilities of large language models to generate more realistic and diverse questions. Using only a few prompts, we achieve a two-fold increase in the number of questions in Spider. Training on this augmented dataset yields substantial improvements on a range of evaluation sets, including robustness benchmarks and out-of-domain data.</abstract>
       <url hash="5cba1ca2">2024.eacl-long.71</url>
       <bibkey>saparina-lapata-2024-improving</bibkey>
+      <video href="2024.eacl-long.71.mp4"/>
     </paper>
     <paper id="72">
       <title>Text-to-Code Generation with Modality-relative Pre-training</title>
@@ -861,6 +929,7 @@
       <abstract>Large pre-trained language models have recently been expanded and applied to programming language tasks with great success, often through further pre-training of a strictly-natural language model–where training sequences typically contain both natural and (linearised) programming language. Such approaches effectively map both modalities of the sequence into the same embedding space. However, programming language keywords (e.g. “while”) often have very strictly defined semantics. As such, transfer learning from their natural language usage may not necessarily be beneficial to their code application and vise versa. Assuming an already pre-trained language model, in this work we investigate how sequence tokens can be adapted and represented differently, depending on which modality they belong to, and to the ultimate benefit of the downstream task. We experiment with separating embedding spaces between modalities during further model pre-training with modality-relative training objectives. We focus on text-to-code generation and observe consistent improvements across two backbone models and two test sets, measuring pass@<tex-math>k</tex-math> and a novel incremental variation.</abstract>
       <url hash="fa3b5357">2024.eacl-long.72</url>
       <bibkey>christopoulou-etal-2024-text</bibkey>
+      <video href="2024.eacl-long.72.mp4"/>
     </paper>
     <paper id="73">
       <title>No Error Left Behind: Multilingual Grammatical Error Correction with Pre-trained Translation Models</title>
@@ -871,6 +940,7 @@
       <abstract>Grammatical Error Correction (GEC) enhances language proficiency and promotes effective communication, but research has primarily centered around English. We propose a simple approach to multilingual and low-resource GEC by exploring the potential of multilingual machine translation (MT) models for error correction. We show that MT models are not only capable of error correction out-of-the-box, but that they can also be fine-tuned to even better correction quality. Results show the effectiveness of this approach, with our multilingual model outperforming similar-sized mT5-based models and even competing favourably with larger models.</abstract>
       <url hash="3e160efb">2024.eacl-long.73</url>
       <bibkey>luhtaru-etal-2024-error</bibkey>
+      <video href="2024.eacl-long.73.mp4"/>
     </paper>
     <paper id="74">
       <title>Quantifying Stereotypes in Language</title>
@@ -880,6 +950,7 @@
       <url hash="ef21cc92">2024.eacl-long.74</url>
       <attachment type="software" hash="c4201e57">2024.eacl-long.74.software.zip</attachment>
       <bibkey>liu-2024-quantifying</bibkey>
+      <video href="2024.eacl-long.74.mp4"/>
     </paper>
     <paper id="75">
       <title>Generation, Distillation and Evaluation of Motivational Interviewing-Style Reflections with a Foundational Language Model</title>
@@ -893,6 +964,7 @@
       <abstract>Large Foundational Language Models are capable of performing many tasks at a high level but are difficult to deploy in many applications because of their size and proprietary ownership. Many will be motivated to distill specific capabilities of foundational models into smaller models that can be owned and controlled. In the development of a therapeutic chatbot, we wish to distill a capability known as reflective listening, in which a therapist produces reflections of client speech. These reflections either restate what a client has said, or connect what was said to a relevant observation, idea or guess that encourages and guides the client to continue contemplation. In this paper, we present a method for distilling the generation of reflections from a Foundational Language Model (GPT-4) into smaller models. We first show that GPT-4, using zero-shot prompting, can generate reflections at near 100% success rate, superior to all previous methods. Using reflections generated by GPT-4, we fine-tune different sizes of the GPT-2 family. The GPT-2-small model achieves 83% success on a hold-out test set and the GPT-2 XL achieves 90% success. We also show that GPT-4 can help in the labor-intensive task of evaluating the quality of the distilled models, using it as a zero-shot classifier. Using triple-human review as a guide, the classifier achieves a Cohen-Kappa of 0.66, a substantial inter-rater reliability figure.</abstract>
       <url hash="5d0f148a">2024.eacl-long.75</url>
       <bibkey>brown-etal-2024-generation</bibkey>
+      <video href="2024.eacl-long.75.mp4"/>
     </paper>
     <paper id="76">
       <title>Multi-Reference Benchmarks for <fixed-case>R</fixed-case>ussian Grammatical Error Correction</title>
@@ -902,6 +974,7 @@
       <abstract>This paper presents multi-reference benchmarks for the Grammatical Error Correction (GEC) of Russian, based on two existing single-reference datasets, for a total of 7,444 learner sentences from a variety of first language backgrounds. Each sentence is corrected independently by two new raters, and their corrections are reviewed by a senior annotator, resulting in a total of three references per sentence. Analysis of the annotations reveals that the new raters tend to make more changes, compared to the original raters, especially at the lexical level. We conduct experiments with two popular GEC approaches and show competitive performance on the original datasets and the new benchmarks. We also compare system scores as evaluated against individual annotators and discuss the effect of using multiple references overall and on specific error types. We find that using the union of the references increases system scores by more than 10 points and decreases the gap between system and human performance, thereby providing a more realistic evaluation of GEC system performance, although the effect is not the same across the error types. The annotations are available for research.</abstract>
       <url hash="04fe91fa">2024.eacl-long.76</url>
       <bibkey>palma-gomez-rozovskaya-2024-multi</bibkey>
+      <video href="2024.eacl-long.76.mp4"/>
     </paper>
     <paper id="77">
       <title>Plan-Grounded Large Language Models for Dual Goal Conversational Settings</title>
@@ -914,6 +987,7 @@
       <abstract>Training Large Language Models (LLMs) to follow user instructions has shown to supply the LLM with ample capacity to converse fluently while being aligned with humans. Yet, it is not completely clear how an LLM can lead a plan-grounded conversation in mixed-initiative settings where instructions flow in both directions of the conversation, i.e. both the LLM and the user provide instructions to one another. In this paper, we tackle a dual goal mixed-initiative conversational setting where the LLM not only grounds the conversation on an arbitrary plan but also seeks to satisfy both a procedural plan and user instructions. The LLM is then responsible for guiding the user through the plan and, at the same time, adapting to new circumstances, answering questions, and activating safety guardrails when needed. We propose a novel LLM that grounds the dialogue on a procedural plan, can take the dialogue initiative, and enforces guardrails on the system’s behavior, while also improving the LLM’s responses to unexpected user behavior. Experiments in controlled settings and with real users show that the best-performing model, which we call PlanLLM, achieves a 2.1x improvement over a strong baseline. Moreover, experiments also show good generalization to unseen domains.</abstract>
       <url hash="81ed6b57">2024.eacl-long.77</url>
       <bibkey>gloria-silva-etal-2024-plan</bibkey>
+      <video href="2024.eacl-long.77.mp4"/>
     </paper>
     <paper id="78">
       <title>“Define Your Terms” : Enhancing Efficient Offensive Speech Classification with Definition</title>
@@ -924,6 +998,7 @@
       <abstract>The propagation of offensive content through social media channels has garnered attention of the research community. Multiple works have proposed various semantically related yet subtle distinct categories of offensive speech. In this work, we explore meta-learning approaches to leverage the diversity of offensive speech corpora to enhance their reliable and efficient detection. We propose a joint embedding architecture that incorporates the input’s label and definition for classification via Prototypical Network. Our model achieves at least 75% of the maximal F1-score while using less than 10% of the available training data across 4 datasets. Our experimental findings also provide a case study of training strategies valuable to combat resource scarcity.</abstract>
       <url hash="da830553">2024.eacl-long.78</url>
       <bibkey>nghiem-etal-2024-define</bibkey>
+      <video href="2024.eacl-long.78.mp4"/>
     </paper>
     <paper id="79">
       <title><fixed-case>V</fixed-case>log<fixed-case>QA</fixed-case>: Task, Dataset, and Baseline Models for <fixed-case>V</fixed-case>ietnamese Spoken-Based Machine Reading Comprehension</title>
@@ -940,6 +1015,7 @@
       <bibkey>ngo-etal-2024-vlogqa</bibkey>
       <revision id="1" href="2024.eacl-long.79v1" hash="8aa9495f"/>
       <revision id="2" href="2024.eacl-long.79v2" hash="ca4a50d7" date="2024-03-17">Minor updates.</revision>
+      <video href="2024.eacl-long.79.mp4"/>
     </paper>
     <paper id="80">
       <title><fixed-case>CEV</fixed-case>-<fixed-case>LM</fixed-case>: Controlled Edit Vector Language Model for Shaping Natural Language Generations</title>
@@ -951,6 +1027,7 @@
       <url hash="92f39c93">2024.eacl-long.80</url>
       <attachment type="software" hash="020da64e">2024.eacl-long.80.software.zip</attachment>
       <bibkey>moorjani-etal-2024-cev</bibkey>
+      <video href="2024.eacl-long.80.mp4"/>
     </paper>
     <paper id="81">
       <title>It’s All Relative: Learning Interpretable Models for Scoring Subjective Bias in Documents from Pairwise Comparisons</title>
@@ -961,6 +1038,7 @@
       <abstract>We propose an interpretable model to score the subjective bias present in documents, based only on their textual content. Our model is trained on pairs of revisions of the same Wikipedia article, where one version is more biased than the other. Although prior approaches based on bias classification have struggled to obtain a high accuracy for the task, we are able to develop a useful model for scoring bias by learning to accurately perform pairwise comparisons. We show that we can interpret the parameters of the trained model to discover the words most indicative of bias. We also apply our model in three different settings by studying the temporal evolution of bias in Wikipedia articles, comparing news sources based on bias, and scoring bias in law amendments. In each case, we demonstrate that the outputs of the model can be explained and validated, even for the two domains that are outside the training-data domain. We also use the model to compare the general level of bias between domains, where we see that legal texts are the least biased and news media are the most biased, with Wikipedia articles in between.</abstract>
       <url hash="d2d5debb">2024.eacl-long.81</url>
       <bibkey>suresh-etal-2024-relative</bibkey>
+      <video href="2024.eacl-long.81.mp4"/>
     </paper>
     <paper id="82">
       <title><fixed-case>H</fixed-case>i<fixed-case>G</fixed-case>en: Hierarchy-Aware Sequence Generation for Hierarchical Text Classification</title>
@@ -992,7 +1070,7 @@
       <author><first>Tarek</first><last>Mahmoud</last></author>
       <author><first>Toru</first><last>Sasaki</last><affiliation>Technische Universität Darmstadt</affiliation></author>
       <author><first>Thomas</first><last>Arnold</last><affiliation>Technische Universität Darmstadt</affiliation></author>
-      <author><first>Alham</first><last>Aji</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and Amazon</affiliation></author>
+      <author><first>Alham Fikri</first><last>Aji</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and Amazon</affiliation></author>
       <author><first>Nizar</first><last>Habash</last><affiliation>New York University Abu Dhabi</affiliation></author>
       <author><first>Iryna</first><last>Gurevych</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and Technical University of Darmstadt</affiliation></author>
       <author><first>Preslav</first><last>Nakov</last></author>
@@ -1001,6 +1079,7 @@
       <url hash="3ba47fcc">2024.eacl-long.83</url>
       <bibkey>wang-etal-2024-m4</bibkey>
       <award>Resource Paper Award</award>
+      <video href="2024.eacl-long.83.mp4"/>
     </paper>
     <paper id="84">
       <title>A Truly Joint Neural Architecture for Segmentation and Parsing</title>
@@ -1010,6 +1089,7 @@
       <abstract>Contemporary multilingual dependency parsers can parse a diverse set of languages, but for Morphologically Rich Languages (MRLs), performance is attested to be lower than other languages. The key challenge is that, due to high morphological complexity and ambiguity of the space-delimited input tokens, the linguistic units that act as nodes in the tree are not known in advance. Pre-neural dependency parsers for MRLs subscribed to the <i>joint morpho-syntactic hypothesis</i>, stating that morphological segmentation and syntactic parsing should be solved jointly, rather than as a pipeline where segmentation precedes parsing. However, neural state-of-the-art parsers to date use a strict pipeline. In this paper we introduce a joint neural architecture where a lattice-based representation preserving all morphological ambiguity of the input is provided to an arc-factored model, which then solves the morphological segmentation and syntactic parsing tasks at once. Our experiments on Hebrew, a rich and highly ambiguous MRL, demonstrate state-of-the-art performance on parsing, tagging and segmentation of the Hebrew section of UD, using a single model. This proposed architecture is LLM-based and language agnostic, providing a solid foundation for MRLs to obtain further performance improvements and bridge the gap with other languages.</abstract>
       <url hash="ecd3388e">2024.eacl-long.84</url>
       <bibkey>yshaayahu-levi-tsarfaty-2024-truly</bibkey>
+      <video href="2024.eacl-long.84.mp4"/>
     </paper>
     <paper id="85">
       <title><fixed-case>V</fixed-case>i<fixed-case>L</fixed-case>ex<fixed-case>N</fixed-case>orm: A Lexical Normalization Corpus for <fixed-case>V</fixed-case>ietnamese Social Media Text</title>
@@ -1021,6 +1101,7 @@
       <url hash="2d2a03bf">2024.eacl-long.85</url>
       <attachment type="note" hash="38acac6f">2024.eacl-long.85.note.zip</attachment>
       <bibkey>nguyen-etal-2024-vilexnorm</bibkey>
+      <video href="2024.eacl-long.85.mp4"/>
     </paper>
     <paper id="86">
       <title>Diffusion-<fixed-case>NAT</fixed-case>: Self-Prompting Discrete Diffusion for Non-Autoregressive Text Generation</title>
@@ -1044,6 +1125,7 @@
       <url hash="33e6bec7">2024.eacl-long.87</url>
       <attachment type="software" hash="ade150f7">2024.eacl-long.87.software.zip</attachment>
       <bibkey>chernyavskiy-etal-2024-unleashing</bibkey>
+      <video href="2024.eacl-long.87.mp4"/>
     </paper>
     <paper id="88">
       <title>Predicting Client Emotions and Therapist Interventions in Psychotherapy Dialogues</title>
@@ -1056,6 +1138,7 @@
       <abstract>Natural Language Processing (NLP) can advance psychotherapy research by scaling up therapy dialogue analysis as well as by allowing researchers to examine client-therapist interactions in detail. Previous studies have mainly either explored the clients’ behavior or the therapists’ intervention in dialogues. Yet, modelling conversations from both dialogue participants is crucial to understanding the therapeutic interaction. This study explores speaker contribution-based dialogue acts at the utterance-level; i.e, the therapist - Intervention Prediction (IP) and the client - Emotion Recognition (ER) in psychotherapy using a pan-theoretical schema. We perform experiments with fine-tuned language models and light-weight adapter solutions on a Hebrew dataset. We deploy the results from our ER model predictions in investigating the coherence between client self-reports on emotion and the utterance-level emotions. Our best adapters achieved on-par performance with fully fine-tuned models, at 0.64 and 0.66 micro F1 for IP and ER, respectively. In addition, our analysis identifies ambiguities within categorical clinical coding, which can be used to fine-tune the coding schema. Finally, our results indicate a positive correlation between client self-reports and utterance-level emotions.</abstract>
       <url hash="28601b97">2024.eacl-long.88</url>
       <bibkey>mayer-etal-2024-predicting</bibkey>
+      <video href="2024.eacl-long.88.mp4"/>
     </paper>
     <paper id="89">
       <title>Who Needs Decoders? Efficient Estimation of Sequence-Level Attributes with Proxies</title>
@@ -1067,6 +1150,7 @@
       <abstract>Sequence-to-sequence models often require an expensive autoregressive decoding process. However, for some downstream tasks such as out-of-distribution (OOD) detection and resource allocation, the actual decoding output is not needed, just a scalar attribute of this sequence. In such scenarios, where knowing the quality of a system’s output to predict poor performance prevails over knowing the output itself, is it possible to bypass the autoregressive decoding? We propose Non-Autoregressive Proxy (NAP) models that can efficiently predict scalar-valued sequence-level attributes. Importantly, NAPs predict these metrics directly from the encodings, avoiding the expensive decoding stage. We consider two sequence tasks: Machine Translation (MT) and Automatic Speech Recognition (ASR). In OOD for MT, NAPs outperform ensembles while being significantly faster. NAPs are also proven capable of predicting metrics such as BERTScore (MT) or word error rate (ASR). For downstream tasks, such as data filtering and resource optimization, NAPs generate performance predictions that outperform predictive uncertainty while being highly inference efficient.</abstract>
       <url hash="dd85ade2">2024.eacl-long.89</url>
       <bibkey>fathullah-etal-2024-needs</bibkey>
+      <video href="2024.eacl-long.89.mp4"/>
     </paper>
     <paper id="90">
       <title>3<fixed-case>D</fixed-case> Rotation and Translation for Hyperbolic Knowledge Graph Embedding</title>
@@ -1077,6 +1161,7 @@
       <url hash="bb0a8eb9">2024.eacl-long.90</url>
       <attachment type="software" hash="ec58690a">2024.eacl-long.90.software.zip</attachment>
       <bibkey>zhu-shimodaira-2024-3d</bibkey>
+      <video href="2024.eacl-long.90.mp4"/>
     </paper>
     <paper id="91">
       <title>Geo-Encoder: A Chunk-Argument Bi-Encoder Framework for <fixed-case>C</fixed-case>hinese Geographic Re-Ranking</title>
@@ -1092,6 +1177,7 @@
       <abstract>Chinese geographic re-ranking task aims to find the most relevant addresses among retrieved candidates, which is crucial for location-related services such as navigation maps. Unlike the general sentences, Chinese geographic contexts are closely intertwined with geographical concepts, from general spans (e.g., province) to specific spans (e.g., road). Given this feature, we propose an innovative framework, namely Geo-Encoder, to more effectively integrate Chinese geographical semantics into re-ranking pipelines. Our methodology begins by employing off-the-shelf tools to associate text with geographical spans, treating them as chunking units. Then, we present a multi-task learning module to simultaneously acquire an effective attention matrix that determines chunk contributions to geographic representations. Furthermore, we put forth an asynchronous update mechanism for the proposed task, aiming to guide the model to focus on specific chunks. Experiments on two Chinese benchmark datasets, show that the Geo-Encoder achieves significant improvements when compared to state-of-the-art baselines. Notably, it leads to a substantial improvement in the Hit@1 score of MGEO-BERT, increasing it by 6.22% from 62.76 to 68.98 on the GeoTES dataset.</abstract>
       <url hash="7c138386">2024.eacl-long.91</url>
       <bibkey>cao-etal-2024-geo</bibkey>
+      <video href="2024.eacl-long.91.mp4"/>
     </paper>
     <paper id="92">
       <title>Style-News: Incorporating Stylized News Generation and Adversarial Verification for Neural Fake News Detection</title>
@@ -1113,6 +1199,7 @@
       <abstract>Despite the predominance of contextualized embeddings in NLP, approaches to detect semantic change relying on these embeddings and clustering methods underperform simpler counterparts based on static word embeddings. This stems from the poor quality of the clustering methods to produce sense clusters—which struggle to capture word senses, especially those with low frequency. This issue hinders the next step in examining how changes in word senses in one language influence another. To address this issue, we propose a graph-based clustering approach to capture nuanced changes in both high- and low-frequency word senses across time and languages, including the acquisition and loss of these senses over time. Our experimental results show that our approach substantially surpasses previous approaches in the SemEval2020 binary classification task across four languages. Moreover, we showcase the ability of our approach as a versatile visualization tool to detect semantic changes in both intra-language and inter-language setups. We make our code and data publicly available.</abstract>
       <url hash="af7a6847">2024.eacl-long.93</url>
       <bibkey>ma-etal-2024-graph</bibkey>
+      <video href="2024.eacl-long.93.mp4"/>
     </paper>
     <paper id="94">
       <title>Translate to Disambiguate: Zero-shot Multilingual Word Sense Disambiguation with Pretrained Language Models</title>
@@ -1123,6 +1210,7 @@
       <abstract>Pretrained Language Models (PLMs) learn rich cross-lingual knowledge and perform well on diverse tasks such as translation and multilingual word sense disambiguation (WSD) when finetuned. However, they often struggle at disambiguating word sense in a zero-shot setting. To better understand this contrast, we present a new study investigating how well PLMs capture cross-lingual word sense with Contextual Word-Level Translation (C-WLT), an extension of word-level translation that prompts the model to translate a given word in context. We find that as the model size increases, PLMs encode more cross-lingual word sense knowledge and better use context to improve WLT performance. Building on C-WLT, we introduce a zero-shot prompting approach for WSD, tested on 18 languages from the XL-WSD dataset. Our method outperforms fully supervised baselines on recall for many evaluation languages without additional training or finetuning. This study presents a first step towards understanding how to best leverage the cross-lingual knowledge inside PLMs for robust zero-shot reasoning in any language.</abstract>
       <url hash="990aca57">2024.eacl-long.94</url>
       <bibkey>kang-etal-2024-translate</bibkey>
+      <video href="2024.eacl-long.94.mp4"/>
     </paper>
     <paper id="95">
       <title>Anchor Points: Benchmarking Models with Much Fewer Examples</title>
@@ -1136,6 +1224,7 @@
       <attachment type="software" hash="f5fc7bb4">2024.eacl-long.95.software.zip</attachment>
       <attachment type="note" hash="041c0990">2024.eacl-long.95.note.zip</attachment>
       <bibkey>vivek-etal-2024-anchor</bibkey>
+      <video href="2024.eacl-long.95.mp4"/>
     </paper>
     <paper id="96">
       <title><fixed-case>SCO</fixed-case>-<fixed-case>VIST</fixed-case>: Social Interaction Commonsense Knowledge-based Visual Storytelling</title>
@@ -1146,6 +1235,7 @@
       <abstract>Visual storytelling aims to automatically generate a coherent story based on a given image sequence. Unlike tasks like image captioning, visual stories should contain factual descriptions, worldviews, and human social commonsense to put disjointed elements together to form a coherent and engaging human-writeable story. However, most models mainly focus on applying factual information and using taxonomic/lexical external knowledge when attempting to create stories. This paper introduces SCO-VIST, a framework representing the image sequence as a graph with objects and relations that includes human action motivation and its social interaction commonsense knowledge. SCO-VIST then takes this graph representing plot points and creates bridges between plot points with semantic and occurrence-based edge weights. This weighted story graph produces the storyline in a sequence of events using Floyd-Warshall’s algorithm. Our proposed framework produces stories superior across multiple metrics in terms of visual grounding, coherence, diversity, and humanness, per both automatic and human evaluations.</abstract>
       <url hash="7c775d81">2024.eacl-long.96</url>
       <bibkey>wang-etal-2024-sco</bibkey>
+      <video href="2024.eacl-long.96.mp4"/>
     </paper>
     <paper id="97">
       <title>Discovering and Articulating Frames of Communication from Social Media Using Chain-of-Thought Reasoning</title>
@@ -1155,6 +1245,7 @@
       <abstract>Frames of Communication (FoCs) are ubiquitous in social media discourse. They define what counts as a problem, diagnose what is causing the problem, elicit moral judgments and imply remedies for resolving the problem. Most research on automatic frame detection involved the recognition of the problems addressed by frames, but did not consider the articulation of frames. Articulating an FoC involves reasoning with salient problems, their cause and eventual solution. In this paper we present a method for Discovering and Articulating FoCs (DA-FoC) that relies on a combination of Chain-of-Thought prompting of large language models (LLMs) with In-Context Active Curriculum Learning. Very promising evaluation results indicate that 86.72% of the FoCs encoded by communication experts on the same reference dataset were also uncovered by DA-FoC. Moreover, DA-FoC uncovered many new FoCs, which escaped the experts. Interestingly, 55.1% of the known FoCs were judged as being better articulated than the human-written ones, while 93.8% of the new FoCs were judged as having sound rationale and being clearly articulated.</abstract>
       <url hash="ecd1ffc0">2024.eacl-long.97</url>
       <bibkey>weinzierl-harabagiu-2024-discovering</bibkey>
+      <video href="2024.eacl-long.97.mp4"/>
     </paper>
     <paper id="98">
       <title><fixed-case>VEIL</fixed-case>: Vetting Extracted Image Labels from In-the-Wild Captions for Weakly-Supervised Object Detection</title>
@@ -1166,6 +1257,7 @@
       <attachment type="software" hash="51fccf86">2024.eacl-long.98.software.zip</attachment>
       <attachment type="note" hash="e92a37a3">2024.eacl-long.98.note.zip</attachment>
       <bibkey>rai-kovashka-2024-veil</bibkey>
+      <video href="2024.eacl-long.98.mp4"/>
     </paper>
     <paper id="99">
       <title><fixed-case>WSC</fixed-case>+: Enhancing The <fixed-case>W</fixed-case>inograd Schema Challenge Using Tree-of-Experts</title>
@@ -1175,6 +1267,7 @@
       <abstract>The Winograd Schema Challenge (WSC) serves as a prominent benchmark for evaluating machine understanding. While Large Language Models (LLMs) excel at answering WSC questions, their ability to generate such questions remains less explored. In this work, we propose Tree-of-Experts (ToE), a novel prompting method which enhances the generation of WSC instances (50% valid cases vs. 10% in recent methods). Using this approach, we introduce WSC+, a novel dataset comprising 3,026 LLM-generated sentences. Notably, we extend the WSC framework by incorporating new ‘ambiguous’ and ‘offensive’ categories, providing a deeper insight into model overconfidence and bias. Our analysis reveals nuances in generation-evaluation consistency, suggesting that LLMs may not always outperform in evaluating their own generated questions when compared to those crafted by other models. On WSC+, GPT-4, the top-performing LLM, achieves an accuracy of 68.7%, significantly below the human benchmark of 95.1%.</abstract>
       <url hash="a39127b0">2024.eacl-long.99</url>
       <bibkey>zahraei-emami-2024-wsc</bibkey>
+      <video href="2024.eacl-long.99.mp4"/>
     </paper>
     <paper id="100">
       <title>Kardeş-<fixed-case>NLU</fixed-case>: Transfer to Low-Resource Languages with the Help of a High-Resource Cousin – A Benchmark and Evaluation for <fixed-case>T</fixed-case>urkic Languages</title>
@@ -1191,6 +1284,7 @@
       <revision id="1" href="2024.eacl-long.100v1" hash="e31fcfaf"/>
       <revision id="2" href="2024.eacl-long.100v2" hash="9e3df1c2" date="2024-03-25">The revision changes the title due to ethical considerations.</revision>
       <award>Outstanding Paper Award</award>
+      <video href="2024.eacl-long.100.mp4"/>
     </paper>
     <paper id="101">
       <title>Chaining Event Spans for Temporal Relation Grounding</title>
@@ -1202,6 +1296,7 @@
       <abstract>Accurately understanding temporal relations between events is a critical building block of diverse tasks, such as temporal reading comprehension (TRC) and relation extraction (TRE). For example in TRC, we need to understand the temporal semantic differences between the following two questions that are lexically near-identical: “What finished right before the decision?” or “What finished right after the decision?”. To discern the two questions, existing solutions have relied on answer overlaps as a proxy label to contrast similar and dissimilar questions. However, we claim that answer overlap can lead to unreliable results, due to spurious overlaps of two dissimilar questions with coincidentally identical answers. To address the issue, we propose a novel approach that elicits proper reasoning behaviors through a module for predicting time spans of events. We introduce the Timeline Reasoning Network (TRN) operating in a two-step inductive reasoning process: In the first step model initially answers each question with semantic and syntactic information. The next step chains multiple questions on the same event to predict a timeline, which is then used to ground the answers. Results on the TORQUE and TB-dense, TRC, and TRE tasks respectively, demonstrate that TRN outperforms previous methods by effectively resolving the spurious overlaps using the predicted timeline.</abstract>
       <url hash="c1779110">2024.eacl-long.101</url>
       <bibkey>kim-etal-2024-chaining</bibkey>
+      <video href="2024.eacl-long.101.mp4"/>
     </paper>
     <paper id="102">
       <title>Fine-Grained Natural Language Inference Based Faithfulness Evaluation for Diverse Summarisation Tasks</title>
@@ -1212,6 +1307,7 @@
       <abstract>We study existing approaches to leverage off-the-shelf Natural Language Inference (NLI) models for the evaluation of summary faithfulness and argue that these are sub-optimal due to the granularity level considered for premises and hypotheses. That is, the smaller content unit considered as hypothesis is a sentence and premises are made up of a fixed number of document sentences. We propose a novel approach, namely INFUSE, that uses a variable premise size and simplifies summary sentences into shorter hypotheses. Departing from previous studies which focus on single short document summarisation, we analyse NLI based faithfulness evaluation for diverse summarisation tasks. We introduce DiverSumm, a new benchmark comprising long form summarisation (long documents and summaries) and diverse summarisation tasks (e.g., meeting and multi-document summarisation). In experiments, INFUSE obtains superior performance across the different summarisation tasks.</abstract>
       <url hash="0ae0f63d">2024.eacl-long.102</url>
       <bibkey>zhang-etal-2024-fine</bibkey>
+      <video href="2024.eacl-long.102.mp4"/>
     </paper>
     <paper id="103">
       <title><fixed-case>A</fixed-case>na<fixed-case>DE</fixed-case>1.0: A Novel Data Set for Benchmarking Analogy Detection and Extraction</title>
@@ -1223,6 +1319,7 @@
       <abstract>Textual analogies that make comparisons between two concepts are often used for explaining complex ideas, creative writing, and scientific discovery. In this paper, we propose and study a new task, called Analogy Detection and Extraction (AnaDE), which includes three synergistic sub-tasks: 1) detecting documents containing analogies, 2) extracting text segments that make up the analogy, and 3) identifying the (source and target) concepts being compared. To facilitate the study of this new task, we create a benchmark dataset by scraping Metamia.com and investigate the performances of state-of-the-art models on all sub-tasks to establish the first-generation benchmark results for this new task. We find that the Longformer model achieves the best performance on all the three sub-tasks demonstrating its effectiveness for handling long texts. Moreover, smaller models fine-tuned on our dataset perform better than non-finetuned ChatGPT, suggesting high task difficulty. Overall, the models achieve a high performance on documents detection suggesting that it could be used to develop applications like analogy search engines. Further, there is a large room for improvement on the segment and concept extraction tasks.</abstract>
       <url hash="443d0109">2024.eacl-long.103</url>
       <bibkey>bhavya-etal-2024-anade1</bibkey>
+      <video href="2024.eacl-long.103.mp4"/>
     </paper>
     <paper id="104">
       <title>A Comprehensive Survey of Sentence Representations: From the <fixed-case>BERT</fixed-case> Epoch to the <fixed-case>CHATGPT</fixed-case> Era and Beyond</title>
@@ -1236,6 +1333,7 @@
       <abstract>Sentence representations are a critical component in NLP applications such as retrieval, question answering, and text classification. They capture the meaning of a sentence, enabling machines to understand and reason over human language. In recent years, significant progress has been made in developing methods for learning sentence representations, including unsupervised, supervised, and transfer learning approaches. However there is no literature review on sentence representations till now. In this paper, we provide an overview of the different methods for sentence representation learning, focusing mostly on deep learning models. We provide a systematic organization of the literature, highlighting the key contributions and challenges in this area. Overall, our review highlights the importance of this area in natural language processing, the progress made in sentence representation learning, and the challenges that remain. We conclude with directions for future research, suggesting potential avenues for improving the quality and efficiency of sentence representations.</abstract>
       <url hash="068fb55a">2024.eacl-long.104</url>
       <bibkey>ramesh-kashyap-etal-2024-comprehensive</bibkey>
+      <video href="2024.eacl-long.104.mp4"/>
     </paper>
     <paper id="105">
       <title>Learning to Retrieve In-Context Examples for Large Language Models</title>
@@ -1248,6 +1346,7 @@
       <attachment type="software" hash="67b4466e">2024.eacl-long.105.software.zip</attachment>
       <attachment type="note" hash="426bcdc8">2024.eacl-long.105.note.zip</attachment>
       <bibkey>wang-etal-2024-learning</bibkey>
+      <video href="2024.eacl-long.105.mp4"/>
     </paper>
     <paper id="106">
       <title><fixed-case>E</fixed-case>n<fixed-case>C</fixed-case>ore: Fine-Grained Entity Typing by Pre-Training Entity Encoders on Coreference Chains</title>
@@ -1257,6 +1356,7 @@
       <abstract>Entity typing is the task of assigning semantic types to the entities that are mentioned in a text. In the case of fine-grained entity typing (FET), a large set of candidate type labels is considered. Since obtaining sufficient amounts of manual annotations is then prohibitively expensive, FET models are typically trained using distant supervision. In this paper, we propose to improve on this process by pre-training an entity encoder such that embeddings of coreferring entities are more similar to each other than to the embeddings of other entities. The main problem with this strategy, which helps to explain why it has not previously been considered, is that predicted coreference links are often too noisy. We show that this problem can be addressed by using a simple trick: we only consider coreference links that are predicted by two different off-the-shelf systems. With this prudent use of coreference links, our pre-training strategy allows us to improve the state-of-the-art in benchmarks on fine-grained entity typing, as well as traditional entity extraction.</abstract>
       <url hash="fe158caa">2024.eacl-long.106</url>
       <bibkey>mtumbuka-schockaert-2024-encore</bibkey>
+      <video href="2024.eacl-long.106.mp4"/>
     </paper>
     <paper id="107">
       <title>Unsupervised stance detection for social media discussions: A generic baseline</title>
@@ -1268,6 +1368,7 @@
       <abstract>With the ever-growing use of social media to express opinions on the national and international stage, unsupervised methods of stance detection are increasingly important to handle the task without costly annotation of data. The current unsupervised state-of-the-art models are designed for specific network types, either homophilic or heterophilic, and they fail to generalize to both. In this paper, we first analyze the generalization ability of recent baselines to these two very different network types. Then, we conduct extensive experiments with a baseline model based on text embeddings propagated with a graph neural network that generalizes well to heterophilic and homophilic networks. We show that it outperforms, on average, other state-of-the-art methods across the two network types. Additionally, we show that combining textual and network information outperforms using text only, and that the language model size has only a limited impact on the model performance.</abstract>
       <url hash="ad02d8f1">2024.eacl-long.107</url>
       <bibkey>sutter-etal-2024-unsupervised</bibkey>
+      <video href="2024.eacl-long.107.mp4"/>
     </paper>
     <paper id="108">
       <title>Putting Context in Context: the Impact of Discussion Structure on Text Classification</title>
@@ -1282,6 +1383,7 @@
       <attachment type="software" hash="bd93ff36">2024.eacl-long.108.software.zip</attachment>
       <attachment type="note" hash="b013579d">2024.eacl-long.108.note.zip</attachment>
       <bibkey>penzo-etal-2024-putting</bibkey>
+      <video href="2024.eacl-long.108.mp4"/>
     </paper>
     <paper id="109">
       <title>Aligning Large and Small Language Models via Chain-of-Thought Reasoning</title>
@@ -1301,6 +1403,7 @@
       <abstract>Multilingual Machine Translation (MMT) benefits from knowledge transfer across different language pairs. However, improvements in one-to-many translation compared to many-to-one translation are only marginal and sometimes even negligible. This performance discrepancy raises the question of to what extent positive transfer plays a role on the target-side for one-to-many MT. In this paper, we conduct a large-scale study that varies the auxiliary target-side languages along two dimensions, i.e., linguistic similarity and corpus size, to show the dynamic impact of knowledge transfer on the main language pairs. We show that linguistically similar auxiliary target languages exhibit strong ability to transfer positive knowledge. With an increasing size of similar target languages, the positive transfer is further enhanced to benefit the main language pairs. Meanwhile, we find distant auxiliary target languages can also unexpectedly benefit main language pairs, even with minimal positive transfer ability. Apart from transfer, we show distant auxiliary target languages can act as a regularizer to benefit translation performance by enhancing the generalization and model inference calibration.</abstract>
       <url hash="386f1099">2024.eacl-long.110</url>
       <bibkey>meng-monz-2024-disentangling</bibkey>
+      <video href="2024.eacl-long.110.mp4"/>
     </paper>
     <paper id="111">
       <title>Uncovering Stereotypes in Large Language Models: A Task Complexity-based Approach</title>
@@ -1314,6 +1417,7 @@
       <attachment type="software" hash="02249be2">2024.eacl-long.111.software.zip</attachment>
       <attachment type="note" hash="ce47ec9a">2024.eacl-long.111.note.zip</attachment>
       <bibkey>shrawgi-etal-2024-uncovering</bibkey>
+      <video href="2024.eacl-long.111.mp4"/>
     </paper>
     <paper id="112">
       <title>Rainbow - A Benchmark for Systematic Testing of How Sensitive Visio-Linguistic Models are to Color Naming</title>
@@ -1324,6 +1428,7 @@
       <abstract>With the recent emergence of powerful visio-linguistic models comes the question of how fine-grained their multi-modal understanding is. This has lead to the release of several probing datasets. Results point towards models having trouble with prepositions and verbs, but being relatively robust when it comes to color.To gauge how deep this understanding goes, we compile a comprehensive probing dataset to systematically test multi-modal alignment around color. We demonstrate how human perception influences descriptions of color and pay special attention to the extent to which this is reflected within the predictions of a visio-linguistic model. Probing a set of models with diverse properties with our benchmark confirms the superiority of models that do not rely on pre-extracted image features, and demonstrates that augmentation with too much noisy pre-training data can produce an inferior model. While the benchmark remains challenging for all models we test, the overall result pattern suggests well-founded alignment of color terms with hues. Analyses do however reveal uncertainty regarding the boundaries between neighboring color terms.</abstract>
       <url hash="51ff0542">2024.eacl-long.112</url>
       <bibkey>bexte-etal-2024-rainbow</bibkey>
+      <video href="2024.eacl-long.112.mp4"/>
     </paper>
     <paper id="113">
       <title><fixed-case>CAT</fixed-case>f<fixed-case>OOD</fixed-case>: Counterfactual Augmented Training for Improving Out-of-Domain Performance and Calibration</title>
@@ -1335,6 +1440,7 @@
       <url hash="46770007">2024.eacl-long.113</url>
       <attachment type="software" hash="afd2338a">2024.eacl-long.113.software.zip</attachment>
       <bibkey>sachdeva-etal-2024-catfood</bibkey>
+      <video href="2024.eacl-long.113.mp4"/>
     </paper>
     <paper id="114">
       <title><fixed-case>UP</fixed-case>5: Unbiased Foundation Model for Fairness-aware Recommendation</title>
@@ -1358,6 +1464,7 @@
       <abstract>Past work in NLP has proposed the task of classifying English verb phrases into situation aspect categories, assuming that these categories play an important role in tasks requiring temporal reasoning. We investigate this assumption by gathering crowd-sourced judgements about aspectual entailments from non-expert, native English participants. The results suggest that aspectual class alone is not sufficient to explain the response patterns of the participants. We propose that looking at scenarios which can feasibly accompany an action description contributes towards a better explanation of the participants’ answers. A further experiment using GPT-3.5 shows that its outputs follow different patterns than human answers, suggesting that such conceivable scenarios cannot be fully accounted for in the language alone. We release our dataset to support further research.</abstract>
       <url hash="e1ae9fe4">2024.eacl-long.115</url>
       <bibkey>prus-etal-2024-human</bibkey>
+      <video href="2024.eacl-long.115.mp4"/>
     </paper>
     <paper id="116">
       <title>It is not True that Transformers are Inductive Learners: Probing <fixed-case>NLI</fixed-case> Models with External Negation</title>
@@ -1366,6 +1473,7 @@
       <abstract>NLI tasks necessitate a substantial degree of logical reasoning; as such, the remarkable performance of SoTA transformers on these tasks may lead us to believe that those models have learned to reason logically. The results presented in this paper demonstrate that (i) models fine-tuned on NLI datasets learn to treat external negation as a distractor, effectively ignoring its presence in hypothesis sentences; (ii) several near-SoTA encoder and encoder-decoder transformer models fail to inductively learn the law of the excluded middle for a single external negation prefix with respect to NLI tasks, despite extensive fine-tuning; (iii) those models which are are able to learn the law of the excluded middle for a single prefix are unable to generalize this pattern to similar prefixes. Given the critical role of negation in logical reasoning, we may conclude from these findings that transformers do not learn to reason logically when fine-tuned for NLI tasks. Furthermore, these results suggest that transformers may not be able to inductively learn the role of negation with respect to NLI tasks, calling into question their capacity to fully acquire logical reasoning abilities.</abstract>
       <url hash="4b876946">2024.eacl-long.116</url>
       <bibkey>sullivan-2024-true</bibkey>
+      <video href="2024.eacl-long.116.mp4"/>
     </paper>
     <paper id="117">
       <title>Polarized Opinion Detection Improves the Detection of Toxic Language</title>
@@ -1376,6 +1484,7 @@
       <url hash="1eda6f0a">2024.eacl-long.117</url>
       <attachment type="software" hash="973b811f">2024.eacl-long.117.software.zip</attachment>
       <bibkey>pavlopoulos-likas-2024-polarized</bibkey>
+      <video href="2024.eacl-long.117.mp4"/>
     </paper>
     <paper id="118">
       <title>Improving Acoustic Word Embeddings through Correspondence Training of Self-supervised Speech Representations</title>
@@ -1386,6 +1495,7 @@
       <url hash="9b57806b">2024.eacl-long.118</url>
       <attachment type="note" hash="608cd582">2024.eacl-long.118.note.zip</attachment>
       <bibkey>meghanani-hain-2024-improving</bibkey>
+      <video href="2024.eacl-long.118.mp4"/>
     </paper>
     <paper id="119">
       <title>Investigating Agency of <fixed-case>LLM</fixed-case>s in Human-<fixed-case>AI</fixed-case> Collaboration Tasks</title>
@@ -1400,6 +1510,7 @@
       <url hash="1b326838">2024.eacl-long.119</url>
       <attachment type="note" hash="cb692aac">2024.eacl-long.119.note.zip</attachment>
       <bibkey>sharma-etal-2024-investigating</bibkey>
+      <video href="2024.eacl-long.119.mp4"/>
     </paper>
     <paper id="120">
       <title><fixed-case>S</fixed-case>ynth<fixed-case>DST</fixed-case>: Synthetic Data is All You Need for Few-Shot Dialog State Tracking</title>
@@ -1425,6 +1536,7 @@
       <abstract>Argument Mining (AM) aims to uncover the argumentative structures within a text. Previous methods require several subtasks, such as span identification, component classification, and relation classification. Consequently, these methods need rule-based postprocessing to derive argumentative structures from the output of each subtask. This approach adds to the complexity of the model and expands the search space of the hyperparameters. To address this difficulty, we propose a simple yet strong method based on a text-to-text generation approach using a pretrained encoder-decoder language model. Our method simultaneously generates argumentatively annotated text for spans, components, and relations, eliminating the need for task-specific postprocessing and hyperparameter tuning. Furthermore, because it is a straightforward text-to-text generation method, we can easily adapt our approach to various types of argumentative structures.Experimental results demonstrate the effectiveness of our method, as it achieves state-of-the-art performance on three different types of benchmark datasets: the Argument-annotated Essays Corpus (AAEC), AbstRCT, and the Cornell eRulemaking Corpus (CDCP).</abstract>
       <url hash="11366197">2024.eacl-long.121</url>
       <bibkey>kawarada-etal-2024-argument</bibkey>
+      <video href="2024.eacl-long.121.mp4"/>
     </paper>
     <paper id="122">
       <title>Answering legal questions from laymen in <fixed-case>G</fixed-case>erman civil law system</title>
@@ -1436,6 +1548,7 @@
       <attachment type="software" hash="062d887d">2024.eacl-long.122.software.zip</attachment>
       <attachment type="note" hash="692534e5">2024.eacl-long.122.note.zip</attachment>
       <bibkey>buttner-habernal-2024-answering</bibkey>
+      <video href="2024.eacl-long.122.mp4"/>
     </paper>
     <paper id="123">
       <title>An Empirical Analysis of Diversity in Argument Summarization</title>
@@ -1448,6 +1561,7 @@
       <url hash="d367e18c">2024.eacl-long.123</url>
       <attachment type="software" hash="9f52a6dc">2024.eacl-long.123.software.zip</attachment>
       <bibkey>van-der-meer-etal-2024-empirical</bibkey>
+      <video href="2024.eacl-long.123.mp4"/>
     </paper>
     <paper id="124">
       <title>What Makes Medical Claims (Un)Verifiable? Analyzing Entity and Relation Properties for Fact Verification</title>
@@ -1460,6 +1574,7 @@
       <url hash="484e5662">2024.eacl-long.124</url>
       <attachment type="note" hash="e779fd35">2024.eacl-long.124.note.zip</attachment>
       <bibkey>wuehrl-etal-2024-makes</bibkey>
+      <video href="2024.eacl-long.124.mp4"/>
     </paper>
     <paper id="125">
       <title>Approximate Attributions for Off-the-Shelf <fixed-case>S</fixed-case>iamese Transformers</title>
@@ -1471,6 +1586,7 @@
       <url hash="9534c6fa">2024.eacl-long.125</url>
       <attachment type="software" hash="dbdd906b">2024.eacl-long.125.software.zip</attachment>
       <bibkey>moeller-etal-2024-approximate</bibkey>
+      <video href="2024.eacl-long.125.mp4"/>
     </paper>
     <paper id="126">
       <title>Describing Images <tex-math>\textit{Fast and Slow}</tex-math>: Quantifying and Predicting the Variation in Human Signals during Visuo-Linguistic Processes</title>
@@ -1481,6 +1597,7 @@
       <abstract>There is an intricate relation between the properties of an image and how humans behave while describing the image. This behavior shows ample variation, as manifested in human signals such as eye movements and when humans start to describe the image. Despite the value of such signals of visuo-linguistic variation, they are virtually disregarded in the training of current pretrained models, which motivates further investigation. Using a corpus of Dutch image descriptions with concurrently collected eye-tracking data, we explore the nature of the variation in visuo-linguistic signals, and find that they correlate with each other. Given this result, we hypothesize that variation stems partly from the properties of the images, and explore whether image representations encoded by pretrained vision encoders can capture such variation. Our results indicate that pretrained models do so to a weak-to-moderate degree, suggesting that the models lack biases about what makes a stimulus complex for humans and what leads to variations in human outputs.</abstract>
       <url hash="1647aacb">2024.eacl-long.126</url>
       <bibkey>takmaz-etal-2024-describing</bibkey>
+      <video href="2024.eacl-long.126.mp4"/>
     </paper>
     <paper id="127">
       <title>Tracing the Roots of Facts in Multilingual Language Models: Independent, Shared, and Transferred Knowledge</title>
@@ -1491,6 +1608,7 @@
       <abstract>Acquiring factual knowledge for language models (LMs) in low-resource languages poses a serious challenge, thus resorting to cross-lingual transfer in multilingual LMs (ML-LMs). In this study, we ask how ML-LMs acquire and represent factual knowledge. Using the multilingual factual knowledge probing dataset, mLAMA, we first conducted a neuron investigation of ML-LMs (specifically, multilingual BERT). We then traced the roots of facts back to the knowledge source (Wikipedia) to identify the ways in which ML-LMs acquire specific facts. We finally identified three patterns of acquiring and representing facts in ML-LMs: language-independent, cross-lingual shared and transferred, and devised methods for differentiating them. Our findings highlight the challenge of maintaining consistent factual knowledge across languages, underscoring the need for better fact representation learning in ML-LMs.</abstract>
       <url hash="1b80c08e">2024.eacl-long.127</url>
       <bibkey>zhao-etal-2024-tracing</bibkey>
+      <video href="2024.eacl-long.127.mp4"/>
     </paper>
     <paper id="128">
       <title>Comparing Knowledge Sources for Open-Domain Scientific Claim Verification</title>
@@ -1510,6 +1628,7 @@
       <abstract>Uncertainty estimation is an important diagnostic tool for statistical models, and is often used to assess the confidence of model predictions. Previous work shows that neural machine translation (NMT) is an intrinsically uncertain task where there are often multiple correct and semantically equivalent translations, and that well-trained NMT models produce good translations despite spreading probability mass among many semantically similar translations. These findings suggest that popular measures of uncertainty based on token- and sequence-level entropies which measure surface form diversity may not be good proxies of the more useful quantity of interest, semantic diversity. We propose to adapt similarity-sensitive Shannon entropy (S3E), a concept borrowed from theoretical ecology, for NMT. By demonstrating significantly improved correlation between S3E and task performance on quality estimation and named entity recall, we show that S3E is a useful framework for measuring uncertainty in NMT.</abstract>
       <url hash="4d626cc3">2024.eacl-long.129</url>
       <bibkey>cheng-vlachos-2024-measuring</bibkey>
+      <video href="2024.eacl-long.129.mp4"/>
     </paper>
     <paper id="130">
       <title><fixed-case>L</fixed-case>egal<fixed-case>L</fixed-case>ens: Leveraging <fixed-case>LLM</fixed-case>s for Legal Violation Identification in Unstructured Text</title>
@@ -1525,6 +1644,7 @@
       <abstract>In this study, we focus on two main tasks, the first for detecting legal violations within unstructured textual data, and the second for associating these violations with potentially affected individuals. We constructed two datasets using Large Language Models (LLMs) which were subsequently validated by domain expert annotators. Both tasks were designed specifically for the context of class-action cases. The experimental design incorporated fine-tuning models from the BERT family and open-source LLMs, and conducting few-shot experiments using closed-source LLMs. Our results, with an F1-score of 62.69% (violation identification) and 81.02% (associating victims), show that our datasets and setups can be used for both tasks. Finally, we publicly release the datasets and the code used for the experiments in order to advance further research in the area of legal natural language processing (NLP).</abstract>
       <url hash="d7140c68">2024.eacl-long.130</url>
       <bibkey>bernsohn-etal-2024-legallens</bibkey>
+      <video href="2024.eacl-long.130.mp4"/>
     </paper>
     <paper id="131">
       <title><tex-math>\mu</tex-math><fixed-case>PLAN</fixed-case>: Summarizing using a Content Plan as Cross-Lingual Bridge</title>
@@ -1540,6 +1660,7 @@
       <abstract>Cross-lingual summarization aims to generate a summary in one languagegiven input in a different language, allowing for the dissemination ofrelevant content among different language speaking populations. Thetask is challenging mainly due to the paucity of cross-lingualdatasets and the compounded difficulty of summarizing andtranslating.This work presents <tex-math>\mu</tex-math>PLAN, an approach to cross-lingual summarization that uses an intermediate planning step as a cross-lingual bridge. We formulate the plan as a sequence of entities capturing thesummary’s content and the order in which it should becommunicated. Importantly, our plans abstract from surface form: usinga multilingual knowledge base, we align entities to their canonicaldesignation across languages and generate the summary conditioned onthis cross-lingual bridge and the input. Automatic and human evaluation on the XWikis dataset (across four language pairs) demonstrates that our planning objective achieves state-of-the-art performance interms of informativeness and faithfulness. Moreover, <tex-math>\mu</tex-math>PLAN modelsimprove the zero-shot transfer to new cross-lingual language pairscompared to baselines without a planning component.</abstract>
       <url hash="9963d914">2024.eacl-long.131</url>
       <bibkey>huot-etal-2024-mplan</bibkey>
+      <video href="2024.eacl-long.131.mp4"/>
     </paper>
     <paper id="132">
       <title>Exploring Data Augmentation in Neural <fixed-case>DRS</fixed-case>-to-Text Generation</title>
@@ -1550,6 +1671,7 @@
       <abstract>Neural networks are notoriously data-hungry. This represents an issue in cases where data are scarce such as in low-resource languages. Data augmentation is a technique commonly used in computer vision to provide neural networks with more data and increase their generalization power. When dealing with data augmentation for natural language, however, simple data augmentation techniques similar to the ones used in computer vision such as rotation and cropping cannot be employed because they would generate ungrammatical texts. Thus, data augmentation needs a specific design in the case of neural logic-to-text systems, especially for a structurally rich input format such as the ones used for meaning representation. This is the case of the neural natural language generation for Discourse Representation Structures (DRS-to-Text), where the logical nature of DRS needs a specific design of data augmentation. In this paper, we adopt a novel approach in DRS-to-Text to selectively augment a training set with new data by adding and varying two specific lexical categories, i.e. proper and common nouns. In particular, we propose using WordNet supersenses to produce new training sentences using both in-and-out-of-context nouns. We present a number of experiments for evaluating the role played by augmented lexical information. The experimental results prove the effectiveness of our approach for data augmentation in DRS-to-Text generation.</abstract>
       <url hash="fcedda01">2024.eacl-long.132</url>
       <bibkey>amin-etal-2024-exploring</bibkey>
+      <video href="2024.eacl-long.132.mp4"/>
     </paper>
     <paper id="133">
       <title>Think Twice: Measuring the Efficiency of Eliminating Prediction Shortcuts of Question Answering Models</title>
@@ -1562,6 +1684,7 @@
       <url hash="cf7135c3">2024.eacl-long.133</url>
       <attachment type="software" hash="83eda4a4">2024.eacl-long.133.software.zip</attachment>
       <bibkey>mikula-etal-2024-think</bibkey>
+      <video href="2024.eacl-long.133.mp4"/>
     </paper>
     <paper id="134">
       <title>Improving Contrastive Learning in Emotion Recognition in Conversation via Data Augmentation and Decoupled Neutral Emotion</title>
@@ -1571,6 +1694,7 @@
       <abstract>Emotion recognition in conversation (ERC) has attracted much attention due to its wide applications. While consistent improvement is being made in this area, inevitable challenge comes from the dataset. The ERC dataset exhibits significantly imbalanced emotion distribution. While the utterances with neutral emotion predominate the data, this emotion label is always treated the same as other emotion labels in current approaches. To address the problem caused by the dataset, we propose a supervised contrastive learning specifically oriented for ERC task. We employ a novel data augmentation method emulating the emotion dynamics in a conversation and formulate supervised contrastive learning method tailored for ERC addressing the predominance and the ambiguity of neutral emotion. Experimental results on four benchmark datasets demonstrate the effectiveness of our approach.</abstract>
       <url hash="2b2331b0">2024.eacl-long.134</url>
       <bibkey>kang-cho-2024-improving</bibkey>
+      <video href="2024.eacl-long.134.mp4"/>
     </paper>
     <paper id="135">
       <title>CroCoAlign: A Cross-Lingual, Context-Aware and Fully-Neural Sentence Alignment System for Long Texts</title>
@@ -1585,6 +1709,7 @@
       <bibkey>molfese-etal-2024-neuralign</bibkey>
       <revision id="1" href="2024.eacl-long.135v1" hash="c49fbf28"/>
       <revision id="2" href="2024.eacl-long.135v2" hash="37cdec45" date="2024-03-19">Update CroCoAlign.</revision>
+      <video href="2024.eacl-long.135.mp4"/>
     </paper>
     <paper id="136">
       <title>Explaining Speech Classification Models via Word-Level Audio Segments and Paralinguistic Features</title>
@@ -1597,6 +1722,7 @@
       <abstract>Predictive models make mistakes and have biases. To combat both, we need to understand their predictions.Explainable AI (XAI) provides insights into models for vision, language, and tabular data. However, only a few approaches exist for speech classification models. Previous works focus on a selection of spoken language understanding (SLU) tasks, and most users find their explanations challenging to interpret.We propose a novel approach to explain speech classification models. It provides two types of insights. (i) Word-level. We measure the impact of each audio segment aligned with a word on the outcome. (ii) Paralinguistic. We evaluate how non-linguistic features (e.g., prosody and background noise) affect the outcome if perturbed.We validate our approach by explaining two state-of-the-art SLU models on two tasks in English and Italian. We test their plausibility with human subject ratings. Our results show that the explanations correctly represent the model’s inner workings and are plausible to humans.</abstract>
       <url hash="9921275b">2024.eacl-long.136</url>
       <bibkey>pastor-etal-2024-explaining</bibkey>
+      <video href="2024.eacl-long.136.mp4"/>
     </paper>
     <paper id="137">
       <title>Zero-Shot End-to-End Spoken Language Understanding via Cross-Modal Selective Self-Training</title>
@@ -1611,6 +1737,7 @@
       <attachment type="software" hash="3818b9f4">2024.eacl-long.137.software.zip</attachment>
       <attachment type="note" hash="9c981a09">2024.eacl-long.137.note.zip</attachment>
       <bibkey>he-etal-2024-zero</bibkey>
+      <video href="2024.eacl-long.137.mp4"/>
     </paper>
     <paper id="138">
       <title>Clever Hans or Neural Theory of Mind? Stress Testing Social Reasoning in Large Language Models</title>
@@ -1626,6 +1753,7 @@
       <abstract>The escalating debate on AI’s capabilities warrants developing reliable metrics to assess machine “intelligence.” Recently, many anecdotal examples were used to suggest that newer Large Language Models (LLMs) like ChatGPT and GPT-4 exhibit Neural Theory-of-Mind (N-ToM); however, prior work reached conflicting conclusions regarding those abilities. We investigate the extent of LLMs’ N-ToM through an extensive evaluation of 6 tasks and find that while LLMs exhibit certain N-ToM abilities, this behavior is far from being robust. We further examine the factors impacting performance on N-ToM tasks and discover that LLMs struggle with adversarial examples, indicating reliance on shallow heuristics rather than robust ToM abilities. We caution against drawing conclusions from anecdotal examples, limited benchmark testing, and using human-designed psychological tests to evaluate models.</abstract>
       <url hash="f7891313">2024.eacl-long.138</url>
       <bibkey>shapira-etal-2024-clever</bibkey>
+      <video href="2024.eacl-long.138.mp4"/>
     </paper>
     <paper id="139">
       <title><fixed-case>N</fixed-case>ev<fixed-case>IR</fixed-case>: Negation in Neural Information Retrieval</title>
@@ -1637,6 +1765,7 @@
       <url hash="6a5f023d">2024.eacl-long.139</url>
       <attachment type="note" hash="ed01c8e2">2024.eacl-long.139.note.zip</attachment>
       <bibkey>weller-etal-2024-nevir</bibkey>
+      <video href="2024.eacl-long.139.mp4"/>
     </paper>
     <paper id="140">
       <title>“According to . . . ”: Prompting Language Models Improves Quoting from Pre-Training Data</title>
@@ -1650,6 +1779,7 @@
       <abstract>Large Language Models (LLMs) may hallucinate and generate fake information, despite pre-training on factual data. Inspired by the journalistic device of “according to sources”, we propose according-to prompting: directing LLMs to ground responses against previously observed text. To quantify this grounding, we propose a novel evaluation metric (QUIP-Score) that measures the extent to which model-produced answers are directly found in underlying text corpora. We illustrate with experiments on three corpora (Wikipedia, PubMed, and the U.S. legal tax code) that these prompts improve grounding under our metrics, with the additional benefit of often improving end-task performance. Furthermore, prompts that ask the model to decrease grounding (or to ground to other corpora) indeed decrease QUIP-Score, indicating the ability of LLMs to increase or decrease grounded generations on request.</abstract>
       <url hash="d5d3d5c2">2024.eacl-long.140</url>
       <bibkey>weller-etal-2024-according</bibkey>
+      <video href="2024.eacl-long.140.mp4"/>
     </paper>
     <paper id="141">
       <title>Accurate and Well-Calibrated <fixed-case>ICD</fixed-case> Code Assignment Through Attention Over Diverse Label Embeddings</title>
@@ -1660,6 +1790,7 @@
       <abstract>Although the International Classification of Diseases (ICD) has been adopted worldwide, manually assigning ICD codes to clinical text is time-consuming, error-prone, and expensive, motivating the development of automated approaches. This paper describes a novel approach for automated ICD coding, combining several ideas from previous related work. We specifically employ a strong Transformer-based model as a text encoder and, to handle lengthy clinical narratives, we explored either (a) adapting the base encoder model into a Longformer, or (b) dividing the text into chunks and processing each chunk independently. The representations produced by the encoder are combined with a label embedding mechanism that explores diverse ICD code synonyms. Experiments with different splits of the MIMIC-III dataset show that the proposed approach outperforms the current state-of-the-art models in ICD coding, with the label embeddings significantly contributing to the good performance. Our approach also leads to properly calibrated classification results, which can effectively inform downstream tasks such as quantification.</abstract>
       <url hash="bdb511b7">2024.eacl-long.141</url>
       <bibkey>gomes-etal-2024-accurate</bibkey>
+      <video href="2024.eacl-long.141.mp4"/>
     </paper>
     <paper id="142">
       <title>Investigating Content Planning for Navigating Trade-offs in Knowledge-Grounded Dialogue</title>
@@ -1671,6 +1802,7 @@
       <abstract>Knowledge-grounded dialogue generation is a challenging task because it requires satisfying two fundamental, yet often competing constraints: being responsive in a manner that is specific to what the conversation partner has said while also being attributable to an underlying source document. In this work, we bring this trade-off between these two objectives (specificity and attribution) to light, and ask the question: Can explicit content planning before the response generation help the model to address this challenge? To answer this question, we design a framework called PLEDGE, which allows us to experiment with various plan variables explored in prior work supporting both metric-agnostic and metric-aware approaches. While content planning shows promise, our results on whether it can actually help to navigate this trade-off are mixed – planning mechanisms that are metric-aware (use automatic metrics during training) are better at automatic evaluations but underperform in human judgment compared to metric-agnostic mechanisms. We discuss how this may be caused by over-fitting to automatic metrics, and the need for future work to better calibrate these metrics towards human judgment. We hope the observations from our analysis will inform future work that aims to apply content planning in this context.</abstract>
       <url hash="7e8e7463">2024.eacl-long.142</url>
       <bibkey>chawla-etal-2024-investigating</bibkey>
+      <video href="2024.eacl-long.142.mp4"/>
     </paper>
     <paper id="143">
       <title><fixed-case>SPUQ</fixed-case>: Perturbation-Based Uncertainty Quantification for Large Language Models</title>
@@ -1696,6 +1828,7 @@
       <abstract>Diffusion models have emerged as a powerful paradigm for generation, obtaining strong performance in various continuous domains. However, applying continuous diffusion models to natural language remains challenging due to its discrete nature and the need for a large number of diffusion steps to generate text, making diffusion-based generation expensive.In this work, we propose Text-to-text Self-conditioned Simplex Diffusion (TESS), a text diffusion model that is fully non-autoregressive, employs a new form of self-conditioning, and applies the diffusion process on the logit simplex space rather than the learned embedding space.Through extensive experiments on natural language understanding and generation tasks including summarization, text simplification, paraphrase generation, and question generation, we demonstrate that TESS outperforms state-of-the-art non-autoregressive models, requires fewer diffusion steps with minimal drop in performance, and is competitive with pretrained autoregressive sequence-to-sequence models.</abstract>
       <url hash="0fcb490a">2024.eacl-long.144</url>
       <bibkey>karimi-mahabadi-etal-2024-tess</bibkey>
+      <video href="2024.eacl-long.144.mp4"/>
     </paper>
     <paper id="145">
       <title>Advancing Precise Outline-Conditioned Text Generation with Task Duality and Explicit Outline Control</title>
@@ -1723,6 +1856,7 @@
       <url hash="4b808528">2024.eacl-long.146</url>
       <attachment type="software" hash="e80ee5cc">2024.eacl-long.146.software.zip</attachment>
       <bibkey>li-etal-2024-localization</bibkey>
+      <video href="2024.eacl-long.146.mp4"/>
     </paper>
     <paper id="147">
       <title>Creating Suspenseful Stories: Iterative Planning with Large Language Models</title>
@@ -1732,6 +1866,7 @@
       <abstract>Automated story generation has been one of the long-standing challenges in NLP. Among all dimensions of stories, *suspense* is very common in human-written stories but relatively under-explored in AI-generated stories. While recent advances in large language models (LLMs) have greatly promoted language generation in general, state-of-the-art LLMs are still unreliable when it comes to suspenseful story generation. We propose a novel iterative-prompting-based planning method that is grounded in two theoretical foundations of story suspense from cognitive psychology and narratology. This theory-grounded method works in a fully zero-shot manner and does not rely on any supervised story corpora. To the best of our knowledge, this paper is the first attempt at suspenseful story generation with LLMs. Extensive human evaluations of the generated suspenseful stories demonstrate the effectiveness of our method.</abstract>
       <url hash="6bfe995c">2024.eacl-long.147</url>
       <bibkey>xie-riedl-2024-creating</bibkey>
+      <video href="2024.eacl-long.147.mp4"/>
     </paper>
     <paper id="148">
       <title>Few-Shot Dialogue Summarization via Skeleton-Assisted Prompt Transfer in Prompt Tuning</title>
@@ -1748,6 +1883,7 @@
       <abstract>In real-world scenarios, labeled samples for dialogue summarization are usually limited (i.e., few-shot) due to high annotation costs for high-quality dialogue summaries. To efficiently learn from few-shot samples, previous works have utilized massive annotated data from other downstream tasks and then performed prompt transfer in prompt tuning so as to enable cross-task knowledge transfer. However, existing general-purpose prompt transfer techniques lack consideration for dialogue-specific information. In this paper, we focus on improving the prompt transfer from dialogue state tracking to dialogue summarization and propose Skeleton-Assisted Prompt Transfer (SAPT), which leverages skeleton generation as extra supervision that functions as a medium connecting the distinct source and target task and resulting in the model’s better consumption of dialogue state information. To automatically extract dialogue skeletons as supervised training data for skeleton generation, we design a novel approach with perturbation-based probes requiring neither annotation effort nor domain knowledge. Training the model on such skeletons can also help preserve model capability during prompt transfer. Our method significantly outperforms existing baselines. In-depth analyses demonstrate the effectiveness of our method in facilitating cross-task knowledge transfer in few-shot dialogue summarization.</abstract>
       <url hash="59fa61bb">2024.eacl-long.148</url>
       <bibkey>xie-etal-2024-shot</bibkey>
+      <video href="2024.eacl-long.148.mp4"/>
     </paper>
     <paper id="149">
       <title>Ask, Assess, and Refine: Rectifying Factual Consistency and Hallucination in <fixed-case>LLM</fixed-case>s with Metric-Guided Feedback Learning</title>
@@ -1761,6 +1897,7 @@
       <attachment type="software" hash="a0a3be68">2024.eacl-long.149.software.zip</attachment>
       <attachment type="note" hash="cfcdc311">2024.eacl-long.149.note.zip</attachment>
       <bibkey>lee-etal-2024-ask</bibkey>
+      <video href="2024.eacl-long.149.mp4"/>
     </paper>
     <paper id="150">
       <title>Effective Controllable Bias Mitigation for Classification and Retrieval using Gate Adapters</title>
@@ -1772,6 +1909,7 @@
       <abstract>Bias mitigation of Language Models has been the topic of many studies with a recent focus on learning separate modules like adapters for on-demand debiasing. Besides optimizing for a modularized debiased model, it is often critical in practice to control the degree of bias reduction at inference time, e.g., in order to tune for a desired performance-fairness trade-off in search results or to control the strength of debiasing in classification tasks. In this paper, we introduce Controllable Gate Adapter (ConGater), a novel modular gating mechanism with adjustable sensitivity parameters, %In addition to better perseverance of task performance and enhanced information removal, which allows for a gradual transition from the biased state of the model to the fully debiased version at inference time. We demonstrate ConGater performance by (1) conducting adversarial debiasing experiments with three different models on three classification tasks with four protected attributes, and (2) reducing the bias of search results through fairness list-wise regularization to enable adjusting a trade-off between performance and fairness metrics. Our experiments on the classification tasks show that compared to baselines of the same caliber, ConGater can maintain higher task performance while containing less information regarding the attributes. Our results on the retrieval task show that the fully debiased ConGater can achieve the same fairness performance while maintaining more than twice as high task performance than recent strong baselines. Overall, besides strong performance ConGater enables the continuous transitioning between biased and debiased states of models, enhancing personalization of use and interpretability through controllability.</abstract>
       <url hash="b5cd27de">2024.eacl-long.150</url>
       <bibkey>masoudian-etal-2024-effective</bibkey>
+      <video href="2024.eacl-long.150.mp4"/>
     </paper>
     <paper id="151">
       <title><fixed-case>ST</fixed-case>able: Table Generation Framework for Encoder-Decoder Models</title>
@@ -1788,6 +1926,7 @@
       <url hash="391e424b">2024.eacl-long.151</url>
       <attachment type="software" hash="08f6c1ea">2024.eacl-long.151.software.zip</attachment>
       <bibkey>pietruszka-etal-2024-stable</bibkey>
+      <video href="2024.eacl-long.151.mp4"/>
     </paper>
     <paper id="152">
       <title>A <fixed-case>R</fixed-case>el<fixed-case>E</fixed-case>nt<fixed-case>L</fixed-case>ess Benchmark for Modelling Graded Relations between Named Entities</title>
@@ -1799,6 +1938,7 @@
       <url hash="71132dc4">2024.eacl-long.152</url>
       <attachment type="note" hash="b10248b0">2024.eacl-long.152.note.zip</attachment>
       <bibkey>ushio-etal-2024-relentless</bibkey>
+      <video href="2024.eacl-long.152.mp4"/>
     </paper>
     <paper id="153">
       <title>A Multimodal Framework to Detect Target Aware Aggression in Memes</title>
@@ -1812,6 +1952,7 @@
       <abstract>Internet memes have gained immense traction as a medium for individuals to convey emotions, thoughts, and perspectives on social media. While memes often serve as sources of humor and entertainment, they can also propagate offensive, incendiary, or harmful content, deliberately targeting specific individuals or communities. Identifying such memes is challenging because of their satirical and cryptic characteristics. Most contemporary research on memes’ detrimental facets is skewed towards high-resource languages, often sidelining the unique challenges tied to low-resource languages, such as Bengali. To facilitate this research in low-resource languages, this paper presents a novel dataset MIMOSA (MultIMOdal aggreSsion dAtaset) in Bengali. MIMOSA encompasses 4,848 annotated memes across five aggression target categories: Political, Gender, Religious, Others, and non-aggressive. We also propose MAF (Multimodal Attentive Fusion), a simple yet effective approach that uses multimodal context to detect the aggression targets. MAF captures the selective modality-specific features of the input meme and jointly evaluates them with individual modality features. Experiments on MIMOSA exhibit that the proposed method outperforms several state-of-the-art rivaling approaches. Our code and data are available at https://github.com/shawlyahsan/Bengali-Aggression-Memes.</abstract>
       <url hash="63894988">2024.eacl-long.153</url>
       <bibkey>ahsan-etal-2024-multimodal</bibkey>
+      <video href="2024.eacl-long.153.mp4"/>
     </paper>
     <paper id="154">
       <title>Graph Guided Question Answer Generation for Procedural Question-Answering</title>
@@ -1829,6 +1970,7 @@
       <url hash="31aabf9f">2024.eacl-long.154</url>
       <attachment type="note" hash="77a407c4">2024.eacl-long.154.note.zip</attachment>
       <bibkey>pham-etal-2024-graph</bibkey>
+      <video href="2024.eacl-long.154.mp4"/>
     </paper>
     <paper id="155">
       <title>Contrastive Decoding Reduces Hallucinations in Large Multilingual Machine Translation Models</title>
@@ -1850,6 +1992,7 @@
       <abstract>Misinformation and disinformation phenomena existed long before the advent of digital technologies. The exponential use of social media platforms, whose information feeds have created the conditions for many to many communication and instant amplification of the news has accelerated the diffusion of inaccurate and misleading information. As a result, the identification of claims have emerged as a pivotal technology for combating the influence of misinformation and disinformation within news media. Most existing work has concentrated on claim analysis at the sentence level, neglecting the crucial exploration of supplementary attributes such as the claimer and the claim object of the claim or confining it by limiting its scope to a predefined list of topics. Furthermore, previous research has been mostly centered around political debates, Wikipedia articles, and COVID-19 related content. By leveraging the advanced capabilities of Large Language Models (LLMs) in Natural Language Understanding (NLU) and text generation, we propose a novel architecture utilizing LLMs finetuned with LoRA to transform the claim, claimer and claim object detection task into a Question Answering (QA) setting. We evaluate our approach in a dataset of 867 scientific news articles of 3 domains (Health, Climate Change, Nutrition) (HCN), which are human annotated with the major claim, the claimer and the object of the major claim. We also evaluate our proposed model in the benchmark dataset of NEWSCLAIMS. Experimental and qualitative results showcase the effectiveness of the proposed approach. We make our dataset publicly available to encourage further research.</abstract>
       <url hash="3c7f19ca">2024.eacl-long.156</url>
       <bibkey>kotitsas-etal-2024-leveraging</bibkey>
+      <video href="2024.eacl-long.156.mp4"/>
     </paper>
     <paper id="157">
       <title>Should <fixed-case>I</fixed-case> try multiple optimizers when fine-tuning a pre-trained Transformer for <fixed-case>NLP</fixed-case> tasks? Should <fixed-case>I</fixed-case> tune their hyperparameters?</title>
@@ -1861,6 +2004,7 @@
       <abstract>NLP research has explored different neural model architectures and sizes, datasets, training objectives, and transfer learning techniques. However, the choice of optimizer during training has not been explored as extensively. Typically, some variant of Stochastic Gradient Descent (SGD) is employed, selected among numerous variants, using unclear criteria, often with minimal or no tuning of the optimizer’s hyperparameters. Experimenting with five GLUE datasets, two models (DistilBERT and DistilRoBERTa), and seven popular optimizers (SGD, SGD with Momentum, Adam, AdaMax, Nadam, AdamW, and AdaBound), we find that when the hyperparameters of the optimizers are tuned, there is no substantial difference in test performance across the five more elaborate (adaptive) optimizers, despite differences in training loss. Furthermore, tuning just the learning rate is in most cases as good as tuning all the hyperparameters. Hence, we recommend picking any of the best-behaved adaptive optimizers (e.g., Adam) and tuning only its learning rate. When no hyperparameter can be tuned, SGD with Momentum is the best choice.</abstract>
       <url hash="128cc1d3">2024.eacl-long.157</url>
       <bibkey>gkouti-etal-2024-try</bibkey>
+      <video href="2024.eacl-long.157.mp4"/>
     </paper>
     <paper id="158">
       <title><fixed-case>GUM</fixed-case>sley: Evaluating Entity Salience in Summarization for 12 <fixed-case>E</fixed-case>nglish Genres</title>
@@ -1870,6 +2014,7 @@
       <abstract>As NLP models become increasingly capable of understanding documents in terms of coherent entities rather than strings, obtaining the most salient entities for each document is not only an important end task in itself but also vital for Information Retrieval (IR) and other downstream applications such as controllable summarization. In this paper, we present and evaluate GUMsley, the first entity salience dataset covering all named and non-named salient entities for 12 genres of English text, aligned with entity types, Wikification links and full coreference resolution annotations. We promote a strict definition of salience using human summaries and demonstrate high inter-annotator agreement for salience based on whether a source entity is mentioned in the summary. Our evaluation shows poor performance by pre-trained SOTA summarization models and zero-shot LLM prompting in capturing salient entities in generated summaries. We also show that predicting or providing salient entities to several model architectures enhances performance and helps derive higher-quality summaries by alleviating the entity hallucination problem in existing abstractive summarization.</abstract>
       <url hash="1ee26f48">2024.eacl-long.158</url>
       <bibkey>lin-zeldes-2024-gumsley</bibkey>
+      <video href="2024.eacl-long.158.mp4"/>
     </paper>
     <paper id="159">
       <title>Sensitivity, Performance, Robustness: Deconstructing the Effect of Sociodemographic Prompting</title>
@@ -1882,6 +2027,7 @@
       <url hash="b5ed6d7b">2024.eacl-long.159</url>
       <bibkey>beck-etal-2024-sensitivity</bibkey>
       <award>Social Impact Award</award>
+      <video href="2024.eacl-long.159.mp4"/>
     </paper>
     <paper id="160">
       <title>Threat Behavior Textual Search by Attention Graph Isomorphism</title>
@@ -1893,6 +2039,7 @@
       <abstract>Cyber attacks cause over $1 trillion loss every year. An important task for cyber security analysts is attack forensics. It entails understanding malware behaviors and attack origins. However, existing automated or manual malware analysis can only disclose a subset of behaviors due to inherent difficulties (e.g., malware cloaking and obfuscation). As such, analysts often resort to text search techniques to identify existing malware reports based on the symptoms they observe, exploiting the fact that malware samples share a lot of similarity, especially those from the same origin. In this paper, we propose a novel malware behavior search technique that is based on graph isomorphism at the attention layers of Transformer models. We also compose a large dataset collected from various agencies to facilitate such research.Our technique outperforms state-of-the-art methods, such as those based on sentence embeddings and keywords by 6-14%. In the case study of 10 real-world malwares, our technique can correctly attribute 8 of them to their ground truth origins while using Google only works for 3 cases.</abstract>
       <url hash="25fc337d">2024.eacl-long.160</url>
       <bibkey>bae-etal-2024-threat</bibkey>
+      <video href="2024.eacl-long.160.mp4"/>
     </paper>
     <paper id="161">
       <title>Identifying Narrative Content in Podcast Transcripts</title>
@@ -1903,6 +2050,7 @@
       <abstract>As one of the oldest forms of human communication, narratives appear across a variety of genres and media. Computational methods have been applied to study narrativity in novels, social media, and patient records, leading to new approaches and insights. However, other types of media are growing in popularity, like podcasts. Podcasts contain a multitude of spoken narratives that can provide a meaningful glimpse into how people share stories with one another.In this paper, we outline and apply methods to process English-language podcast transcripts and extract narrative content from conversations within each episode. We provide an initial analysis of the types of narrative content that exists within a wide range of podcasts, and compare our results to other established narrative analysis tools.Our annotations for narrativity and pretrained models can help to enable future research into narrativity within a large corpus of approximately 100,000 podcast episodes.</abstract>
       <url hash="f3e8d18b">2024.eacl-long.161</url>
       <bibkey>abdessamed-etal-2024-identifying</bibkey>
+      <video href="2024.eacl-long.161.mp4"/>
     </paper>
     <paper id="162">
       <title>Frequency Explains the Inverse Correlation of Large Language Models’ Size, Training Data Amount, and Surprisal’s Fit to Reading Times</title>
@@ -1913,6 +2061,7 @@
       <abstract>Recent studies have shown that as Transformer-based language models become larger and are trained on very large amounts of data, the fit of their surprisal estimates to naturalistic human reading times degrades. The current work presents a series of analyses showing that word frequency is a key explanatory factor underlying these two trends. First, residual errors from four language model families on four corpora show that the inverse correlation between model size and fit to reading times is the strongest on the subset of least frequent words, which is driven by excessively accurate predictions of larger model variants. Additionally, training dynamics reveal that during later training steps, all model variants learn to predict rare words and that larger model variants do so more accurately, which explains the detrimental effect of both training data amount and model size on fit to reading times. Finally, a feature attribution analysis demonstrates that larger model variants are able to accurately predict rare words based on both an effectively longer context window size as well as stronger local associations compared to smaller model variants. Taken together, these results indicate that Transformer-based language models’ surprisal estimates diverge from human-like expectations due to the superhumanly complex associations they learn for predicting rare words.</abstract>
       <url hash="20fa4d2d">2024.eacl-long.162</url>
       <bibkey>oh-etal-2024-frequency</bibkey>
+      <video href="2024.eacl-long.162.mp4"/>
     </paper>
     <paper id="163">
       <title>Presentations by the Humans and For the Humans: Harnessing <fixed-case>LLM</fixed-case>s for Generating Persona-Aware Slides from Documents</title>
@@ -1926,6 +2075,7 @@
       <abstract>Scientific papers and slides are two different representations of the same underlying information, but both require substantial work to prepare. While there had been prior efforts on automating document-to-slides generation, there is still a pressing need of customizing the presentation of content aligning with the persona of target audience or duration of presentation. This paper first introduces the concept of end-user specification-aware document to slides conversion that incorporates end-user specifications into the conversion process. For this, we initially introduce a new dataset reuse the existing SciDuet dataset consisting of pairs of papers and corresponding slides decks from recent years’ *ACL conferences to create four persona-aware configurations. Secondly, we present Persona-Aware-D2S, a novel approach by finetuning LLMs using target audience feedback to create persona-aware slides from scientific documents. Our evaluation on both automated metrics and qualitative human evaluation suggests that by incorporating end-user specifications into the conversion process, our model can create presentations that are not only informative but also tailored to expectations and cognitive abilities of target audience.</abstract>
       <url hash="5c312eac">2024.eacl-long.163</url>
       <bibkey>mondal-etal-2024-presentations</bibkey>
+      <video href="2024.eacl-long.163.mp4"/>
     </paper>
     <paper id="164">
       <title><fixed-case>T</fixed-case>o<fixed-case>P</fixed-case>ro: Token-Level Prompt Decomposition for Cross-Lingual Sequence Labeling Tasks</title>
@@ -1942,6 +2092,7 @@
       <bibkey>ma-etal-2024-topro</bibkey>
       <revision id="1" href="2024.eacl-long.164v1" hash="fef9e762"/>
       <revision id="2" href="2024.eacl-long.164v2" hash="744e06d3" date="2024-03-25">Minor updates.</revision>
+      <video href="2024.eacl-long.164.mp4"/>
     </paper>
     <paper id="165">
       <title>Small Language Models Improve Giants by Rewriting Their Outputs</title>
@@ -1955,6 +2106,7 @@
       <abstract>Despite the impressive performance of large language models (LLMs), theyoften lag behind specialized models in various tasks. LLMs only use a fractionof the existing training data for in-context learning, while task-specificmodels harness the full dataset for fine-tuning. In this work, we tackle theproblem of leveraging training data to improve the performance of LLMs withoutfine-tuning. Our approach directly targets LLM predictions without requiringaccess to their weights. We create a pool of candidates from the LLM throughfew-shot prompting and we employ a compact model, the LM-corrector (LMCor),specifically trained to merge these candidates to produce an enhanced output.Our experiments on four natural language generation tasks demonstrate that evena small LMCor model (250M) substantially improves the few-shot performance ofLLMs (62B), matching and even outperforming standard fine-tuning. Furthermore,we illustrate the robustness of LMCor against different prompts, therebyminimizing the need for extensive prompt engineering. Finally, we show thatLMCor can be seamlessly integrated with different LLMs at inference, serving asa plug-and-play module to improve their performance.</abstract>
       <url hash="32cd8b0f">2024.eacl-long.165</url>
       <bibkey>vernikos-etal-2024-small</bibkey>
+      <video href="2024.eacl-long.165.mp4"/>
     </paper>
     <paper id="166">
       <title>Unintended Bias Detection and Mitigation in Misogynous Memes</title>
@@ -1980,6 +2132,7 @@
       <abstract>We explore how weak supervision on abundant unlabeled data can be leveraged to improve few-shot performance in aspect-based sentiment analysis (ABSA) tasks. We propose a pipeline approach to construct a noisy ABSA dataset, and we use it to adapt a pre-trained sequence-to-sequence model to the ABSA tasks. We test the resulting model on three widely used ABSA datasets, before and after fine-tuning. Our proposed method preserves the full fine-tuning performance while showing significant improvements (15.84 absolute F1) in the few-shot learning scenario for the harder tasks. In zero-shot (i.e., without fine-tuning), our method outperforms the previous state of the art on the aspect extraction sentiment classification (AESC) task and is, additionally, capable of performing the harder aspect sentiment triplet extraction (ASTE) task.</abstract>
       <url hash="08c07ded">2024.eacl-long.167</url>
       <bibkey>vacareanu-etal-2024-weak</bibkey>
+      <video href="2024.eacl-long.167.mp4"/>
     </paper>
     <paper id="168">
       <title>Counterfactual Reasoning with Knowledge Graph Embeddings</title>
@@ -1992,6 +2145,7 @@
       <attachment type="software" hash="f1098e07">2024.eacl-long.168.software.zip</attachment>
       <attachment type="note" hash="ab458e85">2024.eacl-long.168.note.zip</attachment>
       <bibkey>zellinger-etal-2024-counterfactual</bibkey>
+      <video href="2024.eacl-long.168.mp4"/>
     </paper>
     <paper id="169">
       <title>System-Level Natural Language Feedback</title>
@@ -2002,6 +2156,7 @@
       <abstract>Natural language (NL) feedback offers rich insights into user experience. While existing studies focus on an instance-level approach, where feedback is used to refine specific examples, we introduce a framework for system-level use of NL feedback. We show how to use feedback to formalize system-level design decisions in a human-in-the-loop-process – in order to produce better models. In particular this is done through: (i) metric design for tasks; and (ii) language model prompt design for refining model responses. We conduct two case studies of this approach for improving search query and dialog response generation, demonstrating the effectiveness of system-level feedback. We show the combination of system-level and instance-level feedback brings further gains, and that human written instance-level feedback results in more grounded refinements than GPT-3.5 written ones, underlying the importance of human feedback for building systems.</abstract>
       <url hash="8e1c4adb">2024.eacl-long.169</url>
       <bibkey>yuan-etal-2024-system</bibkey>
+      <video href="2024.eacl-long.169.mp4"/>
     </paper>
     <paper id="170">
       <title>Syntactic Preposing and Discourse Relations</title>
@@ -2012,6 +2167,7 @@
       <abstract>Over 15 years ago, Ward &amp; Birner (2006) suggested that non-canonical constructions in English can serve both to mark information status and to structure the information flow of discourse. One such construction is preposing, where a phrasal constituent appears to the left of its canonical position, typically sentence-initially. But computational work on discourse has, to date, ignored non-canonical syntax. We take account of non-canonical syntax by providing quantitative evidence relating NP/PP preposing to discourse relations. The evidence comes from an LLM mask-filling task that compares the predictions when a mask is inserted between the arguments of an implicit inter-sentential discourse relation — first, when the right-hand argument (Arg2) starts with a preposed constituent, and again, when that constituent is in canonical (post-verbal) position. Results show that (1) the top-ranked mask-fillers in the preposed case agree more often with “gold” annotations in the Penn Discourse TreeBank than they do in the latter case, and (2) preposing in Arg2 can affect the distribution of discourse-relational senses.</abstract>
       <url hash="a8aa0d72">2024.eacl-long.170</url>
       <bibkey>dong-etal-2024-syntactic</bibkey>
+      <video href="2024.eacl-long.170.mp4"/>
     </paper>
     <paper id="171">
       <title>Can we obtain significant success in <fixed-case>RST</fixed-case> discourse parsing by using Large Language Models?</title>
@@ -2023,6 +2179,7 @@
       <abstract>Recently, decoder-only pre-trained large language models (LLMs), with several tens of billion parameters, have significantly impacted a wide range of natural language processing (NLP) tasks. While encoder-only or encoder-decoder pre-trained language models have already proved to be effective in discourse parsing, the extent to which LLMs can perform this task remains an open research question. Therefore, this paper explores how beneficial such LLMs are for Rhetorical Structure Theory (RST) discourse parsing. Here, the parsing process for both fundamental top-down and bottom-up strategies is converted into prompts, which LLMs can work with. We employ Llama 2 and fine-tune it with QLoRA, which has fewer parameters that can be tuned. Experimental results on three benchmark datasets, RST-DT, Instr-DT, and the GUM corpus, demonstrate that Llama 2 with 70 billion parameters in the bottom-up strategy obtained state-of-the-art (SOTA) results with significant differences. Furthermore, our parsers demonstrated generalizability when evaluated on RST-DT, showing that, in spite of being trained with the GUM corpus, it obtained similar performances to those of existing parsers trained with RST-DT.</abstract>
       <url hash="58708507">2024.eacl-long.171</url>
       <bibkey>maekawa-etal-2024-obtain</bibkey>
+      <video href="2024.eacl-long.171.mp4"/>
     </paper>
     <paper id="172">
       <title>Ameli: Enhancing Multimodal Entity Linking with Fine-Grained Attributes</title>
@@ -2038,6 +2195,7 @@
       <abstract>We propose attribute-aware multimodal entity linking, where the input consists of a mention described with a text paragraph and images, and the goal is to predict the corresponding target entity from a multimodal knowledge base (KB) where each entity is also accompanied by a text description, visual images, and a collection of attributes that present the meta-information of the entity in a structured format. To facilitate this research endeavor, we construct Ameli, encompassing a new multimodal entity linking benchmark dataset that contains 16,735 mentions described in text and associated with 30,472 images, and a multimodal knowledge base that covers 34,690 entities along with 177,873 entity images and 798,216 attributes. To establish baseline performance on Ameli, we experiment with several state-of-the-art architectures for multimodal entity linking and further propose a new approach that incorporates attributes of entities into disambiguation. Experimental results and extensive qualitative analysis demonstrate that extracting and understanding the attributes of mentions from their text descriptions and visual images play a vital role in multimodal entity linking. To the best of our knowledge, we are the first to integrate attributes in the multimodal entity linking task. The programs, model checkpoints, and the dataset are publicly available at https://github.com/VT-NLP/Ameli.</abstract>
       <url hash="2e8a4ccc">2024.eacl-long.172</url>
       <bibkey>yao-etal-2024-ameli</bibkey>
+      <video href="2024.eacl-long.172.mp4"/>
     </paper>
     <paper id="173">
       <title>Generative Dense Retrieval: Memory Can Be a Burden</title>
@@ -2054,6 +2212,7 @@
       <url hash="77762d95">2024.eacl-long.173</url>
       <attachment type="note" hash="95b8af25">2024.eacl-long.173.note.zip</attachment>
       <bibkey>yuan-etal-2024-generative</bibkey>
+      <video href="2024.eacl-long.173.mp4"/>
     </paper>
     <paper id="174">
       <title>Backward Compatibility During Data Updates by Weight Interpolation</title>
@@ -2078,6 +2237,7 @@
       <abstract>Red teaming is a common strategy for identifying weaknesses in generative language models (LMs) by producing adversarial prompts that trigger models to generate unsafe responses. Red teaming is instrumental for both model alignment and evaluation, but is labor-intensive and difficult to scale when done by humans. In this paper, we present Gradient-Based Red Teaming (GBRT), a novel red teaming method for automatically generating diverse prompts that are likely to cause an LM to output unsafe responses. GBRT is a form of prompt learning, trained by scoring an LM response with a safety classifier and then backpropagating through the frozen safety classifier and LM to update the prompt. To improve the coherence of input prompts, we introduce two variants that add a realism loss and fine-tune a pretrained model to generate the prompts instead of learning the prompts directly. Our experiments show that GBRT is more effective at finding prompts that trigger an LM to generate unsafe responses than a strong reinforcement learning-based red teaming approach and works even when the LM has been fine-tuned to produce safer outputs.</abstract>
       <url hash="5265c268">2024.eacl-long.175</url>
       <bibkey>wichers-etal-2024-gradient</bibkey>
+      <video href="2024.eacl-long.175.mp4"/>
     </paper>
     <paper id="176">
       <title>Do Moral Judgment and Reasoning Capability of <fixed-case>LLM</fixed-case>s Change with Language? A Study using the Multilingual Defining Issues Test</title>
@@ -2089,6 +2249,7 @@
       <abstract>This paper explores the moral judgment and moral reasoning abilities exhibited by Large Language Models (LLMs) across languages through the Defining Issues Test. It is a well known fact that moral judgment depends on the language in which the question is asked. We extend the work of beyond English, to 5 new languages (Chinese, Hindi, Russian, Spanish and Swahili), and probe three LLMs – ChatGPT, GPT-4 and Llama2Chat-70B – that shows substantial multilingual text processing and generation abilities. Our study shows that the moral reasoning ability for all models, as indicated by the post-conventional score, is substantially inferior for Hindi and Swahili, compared to Spanish, Russian, Chinese and English, while there is no clear trend for the performance of the latter four languages. The moral judgments too vary considerably by the language.</abstract>
       <url hash="8e597980">2024.eacl-long.176</url>
       <bibkey>khandelwal-etal-2024-moral</bibkey>
+      <video href="2024.eacl-long.176.mp4"/>
     </paper>
     <paper id="177">
       <title>Analyzing the Evaluation of Cross-Lingual Knowledge Transfer in Multilingual Language Models</title>
@@ -2098,6 +2259,7 @@
       <abstract>Recent advances in training multilingual language models on large datasets seem to have shown promising results in knowledge transfer across languages and achieve high performance on downstream tasks. However, we question to what extent the current evaluation benchmarks and setups accurately measure zero-shot cross-lingual knowledge transfer. In this work, we challenge the assumption that high zero-shot performance on target tasks reflects high cross-lingual ability by introducing more challenging setups involving instances with multiple languages. Through extensive experiments and analysis, we show that the observed high performance of multilingual models can be largely attributed to factors not requiring the transfer of actual linguistic knowledge, such as task- and surface-level knowledge. More specifically, we observe what has been transferred across languages is mostly data artifacts and biases, especially for low-resource languages. Our findings highlight the overlooked drawbacks of existing cross-lingual test data and evaluation setups, calling for a more nuanced understanding of the cross-lingual capabilities of multilingual models.</abstract>
       <url hash="d35f8b99">2024.eacl-long.177</url>
       <bibkey>rajaee-monz-2024-analyzing</bibkey>
+      <video href="2024.eacl-long.177.mp4"/>
     </paper>
     <paper id="178">
       <title>Large-Scale Label Interpretation Learning for Few-Shot Named Entity Recognition</title>
@@ -2108,6 +2270,7 @@
       <abstract>Few-shot named entity recognition (NER) detects named entities within text using only a few annotated examples. One promising line of research is to leverage natural language descriptions of each entity type: the common label PER might, for example, be verbalized as ”person entity.” In an initial label interpretation learning phase, the model learns to interpret such verbalized descriptions of entity types. In a subsequent few-shot tagset extension phase, this model is then given a description of a previously unseen entity type (such as ”music album”) and optionally a few training examples to perform few-shot NER for this type. In this paper, we systematically explore the impact of a strong semantic prior to interpret verbalizations of new entity types by massively scaling up the number and granularity of entity types used for label interpretation learning. To this end, we leverage an entity linking benchmark to create a dataset with orders of magnitude of more distinct entity types and descriptions as currently used datasets. We find that this increased signal yields strong results in zero- and few-shot NER in in-domain, cross-domain, and even cross-lingual settings. Our findings indicate significant potential for improving few-shot NER through heuristical data-based optimization.</abstract>
       <url hash="d0da8823">2024.eacl-long.178</url>
       <bibkey>golde-etal-2024-large</bibkey>
+      <video href="2024.eacl-long.178.mp4"/>
     </paper>
     <paper id="179">
       <title><fixed-case>MLC</fixed-case>opilot: Unleashing the Power of Large Language Models in Solving Machine Learning Tasks</title>
@@ -2121,6 +2284,7 @@
       <url hash="24fa996c">2024.eacl-long.179</url>
       <bibkey>zhang-etal-2024-mlcopilot</bibkey>
       <award>Outstanding Paper Award</award>
+      <video href="2024.eacl-long.179.mp4"/>
     </paper>
     <paper id="180">
       <title>Text-Guided Image Clustering</title>
@@ -2136,6 +2300,7 @@
       <url hash="dab1dd8e">2024.eacl-long.180</url>
       <attachment type="software" hash="6fd49102">2024.eacl-long.180.software.zip</attachment>
       <bibkey>stephan-etal-2024-text</bibkey>
+      <video href="2024.eacl-long.180.mp4"/>
     </paper>
     <paper id="181">
       <title><fixed-case>CCP</fixed-case>refix: Counterfactual Contrastive Prefix-Tuning for Many-Class Classification</title>
@@ -2151,6 +2316,7 @@
       <attachment type="software" hash="a0555e87">2024.eacl-long.181.software.zip</attachment>
       <attachment type="note" hash="bf7b8808">2024.eacl-long.181.note.zip</attachment>
       <bibkey>li-etal-2024-ccprefix</bibkey>
+      <video href="2024.eacl-long.181.mp4"/>
     </paper>
   </volume>
   <volume id="short" ingest-date="2024-03-03" type="proceedings">
@@ -2178,6 +2344,7 @@
       <abstract>The realm of Large Language Models (LLMs) is undergoing a continuous and dynamic transformation. These state-of-the-art LLMs showcase an impressive ability to craft narratives based on contextual cues, highlighting their skill in comprehending and producing text resembling human writing. However, there exists a potential risk: the potential inclination of LLMs to create gossips when prompted with specific contexts. These LLMs possess the capacity to generate stories rooted in the context provided by the prompts. Yet, this very capability carries a risk of generating gossips given the context as input. To mitigate this, we introduce a dataset named “French GossipPrompts” designed for identifying prompts that lead to the creation of gossipy content in the French language. This dataset employs binary classification, categorizing whether a given prompt generates gossip or not. The dataset comprises a total of 7253 individual prompts. We have developed classification models and achieved an accuracy of 89.95%.</abstract>
       <url hash="9e6f7be2">2024.eacl-short.1</url>
       <bibkey>sathvik-etal-2024-french</bibkey>
+      <video href="2024.eacl-short.1.mp4"/>
     </paper>
     <paper id="2">
       <title>More Discriminative Sentence Embeddings via Semantic Graph Smoothing</title>
@@ -2200,6 +2367,7 @@
       <bibkey>jeon-lee-2024-multi</bibkey>
       <revision id="1" href="2024.eacl-short.3v1" hash="5a2d7aa2"/>
       <revision id="2" href="2024.eacl-short.3v2" hash="97db6d33" date="2024-03-30">This revision corrects a typo in Equation 1.</revision>
+      <video href="2024.eacl-short.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Mitigating Hallucinations and Off-target Machine Translation with Source-Contrastive and Language-Contrastive Decoding</title>
@@ -2211,6 +2379,7 @@
       <url hash="fb365334">2024.eacl-short.4</url>
       <attachment type="software" hash="78e8b523">2024.eacl-short.4.software.zip</attachment>
       <bibkey>sennrich-etal-2024-mitigating</bibkey>
+      <video href="2024.eacl-short.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Injecting <fixed-case>W</fixed-case>iktionary to improve token-level contextual representations using contrastive learning</title>
@@ -2221,6 +2390,7 @@
       <abstract>While static word embeddings are blind to context, for lexical semantics tasks context is rather too present in contextual word embeddings, vectors of same-meaning occurrences being too different (Ethayarajh, 2019). Fine-tuning pre-trained language models (PLMs) using contrastive learning was proposed, leveraging automatically self-augmented examples (Liu et al., 2021b). In this paper, we investigate how to inject a lexicon as an alternative source of supervision, using the English Wiktionary. We also test how dimensionality reduction impacts the resulting contextual word embeddings. We evaluate our approach on the Word-In-Context (WiC) task, in the unsupervised setting (not using the training set). We achieve new SoTA result on the original WiC test set. We also propose two new WiC test sets for which we show that our fine-tuning method achieves substantial improvements. We also observe improvements, although modest, for the semantic frame induction task. Although we experimented on English to allow comparison with related work, our method is adaptable to the many languages for which large Wiktionaries exist.</abstract>
       <url hash="bcff41b8">2024.eacl-short.5</url>
       <bibkey>mosolova-etal-2024-injecting</bibkey>
+      <video href="2024.eacl-short.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Multilingual Gradient Word-Order Typology from <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies</title>
@@ -2231,6 +2401,7 @@
       <abstract>While information from the field of linguistic typology has the potential to improve performance on NLP tasks, reliable typological data is a prerequisite. Existing typological databases, including WALS and Grambank, suffer from inconsistencies primarily caused by their categorical format. Furthermore, typological categorisations by definition differ significantly from the continuous nature of phenomena, as found in natural language corpora. In this paper, we introduce a new seed dataset made up of continuous-valued data, rather than categorical data, that can better reflect the variability of language. While this initial dataset focuses on word-order typology, we also present the methodology used to create the dataset, which can be easily adapted to generate data for a broader set of features and languages.</abstract>
       <url hash="128cb4ef">2024.eacl-short.6</url>
       <bibkey>baylor-etal-2024-multilingual</bibkey>
+      <video href="2024.eacl-short.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Evaluating the Factuality of Zero-shot Summarizers Across Varied Domains</title>
@@ -2242,6 +2413,7 @@
       <abstract>Recent work has shown that large language models (LLMs) are capable of generating summaries zero-shot—i.e., without explicit supervision—that, under human assessment, are often comparable or even preferred to manually composed reference summaries. However, this prior work has focussed almost exclusively on evaluating news article summarization. How do zero-shot summarizers perform in other (potentially more specialized) domains?In this work we evaluate zero-shot generated summaries across specialized domains including: biomedical articles, and legal bills (in addition to standard news benchmarks for reference). We focus especially on the factuality of outputs. We acquire annotations from domain experts to identify inconsistencies in summaries and systematically categorize these errors. We analyze whether the prevalence of a given domain in the pretraining corpus affects extractiveness and faithfulness of generated summaries of articles in this domain. We release all collected annotations to facilitate additional research toward measuring and realizing factually accurate summarization, beyond news articles (The dataset can be downloaded from https://anonymous.4open.science/r/zero_shot_faceval_domains-9B83)</abstract>
       <url hash="19a8a9a2">2024.eacl-short.7</url>
       <bibkey>ramprasad-etal-2024-evaluating</bibkey>
+      <video href="2024.eacl-short.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Leveraging Implicit Feedback from Deployment Data in Dialogue</title>
@@ -2254,6 +2426,7 @@
       <abstract>We study improving social conversational agents by learning from natural dialogue between users and a deployed model, without extra annotations. To implicitly measure the quality of a machine-generated utterance, we leverage signals like user response length, sentiment and reaction of the future human utterances in the collected dialogue episodes. Our experiments use the publicly released deployment data from BlenderBot (Xu et al., 2023). Human evaluation indicates improvements in our new models over baseline responses; however, we find that some proxy signals can lead to more generations with undesirable properties as well. For example, optimizing for conversation length can lead to more controversial or unfriendly generations compared to the baseline, whereas optimizing for positive sentiment or reaction can decrease these behaviors.</abstract>
       <url hash="8ce46674">2024.eacl-short.8</url>
       <bibkey>pang-etal-2024-leveraging</bibkey>
+      <video href="2024.eacl-short.8.mp4"/>
     </paper>
     <paper id="9">
       <title>Characterizing the Confidence of Large Language Model-Based Automatic Evaluation Metrics</title>
@@ -2264,6 +2437,7 @@
       <abstract>There has recently been a growing interest in using Large Language Models (LLMs) to evaluate NLP tasks automatically. Considerable research effort has been put into improving such systems towards achieving high correlations with human judgement. However, it is still unclear what level of correlation is good enough for practical applications of LLM-based automatic evaluation systems. This paper characterizes these LLM evaluators’ confidence in ranking candidate NLP models and develops a configurable Monte Carlo simulation method. We show that even automatic metrics with low correlation with human judgement can reach high-confidence rankings of candidate models with reasonable evaluation set sizes (100s of examples). Further, we describe tradeoff curves between the LLM evaluator performance (i.e., correlation with humans) and evaluation set size; loss in correlation can be compensated with modest increases in the evaluation set size. We validate our results on RoSE, a text summarization dataset, and find our estimates of confidence align with empirical observations.Code available at https://github.com/rickardstureborg/llm-eval-confidence</abstract>
       <url hash="861861e5">2024.eacl-short.9</url>
       <bibkey>stureborg-etal-2024-characterizing</bibkey>
+      <video href="2024.eacl-short.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Equipping Language Models with Tool Use Capability for Tabular Data Analysis in Finance</title>
@@ -2274,6 +2448,7 @@
       <url hash="d60bf33d">2024.eacl-short.10</url>
       <attachment type="note" hash="9e28b2b9">2024.eacl-short.10.note.zip</attachment>
       <bibkey>theuma-shareghi-2024-equipping</bibkey>
+      <video href="2024.eacl-short.10.mp4"/>
     </paper>
     <paper id="11">
       <title>Commonsense-augmented Memory Construction and Management in Long-term Conversations via Context-aware Persona Refinement</title>
@@ -2287,6 +2462,7 @@
       <url hash="71818cd7">2024.eacl-short.11</url>
       <attachment type="software" hash="a8959d7d">2024.eacl-short.11.software.zip</attachment>
       <bibkey>kim-etal-2024-commonsense</bibkey>
+      <video href="2024.eacl-short.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Investigating the Potential of Task Arithmetic for Cross-Lingual Transfer</title>
@@ -2308,6 +2484,7 @@
       <attachment type="software" hash="8abd1078">2024.eacl-short.13.software.zip</attachment>
       <attachment type="note" hash="5eb8f975">2024.eacl-short.13.note.zip</attachment>
       <bibkey>flores-cohan-2024-benefits</bibkey>
+      <video href="2024.eacl-short.13.mp4"/>
     </paper>
     <paper id="14">
       <title>Evaluating Unsupervised Argument Aligners via Generation of Conclusions of Structured Scientific Abstracts</title>
@@ -2320,6 +2497,7 @@
       <abstract>Scientific abstracts provide a concise summary of research findings, making them a valuable resource for extracting scientific arguments. In this study, we assess various unsupervised approaches for extracting arguments as aligned premise-conclusion pairs: semantic similarity, text perplexity, and mutual information. We aggregate structured abstracts from PubMed Central Open Access papers published in 2022 and evaluate the argument aligners in terms of the performance of language models that we fine-tune to generate the conclusions from the extracted premise given as input prompts. We find that mutual information outperforms the other measures on this task, suggesting that the reasoning process in scientific abstracts hinges mostly on linguistic constructs beyond simple textual similarity.</abstract>
       <url hash="359f71fe">2024.eacl-short.14</url>
       <bibkey>gao-etal-2024-evaluating</bibkey>
+      <video href="2024.eacl-short.14.mp4"/>
     </paper>
     <paper id="15">
       <title>Over-Reasoning and Redundant Calculation of Large Language Models</title>
@@ -2330,6 +2508,7 @@
       <url hash="8a8e11cc">2024.eacl-short.15</url>
       <attachment type="note" hash="6d72ef54">2024.eacl-short.15.note.zip</attachment>
       <bibkey>chiang-lee-2024-reasoning</bibkey>
+      <video href="2024.eacl-short.15.mp4"/>
     </paper>
     <paper id="16">
       <title>Multimodal Fallacy Classification in Political Debates</title>
@@ -2340,6 +2519,7 @@
       <abstract>Recent advances in NLP suggest that some tasks, such as argument detection and relation classification, are better framed in a multimodal perspective. We propose multimodal argument mining for argumentative fallacy classification in political debates. To this end, we release the first corpus for multimodal fallacy classification. Our experiments show that the integration of the audio modality leads to superior classification performance. Our findings confirm that framing fallacy classification as a multimodal task is essential to capture paralinguistic aspects of fallacious arguments.</abstract>
       <url hash="6a9f1b90">2024.eacl-short.16</url>
       <bibkey>mancini-etal-2024-multimodal</bibkey>
+      <video href="2024.eacl-short.16.mp4"/>
     </paper>
     <paper id="17">
       <title>The Parrot Dilemma: Human-Labeled vs. <fixed-case>LLM</fixed-case>-augmented Data in Classification Tasks</title>
@@ -2352,6 +2532,7 @@
       <url hash="9268aea3">2024.eacl-short.17</url>
       <attachment type="software" hash="499519a7">2024.eacl-short.17.software.zip</attachment>
       <bibkey>moller-etal-2024-parrot</bibkey>
+      <video href="2024.eacl-short.17.mp4"/>
     </paper>
     <paper id="18">
       <title>Language Model Sentence Completion with a Parser-Driven Rhetorical Control Method</title>
@@ -2361,6 +2542,7 @@
       <abstract>Controlled text generation (CTG) seeks to guide large language model (LLM) output, that statistical language generation would conform to desired criteria. The current study presents a novel CTG algorithm that enforces adherence toward specific rhetorical relations in an LLM sentence-completion context by a parser-driven decoding scheme that requires no model fine-tuning. The method is validated both with automatic and human evaluation.</abstract>
       <url hash="ee9d9743">2024.eacl-short.18</url>
       <bibkey>zingale-kalita-2024-language</bibkey>
+      <video href="2024.eacl-short.18.mp4"/>
     </paper>
     <paper id="19">
       <title>”It’s how you do things that matters”: Attending to Process to Better Serve Indigenous Communities with Language Technologies</title>
@@ -2371,6 +2553,7 @@
       <abstract>Indigenous languages are historically under-served by Natural Language Processing (NLP) technologies, but this is changing for some languages with the recent scaling of large multilingual models and an increased focus by the NLP community on endangered languages. This position paper explores ethical considerations in building NLP technologies for Indigenous languages, based on the premise that such projects should primarily serve Indigenous communities. We report on interviews with 17 researchers working in or with Aboriginal and/or Torres Strait Islander communities on language technology projects in Australia. Drawing on insights from the interviews, we recommend practices for NLP researchers to increase attention to the process of engagements with Indigenous communities, rather than focusing only on decontextualised artefacts.</abstract>
       <url hash="061935bb">2024.eacl-short.19</url>
       <bibkey>cooper-etal-2024-things</bibkey>
+      <video href="2024.eacl-short.19.mp4"/>
     </paper>
     <paper id="20">
       <title>Source Identification in Abstractive Summarization</title>
@@ -2382,6 +2565,7 @@
       <attachment type="software" hash="cfcdd5df">2024.eacl-short.20.software.zip</attachment>
       <attachment type="note" hash="a3ec5467">2024.eacl-short.20.note.zip</attachment>
       <bibkey>suhara-alikaniotis-2024-source</bibkey>
+      <video href="2024.eacl-short.20.mp4"/>
     </paper>
     <paper id="21">
       <title>From Partial to Strictly Incremental Constituent Parsing</title>
@@ -2392,6 +2576,7 @@
       <abstract>We study incremental constituent parsers to assess their capacity to output trees based on prefix representations alone. Guided by strictly left-to-right generative language models and tree-decoding modules, we build parsers that adhere to a strong definition of incrementality across languages. This builds upon work that asserted incrementality, but that mostly only enforced it on either the encoder or the decoder. Finally, we conduct an analysis against non-incremental and partially incremental models.</abstract>
       <url hash="add9b44e">2024.eacl-short.21</url>
       <bibkey>ezquerro-etal-2024-partial</bibkey>
+      <video href="2024.eacl-short.21.mp4"/>
     </paper>
     <paper id="22">
       <title>Predict the Next Word: &lt;Humans exhibit uncertainty in this task and language models _____&gt;</title>
@@ -2403,6 +2588,7 @@
       <bibkey>ilia-aziz-2024-predict</bibkey>
       <revision id="1" href="2024.eacl-short.22v1" hash="ca3d147e"/>
       <revision id="2" href="2024.eacl-short.22v2" hash="ccf7f5d5" date="2024-04-29">Figure 3 was plotting variances instead of standard deviations. A relevant footnote was also added to explain this.</revision>
+      <video href="2024.eacl-short.22.mp4"/>
     </paper>
     <paper id="23">
       <title>A Prompt Response to the Demand for Automatic Gender-Neutral Translation</title>
@@ -2415,6 +2601,7 @@
       <abstract>Gender-neutral translation (GNT) that avoids biased and undue binary assumptions is a pivotal challenge for the creation of more inclusive translation technologies. Advancements for this task in Machine Translation (MT), however, are hindered by the lack of dedicated parallel data, which are necessary to adapt MT systems to satisfy neutral constraints. For such a scenario, large language models offer hitherto unforeseen possibilities, as they come with the distinct advantage of being versatile in various (sub)tasks when provided with explicit instructions. In this paper, we explore this potential to automate GNT by comparing MT with the popular GPT-4 model. Through extensive manual analyses, our study empirically reveals the inherent limitations of current MT systems in generating GNTs and provides valuable insights into the potential and challenges associated with prompting for neutrality.</abstract>
       <url hash="d875f994">2024.eacl-short.23</url>
       <bibkey>savoldi-etal-2024-prompt</bibkey>
+      <video href="2024.eacl-short.23.mp4"/>
     </paper>
     <paper id="24">
       <title>Interpreting Predictive Probabilities: Model Confidence or Human Label Variation?</title>
@@ -2426,6 +2613,7 @@
       <abstract>With the rise of increasingly powerful and user-facing NLP systems, there is growing interest in assessing whether they have a good _representation of uncertainty_ by evaluating the quality of their predictive distribution over outcomes. We identify two main perspectives that drive starkly different evaluation protocols. The first treats predictive probability as an indication of model confidence; the second as an indication of human label variation. We discuss their merits and limitations, and take the position that both are crucial for trustworthy and fair NLP systems, but that exploiting a single predictive distribution is limiting. We recommend tools and highlight exciting directions towards models with disentangled representations of uncertainty about predictions and uncertainty about human labels.</abstract>
       <url hash="55983a0b">2024.eacl-short.24</url>
       <bibkey>baan-etal-2024-interpreting</bibkey>
+      <video href="2024.eacl-short.24.mp4"/>
     </paper>
     <paper id="25">
       <title>Smaller Language Models are Better Zero-shot Machine-Generated Text Detectors</title>
@@ -2449,6 +2637,7 @@
       <abstract>We address the task of machine translation (MT) from extremely low-resource language (ELRL) to English by leveraging cross-lingual transfer from *closely-related* high-resource language (HRL). The development of an MT system for ELRL is challenging because these languages typically lack parallel corpora and monolingual corpora, and their representations are absent from large multilingual language models. Many ELRLs share lexical similarities with some HRLs, which presents a novel modeling opportunity. However, existing subword-based neural MT models do not explicitly harness this lexical similarity, as they only implicitly align HRL and ELRL latent embedding space. To overcome this limitation, we propose a novel, CharSpan, approach based on character-span noise augmentation into the training data of HRL. This serves as a regularization technique, making the model more robust to <i>lexical divergences</i> between the HRL and ELRL, thus facilitating effective cross-lingual transfer. Our method significantly outperformed strong baselines in zero-shot settings on closely related HRL and ELRL pairs from three diverse language families, emerging as the state-of-the-art model for ELRLs.</abstract>
       <url hash="f6c6284a">2024.eacl-short.26</url>
       <bibkey>maurya-etal-2024-charspan</bibkey>
+      <video href="2024.eacl-short.26.mp4"/>
     </paper>
     <paper id="27">
       <title>Robust Neural Machine Translation for Abugidas by Glyph Perturbation</title>
@@ -2460,6 +2649,7 @@
       <abstract>Neural machine translation (NMT) systems are vulnerable when trained on limited data. This is a common scenario in low-resource tasks in the real world. To increase robustness, a solution is to intently add realistic noise in the training phase. Noise simulation using text perturbation has been proven to be efficient in writing systems that use Latin letters. In this study, we further explore perturbation techniques on more complex abugida writing systems, for which the visual similarity of complex glyphs is considered to capture the essential nature of these writing systems. Besides the generated noise, we propose a training strategy to improve robustness. We conducted experiments on six languages: Bengali, Hindi, Myanmar, Khmer, Lao, and Thai. By overcoming the introduced noise, we obtained non-degenerate NMT systems with improved robustness for low-resource tasks for abugida glyphs.</abstract>
       <url hash="d7605d79">2024.eacl-short.27</url>
       <bibkey>kaing-etal-2024-robust</bibkey>
+      <video href="2024.eacl-short.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Translation Errors Significantly Impact Low-Resource Languages in Cross-Lingual Learning</title>
@@ -2470,6 +2660,7 @@
       <abstract>Popular benchmarks (e.g., XNLI) used to evaluate cross-lingual language understanding consist of parallel versions of English evaluation sets in multiple target languages created with the help of professional translators. When creating such parallel data, it is critical to ensure high-quality translations for all target languages for an accurate characterization of cross-lingual transfer. In this work, we find that translation inconsistencies <i>do exist</i> and interestingly they <i>disproportionally impact low-resource languages</i> in XNLI. To identify such inconsistencies, we propose measuring the gap in performance between zero-shot evaluations on the human-translated and machine-translated target text across multiple target languages; relatively large gaps are indicative of translation errors. We also corroborate that translation errors exist for two target languages, namely Hindi and Urdu, by doing a manual reannotation of human-translated test instances in these two languages and finding poor agreement with the original English labels these instances were supposed to inherit.</abstract>
       <url hash="30ab5c84">2024.eacl-short.28</url>
       <bibkey>agrawal-etal-2024-translation</bibkey>
+      <video href="2024.eacl-short.28.mp4"/>
     </paper>
     <paper id="29">
       <title>Less is More for Long Document Summary Evaluation by <fixed-case>LLM</fixed-case>s</title>
@@ -2482,6 +2673,7 @@
       <abstract>Large Language Models (LLMs) have shown promising performance in summary evaluation tasks, yet they face challenges such as high computational costs and the <i>Lost-in-the-Middle</i> problem where important information in the middle of long documents is often overlooked. To address these issues, this paper introduces a novel approach, Extract-then-Evaluate, which involves extracting key sentences from a long source document and then evaluating the summary by prompting LLMs. The results reveal that the proposed method not only significantly reduces evaluation costs but also exhibits a higher correlation with human evaluations. Furthermore, we provide practical recommendations for optimal document length and sentence extraction methods, contributing to the development of cost-effective yet more accurate methods for LLM-based text generation evaluation.</abstract>
       <url hash="0fbcc882">2024.eacl-short.29</url>
       <bibkey>wu-etal-2024-less</bibkey>
+      <video href="2024.eacl-short.29.mp4"/>
     </paper>
     <paper id="30">
       <title>Leveraging <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> in Pharmacovigilance Event Extraction: An Empirical Study</title>
@@ -2493,6 +2685,7 @@
       <abstract>With the advent of large language models (LLMs), there has been growing interest in exploring their potential for medical applications. This research aims to investigate the ability of LLMs, specifically ChatGPT, in the context of pharmacovigilance event extraction, of which the main goal is to identify and extract adverse events or potential therapeutic events from textual medical sources. We conduct extensive experiments to assess the performance of ChatGPT in the pharmacovigilance event extraction task, employing various prompts and demonstration selection strategies. The findings demonstrate that while ChatGPT demonstrates reasonable performance with appropriate demonstration selection strategies, it still falls short compared to fully fine-tuned small models. Additionally, we explore the potential of leveraging ChatGPT for data augmentation. However, our investigation reveals that the inclusion of synthesized data into fine-tuning may lead to a decrease in performance, possibly attributed to noise in the ChatGPT-generated labels. To mitigate this, we explore different filtering strategies and find that, with the proper approach, more stable performance can be achieved, although constant improvement remains elusive.</abstract>
       <url hash="938df389">2024.eacl-short.30</url>
       <bibkey>sun-etal-2024-leveraging</bibkey>
+      <video href="2024.eacl-short.30.mp4"/>
     </paper>
     <paper id="31">
       <title>A Comparative Analysis of Conversational Large Language Models in Knowledge-Based Text Generation</title>
@@ -2505,6 +2698,7 @@
       <url hash="308f5740">2024.eacl-short.31</url>
       <attachment type="note" hash="588a7caa">2024.eacl-short.31.note.zip</attachment>
       <bibkey>schneider-etal-2024-comparative</bibkey>
+      <video href="2024.eacl-short.31.mp4"/>
     </paper>
     <paper id="32">
       <title>Extreme Fine-tuning: A Novel and Fast Fine-tuning Approach for Text Classification</title>
@@ -2516,6 +2710,7 @@
       <abstract>Although fine-tuning a pre-trained model with a conventional approach has shown to be effective in various downstream tasks, previous work has used only backpropagation to fine-tune the model, which causes a massive amount of computational resources and time. We propose Extreme Fine-Tuning (EFT), a novel approach for fine-tuning a pre-trained model effectively and efficiently. EFT uses backpropagation for a brief fine-tuning and an iterative extreme learning machine for training a classifier. We applied EFT to four text classification datasets, MELD, IEMOCAP, IMDb, and AG News, and compared its performance with state-of-the-art (SOTA) approaches. The results indicate that EFT noticeably outperformed the other approaches in training-time measurement with comparable model performance. We will release our code at https://github.com/up-33/extreme-fine-tuning.</abstract>
       <url hash="d7ff3f30">2024.eacl-short.32</url>
       <bibkey>jiaramaneepinit-etal-2024-extreme</bibkey>
+      <video href="2024.eacl-short.32.mp4"/>
     </paper>
     <paper id="33">
       <title>Flow Matching for Conditional Text Generation in a Few Sampling Steps</title>
@@ -2544,6 +2739,7 @@
       <url hash="5c0d35d7">2024.eacl-short.34</url>
       <attachment type="software" hash="532a4543">2024.eacl-short.34.software.zip</attachment>
       <bibkey>lei-etal-2024-corpus</bibkey>
+      <video href="2024.eacl-short.34.mp4"/>
     </paper>
     <paper id="35">
       <title>Defending Against Disinformation Attacks in Open-Domain Question Answering</title>
@@ -2556,6 +2752,7 @@
       <abstract>Recent work in open-domain question answering (ODQA) has shown that adversarial poisoning of the search collection can cause large drops in accuracy for production systems. However, little to no work has proposed methods to defend against these attacks. To do so, we rely on the intuition that redundant information often exists in large corpora. To find it, we introduce a method that uses query augmentation to search for a diverse set of passages that could answer the original question but are less likely to have been poisoned. We integrate these new passages into the model through the design of a novel confidence method, comparing the predicted answer to its appearance in the retrieved contexts (what we call Confidence from Answer Redundancy, i.e. CAR). Together these methods allow for a simple but effective way to defend against poisoning attacks that provides gains of nearly 20% exact match across varying levels of data poisoning/knowledge conflicts.</abstract>
       <url hash="59f6f95f">2024.eacl-short.35</url>
       <bibkey>weller-etal-2024-defending</bibkey>
+      <video href="2024.eacl-short.35.mp4"/>
     </paper>
     <paper id="36">
       <title>Sentence Representations via <fixed-case>G</fixed-case>aussian Embedding</title>
@@ -2567,6 +2764,7 @@
       <abstract>Recent progress in sentence embedding, which represents a sentence’s meaning as a point in a vector space, has achieved high performance on several tasks such as the semantic textual similarity (STS) task.However, a sentence representation cannot adequately express the diverse information that sentences contain: for example, such representations cannot naturally handle asymmetric relationships between sentences.This paper proposes GaussCSE, a Gaussian-distribution-based contrastive learning framework for sentence embedding that can handle asymmetric inter-sentential relations, as well as a similarity measure for identifying entailment relations.Our experiments show that GaussCSE achieves performance comparable to that of previous methods on natural language inference (NLI) tasks, and that it can estimate the direction of entailment relations, which is difficult with point representations.</abstract>
       <url hash="fc1d33f4">2024.eacl-short.36</url>
       <bibkey>yoda-etal-2024-sentence</bibkey>
+      <video href="2024.eacl-short.36.mp4"/>
     </paper>
     <paper id="37">
       <title><fixed-case>STOR</fixed-case>i<fixed-case>C</fixed-case>o: Storytelling <fixed-case>TTS</fixed-case> for <fixed-case>H</fixed-case>indi with Character Voice Modulation</title>
@@ -2578,6 +2776,7 @@
       <abstract>We present a new Hindi text-to-speech (TTS) dataset and demonstrate its utility for the expressive synthesis of children’s audio stories. The dataset comprises narration by a single female speaker who modifies her voice to produce different story characters. Annotation for dialogue identification, character labelling, and character attribution are provided, all of which are expected to facilitate the learning of character voice and speaking styles. Experiments are conducted using different versions of the annotated dataset that enable training a multi-speaker TTS model on the single-speaker data. Subjective tests show that the multi-speaker model improves expressiveness and character voice consistency compared to the baseline single-speaker TTS. With the multi-speaker model, objective evaluations show comparable word error rates, better speaker voice consistency, and higher correlations with ground-truth emotion attributes. We release a new 16.8 hours storytelling speech dataset in Hindi and propose effective solutions for expressive TTS with narrator voice modulation and character voice consistency.</abstract>
       <url hash="21284d0b">2024.eacl-short.37</url>
       <bibkey>tankala-etal-2024-storico</bibkey>
+      <video href="2024.eacl-short.37.mp4"/>
     </paper>
     <paper id="38">
       <title>Rethinking Loss Functions for Fact Verification</title>
@@ -2588,6 +2787,7 @@
       <abstract>We explore loss functions for fact verification in the FEVER shared task. While the cross-entropy loss is a standard objective for training verdict predictors, it fails to capture the heterogeneity among the FEVER verdict classes. In this paper, we develop two task-specific objectives tailored to FEVER. Experimental results confirm that the proposed objective functions outperform the standard cross-entropy. Performance is further improved when these objectives are combined with simple class weighting, which effectively overcomes the imbalance in the training data. The source code is available (https://github.com/yuta-mukobara/RLF-KGAT).</abstract>
       <url hash="3a81db24">2024.eacl-short.38</url>
       <bibkey>mukobara-etal-2024-rethinking</bibkey>
+      <video href="2024.eacl-short.38.mp4"/>
     </paper>
     <paper id="39">
       <title>A Dataset for Metaphor Detection in Early Medieval <fixed-case>H</fixed-case>ebrew Poetry</title>
@@ -2600,6 +2800,7 @@
       <abstract>There is a large volume of late antique and medieval Hebrew texts. They represent a crucial linguistic and cultural bridge between Biblical and modern Hebrew. Poetry is prominent in these texts and one of its main characteristics is the frequent use of metaphor. Distinguishing figurative and literal language use is a major task for scholars of the Humanities, especially in the fields of literature, linguistics, and hermeneutics. This paper presents a new, challenging dataset of late antique and medieval Hebrew poetry with expert annotations of metaphor, as well as some baseline results, which we hope will facilitate further research in this area.</abstract>
       <url hash="78f9e799">2024.eacl-short.39</url>
       <bibkey>toker-etal-2024-dataset</bibkey>
+      <video href="2024.eacl-short.39.mp4"/>
     </paper>
     <paper id="40">
       <title><fixed-case>SOCIALITE</fixed-case>-<fixed-case>LLAMA</fixed-case>: An Instruction-Tuned Model for Social Scientific Tasks</title>
@@ -2618,6 +2819,7 @@
       <bibkey>dey-etal-2024-socialite</bibkey>
       <revision id="1" href="2024.eacl-short.40v1" hash="40eaf18d"/>
       <revision id="2" href="2024.eacl-short.40v2" hash="91eb1d08" date="2024-03-16">Added a sponsor.</revision>
+      <video href="2024.eacl-short.40.mp4"/>
     </paper>
     <paper id="41">
       <title>Pre-Training Methods for Question Reranking</title>
@@ -2640,6 +2842,7 @@
       <abstract>Most works on transformers trained with the Masked Language Modeling (MLM) objective use the original BERT model’s fixed masking rate of 15%. We propose to instead dynamically schedule the masking rate throughout training. We find that linearly decreasing the masking rate over the course of pretraining improves average GLUE accuracy by up to 0.46% and 0.25% in BERT-base and BERT-large, respectively, compared to fixed rate baselines. These gains come from exposure to both high and low masking rate regimes, providing benefits from both settings. Our results demonstrate that masking rate scheduling is a simple way to improve the quality of masked language models, achieving up to a 1.89x speedup in pretraining for BERT-base as well as a Pareto improvement for BERT-large.</abstract>
       <url hash="0ceeba81">2024.eacl-short.42</url>
       <bibkey>ankner-etal-2024-dynamic</bibkey>
+      <video href="2024.eacl-short.42.mp4"/>
     </paper>
   </volume>
   <volume id="demo" ingest-date="2024-03-03" type="proceedings">
@@ -2670,6 +2873,7 @@
       <abstract>In this paper we introduce TextBI, a multimodal generic dashboard designed to present multidimensional text annotations on large volumes of multilingual social media data. This tool focuses on four core dimensions: spatial, temporal, thematic, and personal, and also supports additional enrichment data such as sentiment and engagement. Multiple visualization modes are offered, including frequency, movement, and association. This dashboard addresses the challenge of facilitating the interpretation of NLP annotations by visualizing them in a user-friendly, interactive interface catering to two categories of users: (1) domain stakeholders and (2) NLP researchers. We conducted experiments within the domain of tourism leveraging data from X (formerly Twitter) and incorporating requirements from tourism offices. Our approach, TextBI, represents a significant advancement in the field of visualizing NLP annotations by integrating and blending features from a variety of Business Intelligence, Geographical Information Systems and NLP tools. A demonstration video is also provided https://youtu.be/x714RKvo9Cg</abstract>
       <url hash="df77c7ee">2024.eacl-demo.1</url>
       <bibkey>masson-etal-2024-textbi</bibkey>
+      <video href="2024.eacl-demo.1.mp4"/>
     </paper>
     <paper id="2">
       <title>k<fixed-case>NN</fixed-case>-<fixed-case>BOX</fixed-case>: A Unified Framework for Nearest Neighbor Generation</title>
@@ -2707,6 +2911,7 @@
       <abstract>pyTLEX is an implementation of the TimeLine EXtraction algorithm (TLEX; Finlayson et al.,2021) that enables users to work with TimeML annotations and perform advanced temporal analysis, offering a comprehensive suite of features. TimeML is a standardized markup language for temporal information in text. pyTLEX allows users to parse TimeML annotations, construct TimeML graphs, and execute the TLEX algorithm to effect complete timeline extraction. In contrast to previous implementations (i.e., jTLEX for Java), pyTLEX sets itself apart with a range of advanced features. It introduces a React-based visualization system, enhancing the exploration of temporal data and the comprehension of temporal connections within textual information. Furthermore, pyTLEX incorporates an algorithm for increasing connectivity in temporal graphs, which identifies graph disconnectivity and recommends links based on temporal reasoning, thus enhancing the coherence of the graph representation. Additionally, pyTLEX includes a built-in validation algorithm, ensuring compliance with TimeML annotation guidelines, which is essential for maintaining data quality and reliability. pyTLEX equips researchers and developers with an extensive toolkit for temporal analysis, and its testing across various datasets validates its accuracy and reliability.</abstract>
       <url hash="f73e10d4">2024.eacl-demo.4</url>
       <bibkey>singh-etal-2024-pytlex</bibkey>
+      <video href="2024.eacl-demo.4.mp4"/>
     </paper>
     <paper id="5">
       <title><fixed-case>D</fixed-case>epress<fixed-case>M</fixed-case>ind: A Depression Surveillance System for Social Media Analysis</title>
@@ -2729,6 +2934,7 @@
       <abstract>Many European citizens become targets of the Kremlin propaganda campaigns, aiming to minimise public support for Ukraine, foster a climate of mistrust and disunity, and shape elections (Meister, 2022). To address this challenge, we developed “Check News in 1 Click”, the first NLP-empowered pro-Kremlin propaganda detection application available in 7 languages, which provides the lay user with feedback on their news, and explains manipulative linguistic features and keywords. We conducted a user study, analysed user entries and models’ behaviour paired with questionnaire answers, and investigated the advantages and disadvantages of the proposed interpretative solution.</abstract>
       <url hash="9673cb80">2024.eacl-demo.6</url>
       <bibkey>solopova-etal-2024-check</bibkey>
+      <video href="2024.eacl-demo.6.mp4"/>
     </paper>
     <paper id="7">
       <title><fixed-case>NESTLE</fixed-case>: a No-Code Tool for Statistical Analysis of Legal Corpus</title>
@@ -2740,6 +2946,7 @@
       <abstract>The statistical analysis of large scale legal corpus can provide valuable legal insights. For such analysis one needs to (1) select a subset of the corpus using document retrieval tools, (2) structure text using information extraction (IE) systems, and (3) visualize the data for the statistical analysis. Each process demands either specialized tools or programming skills whereas no comprehensive unified “no-code” tools have been available. Here we provide NESTLE, a no-code tool for large-scale statistical analysis of legal corpus. Powered by a Large Language Model (LLM) and the internal custom end-to-end IE system, NESTLE can extract any type of information that has not been predefined in the IE system opening up the possibility of unlimited customizable statistical analysis of the corpus without writing a single line of code. We validate our system on 15 Korean precedent IE tasks and 3 legal text classification tasks from LexGLUE. The comprehensive experiments reveal NESTLE can achieve GPT-4 comparable performance by training the internal IE module with 4 human-labeled, and 192 LLM-labeled examples.</abstract>
       <url hash="03716bd4">2024.eacl-demo.7</url>
       <bibkey>cho-etal-2024-nestle</bibkey>
+      <video href="2024.eacl-demo.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Multi-party Multimodal Conversations Between Patients, Their Companions, and a Social Robot in a Hospital Memory Clinic</title>
@@ -2756,6 +2963,7 @@
       <url hash="9e81e1d3">2024.eacl-demo.8</url>
       <bibkey>addlesee-etal-2024-multi</bibkey>
       <award>Best Demo Award</award>
+      <video href="2024.eacl-demo.8.mp4"/>
     </paper>
     <paper id="9">
       <title><fixed-case>S</fixed-case>cam<fixed-case>S</fixed-case>pot: Fighting Financial Fraud in <fixed-case>I</fixed-case>nstagram Comments</title>
@@ -2765,6 +2973,7 @@
       <abstract>The long-standing problem of spam and fraudulent messages in the comment sections of Instagram pages in the financial sector claims new victims every day. Instagram’s current spam filter proves inadequate, and existing research approaches are primarily confined to theoretical concepts. Practical implementations with evaluated results are missing. To solve this problem, we propose ScamSpot, a comprehensive system that includes a browser extension, a fine-tuned BERT model and a REST API. This approach ensures public accessibility of our results for Instagram users using the Chrome browser. Furthermore, we conduct a data annotation study, shedding light on the reasons and causes of the problem and evaluate the system through user feedback and comparison with existing models. ScamSpot is an open-source project and is publicly available at https://scamspot.github.io/.</abstract>
       <url hash="d6613b8d">2024.eacl-demo.9</url>
       <bibkey>erben-waldis-2024-scamspot</bibkey>
+      <video href="2024.eacl-demo.9.mp4"/>
     </paper>
     <paper id="10">
       <title><fixed-case>N</fixed-case>arrative<fixed-case>P</fixed-case>lay: Interactive Narrative Understanding</title>
@@ -2779,6 +2988,7 @@
       <abstract>In this paper, we introduce NarrativePlay, a novel system that allows users to role-play a fictional character and interact with other characters in narratives in an immersive environment. We leverage Large Language Models (LLMs) to generate human-like responses, guided by personality traits extracted from narratives. The system incorporates auto-generated visual display of narrative settings, character portraits, and character speech, greatly enhancing the user experience. Our approach eschews predefined sandboxes, focusing instead on main storyline events from the perspective of a user-selected character. NarrativePlay has been evaluated on two types of narratives, detective and adventure stories, where users can either explore the world or increase affinity with other characters through conversations.</abstract>
       <url hash="a8809656">2024.eacl-demo.10</url>
       <bibkey>zhao-etal-2024-narrativeplay</bibkey>
+      <video href="2024.eacl-demo.10.mp4"/>
     </paper>
     <paper id="11">
       <title><fixed-case>DP</fixed-case>-<fixed-case>NMT</fixed-case>: Scalable Differentially Private Machine Translation</title>
@@ -2792,6 +3002,7 @@
       <abstract>Neural machine translation (NMT) is a widely popular text generation task, yet there is a considerable research gap in the development of privacy-preserving NMT models, despite significant data privacy concerns for NMT systems. Differentially private stochastic gradient descent (DP-SGD) is a popular method for training machine learning models with concrete privacy guarantees; however, the implementation specifics of training a model with DP-SGD are not always clarified in existing models, with differing software libraries used and code bases not always being public, leading to reproducibility issues. To tackle this, we introduce DP-NMT, an open-source framework for carrying out research on privacy-preserving NMT with DP-SGD, bringing together numerous models, datasets, and evaluation metrics in one systematic software package. Our goal is to provide a platform for researchers to advance the development of privacy-preserving NMT systems, keeping the specific details of the DP-SGD algorithm transparent and intuitive to implement. We run a set of experiments on datasets from both general and privacy-related domains to demonstrate our framework in use. We make our framework publicly available and welcome feedback from the community.</abstract>
       <url hash="cfc57f01">2024.eacl-demo.11</url>
       <bibkey>igamberdiev-etal-2024-dp</bibkey>
+      <video href="2024.eacl-demo.11.mp4"/>
     </paper>
     <paper id="12">
       <title><fixed-case>A</fixed-case>nno<fixed-case>P</fixed-case>lot: Interactive Visualizations of Text Annotations</title>
@@ -2816,6 +3027,7 @@
       <abstract>Spatial information in text enables to understand the geographical context and relationships within text for better decision-making across various domains such as disease surveillance, disaster management and other location based services. Therefore, it is crucial to understand the precise geographical context for location-sensitive applications. In response to this necessity, we introduce the GeospaCy software tool, designed for the extraction and georeferencing of spatial information present in textual data. GeospaCy fulfils two primary objectives: 1) Geoparsing, which involves extracting spatial expressions, encompassing place names and associated spatial relations within the text data, and 2) Geocoding, which facilitates the assignment of geographical coordinates to the spatial expressions extracted during the Geoparsing task. Geoparsing is evaluated with a disease news article dataset consisting of event information, whereas a qualitative evaluation of geographical coordinates (polygons/geometries) of spatial expressions is performed by end-users for Geocoding task.</abstract>
       <url hash="ce33be76">2024.eacl-demo.13</url>
       <bibkey>mehtab-alam-etal-2024-geospacy</bibkey>
+      <video href="2024.eacl-demo.13.mp4"/>
     </paper>
     <paper id="14">
       <title><fixed-case>MAMMOTH</fixed-case>: Massively Multilingual Modular Open Translation @ <fixed-case>H</fixed-case>elsinki</title>
@@ -2849,6 +3061,7 @@
       <abstract>We present the DURel tool implementing the annotation of semantic proximity between word uses into an online, open source interface. The tool supports standardized human annotation as well as computational annotation, building on recent advances with Word-in-Context models. Annotator judgments are clustered with automatic graph clustering techniques and visualized for analysis. This allows to measure word senses with simple and intuitive micro-task judgments between use pairs, requiring minimal preparation efforts. The tool offers additional functionalities to compare the agreement between annotators to guarantee the inter-subjectivity of the obtained judgments and to calculate summary statistics over the annotated data giving insights into sense frequency distributions, semantic variation or changes of senses over time.</abstract>
       <url hash="a47730f3">2024.eacl-demo.15</url>
       <bibkey>schlechtweg-etal-2024-durel</bibkey>
+      <video href="2024.eacl-demo.15.mp4"/>
     </paper>
     <paper id="16">
       <title><fixed-case>RAGA</fixed-case>s: Automated Evaluation of Retrieval Augmented Generation</title>
@@ -2860,6 +3073,7 @@
       <abstract>We introduce RAGAs (Retrieval Augmented Generation Assessment), a framework for reference-free evaluation of Retrieval Augmented Generation (RAG) pipelines. RAGAs is available at [https://github.com/explodinggradients/ragas]. RAG systems are composed of a retrieval and an LLM based generation module. They provide LLMs with knowledge from a reference textual database, enabling them to act as a natural language layer between a user and textual databases, thus reducing the risk of hallucinations. Evaluating RAG architectures is challenging due to several dimensions to consider: the ability of the retrieval system to identify relevant and focused context passages, the ability of the LLM to exploit such passages faithfully, and the quality of the generation itself. With RAGAs, we introduce a suite of metrics that can evaluate these different dimensions without relying on ground truth human annotations. We posit that such a framework can contribute crucially to faster evaluation cycles of RAG architectures, which is especially important given the fast adoption of LLMs.</abstract>
       <url hash="8b5c2d0c">2024.eacl-demo.16</url>
       <bibkey>es-etal-2024-ragas</bibkey>
+      <video href="2024.eacl-demo.16.mp4"/>
     </paper>
     <paper id="17">
       <title><fixed-case>N</fixed-case>euro<fixed-case>P</fixed-case>rompts: An Adaptive Framework to Optimize Prompts for Text-to-Image Generation</title>
@@ -2870,6 +3084,7 @@
       <abstract>Despite impressive recent advances in text-to-image diffusion models, obtaining high-quality images often requires prompt engineering by humans who have developed expertise in using them. In this work, we present NeuroPrompts, an adaptive framework that automatically enhances a user’s prompt to improve the quality of generations produced by text-to-image models. Our framework utilizes constrained text decoding with a pre-trained language model that has been adapted to generate prompts similar to those produced by human prompt engineers. This approach enables higher-quality text-to-image generations and provides user control over stylistic features via constraint set specification. We demonstrate the utility of our framework by creating an interactive application for prompt enhancement and image generation using Stable Diffusion. Additionally, we conduct experiments utilizing a large dataset of human-engineered prompts for text-to-image generation and show that our approach automatically produces enhanced prompts that result in superior image quality. We make our code, a screencast video demo and a live demo instance of NeuroPrompts publicly available.</abstract>
       <url hash="f715b7cd">2024.eacl-demo.17</url>
       <bibkey>rosenman-etal-2024-neuroprompts</bibkey>
+      <video href="2024.eacl-demo.17.mp4"/>
     </paper>
     <paper id="18">
       <title><fixed-case>MEGA</fixed-case>nno+: A Human-<fixed-case>LLM</fixed-case> Collaborative Annotation System</title>
@@ -2893,6 +3108,7 @@
       <abstract>This paper presents a novel Cross-document Abstract Meaning Representation (X-AMR) annotation tool designed for annotating key corpus-level event semantics. Leveraging machine assistance through the Prodigy Annotation Tool, we enhance the user experience, ensuring ease and efficiency in the annotation process. Through empirical analyses, we demonstrate the effectiveness of our tool in augmenting an existing event corpus, highlighting its advantages when integrated with GPT-4. Code and annotations: href{https://anonymous.4open.science/r/xamr-9ED0}{anonymous.4open.science/r/xamr-9ED0} footnote Demo: {href{https://youtu.be/TuirftxciNE}{https://youtu.be/TuirftxciNE}} footnote Live Link: {href{https://tinyurl.com/mrxmafwh}{https://tinyurl.com/mrxmafwh}}</abstract>
       <url hash="0d9a8852">2024.eacl-demo.19</url>
       <bibkey>ahmed-etal-2024-x</bibkey>
+      <video href="2024.eacl-demo.19.mp4"/>
     </paper>
     <paper id="20">
       <title><fixed-case>D</fixed-case>oc<fixed-case>C</fixed-case>hecker: Bootstrapping Code Large Language Model for Detecting and Resolving Code-Comment Inconsistencies</title>
@@ -2903,6 +3119,7 @@
       <abstract>Comments in source code are crucial for developers to understand the purpose of the code and to use it correctly. However, keeping comments aligned with the evolving codebase poses a significant challenge. With increasing interest in automated solutions to identify and rectify discrepancies between code and its associated comments, most existing methods rely heavily on heuristic rules. This paper introduces DocChecker, a language model-based framework adept at detecting inconsistencies between code and comments and capable of generating synthetic comments. This functionality allows DocChecker to identify and rectify cases where comments do not accurately represent the code they describe.The efficacy of DocChecker is demonstrated using the Just-In-Time and CodeXGlue datasets in various scenarios. Notably, DocChecker sets a new benchmark in the Inconsistency Code-Comment Detection (ICCD) task, achieving 72.3% accuracy, and scoring 33.64 in BLEU-4 on the code summarization task. These results surpass other Large Language Models (LLMs), including GPT 3.5 and CodeLlama.DocChecker is accessible for use and evaluation. It can be found on https://github.com/FSoft-AI4Code/DocChecker and at http://4.193.50.237:5000/. For a more comprehensive understanding of its functionality, a demonstration video is available on https://youtu.be/FqnPmd531xw.</abstract>
       <url hash="4ee92fab">2024.eacl-demo.20</url>
       <bibkey>dau-etal-2024-docchecker</bibkey>
+      <video href="2024.eacl-demo.20.mp4"/>
     </paper>
     <paper id="21">
       <title><fixed-case>TL</fixed-case>;<fixed-case>DR</fixed-case> Progress: Multi-faceted Literature Exploration in Text Summarization</title>
@@ -2913,6 +3130,7 @@
       <abstract>This paper presents TL;DR Progress, a new tool for exploring the literature on neural text summarization. It organizes 514~papers based on a comprehensive annotation scheme for text summarization approaches and enables fine-grained, faceted search. Each paper was manually annotated to capture aspects such as evaluation metrics, quality dimensions, learning paradigms, challenges addressed, datasets, and document domains. In addition, a succinct indicative summary is provided for each paper, describing contextual factors, issues, and proposed solutions. The tool is available at {url{https://www.tldr-progress.de}}, a demo video at {url{https://youtu.be/uCVRGFvXUj8}}</abstract>
       <url hash="54a7554b">2024.eacl-demo.21</url>
       <bibkey>syed-etal-2024-tl</bibkey>
+      <video href="2024.eacl-demo.21.mp4"/>
     </paper>
     <paper id="22">
       <title><fixed-case>FRAPPE</fixed-case>: <fixed-case>FRA</fixed-case>ming, Persuasion, and Propaganda Explorer</title>
@@ -2923,7 +3141,7 @@
       <author><first>Lara</first><last>Hassan</last><affiliation>Alexandria University</affiliation></author>
       <author><first>Mohamed</first><last>El Zeftawy</last><affiliation>Alexandria University</affiliation></author>
       <author><first>Omar</first><last>El Herraoui</last><affiliation>NYU Abu Dhabi</affiliation></author>
-      <author><first>Osama</first><last>Afzal</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
+      <author><first>Osama</first><last>Mohammed Afzal</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <author><first>Qisheng</first><last>Liao</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <author><first>Tarek</first><last>Mahmoud</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <author><first>Zain</first><last>Muhammad Mujahid</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
@@ -2938,6 +3156,7 @@
       <abstract>The abundance of news sources and the urgent demand for reliable information have led to serious concerns about the threat of misleading information. In this paper, we present FRAPPE, a FRAming, Persuasion, and Propaganda Explorer system. FRAPPE goes beyond conventional news analysis of articles and unveils the intricate linguistic techniques used to shape readers’ opinions and emotions. Our system allows users not only to analyze individual articles for their genre, framings, and use of persuasion techniques, but also to draw comparisons between the strategies of persuasion and framing adopted by a diverse pool of news outlets and countries across multiple languages for different topics, thus providing a comprehensive understanding of how information is presented and manipulated. FRAPPE is publicly accessible at https://frappe.streamlit.app/ and a video explaining our system is available at https://www.youtube.com/watch?v=3RlTfSVnZmk</abstract>
       <url hash="e66c3473">2024.eacl-demo.22</url>
       <bibkey>sajwani-etal-2024-frappe</bibkey>
+      <video href="2024.eacl-demo.22.mp4"/>
     </paper>
     <paper id="23">
       <title><fixed-case>LLM</fixed-case>e<fixed-case>B</fixed-case>ench: A Flexible Framework for Accelerating <fixed-case>LLM</fixed-case>s Benchmarking</title>
@@ -2955,6 +3174,7 @@
       <abstract>The recent development and success of Large Language Models (LLMs) necessitate an evaluation of their performance across diverse NLP tasks in different languages. Although several frameworks have been developed and made publicly available, their customization capabilities for specific tasks and datasets are often complex for different users. In this study, we introduce the LLMeBench framework, which can be seamlessly customized to evaluate LLMs for any NLP task, regardless of language. The framework features generic dataset loaders, several model providers, and pre-implements most standard evaluation metrics. It supports in-context learning with zero- and few-shot settings. A specific dataset and task can be evaluated for a given LLM in less than 20 lines of code while allowing full flexibility to extend the framework for custom datasets, models, or tasks. The framework has been tested on 31 unique NLP tasks using 53 publicly available datasets within 90 experimental setups, involving approximately 296K data points. We open-sourced LLMeBench for the community (https://github.com/qcri/LLMeBench/) and a video demonstrating the framework is available online (https://youtu.be/9cC2m_abk3A).</abstract>
       <url hash="b9920324">2024.eacl-demo.23</url>
       <bibkey>dalvi-etal-2024-llmebench</bibkey>
+      <video href="2024.eacl-demo.23.mp4"/>
     </paper>
     <paper id="24">
       <title>Sig-Networks Toolkit: Signature Networks for Longitudinal Language Modelling</title>
@@ -2969,6 +3189,7 @@
       <abstract>We present an open-source, pip installable toolkit, Sig-Networks, the first of its kind for longitudinal language modelling. A central focus is the incorporation of Signature-based Neural Network models, which have recently shown success in temporal tasks. We apply and extend published research providing a full suite of signature-based models. Their components can be used as PyTorch building blocks in future architectures. Sig-Networks enables task-agnostic dataset plug-in, seamless preprocessing for sequential data, parameter flexibility, automated tuning across a range of models. We examine signature networks under three different NLP tasks of varying temporal granularity: counselling conversations, rumour stance switch and mood changes in social media threads, showing SOTA performance in all three, and provide guidance for future tasks. We release the Toolkit as a PyTorch package with an introductory video, Git repositories for preprocessing and modelling including sample notebooks on the modeled NLP tasks.</abstract>
       <url hash="31e5544c">2024.eacl-demo.24</url>
       <bibkey>tseriotou-etal-2024-sig</bibkey>
+      <video href="2024.eacl-demo.24.mp4"/>
     </paper>
   </volume>
   <volume id="srw" ingest-date="2024-03-03" type="proceedings">
@@ -2999,6 +3220,7 @@
       <abstract>Text data augmentation is a complex problem due to the discrete nature of sentences. Although rule-based augmentation methods are widely adopted in real-world applications because of their simplicity, they suffer from potential semantic damage. Previous researchers have suggested easy data augmentation with soft labels (softEDA), employing label smoothing to mitigate this problem. However, finding the best factor for each model and dataset is challenging; therefore, using softEDA in real-world applications is still difficult. In this paper, we propose adapting AutoAugment to solve this problem. The experimental results suggest that the proposed method can boost existing augmentation methods and that rule-based methods can enhance cutting-edge pretrained language models. We offer the source code.</abstract>
       <url hash="71328d13">2024.eacl-srw.1</url>
       <bibkey>choi-etal-2024-autoaugment</bibkey>
+      <video href="2024.eacl-srw.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Generating Diverse Translation with Perturbed <tex-math>k</tex-math><fixed-case>NN</fixed-case>-<fixed-case>MT</fixed-case></title>
@@ -3010,6 +3232,7 @@
       <abstract>Generating multiple translation candidates would enable users to choose the one that satisfies their needs.Although there has been work on diversified generation, there exists room for improving the diversity mainly because the previous methods do not address the overcorrection problem—the model underestimates a prediction that is largely different from the training data, even if that prediction is likely.This paper proposes methods that generate more diverse translations by introducing perturbed <tex-math>k</tex-math>-nearest neighbor machine translation (<tex-math>k</tex-math>NN-MT).Our methods expand the search space of <tex-math>k</tex-math>NN-MT and help incorporate diverse words into candidates by addressing the overcorrection problem.Our experiments show that the proposed methods drastically improve candidate diversity and control the degree of diversity by tuning the perturbation’s magnitude.</abstract>
       <url hash="17ea0ef7">2024.eacl-srw.2</url>
       <bibkey>nishida-etal-2024-generating</bibkey>
+      <video href="2024.eacl-srw.2.mp4"/>
     </paper>
     <paper id="3">
       <title>The <fixed-case>KIND</fixed-case> Dataset: A Social Collaboration Approach for Nuanced Dialect Data Collection</title>
@@ -3026,6 +3249,7 @@
       <abstract>Nuanced dialects are a linguistic variant that pose several challenges for NLP models and techniques. One of the main challenges is the limited amount of datasets to enable extensive research and experimentation. We propose an approach for efficiently collecting nuanced dialectal datasets that are not only of high quality, but are versatile enough to be multipurpose as well. To test our approach we collect the KIND corpus, which is a collection of fine-grained Arabic dialect data. The data is short texts, and unlike many nuanced dialectal datasets, it is curated manually through social collaboration efforts as opposed to being crawled from social media. The collaborative approach is incentivized through educational gamification and competitions for which the community itself benefits from the open source dataset. Our approach aims to achieve: (1) coverage of dialects from under-represented groups and fine-grained dialectal varieties, (2) provide aligned parallel corpora for translation between Modern Standard Arabic (MSA) and multiple dialects to enable translation and comparison studies, (3) promote innovative approaches for nuanced dialect data collection. We explain the steps for the competition as well as the resulting datasets and the competing data collection systems. The KIND dataset is shared with the research community.</abstract>
       <url hash="4d7e6c03">2024.eacl-srw.3</url>
       <bibkey>yamani-etal-2024-kind</bibkey>
+      <video href="2024.eacl-srw.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Can Stanza be Used for Part-of-Speech Tagging Historical <fixed-case>P</fixed-case>olish?</title>
@@ -3034,6 +3258,7 @@
       <abstract>The goal of this paper is to evaluate the performance of Stanza, a part-of-speech (POS) tagger developed for modern Polish, on historical text to assess its possible use for automating the annotation of other historical texts. While the issue of the reliability of utilizing POS taggers on historical data has been previously discussed, most of the research focuses on languages whose grammar differs from Polish, meaning that their results need not be fully applicable in this case. The evaluation of Stanza is conducted on two sets of 10286 and 3270 manually annotated tokens from a piece of historical Polish writing (1899), and the errors are analyzed qualitatively and quantitatively. The results show a good performance of the tagger, especially when it comes to Universal Part-of-Speech (UPOS) tags, which is promising for utilizing the tagger for automatic annotation in larger projects, and pinpoint some common features of misclassified tokens.</abstract>
       <url hash="e05a57e9">2024.eacl-srw.4</url>
       <bibkey>szawerna-2024-stanza</bibkey>
+      <video href="2024.eacl-srw.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Toward Zero-Shot Instruction Following</title>
@@ -3043,6 +3268,7 @@
       <abstract>This work proposes a challenging yet more realistic setting for zero-shot cross-task generalization: zero-shot instruction following, presuming the existence of a paragraph-style task definition while no demonstrations exist. To better learn the task supervision from the definition, we propose two strategies: first, to automatically find out the critical sentences in the definition; second, a ranking objective to force the model to generate the gold outputs with higher probabilities when those critical parts are highlighted in the definition. The joint efforts of the two strategies yield state-of-the-art performance on the Super-NaturalInstructions. Our code is available on GitHub.</abstract>
       <url hash="0380949d">2024.eacl-srw.5</url>
       <bibkey>lou-yin-2024-toward</bibkey>
+      <video href="2024.eacl-srw.5.mp4"/>
     </paper>
     <paper id="6">
       <title><fixed-case>U</fixed-case>n<fixed-case>MASK</fixed-case>ed: Quantifying Gender Biases in Masked Language Models through Linguistically Informed Job Market Prompts</title>
@@ -3051,6 +3277,7 @@
       <abstract>Language models (LMs) have become pivotal in the realm of technological advancements. While their capabilities are vast and transformative, they often include societal biases encoded in the human-produced datasets used for their training. This research delves into the inherent biases present in masked language models (MLMs), with a specific focus on gender biases. This study evaluated six prominent models: BERT, RoBERTa, DistilBERT, BERT- multilingual, XLM-RoBERTa, and DistilBERT- multilingual. The methodology employed a novel dataset, bifurcated into two subsets: one containing prompts that encouraged models to generate subject pronouns in English and the other requiring models to return the probabilities of verbs, adverbs, and adjectives linked to the prompts’ gender pronouns. The analysis reveals stereotypical gender alignment of all models, with multilingual variants showing comparatively reduced biases.</abstract>
       <url hash="b149c26a">2024.eacl-srw.6</url>
       <bibkey>parra-2024-unmasked</bibkey>
+      <video href="2024.eacl-srw.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Distribution Shifts Are Bottlenecks: Extensive Evaluation for Grounding Language Models to Knowledge Bases</title>
@@ -3060,6 +3287,7 @@
       <abstract>Grounding language models (LMs) to knowledge bases (KBs) helps to obtain rich and accurate facts. However, it remains challenging because of the enormous size, complex structure, and partial observability of KBs. One reason is that current benchmarks fail to reflect robustness challenges and fairly evaluate models.This paper analyzes whether these robustness challenges arise from distribution shifts, including environmental, linguistic, and modal aspects.This affects the ability of LMs to cope with unseen schema, adapt to language variations, and perform few-shot learning. Thus, the paper proposes extensive evaluation protocols and conducts experiments to demonstrate that, despite utilizing our proposed data augmentation method, both advanced small and large language models exhibit poor robustness in these aspects. We conclude that current LMs are too fragile to navigate in complex environments due to distribution shifts. This underscores the need for future research focusing on data collection, evaluation protocols, and learning paradigms.</abstract>
       <url hash="2682b414">2024.eacl-srw.7</url>
       <bibkey>shu-yu-2024-distribution</bibkey>
+      <video href="2024.eacl-srw.7.mp4"/>
     </paper>
     <paper id="8">
       <title><fixed-case>A</fixed-case>ttri<fixed-case>S</fixed-case>age: Product Attribute Value Extraction Using Graph Neural Networks</title>
@@ -3073,6 +3301,7 @@
       <abstract>Extracting the attribute value of a product from the given product description is essential for ecommerce functions like product recommendations, search, and information retrieval. Therefore, understanding products in E-commerce. Greater accuracy certainly gives any retailer the edge. The burdensome aspect of this problem lies in the diversity of the products and their attributes and values. Existing solutions typically employ large language models or sequence-tagging approaches to capture the context of a given product description and extract attribute values. However, they do so with limited accuracy, which serves as the underlying motivation to explore a more comprehensive solution. Through this paper, we present a novel approach for attribute value extraction from product description leveraging graphs and graph neural networks. Our proposed method demonstrates improvements in attribute value extraction accuracy compared to the baseline sequence tagging approaches.</abstract>
       <url hash="f2e3aaf6">2024.eacl-srw.8</url>
       <bibkey>potta-etal-2024-attrisage</bibkey>
+      <video href="2024.eacl-srw.8.mp4"/>
     </paper>
     <paper id="9">
       <title><fixed-case>H</fixed-case>ypo<fixed-case>T</fixed-case>erm<fixed-case>QA</fixed-case>: Hypothetical Terms Dataset for Benchmarking Hallucination Tendency of <fixed-case>LLM</fixed-case>s</title>
@@ -3082,6 +3311,7 @@
       <abstract>Hallucinations pose a significant challenge to the reliability and alignment of Large Language Models (LLMs), limiting their widespread acceptance beyond chatbot applications. Despite ongoing efforts, hallucinations remain a prevalent challenge in LLMs. The detection of hallucinations itself is also a formidable task, frequently requiring manual labeling or constrained evaluations. This paper introduces an automated scalable framework that combines benchmarking LLMs’ hallucination tendencies with efficient hallucination detection. We leverage LLMs to generate challenging tasks related to hypothetical phenomena, subsequently employing them as agents for efficient hallucination detection. The framework is domain-agnostic, allowing the use of any language model for benchmark creation or evaluation in any domain. We introduce the publicly available HypoTermQA Benchmarking Dataset, on which state-of-the-art models’ performance ranged between 3% and 11%, and evaluator agents demonstrated a 6% error rate in hallucination prediction. The proposed framework provides opportunities to test and improve LLMs. Additionally, it has the potential to generate benchmarking datasets tailored to specific domains, such as law, health, and finance.</abstract>
       <url hash="f53f8736">2024.eacl-srw.9</url>
       <bibkey>uluoglakci-temizel-2024-hypotermqa</bibkey>
+      <video href="2024.eacl-srw.9.mp4"/>
     </paper>
     <paper id="10">
       <title><fixed-case>A</fixed-case>rabic Synonym <fixed-case>BERT</fixed-case>-based Adversarial Examples for Text Classification</title>
@@ -3093,6 +3323,7 @@
       <abstract>Text classification systems have been proven vulnerable to adversarial text examples, modified versions of the original text examples that are often unnoticed by human eyes, yet can force text classification models to alter their classification. Often, research works quantifying the impact of adversarial text attacks have been applied only to models trained in English. In this paper, we introduce the first word-level study of adversarial attacks in Arabic. Specifically, we use a synonym (word-level) attack using a Masked Language Modeling (MLM) task with a BERT model in a black-box setting to assess the robustness of the state-of-the-art text classification models to adversarial attacks in Arabic. To evaluate the grammatical and semantic similarities of the newly produced adversarial examples using our synonym BERT-based attack, we invite four human evaluators to assess and compare the produced adversarial examples with their original examples. We also study the transferability of these newly produced Arabic adversarial examples to various models and investigate the effectiveness of defense mechanisms against these adversarial examples on the BERT models. We find that fine-tuned BERT models were more susceptible to our synonym attacks than the other Deep Neural Networks (DNN) models like WordCNN and WordLSTM we trained. We also find that fine-tuned BERT models were more susceptible to transferred attacks. We, lastly, find that fine-tuned BERT models successfully regain at least 2% in accuracy after applying adversarial training as an initial defense mechanism.</abstract>
       <url hash="078ea6ea">2024.eacl-srw.10</url>
       <bibkey>alshahrani-etal-2024-arabic</bibkey>
+      <video href="2024.eacl-srw.10.mp4"/>
     </paper>
     <paper id="11">
       <title>A Hypothesis-Driven Framework for the Analysis of Self-Rationalising Models</title>
@@ -3102,6 +3333,7 @@
       <abstract>The self-rationalising capabilities of LLMs are appealing because the generated explanations can give insights into the plausibility of the predictions. However, how faithful the explanations are to the predictions is questionable, raising the need to explore the patterns behind them further.To this end, we propose a hypothesis-driven statistical framework. We use a Bayesian network to implement a hypothesis about how a task (in our example, natural language inference) is solved, and its internal states are translated into natural language with templates. Those explanations are then compared to LLM-generated free-text explanations using automatic and human evaluations. This allows us to judge how similar the LLM’s and the Bayesian network’s decision processes are. We demonstrate the usage of our framework with an example hypothesis and two realisations in Bayesian networks. The resulting models do not exhibit a strong similarity to GPT-3.5. We discuss the implications of this as well as the framework’s potential to approximate LLM decisions better in future work.</abstract>
       <url hash="b1becb9d">2024.eacl-srw.11</url>
       <bibkey>braun-kunz-2024-hypothesis</bibkey>
+      <video href="2024.eacl-srw.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Align before Attend: Aligning Visual and Textual Features for Multimodal Hateful Content Detection</title>
@@ -3113,6 +3345,7 @@
       <abstract>Multimodal hateful content detection is a challenging task that requires complex reasoning across visual and textual modalities. Therefore, creating a meaningful multimodal representation that effectively captures the interplay between visual and textual features through intermediate fusion is critical. Conventional fusion techniques are unable to attend to the modality-specific features effectively. Moreover, most studies exclusively concentrated on English and overlooked other low-resource languages. This paper proposes a context-aware attention framework for multimodal hateful content detection and assesses it for both English and non-English languages. The proposed approach incorporates an attention layer to meaningfully align the visual and textual features. This alignment enables selective focus on modality-specific features before fusing them. We evaluate the proposed approach on two benchmark hateful meme datasets, viz. MUTE (Bengali code-mixed) and MultiOFF (English). Evaluation results demonstrate our proposed approach’s effectiveness with F1-scores of 69.7% and 70.3% for the MUTE and MultiOFF datasets. The scores show approximately 2.5% and 3.2% performance improvement over the state-of-the-art systems on these datasets. Our implementation is available at https://github.com/eftekhar-hossain/Bengali-Hateful-Memes.</abstract>
       <url hash="6461034d">2024.eacl-srw.12</url>
       <bibkey>hossain-etal-2024-align</bibkey>
+      <video href="2024.eacl-srw.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Topic-guided Example Selection for Domain Adaptation in <fixed-case>LLM</fixed-case>-based Machine Translation</title>
@@ -3122,6 +3355,7 @@
       <abstract>Current machine translation (MT) systems perform well in the domains on which they were trained, but adaptation to unseen domains remains a challenge. Rather than fine-tuning on domain data or modifying the architecture for training, an alternative approach exploits large language models (LLMs), which are performant across NLP tasks especially when presented with in-context examples. We focus on adapting a pre-trained LLM to a domain at inference through in-context example selection. For MT, examples are usually randomly selected from a development set. Some more recent methods though select using the more intuitive basis of test source similarity. We employ topic models to select examples based on abstract semantic relationships below the level of a domain. We test the relevance of these statistical models and use them to select informative examples even for out-of-domain inputs, experimenting on 7 diverse domains and 11 language pairs of differing resourcedness. Our method outperforms baselines on challenging multilingual out-of-domain tests, though it does not match performance with strong baselines for the in-language setting. We find that adding few-shot examples and related keywords consistently improves translation quality, that example diversity must be balanced with source similarity, and that our pipeline is overly restrictive for example selection when a targeted development set is available.</abstract>
       <url hash="9d871088">2024.eacl-srw.13</url>
       <bibkey>aycock-bawden-2024-topic</bibkey>
+      <video href="2024.eacl-srw.13.mp4"/>
     </paper>
     <paper id="14">
       <title>Reforging : A Method for Constructing a Linguistically Valid <fixed-case>J</fixed-case>apanese <fixed-case>CCG</fixed-case> Treebank</title>
@@ -3132,6 +3366,7 @@
       <abstract>The linguistic validity of Combinatory Categorial Grammar (CCG) parsing results relies heavily on treebanks for training and evaluation, so the treebank construction is crucial. Yet the current Japanese CCG treebank is known to have inaccuracies in its analyses of Japanese syntactic structures, including passive and causative constructions. While ABCTreebank, a treebank for ABC grammar, has been made to improve the analysis, particularly of argument structures, it lacks the detailed syntactic features required for Japanese CCG. In contrast, the Japanese CCG parser, lightblue, efficiently provides detailed syntactic features, but it does not accurately capture argument structures. We propose a method to generate a linguistically valid Japanese CCG treebank with detailed information by combining the strengths of ABCTreebank and lightblue. We develop an algorithm that filters lightblue’s lexical items using ABCTreebank, effectively converting lightblue output into a linguistically valid CCG treebank. To evaluate our treebank, we manually evaluate CCG syntactic structures and semantic representations and analyze conversion rates.</abstract>
       <url hash="20dd68a2">2024.eacl-srw.14</url>
       <bibkey>tomita-etal-2024-reforging</bibkey>
+      <video href="2024.eacl-srw.14.mp4"/>
     </paper>
     <paper id="15">
       <title>Thesis Proposal: <fixed-case>D</fixed-case>etecting Agency Attribution</title>
@@ -3141,6 +3376,7 @@
       <abstract>We explore computational methods for perceived agency attribution in natural language data. We consider ‘agency’ as the freedom and capacity to act, and the corresponding Natural Language Processing (NLP) task involves automatically detecting attributions of agency to entities in text. Our theoretical framework draws on semantic frame analysis, role labelling and related techniques. In initial experiments, we focus on the perceived agency of AI systems. To achieve this, we analyse a dataset of English-language news coverage of AI-related topics, published within one year surrounding the release of the Large Language Model-based service ChatGPT, a milestone in the general public’s awareness of AI. Building on this, we propose a schema to annotate a dataset for agency attribution and formulate additional research questions to answer by applying NLP models.</abstract>
       <url hash="f3118b97">2024.eacl-srw.15</url>
       <bibkey>ryazanov-bjorklund-2024-thesis</bibkey>
+      <video href="2024.eacl-srw.15.mp4"/>
     </paper>
     <paper id="16">
       <title>A Thesis Proposal <fixed-case>C</fixed-case>laim<fixed-case>I</fixed-case>nspector Framework: A Hybrid Approach to Data Annotation using Fact-Checked Claims and <fixed-case>LLM</fixed-case>s</title>
@@ -3149,6 +3385,7 @@
       <abstract>This thesis explores the challenges and limitations encountered in automated fact-checking processes, with a specific emphasis on data annotation in the context of misinformation. Despite the widespread presence of misinformation in multiple formats and across various channels, current efforts concentrate narrowly on textual claims sourced mainly from Twitter, resulting in datasets with considerably limited scope. Furthermore, the absence of automated control measures, coupled with the reliance on human annotation, which is very limited, increases the risk of noisy data within these datasets. This thesis proposal examines the existing methods, elucidates their limitations and explores the potential integration of claim detection subtasks and Large Language Models to mitigate these issues. It introduces ClaimInspector, a novel framework designed for a systemic collection of multimodal data from the internet. By implementing this framework, this thesis will propose a dataset comprising fact-checks alongside the corresponding claims made by politicians. Overall, this thesis aims to enhance the accuracy and efficiency of annotation processes, thereby contributing to automated fact-checking efforts.</abstract>
       <url hash="4fbd553c">2024.eacl-srw.16</url>
       <bibkey>bozkurt-2024-thesis</bibkey>
+      <video href="2024.eacl-srw.16.mp4"/>
     </paper>
     <paper id="17">
       <title>Large Language Models for Mathematical Reasoning: Progresses and Challenges</title>
@@ -3162,6 +3399,7 @@
       <abstract>Mathematical reasoning serves as a cornerstone for assessing the fundamental cognitive capabilities of human intelligence. In recent times, there has been a notable surge in the development of Large Language Models (LLMs) geared towards the automated resolution of mathematical problems. However, the landscape of mathematical problem types is vast and varied, with LLM-oriented techniques undergoing evaluation across diverse datasets and settings. This diversity makes it challenging to discern the true advancements and obstacles within this burgeoning field. This survey endeavors to address four pivotal dimensions: i) a comprehensive exploration of the various mathematical problems and their corresponding datasets that have been investigated; ii) an examination of the spectrum of LLM-oriented techniques that have been proposed for mathematical problem-solving; iii) an overview of factors and concerns affecting LLMs in solving math; and iv) an elucidation of the persisting challenges within this domain. To the best of our knowledge, this survey stands as one of the first extensive examinations of the landscape of LLMs in the realm of mathematics, providing a holistic perspective on the current state, accomplishments, and future challenges in this rapidly evolving field.</abstract>
       <url hash="eb1dc4c9">2024.eacl-srw.17</url>
       <bibkey>ahn-etal-2024-large</bibkey>
+      <video href="2024.eacl-srw.17.mp4"/>
     </paper>
     <paper id="18">
       <title>Representation and Generation of Machine Learning Test Functions</title>
@@ -3171,6 +3409,7 @@
       <abstract>Writing tests for machine learning (ML) code is a crucial step towards ensuring the correctness and reliability of ML software. At the same time, Large Language Models (LLMs) have been adopted at a rapid pace for various code generation tasks, making it a natural choice for many developers who need to write ML tests. However, the implications of using these models, and how the LLM-generated tests differ from human-written ones, are relatively unexplored. In this work, we examine the use of LLMs to extract representations of ML source code and tests in order to understand the semantic relationships between human-written test functions and LLM-generated ones, and annotate a set of LLM-generated tests for several important qualities including usefulness, documentation, and correctness. We find that programmers prefer LLM-generated tests to those selected using retrieval-based methods, and in some cases, to those written by other humans.</abstract>
       <url hash="88e27c61">2024.eacl-srw.18</url>
       <bibkey>hassine-wilson-2024-representation</bibkey>
+      <video href="2024.eacl-srw.18.mp4"/>
     </paper>
     <paper id="19">
       <title>The Generative <fixed-case>AI</fixed-case> Paradox in Evaluation: “What It Can Solve, It May Not Evaluate”</title>
@@ -3182,6 +3421,7 @@
       <abstract>This paper explores the assumption that Large Language Models (LLMs) skilled in generation tasks are equally adept as evaluators. We assess the performance of three LLMs and one open-source LM in Question-Answering (QA) and evaluation tasks using the TriviaQA (Joshi et al., 2017) dataset. Results indicate a significant disparity, with LLMs exhibiting lower performance in evaluation tasks compared to generation tasks. Intriguingly, we discover instances of unfaithful evaluation where models accurately evaluate answers in areas where they lack competence, underscoring the need to examine the faithfulness and trustworthiness of LLMs as evaluators. This study contributes to the understanding of “the Generative AI Paradox” (West et al., 2023), highlighting a need to explore the correlation between generative excellence and evaluation proficiency, and the necessity to scrutinize the faithfulness aspect in model evaluations.</abstract>
       <url hash="2704af54">2024.eacl-srw.19</url>
       <bibkey>oh-etal-2024-generative</bibkey>
+      <video href="2024.eacl-srw.19.mp4"/>
     </paper>
     <paper id="20">
       <title>Generative Data Augmentation using <fixed-case>LLM</fixed-case>s improves Distributional Robustness in Question Answering</title>
@@ -3191,6 +3431,7 @@
       <abstract>Robustness in Natural Language Processing continues to be a pertinent issue, where state of the art models under-perform under naturally shifted distributions. In the context of Question Answering, work on domain adaptation methods continues to be a growing body of research. However, very little attention has been given to the notion of domain generalization under natural distribution shifts, where the target domain is unknown. With drastic improvements in the quality and access to generative models, we answer the question: How do generated datasets influence the performance of QA models under natural distribution shifts? We perform experiments on 4 different datasets under varying amounts of distribution shift, and analyze how “in-the-wild” generation can help achieve domain generalization. We take a two-step generation approach, generating both contexts and QA pairs to augment existing datasets. Through our experiments, we demonstrate how augmenting reading comprehension datasets with generated data leads to better robustness towards natural distribution shifts.</abstract>
       <url hash="12cd739f">2024.eacl-srw.20</url>
       <bibkey>chowdhury-chadha-2024-generative</bibkey>
+      <video href="2024.eacl-srw.20.mp4"/>
     </paper>
     <paper id="21">
       <title><fixed-case>J</fixed-case>apanese-<fixed-case>E</fixed-case>nglish Sentence Translation Exercises Dataset for Automatic Grading</title>
@@ -3204,6 +3445,7 @@
       <abstract>This paper proposes the task of automatic assessment of Sentence Translation Exercises (STEs), that have been used in the early stage of L2 language learning.We formalize the task as grading student responses for each rubric criterion pre-specified by the educators.We then create a dataset for STE between Japanese and English including 21 questions, along with a total of 3,498 student responses (167 on average).The answer responses were collected from students and crowd workers.Using this dataset, we demonstrate the performance of baselines including a finetuned BERT model and GPT-3.5 with few-shot learning. Experimental results showed that the baseline model with fine-tuned BERT was able to classify correct responses with approximately 90% in <tex-math>F_1</tex-math>, but only less than 80% for incorrect responses. Furthermore, GPT-3.5 with few-shot learning shows a poorer result than the BERT model, indicating that our newly proposed task presents a challenging issue, even for the state-of-the-art large language model.</abstract>
       <url hash="a7e8f96b">2024.eacl-srw.21</url>
       <bibkey>miura-etal-2024-japanese</bibkey>
+      <video href="2024.eacl-srw.21.mp4"/>
     </paper>
     <paper id="22">
       <title>The Impact of Integration Step on Integrated Gradients</title>
@@ -3215,6 +3457,7 @@
       <abstract>Integrated Gradients (IG) serve as a potent tool for explaining the internal structure of a language model. The calculation of IG requires numerical integration, wherein the number of steps serves as a critical hyperparameter. The step count can drastically alter the results, inducing considerable errors in interpretability. To scrutinize the effect of step variation on IG, we measured the difference between theoretical and observed IG totals for each step amount.Our findings indicate that the ideal number of steps to maintain minimal error varies from instance to instance. Consequently, we advocate for customizing the step count for each instance. Our study is the first to quantitatively analyze the variation of IG values with the number of steps.</abstract>
       <url hash="9c8e15db">2024.eacl-srw.22</url>
       <bibkey>makino-etal-2024-impact</bibkey>
+      <video href="2024.eacl-srw.22.mp4"/>
     </paper>
     <paper id="23">
       <title><fixed-case>G</fixed-case>es<fixed-case>N</fixed-case>avi: Gesture-guided Outdoor Vision-and-Language Navigation</title>
@@ -3226,6 +3469,7 @@
       <abstract>Vision-and-Language Navigation (VLN) task involves navigating mobility using linguistic commands and has application in developing interfaces for autonomous mobility. In reality, natural human communication also encompasses non-verbal cues like hand gestures and gaze. These gesture-guided instructions have been explored in Human-Robot Interaction systems for effective interaction, particularly in object-referring expressions. However, a notable gap exists in tackling gesture-based demonstrative expressions in outdoor VLN task. To address this, we introduce a novel dataset for gesture-guided outdoor VLN instructions with demonstrative expressions, designed with a focus on complex instructions requiring multi-hop reasoning between the multiple input modalities. In addition, our work also includes a comprehensive analysis of the collected data and a comparative evaluation against the existing datasets.</abstract>
       <url hash="aeaa8cf9">2024.eacl-srw.23</url>
       <bibkey>jain-etal-2024-gesnavi</bibkey>
+      <video href="2024.eacl-srw.23.mp4"/>
     </paper>
     <paper id="24">
       <title>Can docstring reformulation with an <fixed-case>LLM</fixed-case> improve code generation?</title>
@@ -3236,6 +3480,7 @@
       <abstract>Generating code is an important application of Large Language Models (LLMs) and the task of function completion is one of the core open challenges in this context. Existing approaches focus on either training, fine-tuning or prompting LLMs to generate better outputs given the same input. We propose a novel and complementary approach: to optimize part of the input, the docstring (summary of a function’s purpose and usage), via reformulation with an LLM, in order to improve code generation. We develop two baseline methods for optimizing code generation via docstring reformulation and test them on the original HumanEval benchmark and multiple curated variants which are made more challenging by realistically worsening the docstrings. Our results show that, when operating on docstrings reformulated by an LLM instead of the original (or worsened) inputs, the performance of a number of open-source LLMs does not change significantlyThis finding demonstrates an unexpected robustness of current open-source LLMs to the details of the docstrings. We conclude by examining a series of questions, accompanied by in-depth analyses, pertaining to the sensitivity of current open-source LLMs to the details in the docstrings, the potential for improvement via docstring reformulation and the limitations of the methods employed in this work.</abstract>
       <url hash="bb36faa3">2024.eacl-srw.24</url>
       <bibkey>dainese-etal-2024-docstring</bibkey>
+      <video href="2024.eacl-srw.24.mp4"/>
     </paper>
     <paper id="25">
       <title>Benchmarking Diffusion Models for Machine Translation</title>
@@ -3246,6 +3491,7 @@
       <abstract>Diffusion models have recently shown great potential on many generative tasks.In this work, we explore diffusion models for machine translation (MT).We adapt two prominent diffusion-based text generation models, Diffusion-LM and DiffuSeq, to perform machine translation.As the diffusion models generate non-autoregressively (NAR),we draw parallels to NAR machine translation models.With a comparison to conventional Transformer-based translation models, as well as to the Levenshtein Transformer,an established NAR MT model,we show that the multimodality problem that limits NAR machine translation performance is also a challenge to diffusion models.We demonstrate that knowledge distillation from an autoregressive model improves the performance of diffusion-based MT.A thorough analysis on the translation quality of inputs of different lengths shows that the diffusion models struggle more on long-range dependencies than other models.</abstract>
       <url hash="81e33602">2024.eacl-srw.25</url>
       <bibkey>demirag-etal-2024-benchmarking</bibkey>
+      <video href="2024.eacl-srw.25.mp4"/>
     </paper>
     <paper id="26">
       <title>Forged-<fixed-case>GAN</fixed-case>-<fixed-case>BERT</fixed-case>: Authorship Attribution for <fixed-case>LLM</fixed-case>-Generated Forged Novels</title>
@@ -3259,6 +3505,7 @@
       <abstract>The advancement of generative Large Language Models (LLMs), capable of producing human-like texts, introduces challenges related to the authenticity of the text documents. This requires exploring potential forgery scenarios within the context of authorship attribution, especially in the literary domain. Particularly,two aspects of doubted authorship may arise in novels, as a novel may be imposed by a renowned author or include a copied writing style of a well-known novel. To address these concerns, we introduce Forged-GAN-BERT, a modified GANBERT-based model to improve the classification of forged novels in two data-augmentation aspects: via the Forged Novels Generator (i.e., ChatGPT) and the generator in GAN. Compared to other transformer-based models, the proposed Forged-GAN-BERT model demonstrates an improved performance with F1 scores of 0.97 and 0.71 for identifying forged novels in single-author and multi-author classification settings. Additionally, we explore different prompt categories for generating the forged novels to analyse the quality of the generated texts using different similarity distance measures, including ROUGE-1, Jaccard Similarity, Overlap Confident, and Cosine Similarity.</abstract>
       <url hash="4026f703">2024.eacl-srw.26</url>
       <bibkey>silva-etal-2024-forged</bibkey>
+      <video href="2024.eacl-srw.26.mp4"/>
     </paper>
     <paper id="27">
       <title>Thesis Proposal: Detecting Empathy Using Multimodal Language Model</title>
@@ -3271,6 +3518,7 @@
       <abstract>Empathy is crucial in numerous social interactions, including human-robot, patient-doctor, teacher-student, and customer-call centre conversations. Despite its importance, empathy detection in videos continues to be a challenging task because of the subjective nature of empathy and often remains under-explored. Existing studies have relied on scripted or semi-scripted interactions in text-, audio-, or video-only settings that fail to capture the complexities and nuances of real-life interactions. This PhD research aims to fill these gaps by developing a multimodal language model (MMLM) that detects empathy in audiovisual data. In addition to leveraging existing datasets, the proposed study involves collecting real-life interaction video and audio. This study will leverage optimisation techniques like neural architecture search to deliver an optimised small-scale MMLM. Successful implementation of this project has significant implications in enhancing the quality of social interactions as it enables real-time measurement of empathy and thus provides potential avenues for training for better empathy in interactions.</abstract>
       <url hash="c8a058db">2024.eacl-srw.27</url>
       <bibkey>hasan-etal-2024-thesis</bibkey>
+      <video href="2024.eacl-srw.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Toward Sentiment Aware Semantic Change Analysis</title>
@@ -3280,6 +3528,7 @@
       <abstract>This student paper explores the potential of augmenting computational models of semantic change with sentiment information. It tests the efficacy of this approach on the English SemEval of Lexical Semantic Change and its associated historical corpora. We first establish the feasibility of our approach by demonstrating that existing models extract reliable sentiment information from historical corpora, and then validate that words that underwent semantic change also show greater sentiment change in comparison to historically stable words. We then integrate sentiment information into standard models of semantic change for individual words, and test if this can improve the overall performance of the latter, showing mixed results. This research contributes to our understanding of language change by providing the first attempt to enrich standard models of semantic change with additional information. It taps into the multifaceted nature of language change, that should not be reduced only to binary or scalar report of change, but adds additional dimensions to this change, sentiment being only one of these. As such, this student paper suggests novel directions for future work in integrating additional, more nuanced information of change and interpretation for finer-grained semantic change analysis.</abstract>
       <url hash="02b026c8">2024.eacl-srw.28</url>
       <bibkey>goworek-dubossarsky-2024-toward</bibkey>
+      <video href="2024.eacl-srw.28.mp4"/>
     </paper>
     <paper id="29">
       <title>Dynamic Task-Oriented Dialogue: A Comparative Study of Llama-2 and Bert in Slot Value Generation</title>
@@ -3290,6 +3539,7 @@
       <abstract>Recent advancements in instruction-based language models have demonstrated exceptional performance across various natural language processing tasks. We present a comprehensive analysis of the performance of two open-source language models, BERT and Llama-2, in the context of dynamic task-oriented dialogues. Focusing on the Restaurant domain and utilizing the MultiWOZ 2.4 dataset, our investigation centers on the models’ ability to generate predictions for masked slot values within text. The dynamic aspect is introduced through simulated domain changes, mirroring real-world scenarios where new slot values are incrementally added to a domain over time.This study contributes to the understanding of instruction-based models’ effectiveness in dynamic natural language understanding tasks when compared to traditional language models and emphasizes the significance of open-source, reproducible models in advancing research within the academic community.</abstract>
       <url hash="e6a5f347">2024.eacl-srw.29</url>
       <bibkey>labruna-etal-2024-dynamic</bibkey>
+      <video href="2024.eacl-srw.29.mp4"/>
     </paper>
   </volume>
   <volume id="tutorials" ingest-date="2024-03-03" type="proceedings">
@@ -3319,6 +3569,7 @@
       <abstract>Languages change constantly over time, influenced by social, technological, cultural and political factors that affect how people express themselves. In particular, words can undergo the process of semantic change, which can be subtle and significantly impact the interpretation of texts. For example, the word terrific used to mean ‘causing terror’ and was as such synonymous to terrifying. Nowadays, speakers use the word in the sense of ‘excessive’ and even ‘amazing’. In Historical Linguistics, tools and methods have been developed to analyse this phenomenon, including systematic categorisations of the types of change, the causes and the mechanisms underlying the different types of change. However, traditional linguistic methods, while informative, are often based on small, carefully curated samples. Thanks to the availability of both large diachronic corpora, the computational means to model word meaning unsupervised, and evaluation benchmarks, we are seeing an increasing interest in the computational modelling of semantic change. This is evidenced by the increasing number of publications in this new domain as well as the organisation of initiatives and events related to this topic, such as four editions of the International Workshop on Computational Approaches to Historical Language Change LChange1, and several evaluation campaigns (Schlechtweg et al., 2020a; Basile et al., 2020b; Kutuzov et al.; Zamora-Reina et al., 2022).</abstract>
       <url hash="64cae14e">2024.eacl-tutorials.1</url>
       <bibkey>cassotti-etal-2024-computational</bibkey>
+      <video href="2024.eacl-tutorials.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Item Response Theory for Natural Language Processing</title>
@@ -3330,6 +3581,7 @@
       <abstract>This tutorial will introduce the NLP community to Item Response Theory (IRT; Baker 2001). IRT is a method from the field of psychometrics for model and dataset assessment. IRT has been used for decades to build test sets for human subjects and estimate latent characteristics of dataset examples. Recently, there has been an uptick in work applying IRT to tasks in NLP. It is our goal to introduce the wider NLP community to IRT and show its benefits for a number of NLP tasks. From this tutorial, we hope to encourage wider adoption of IRT among NLP researchers.</abstract>
       <url hash="b72deb42">2024.eacl-tutorials.2</url>
       <bibkey>lalor-etal-2024-item</bibkey>
+      <video href="2024.eacl-tutorials.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Language + Molecules</title>
@@ -3340,6 +3592,7 @@
       <abstract>Climate change, access to food and water, pandemics–the world faces an enormous number of problems in the coming decades on scales of complexity never-before-seen. To address these issues, development of scientific solutions which are scalable, flexible, and inexpensive are critical. Over the last couple years, considerable interest has arisen for applying natural language-driven solutions to these problems. Particularly, the chemistry field is posed to be substantially accelerated by language+molecule models. This tutorial is designed to provide an introduction to this area of research. It requires no knowledge outside mainstream NLP, and it will enable participants to begin exploring relevant research. By discussing cutting-edge work, we will highlight the key roles language can fill for 1) abstract, compositional control of generative models, 2) bridging different biochemical modalities, 3) planning experimental procedures, and 4) broadening access to computational approaches. Beyond this, language models have also seen considerable success when applied to proteins or molecule structures, which can be considered as ‘exotic’ languages, and computational linguistics researchers’ expertise can be highly valuable for these impactful, possibly life-saving tasks.</abstract>
       <url hash="2fec547a">2024.eacl-tutorials.3</url>
       <bibkey>edwards-etal-2024-language</bibkey>
+      <video href="2024.eacl-tutorials.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Transformer-specific Interpretability</title>
@@ -3363,6 +3616,7 @@
       <abstract>The recent breakthroughs in Artificial Intelligence (AI) can be attributed to the remarkable performance of Large Language Models (LLMs) across a spectrum of research areas (e.g., machine translation, question-answering, automatic speech recognition, text-to-speech generation) and application domains (e.g., business, law, healthcare, education, and psychology). The success of these LLMs largely de- pends on specific training techniques, most notably instruction tuning, RLHF, and subsequent prompting to achieve the desired output. As the development of such LLMs continues to increase in both closed and open settings, evaluation has become crucial for understanding their generalization capabilities across different tasks, modalities, languages, and dialects. This evaluation process is tightly coupled with prompting, which plays a key role in obtain- ing better outputs. There has been attempts to evaluate such models focusing on diverse tasks, languages, and dialects, which suggests that the capabilities of LLMs are still limited to medium-to-low-resource languages due to the lack of representative datasets. The tutorial offers an overview of this emerging research area. We explore the capabilities of LLMs in terms of their performance, zero- and few-shot settings, fine-tuning, instructions tuning, and close vs. open models with a special emphasis on low-resource settings. In addition to LLMs for standard NLP tasks, we will focus on speech and multimodality.</abstract>
       <url hash="b103fda2">2024.eacl-tutorials.5</url>
       <bibkey>alam-etal-2024-llms</bibkey>
+      <video href="2024.eacl-tutorials.5.mp4"/>
     </paper>
   </volume>
   <event id="eacl-2024">
diff --git a/data/xml/2024.ecnlp.xml b/data/xml/2024.ecnlp.xml
new file mode 100644
index 0000000000..bced3e2451
--- /dev/null
+++ b/data/xml/2024.ecnlp.xml
@@ -0,0 +1,200 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.ecnlp">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Seventh Workshop on e-Commerce and NLP @ LREC-COLING 2024</booktitle>
+      <editor><first>Shervin</first><last>Malmasi</last></editor>
+      <editor><first>Besnik</first><last>Fetahu</last></editor>
+      <editor><first>Nicola</first><last>Ueffing</last></editor>
+      <editor><first>Oleg</first><last>Rokhlenko</last></editor>
+      <editor><first>Eugene</first><last>Agichtein</last></editor>
+      <editor><first>Ido</first><last>Guy</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="9af0bd83">2024.ecnlp-1</url>
+      <venue>ecnlp</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="bedf2395">2024.ecnlp-1.0</url>
+      <bibkey>ecnlp-2024-e</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Learning Reasons for Product Returns on <fixed-case>E</fixed-case>-Commerce</title>
+      <author><first>Miriam</first><last>Farber</last></author>
+      <author><first>Slava</first><last>Novgorodov</last></author>
+      <author><first>Ido</first><last>Guy</last></author>
+      <pages>1–7</pages>
+      <abstract>In the rapidly evolving landscape of e-commerce, product returns have become a significant economic burden for businesses, where the reasons for returns may vary from wrong sizing and defective products to simply no longer needing the purchased product. This paper presents, to the best of our knowledge, the first comprehensive study of the complexities of product returns across a variety of e-commerce domains, focusing on the task of predicting the return reason. We propose a supervised approach for predicting return likelihood and the underlying return reason. We test our approach over a real-world dataset from a large e-commerce platform.</abstract>
+      <url hash="c007e5b3">2024.ecnlp-1.1</url>
+      <bibkey>farber-etal-2024-learning</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Towards Multi-Modal Co-Reference Resolution in Conversational Shopping Agents</title>
+      <author><first>Samuel</first><last>Osebe</last></author>
+      <author><first>Prashan</first><last>Wanigasekara</last></author>
+      <author><first>Thomas</first><last>Gueudre</last></author>
+      <author><first>Thanh</first><last>Tran</last></author>
+      <author><first>Rahul</first><last>Sharma</last></author>
+      <author><first>Fan</first><last>Yang</last></author>
+      <author><first>Qian</first><last>Hu</last></author>
+      <author><first>Weitong</first><last>Ruan</last></author>
+      <author><first>Emre</first><last>Barut</last></author>
+      <author><first>Chengwei</first><last>Su</last></author>
+      <pages>8–18</pages>
+      <abstract>The context of modern smart voice assistants is often multi-modal, where images, audio and video content are consumed by users simultaneously. In such a setup, co-reference resolution is especially challenging, and runs across modalities and dialogue turns. We explore the problem of multi-modal co-reference resolution in multi-turn dialogues and quantify the performance of multi-modal LLMs on a specially curated dataset of long, image-interleaved conversations between a voice assistant and human in a shopping use case. We propose a custom architecture for multi-modal embedding alignment using a novel parameter augmentation technique. Our proposed Parameter Augmented LLM approach shows a 4.9% absolute F1 improvement above a cross-attention baseline while reducing the number of parameters being trained by 4x.</abstract>
+      <url hash="a97d6f86">2024.ecnlp-1.2</url>
+      <bibkey>osebe-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Efficient and Interpretable Information Retrieval for Product Question Answering with Heterogeneous Data</title>
+      <author><first>Biplob</first><last>Biswas</last></author>
+      <author><first>Rajiv</first><last>Ramnath</last></author>
+      <pages>19–28</pages>
+      <abstract>Expansion-enhanced sparse lexical representation improves information retrieval (IR) by minimizing vocabulary mismatch problems during lexical matching. In this paper, we explore the potential of jointly learning dense semantic representation and combining it with the lexical one for ranking candidate information. We present a hybrid information retrieval mechanism that maximizes lexical and semantic matching while minimizing their shortcomings. Our architecture consists of dual hybrid encoders that independently encode queries and information elements. Each encoder jointly learns a dense semantic representation and a sparse lexical representation augmented by a learnable term expansion of the corresponding text through contrastive learning. We demonstrate the efficacy of our model in single-stage ranking of a benchmark product question-answering dataset containing the typical heterogeneous information available on online product pages. Our evaluation demonstrates that our hybrid approach outperforms independently trained retrievers by 10.95% (sparse) and 2.7% (dense) in MRR@5 score. Moreover, our model offers better interpretability and performs comparably to state-of-the-art cross-encoders while reducing response time by 30% (latency) and cutting computational load by approximately 38% (FLOPs).</abstract>
+      <url hash="d893edc9">2024.ecnlp-1.3</url>
+      <bibkey>biswas-ramnath-2024-efficient</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Hallucination Detection in <fixed-case>LLM</fixed-case>-enriched Product Listings</title>
+      <author><first>Ling</first><last>Jiang</last></author>
+      <author><first>Keer</first><last>Jiang</last></author>
+      <author><first>Xiaoyu</first><last>Chu</last></author>
+      <author><first>Saaransh</first><last>Gulati</last></author>
+      <author><first>Pulkit</first><last>Garg</last></author>
+      <pages>29–39</pages>
+      <abstract>E-commerce faces persistent challenges with data quality issue of product listings. Recent advances in Large Language Models (LLMs) offer a promising avenue for automated product listing enrichment. However, LLMs are prone to hallucinations, which we define as the generation of content that is unfaithful to the source input. This poses significant risks in customer-facing applications. Hallucination detection is particularly challenging in the vast e-commerce domain, where billions of products are sold. In this paper, we propose a two-phase approach for detecting hallucinations in LLM-enriched product listings. The first phase prioritizes recall through cost-effective unsupervised techniques. The second phase maximizes precision by leveraging LLMs to validate candidate hallucinations detected in phase one. The first phase significantly reduces the inference space and enables the resource-intensive methods in the second phase to scale effectively. Experiments on two real-world datasets demonstrated that our approach achieved satisfactory recall on unstructured product attributes with suboptimal precision, primarily due to the inherent ambiguity of unstructured attributes and the presence of common sense reasoning. This highlights the necessity for a refined approach to distinguish between common sense and hallucination. On structured attributes with clearly de- fined hallucinations, our approach effectively detected hallucinations with precision and recall surpassing targeted level.</abstract>
+      <url hash="6d2d846b">2024.ecnlp-1.4</url>
+      <bibkey>jiang-etal-2024-hallucination</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Self-Improving Customer Review Response Generation Based on <fixed-case>LLM</fixed-case>s</title>
+      <author><first>Guy</first><last>Azov</last></author>
+      <author><first>Tatiana</first><last>Pelc</last></author>
+      <author><first>Adi</first><last>Fledel Alon</last></author>
+      <author><first>Gila</first><last>Kamhi</last></author>
+      <pages>40–57</pages>
+      <abstract>Previous studies have demonstrated that proactive interaction with user reviews has a positive impact on the perception of app users and encourages them to submit revised ratings. Nevertheless, developers encounter challenges in managing a high volume of reviews, particularly in the case of popular apps with a substantial influx of daily reviews. Consequently, there is a demand for automated solutions aimed at streamlining the process of responding to user reviews. To address this, we have developed a new system for generating automatic responses by leveraging user-contributed documents with the help of retrieval-augmented generation (RAG) and advanced Large Language Models (LLMs). Our solution, named SCRABLE, represents an adaptive customer review response automation that enhances itself with self-optimizing prompts and a judging mechanism based on LLMs. Additionally, we introduce an automatic scoring mechanism that mimics the role of a human evaluator to assess the quality of responses generated in customer review domains. Extensive experiments and analyses conducted on real-world datasets reveal that our method is effective in producing high-quality responses, yielding improvement of more than 8.5% compared to the baseline. Further validation through manual examination of the generated responses underscores the efficacy our proposed system.</abstract>
+      <url hash="fba9d124">2024.ecnlp-1.5</url>
+      <bibkey>azov-etal-2024-self</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Don’t Just Translate, Summarize Too: Cross-lingual Product Title Generation in <fixed-case>E</fixed-case>-commerce</title>
+      <author><first>Bryan</first><last>Zhang</last></author>
+      <author><first>Taichi</first><last>Nakatani</last></author>
+      <author><first>Daniel Vidal</first><last>Hussey</last></author>
+      <author><first>Stephan</first><last>Walter</last></author>
+      <author><first>Liling</first><last>Tan</last></author>
+      <pages>58–64</pages>
+      <abstract>Making product titles informative and concise is vital to delighting e-commerce customers. Recent advances have successfully applied monolingual product title summarization to shorten lengthy product titles. This paper explores the cross-lingual product title generation task that summarizes and translates the source language product title to a shortened product title in the target language. Our main contributions are as follows, (i) we investigate the optimal product title length within the scope of e-commerce localization, (ii) we introduce a simple yet effective data filtering technique to train a length-aware machine translation system and compare it to a publicly available LLM, (iii) we propose an automatic approach to validate experimental results using an open-source LLM without human input and show that these evaluation results are consistent with human preferences.</abstract>
+      <url hash="b1fe9323">2024.ecnlp-1.6</url>
+      <bibkey>zhang-etal-2024-dont</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>T</fixed-case>urkish Typo Correction for <fixed-case>E</fixed-case>-Commerce Search Engines</title>
+      <author><first>Elif</first><last>Oral</last></author>
+      <author><first>Koray</first><last>Mancuhan</last></author>
+      <author><first>Hüseyin Varol</first><last>Erdem</last></author>
+      <author><first>Pınar Ece</first><last>Hatipoglu</last></author>
+      <pages>65–73</pages>
+      <abstract>Typo correction is a challenging problem when it is developed for morphologically rich languages. The existing approaches in the literature are successful mainly for English, leaving the problem open for such languages. This creates an issue, because the typo correction is a critical component in practice for many systems such as search engines. Especially, the search engines of e-commerce platforms rely heavily on typo correction for product relevancy. A bad performing typo corrector could result in very few number of relevant products when a user is looking for a product on an e-commerce platform, resulting in significant revenue decrease. For the first time in the literature, this paper proposes a modern typo corrector for a morphologically rich language, Turkish; which is integrated to the search engine of one of the leading e-commerce platforms in Turkey, Hepsiburada. Our thorough experiments show that this new typo corrector performs very successful in practice, outperforming the existing Turkish specific propositions in the literature; even if it is applied out of the context of the search engines.</abstract>
+      <url hash="b829cbc3">2024.ecnlp-1.7</url>
+      <bibkey>oral-etal-2024-turkish</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Detecting <fixed-case>AI</fixed-case>-enhanced Opinion Spambots: a study on <fixed-case>LLM</fixed-case>-generated Hotel Reviews</title>
+      <author><first>Davide</first><last>Buscaldi</last></author>
+      <author><first>Vijini</first><last>Liyanage</last></author>
+      <pages>74–78</pages>
+      <abstract>Opinion spamming is the posting of fake opinions or reviews to promote or discredit target products, services, or individuals. The concern surrounding this activity has grown steadily especially because of the development of automated bots for this purpose (“spambots”). Nowadays, Large Language Models (LLMs) have proved their ability to generate text that is almost indistinguishable from human-written text. Therefore, there is a growing concern regarding the use of these models for malicious purposes, among them opinion spamming. In this paper, we carry out a study on LLM-generated reviews, in particular hotel reviews as we chose the well-known Opinion Spam corpus by Myle Ott as the seed for our dataset. We generated a set of fake reviews with various models and applied different classification algorithms to verify how difficult is it to detect this kind of generated content. The results show that by providing enough training data, it is not difficult to detect the fake reviews generated by such models, as they tend to associate the aspects in the reviews with the same attributes.</abstract>
+      <url hash="ea53ee24">2024.ecnlp-1.8</url>
+      <bibkey>buscaldi-liyanage-2024-detecting</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Assessing Image-Captioning Models: A Novel Framework Integrating Statistical Analysis and Metric Patterns</title>
+      <author><first>Qiaomu</first><last>Li</last></author>
+      <author><first>Ying</first><last>Xie</last></author>
+      <author><first>Nina</first><last>Grundlingh</last></author>
+      <author><first>Varsha Rani</first><last>Chawan</last></author>
+      <author><first>Cody</first><last>Wang</last></author>
+      <pages>79–87</pages>
+      <abstract>In this study, we present a novel evaluation framework for image-captioning models that integrate statistical analysis with common evaluation metrics, utilizing two popular datasets, FashionGen and Amazon, with contrasting dataset variation to evaluate four models: Video-LLaVa, BLIP, CoCa and ViT-GPT2. Our approach not only reveals the comparative strengths of models, offering insights into their adaptability and applicability in real-world scenarios but also contributes to the field by providing a comprehensive evaluation method that considers both statistical significance and practical relevance to guide the selection of models for specific applications. Specifically, we propose Rank Score as a new evaluation metric that is designed for e-commerce image search applications and employ CLIP Score to quantify dataset variation to offer a holistic view of model performance.</abstract>
+      <url hash="bdedb76e">2024.ecnlp-1.9</url>
+      <bibkey>li-etal-2024-assessing</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Frogs into princes: A generative model to understand the success of product descriptions</title>
+      <author><first>Takehiro</first><last>Takayanagi</last></author>
+      <author><first>Bruno</first><last>Charron</last></author>
+      <author><first>Marco</first><last>Visentini-Scarzanella</last></author>
+      <pages>88–96</pages>
+      <abstract>In the dynamic marketplace, vendors continuously seek innovative ideas for new products and ways to improve existing ones. These ideas can be uncovered by analyzing text data, such as product descriptions and customer reviews. However, the ever-increasing volume of text data poses a challenge in extracting meaningful insights. Therefore, this study addresses the challenge of extracting actionable insights from the growing volume of text data, with a specific focus on product descriptions. To this end, we investigate two primary research questions: the predictive power of product descriptions for product success, and the capability of style transfer to highlight the successful factors of these descriptions. In response to the first question, our findings validate that product descriptions are indeed reliable indicators of product success. Addressing our second question, we propose a Successful Style Transfer Variational Autoencoder (SST-VAE), a VAE-based language model designed for effective successful style transfer. Qualitative analysis indicates that the SST-VAE effectively enables successful style transfer conditional on a given label. In addition, case studies suggest that the proposed approach could be useful in gaining insights about product success, by highlighting key factors that may contribute to their success. On the other hand, our approach confronts issues such as hallucinations and the need for factual accuracy. These challenges underscore the necessity for continued research in the field of e-commerce natural language processing.</abstract>
+      <url hash="93eb2dc5">2024.ecnlp-1.10</url>
+      <bibkey>takayanagi-etal-2024-frogs</bibkey>
+    </paper>
+    <paper id="11">
+      <title><fixed-case>STA</fixed-case>: Self-controlled Text Augmentation for Improving Text Classifications</title>
+      <author><first>Congcong</first><last>Wang</last></author>
+      <author><first>Gonzalo</first><last>Fiz Pontiveros</last></author>
+      <author><first>Steven</first><last>Derby</last></author>
+      <author><first>Tri</first><last>Kurniawan Wijaya</last></author>
+      <pages>97–114</pages>
+      <abstract>Despite recent advancements in Machine Learning, many tasks still involve working in low-data regimes which can make solving natural language problems difficult. Recently, a number of text augmentation techniques have emerged in the field of Natural Language Processing (NLP) which can enrich the training data with new examples, though they are not without their caveats. For instance, simple rule-based heuristic methods are effective, but lack variation in semantic content and syntactic structure with respect to the original text. On the other hand, more complex deep learning approaches can cause extreme shifts in the intrinsic meaning of the text and introduce unwanted noise into the training data. To more reliably control the quality of the augmented examples, we introduce a state-of-the-art approach for Self-Controlled Text Augmentation (STA). Our approach tightly controls the generation process by introducing a self-checking procedure to ensure that generated examples retain the semantic content of the original text. Experimental results on multiple benchmarking datasets demonstrate that STA substantially outperforms existing state-of-the-art techniques, whilst qualitative analysis reveals that the generated examples are both lexically diverse and semantically reliable.</abstract>
+      <url hash="006cf33f">2024.ecnlp-1.11</url>
+      <bibkey>wang-etal-2024-sta</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Multi-word Term Embeddings Improve Lexical Product Retrieval</title>
+      <author><first>Viktor</first><last>Shcherbakov</last></author>
+      <author><first>Fedor</first><last>Krasnov</last></author>
+      <pages>115–124</pages>
+      <abstract>Product search is uniquely different from search for documents, Internet resources or vacancies, therefore it requires the development of specialized search systems. The present work describes the H1 embdedding model, designed for an offline term indexing of product descriptions at e-commerce platforms. The model is compared to other state-of-the-art (SoTA) embedding models within a framework of hybrid product search system that incorporates the advantages of lexical methods for product retrieval and semantic embedding-based methods. We propose an approach to building semantically rich term vocabularies for search indexes. Compared to other production semantic models, H1 paired with the proposed approach stands out due to its ability to process multi-word product terms as one token. As an example, for search queries “new balance shoes”, “gloria jeans kids wear” brand entity will be represented as one token - “new balance”, “gloria jeans”. This results in an increased precision of the system without affecting the recall. The hybrid search system with proposed model scores mAP@12 = 56.1% and R@1k = 86.6% on the WANDS public dataset, beating other SoTA analogues.</abstract>
+      <url hash="4f2462c5">2024.ecnlp-1.12</url>
+      <bibkey>shcherbakov-krasnov-2024-multi</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Explicit Attribute Extraction in e-Commerce Search</title>
+      <author><first>Robyn</first><last>Loughnane</last></author>
+      <author><first>Jiaxin</first><last>Liu</last></author>
+      <author><first>Zhilin</first><last>Chen</last></author>
+      <author><first>Zhiqi</first><last>Wang</last></author>
+      <author><first>Joseph</first><last>Giroux</last></author>
+      <author><first>Tianchuan</first><last>Du</last></author>
+      <author><first>Benjamin</first><last>Schroeder</last></author>
+      <author><first>Weiyi</first><last>Sun</last></author>
+      <pages>125–135</pages>
+      <abstract>This paper presents a model architecture and training pipeline for attribute value extraction from search queries. The model uses weak labels generated from customer interactions to train a transformer-based NER model. A two-stage normalization process is then applied to deal with the problem of a large label space: first, the model output is normalized onto common generic attribute values, then it is mapped onto a larger range of actual product attribute values. This approach lets us successfully apply a transformer-based NER model to the extraction of a broad range of attribute values in a real-time production environment for e-commerce applications, contrary to previous research. In an online test, we demonstrate business value by integrating the model into a system for semantic product retrieval and ranking.</abstract>
+      <url hash="31266768">2024.ecnlp-1.13</url>
+      <bibkey>loughnane-etal-2024-explicit</bibkey>
+      <revision id="1" href="2024.ecnlp-1.13v1" hash="c365fde8"/>
+      <revision id="2" href="2024.ecnlp-1.13v2" hash="31266768" date="2024-05-30">Added references.</revision>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>TAAL</fixed-case>: Target-Aware Active Learning</title>
+      <author><first>Kunal</first><last>Kotian</last></author>
+      <author><first>Indranil</first><last>Bhattacharya</last></author>
+      <author><first>Shikhar</first><last>Gupta</last></author>
+      <author><first>Kaushik</first><last>Pavani</last></author>
+      <author><first>Naval</first><last>Bhandari</last></author>
+      <author><first>Sunny</first><last>Dasgupta</last></author>
+      <pages>136–144</pages>
+      <abstract>Pool-based active learning techniques have had success producing multi-class classifiers that achieve high accuracy with fewer labels com- pared to random labeling. However, in an industrial setting where we often have class-level business targets to achieve (e.g., 95% recall at 95% precision for each class), active learning techniques continue to acquire labels for classes that have already met their targets, thus consuming unnecessary manual annotations. We address this problem by proposing a framework called Target-Aware Active Learning that converts any active learning query strategy into its target-aware variant by leveraging the gap between each class’ current estimated accuracy and its corresponding business target. We show empirically that target-aware variants of state-of-the-art active learning techniques achieve business targets faster on 2 open-source image classification datasets and 2 proprietary product classification datasets.</abstract>
+      <url hash="db3bb60c">2024.ecnlp-1.14</url>
+      <bibkey>kotian-etal-2024-taal</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Cluster Language Model for Improved <fixed-case>E</fixed-case>-Commerce Retrieval and Ranking: Leveraging Query Similarity and Fine-Tuning for Personalized Results</title>
+      <author><first>Duleep</first><last>Rathgamage Don</last></author>
+      <author><first>Ying</first><last>Xie</last></author>
+      <author><first>Le</first><last>Yu</last></author>
+      <author><first>Simon</first><last>Hughes</last></author>
+      <author><first>Yun</first><last>Zhu</last></author>
+      <pages>145–153</pages>
+      <abstract>This paper proposes a novel method to improve the accuracy of product search in e-commerce by utilizing a cluster language model. The method aims to address the limitations of the bi-encoder architecture while maintaining a minimal additional training burden. The approach involves labeling top products for each query, generating semantically similar query clusters using the K-Means clustering algorithm, and fine-tuning a global language model into cluster language models on individual clusters. The parameters of each cluster language model are fine-tuned to learn local manifolds in the feature space efficiently, capturing the nuances of various query types within each cluster. The inference is performed by assigning a new query to its respective cluster and utilizing the corresponding cluster language model for retrieval. The proposed method results in more accurate and personalized retrieval results, offering a superior alternative to the popular bi-encoder based retrieval models in semantic search.</abstract>
+      <url hash="c59d7459">2024.ecnlp-1.15</url>
+      <bibkey>rathgamage-don-etal-2024-cluster</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.eurali.xml b/data/xml/2024.eurali.xml
new file mode 100644
index 0000000000..d73763ae17
--- /dev/null
+++ b/data/xml/2024.eurali.xml
@@ -0,0 +1,108 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.eurali">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 2nd Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia (EURALI) @ LREC-COLING 2024</booktitle>
+      <editor><first>Atul Kr.</first><last>Ojha</last></editor>
+      <editor><first>Sina</first><last>Ahmadi</last></editor>
+      <editor><first>Silvie</first><last>Cinková</last></editor>
+      <editor><first>Theodorus</first><last>Fransen</last></editor>
+      <editor><first>Chao-Hong</first><last>Liu</last></editor>
+      <editor><first>John P.</first><last>McCrae</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="15d51877">2024.eurali-1</url>
+      <venue>eurali</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="486e9c6b">2024.eurali-1.0</url>
+      <bibkey>eurali-2024-resources</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Low-Resource Machine Translation through Retrieval-Augmented <fixed-case>LLM</fixed-case> Prompting: A Study on the <fixed-case>M</fixed-case>ambai Language</title>
+      <author><first>Raphaël</first><last>Merx</last></author>
+      <author><first>Aso</first><last>Mahmudi</last></author>
+      <author><first>Katrina</first><last>Langford</last></author>
+      <author><first>Leo Alberto</first><last>de Araujo</last></author>
+      <author><first>Ekaterina</first><last>Vylomova</last></author>
+      <pages>1–11</pages>
+      <abstract>This study explores the use of large language models (LLMs) for translating English into Mambai, a low-resource Austronesian language spoken in Timor-Leste, with approximately 200,000 native speakers. Leveraging a novel corpus derived from a Mambai language manual and additional sentences translated by a native speaker, we examine the efficacy of few-shot LLM prompting for machine translation (MT) in this low-resource context. Our methodology involves the strategic selection of parallel sentences and dictionary entries for prompting, aiming to enhance translation accuracy, using open-source and proprietary LLMs (LlaMa 2 70b, Mixtral 8x7B, GPT-4). We find that including dictionary entries in prompts and a mix of sentences retrieved through TF-IDF and semantic embeddings significantly improves translation quality. However, translation accuracy varies between test sets, highlighting the importance of diverse corpora for evaluating low-resource MT. This research provides insights into few-shot LLM prompting for low-resource MT, and makes available an initial corpus for the Mambai language.</abstract>
+      <url hash="d5d13d3f">2024.eurali-1.1</url>
+      <bibkey>merx-etal-2024-low</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Improved Neural Word Segmentation for Standard <fixed-case>T</fixed-case>ibetan</title>
+      <author><first>Collin J.</first><last>Brown</last></author>
+      <pages>12–17</pages>
+      <abstract>As Tibetan is traditionally not written with word delimiters, various means of word segmentation are necessary to prepare data for downstream tasks. Neural word segmentation has proven a successful means of parsing Tibetan text, but current performance lags behind that of neural word segmenters in other languages, such as Chinese or Japanese, and even behind languages with relatively similar orthographic structures, such as Vietnamese or Thai. We apply methods that have proven useful for these latter two languages , in addition to Classical Tibetan, toward the development of a neural word segmenter with the goal of raising the peak performance of Tibetan neural word segmentation to a level comparable to that reached for orthographically similar languages.</abstract>
+      <url hash="a0a9210b">2024.eurali-1.2</url>
+      <bibkey>brown-2024-improved</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Open Text Collections as a Resource for Doing <fixed-case>NLP</fixed-case> with Eurasian Languages</title>
+      <author><first>Sebastian</first><last>Nordhoff</last></author>
+      <author><first>Christian</first><last>Döhler</last></author>
+      <author><first>Mandana</first><last>Seyfeddinipur</last></author>
+      <pages>18–23</pages>
+      <abstract>The Open Text Collections project establishes a high-quality publication channel for interlinear glossed text from endangered languages. Text collection will by made available in an open interoperable format and as a more traditional book publication. The project addresses a variety of audiences, eg. community members, typological linguists, anthropologists, NLP practitioners.</abstract>
+      <url hash="ddaa29f5">2024.eurali-1.3</url>
+      <bibkey>nordhoff-etal-2024-open</bibkey>
+    </paper>
+    <paper id="4">
+      <title>The Extraction and Fine-grained Classification of Written <fixed-case>C</fixed-case>antonese Materials through Linguistic Feature Detection</title>
+      <author><first>Chaak-ming</first><last>Lau</last></author>
+      <author><first>Mingfei</first><last>Lau</last></author>
+      <author><first>Ann Wai Huen</first><last>To</last></author>
+      <pages>24–29</pages>
+      <abstract>This paper presents a linguistically-informed, non-machine-learning tool for classifying Written Cantonese, Standard Written Chinese, and the intermediate varieties used by Cantonese-speaking users from Hong Kong, which are often grouped into a single “Traditional Chinese” label. Our approach addresses the lack of textual materials for Cantonese NLP, a consequence of a lower sociolinguistic status of Written Cantonese and the interchangeable use of these varieties by users without sufficient language labeling. The tool utilizes key strings and quotation markers, which can be reduced to string operations, to effectively extract Written Cantonese sentences and documents from materials mixed with Standard Written Chinese. This allows for the flexible and efficient extraction of high-quality Cantonese data from large datasets, catering to specific classification needs. This implementation ensures that the tool can process large amounts of data at a low cost by bypassing model-inferencing, which is particularly significant for marginalized languages. The tool also aims to provide a baseline measure for future classification systems, and the approach may be applicable to other low-resource regional or diglossic languages.</abstract>
+      <url hash="64fcb370">2024.eurali-1.4</url>
+      <bibkey>lau-etal-2024-extraction</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Neural Mining of <fixed-case>P</fixed-case>ersian Short Argumentative Texts</title>
+      <author><first>Mohammad</first><last>Yeghaneh Abkenar</last></author>
+      <author><first>Manfred</first><last>Stede</last></author>
+      <pages>30–35</pages>
+      <abstract>Argumentation mining (AM) is concerned with extracting arguments from texts and classifying the elements (e.g.,claim and premise) and relations between them, as well as creating an argumentative structure. A significant hurdle to research in this area for the Persian language is the lack of annotated Persian language corpora. This paper introduces the first argument-annotated corpus in Persian and thereby the possibility of expanding argumentation mining to this low-resource language. The starting point is the English argumentative microtext corpus (AMT) (Peldszus and Stede, 2015), and we built the Persian variant by machine translation (MT) and careful post-editing of the output. We call this corpus Persian argumentative microtext (PAMT). Moreover, we present the first results for Argumentative Discourse Unit (ADU) classification for Persian, which is considered to be one of the main fundamental subtasks of argumentation mining. We adopted span categorization using the deep learning model of spaCy Version 3.0 (a CNN model on top of Bloom embedding with attention) on the corpus for determing argumentative units and their type (claim vs. premise).</abstract>
+      <url hash="e54512d9">2024.eurali-1.5</url>
+      <bibkey>yeghaneh-abkenar-stede-2024-neural</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Endangered Language Preservation: A Model for Automatic Speech Recognition Based on Khroskyabs Data</title>
+      <author><first>Ruiyao</first><last>Li</last></author>
+      <author><first>Yunfan</first><last>Lai</last></author>
+      <pages>36–40</pages>
+      <abstract>This is a report on an Automatic Speech Recognition (ASR) experiment conducted using the Khroskyabs data. With the impact of information technology development and globalization challenges on linguistic diversity, this study focuses on the preservation crisis of the endangered Gyalrongic language, particularly the Khroskyabs language. We used Automatic Speech Recognition technology and the Wav2Vec2 model to transcribe the Khroskyabs language. Despite challenges such as data scarcity and the language’s complex morphology, preliminary results show promising character accuracy from the model. Additionally, the linguist also has given relatively high evaluations to the transcription results of our model. Therefore, the experimental and evaluation results demonstrate the high practicality of our model. At the same time, the results also reveal issues with high word error rates, so we plan to augment our existing dataset with additional Khroskyabs data in our further studies. This study provides insights and methodologies for using Automatic Speech Recognition to transcribe and protect Khroskyabs, and we hope that this can contribute to the preservation efforts of other endangered languages.</abstract>
+      <url hash="2889d54d">2024.eurali-1.6</url>
+      <bibkey>li-lai-2024-endangered</bibkey>
+    </paper>
+    <paper id="7">
+      <title>This Word Mean What: Constructing a <fixed-case>S</fixed-case>inglish Dictionary with <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case></title>
+      <author><first>Siew Yeng</first><last>Chow</last></author>
+      <author><first>Chang-Uk</first><last>Shin</last></author>
+      <author><first>Francis</first><last>Bond</last></author>
+      <pages>41–50</pages>
+      <abstract>Despite the magnitude of recent progress in natural language processing and multilingual language modeling research, the vast majority of NLP research is focused on English and other major languages. This is because recent NLP research is mainly data-driven, and there is more data for resource-rich languages. In particular, Large Language Models (LLM) make use of large unlabeled datasets, a resource that many languages do not have. In this project, we built a new, open-sourced dictionary of Singlish, a contact variety that contains features from English and other local languages and is syntactically, phonologically and lexically distinct from Standard English (Tan, 2010). First, a list of Singlish words was extracted from various online sources. Then using an open Chat-GPT LLM API, the description, including the defintion, part of speech, pronunciation and examples was produced. These were then refined through post processing carried out by a native speaker. The dictionary currently has 1,783 entries and is published under the CC-BY-SA license. The project was carried out with the intention of facilitating future Singlish research and other applications as the accumulation and management of language resources will be of great help in promoting research on the language in the future.</abstract>
+      <url hash="8dc68020">2024.eurali-1.7</url>
+      <bibkey>chow-etal-2024-word</bibkey>
+    </paper>
+    <paper id="8">
+      <title>An Evaluation of Language Models for Hyperpartisan Ideology Detection in <fixed-case>P</fixed-case>ersian <fixed-case>T</fixed-case>witter</title>
+      <author><first>Sahar</first><last>Omidi Shayegan</last></author>
+      <author><first>Isar</first><last>Nejadgholi</last></author>
+      <author><first>Kellin</first><last>Pelrine</last></author>
+      <author><first>Hao</first><last>Yu</last></author>
+      <author><first>Sacha</first><last>Levy</last></author>
+      <author><first>Zachary</first><last>Yang</last></author>
+      <author><first>Jean-François</first><last>Godbout</last></author>
+      <author><first>Reihaneh</first><last>Rabbany</last></author>
+      <pages>51–62</pages>
+      <abstract>Large Language Models (LLMs) have shown significant promise in various tasks, including identifying the political beliefs of English-speaking social media users from their posts. However, assessing LLMs for this task in non-English languages remains unexplored. In this work, we ask to what extent LLMs can predict the political ideologies of users in Persian social media. To answer this question, we first acknowledge that political parties are not well-defined among Persian users, and therefore, we simplify the task to a much simpler task of hyperpartisan ideology detection. We create a new benchmark and show the potential and limitations of both open-source and commercial LLMs in classifying the hyper-partisan ideologies of users. We compare these models with smaller fine-tuned models, both on the Persian language (ParsBERT) and translated data (RoBERTa), showing that they considerably outperform generative LLMs in this task. We further demonstrate that the performance of the generative LLMs degrades when classifying users based on their tweets instead of their bios and even when tweets are added as additional information, whereas the smaller fine-tuned models are robust and achieve similar performance for all classes. This study is a first step toward political ideology detection in Persian Twitter, with implications for future research to understand the dynamics of ideologies in Persian social media.</abstract>
+      <url hash="bb58bfce">2024.eurali-1.8</url>
+      <bibkey>omidi-shayegan-etal-2024-evaluation</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.findings.xml b/data/xml/2024.findings.xml
index fdd20ff196..9d08a16455 100644
--- a/data/xml/2024.findings.xml
+++ b/data/xml/2024.findings.xml
@@ -27,10 +27,13 @@
       <author><first>Heng</first><last>Ji</last><affiliation>University of Illinois, Urbana-Champaign</affiliation></author>
       <pages>1-16</pages>
       <abstract>Fine-grained few-shot entity extraction in the chemical domain faces two unique challenges. First, compared with entity extraction tasks in the general domain, sentences from chemical papers usually contain more entities. Moreover, entity extraction models usually have difficulty extracting entities of long-tailed types. In this paper, we propose Chem-FINESE, a novel sequence-to-sequence (seq2seq) based few-shot entity extraction approach, to address these two challenges. Our Chem-FINESE has two components: a seq2seq entity extractor to extract named entities from the input sentence and a seq2seq self-validation module to reconstruct the original input sentence from extracted entities. Inspired by the fact that a good entity extraction system needs to extract entities faithfully, our new self-validation module leverages entity extraction results to reconstruct the original input sentence. Besides, we design a new contrastive loss to reduce excessive copying during the extraction process. Finally, we release ChemNER+, a new fine-grained chemical entity extraction dataset that is annotated by domain experts with the ChemNER schema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets show that our newly proposed framework has contributed up to 8.26% and 6.84% absolute F1-score gains respectively.</abstract>
-      <url hash="b832457e">2024.findings-eacl.1</url>
+      <url hash="d115d641">2024.findings-eacl.1</url>
       <attachment type="software" hash="4203c223">2024.findings-eacl.1.software.zip</attachment>
       <attachment type="note" hash="c244f46e">2024.findings-eacl.1.note.zip</attachment>
       <bibkey>wang-etal-2024-chem</bibkey>
+      <video href="2024.findings-eacl.1.mp4"/>
+      <revision id="1" href="2024.findings-eacl.1v1" hash="b832457e"/>
+      <revision id="2" href="2024.findings-eacl.1v2" hash="d115d641" date="2024-05-30">Fix two typos in equation 2 and 4.</revision>
     </paper>
     <paper id="2">
       <title><fixed-case>GPT</fixed-case>s Are Multilingual Annotators for Sequence Generation Tasks</title>
@@ -44,6 +47,7 @@
       <attachment type="software" hash="fff5e2cf">2024.findings-eacl.2.software.zip</attachment>
       <attachment type="note" hash="cce955c1">2024.findings-eacl.2.note.zip</attachment>
       <bibkey>choi-etal-2024-gpts</bibkey>
+      <video href="2024.findings-eacl.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal Contrastive <fixed-case>EHR</fixed-case> Modelling with Hierarchical Regularisation</title>
@@ -52,6 +56,7 @@
       <abstract>Predicting next visit diagnosis using Electronic Health Records (EHR) is an essential task in healthcare, critical for devising proactive future plans for both healthcare providers and patients. Nonetheless, many preceding studies have not sufficiently addressed the heterogeneous and hierarchical characteristics inherent in EHR data, inevitably leading to sub-optimal performance. To this end, we propose NECHO, a novel medical code-centric multimodal contrastive EHR learning framework with hierarchical regularisation. First, we integrate multifaceted information encompassing medical codes, demographics, and clinical notes using a tailored network design and a pair of bimodal contrastive losses, all of which pivot around a medical codes representation. We also regularise modality-specific encoders using a parental level information in medical ontology to learn hierarchical structure of EHR data. A series of experiments on MIMIC-III data demonstrates effectiveness of our approach.</abstract>
       <url hash="27b828b0">2024.findings-eacl.3</url>
       <bibkey>koo-2024-next</bibkey>
+      <video href="2024.findings-eacl.3.mp4"/>
     </paper>
     <paper id="4">
       <title><fixed-case>F</fixed-case>lexi<fixed-case>QA</fixed-case>: Leveraging <fixed-case>LLM</fixed-case>’s Evaluation Capabilities for Flexible Knowledge Selection in Open-domain Question Answering</title>
@@ -62,6 +67,7 @@
       <abstract>Nowadays, large language models (LLMs) have demonstrated their ability to be a powerful knowledge generator of generate-then-read paradigm for open-domain question answering (ODQA). However this new paradigm mainly suffers from the “hallucination” and struggles to handle time-sensitive issue because of its expensive knowledge update costs. On the other hand, retrieve-then-read, as a traditional paradigm, is more limited by the relevance of acquired knowledge to the given question. In order to combine the strengths of both paradigms, and overcome their respective shortcomings, we design a new pipeline called “FlexiQA”, in which we utilize the diverse evaluation capabilities of LLMs to select knowledge effectively and flexibly. First, given a question, we prompt a LLM as a discriminator to identify whether it is time-sensitive. For time-sensitive questions, we follow the retrieve-then-read paradigm to obtain the answer. For the non time-sensitive questions, we further prompt the LLM as an evaluator to select a better document from two perspectives: factuality and relevance. Based on the selected document, we leverage a reader to get the final answer. We conduct extensive experiments on three widely-used ODQA benchmarks, the experimental results fully confirm the effectiveness of our approach.</abstract>
       <url hash="cbe0ec51">2024.findings-eacl.4</url>
       <bibkey>chen-etal-2024-flexiqa</bibkey>
+      <video href="2024.findings-eacl.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Hyper-<fixed-case>BTS</fixed-case> Dataset: Scalability and Enhanced Analysis of Back <fixed-case>T</fixed-case>ran<fixed-case>S</fixed-case>cription (<fixed-case>BTS</fixed-case>) for <fixed-case>ASR</fixed-case> Post-Processing</title>
@@ -77,6 +83,7 @@
       <abstract>The recent advancements in the realm of Automatic Speech Recognition (ASR) post-processing have been primarily driven by sequence-to-sequence paradigms. Despite their effectiveness, these methods often demand substantial amounts of data, necessitating the expensive recruitment of phonetic transcription experts to rectify the erroneous outputs of ASR systems, thereby creating the desired training data. Back TranScription (BTS) alleviates this issue by generating ASR inputs from clean text via a Text-to-Speech (TTS) system. While initial studies on BTS exhibited promise, they were constrained by a limited dataset of just 200,000 sentence pairs, leaving the scalability of this method in question. In this study, we delve into the potential scalability of BTS. We introduce the “Hyper-BTS” dataset, a corpus approximately five times larger than that utilized in prior research. Additionally, we present innovative criteria for categorizing error types within ASR post-processing. This not only facilitates a more comprehensive qualitative analysis, which was absent in preceding studies, but also enhances the understanding of ASR error patterns. Our empirical results, both quantitative and qualitative, suggest that the enlarged scale of the Hyper-BTS dataset sufficiently addresses a vast majority of the ASR error categories. We make the Hyper-BTS dataset publicly available.</abstract>
       <url hash="822de166">2024.findings-eacl.5</url>
       <bibkey>park-etal-2024-hyper</bibkey>
+      <video href="2024.findings-eacl.5.mp4"/>
     </paper>
     <paper id="6">
       <title><fixed-case>P</fixed-case>arrot<fixed-case>TTS</fixed-case>: Text-to-speech synthesis exploiting disentangled self-supervised representations</title>
@@ -90,6 +97,7 @@
       <abstract>We present ParrotTTS, a modularized text-to-speech synthesis model leveraging disentangled self-supervised speech representations. It can train a multi-speaker variant effectively using transcripts from a single speaker. ParrotTTS adapts to a new language in low resource setup and generalizes to languages not seen while training the self-supervised backbone. Moreover, without training on bilingual or parallel examples, ParrotTTS can transfer voices across languages while preserving the speaker-specific characteristics, e.g., synthesizing fluent Hindi speech using a French speaker’s voice and accent. We present extensive results in monolingual and multi-lingual scenarios. ParrotTTS outperforms state-of-the-art multi-lingual text-to-speech (TTS) models using only a fraction of paired data as latter. Speech samples from ParrotTTS and code can be found at https://parrot-tts.github.io/tts/</abstract>
       <url hash="1104692c">2024.findings-eacl.6</url>
       <bibkey>shah-etal-2024-parrottts</bibkey>
+      <video href="2024.findings-eacl.6.mp4"/>
     </paper>
     <paper id="7">
       <title><fixed-case>N</fixed-case>av<fixed-case>H</fixed-case>int: Vision and Language Navigation Agent with a Hint Generator</title>
@@ -100,6 +108,7 @@
       <abstract>The existing work on vision and language navigation mainly relies on navigation-related losses to establish the connection between vision and language modalities, neglecting aspects of helping the navigation agent build a deep understanding of the visual environment.In our work, we provide indirect supervision to the navigation agent through a hint generator that provides detailed visual descriptions.The hint generator assists the navigation agent in developing a global understanding of the visual environment. It directs the agent’s attention toward related navigation details, including the relevant sub-instruction, potential challenges in recognition and ambiguities in grounding, and the targeted viewpoint description. To train the hint generator, we construct a synthetic dataset based on landmarks in the instructions and visible and distinctive objects in the visual environment.We evaluate our method on the R2R and R4R datasets and achieve state-of-the-art on several metrics. The experimental results demonstrate that generating hints not only enhances the navigation performance but also helps improve the agent’s interpretability.</abstract>
       <url hash="63815ab4">2024.findings-eacl.7</url>
       <bibkey>zhang-etal-2024-navhint</bibkey>
+      <video href="2024.findings-eacl.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Text or Image? What is More Important in Cross-Domain Generalization Capabilities of Hate Meme Detection Models?</title>
@@ -121,6 +130,7 @@
       <url hash="e9d06d61">2024.findings-eacl.9</url>
       <attachment type="software" hash="4898d0e8">2024.findings-eacl.9.software.tgz</attachment>
       <bibkey>goot-2024-still</bibkey>
+      <video href="2024.findings-eacl.9.mp4"/>
     </paper>
     <paper id="10">
       <title>A Methodology for Generative Spelling Correction via Natural Spelling Errors Emulation across Multiple Domains and Languages</title>
@@ -148,6 +158,7 @@
       <abstract>Fine-tuning large language models is becoming ever more impractical due to their rapidly-growing scale. This motivates the use of parameter-efficient adaptation methods such as prompt tuning (PT), which adds a small number of tunable embeddings to an otherwise frozen model, and in-context learning (ICL), in which demonstrations of the task are provided to the model in natural language without any additional training. Recently, (CITATION) propose “instruction prompt tuning” (IPT), which combines PT with ICL by concatenating a natural language demonstration with learned prompt embeddings. While all of these methods have proven effective on different tasks, how they interact with each other remains unexplored. In this paper, we empirically study when and how in-context examples improve prompt tuning by measuring the effectiveness of ICL, PT, and IPT on five text generation tasks with multiple base language models. We observe that (1) IPT does <i>not</i> always outperform PT, and in fact requires the in-context demonstration to be semantically similar to the test input to yield improvements; (2) PT is unstable and exhibits high variance, but combining PT and ICL (into IPT) consistently reduces variance across all five tasks; and(3) prompts learned for a specific source task via PT exhibit positive transfer when paired with in-context examples of a different target task. Our results offer actionable insights on choosing a suitable parameter-efficient adaptation method for a given task.</abstract>
       <url hash="a17235c9">2024.findings-eacl.11</url>
       <bibkey>sun-etal-2024-context</bibkey>
+      <video href="2024.findings-eacl.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Large Language Models for Psycholinguistic Plausibility Pretesting</title>
@@ -160,6 +171,7 @@
       <attachment type="software" hash="d5b10b6a">2024.findings-eacl.12.software.zip</attachment>
       <attachment type="note" hash="6832e347">2024.findings-eacl.12.note.zip</attachment>
       <bibkey>amouyal-etal-2024-large</bibkey>
+      <video href="2024.findings-eacl.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Modeling Aspect Sentiment Coherency via Local Sentiment Aggregation</title>
@@ -171,6 +183,7 @@
       <attachment type="software" hash="7abddaeb">2024.findings-eacl.13.software.zip</attachment>
       <attachment type="note" hash="7abddaeb">2024.findings-eacl.13.note.zip</attachment>
       <bibkey>yang-li-2024-modeling</bibkey>
+      <video href="2024.findings-eacl.13.mp4"/>
     </paper>
     <paper id="14">
       <title>An Examination of the Robustness of Reference-Free Image Captioning Evaluation Metrics</title>
@@ -180,6 +193,7 @@
       <abstract>Recently, reference-free metrics such as CLIPScore (Hessel et al., 2021), UMIC (Lee et al., 2021), and PAC-S (Sarto et al., 2023) have been proposed for automatic reference-free evaluation of image captions. Our focus lies in evaluating the robustness of these metrics in scenarios that require distinguishing between two captions with high lexical overlap but very different meanings. Our findings reveal that despite their high correlation with human judgments, CLIPScore, UMIC, and PAC-S struggle to identify fine-grained errors. While all metrics exhibit strong sensitivity to visual grounding errors, their sensitivity to caption implausibility errors is limited. Furthermore, we found that all metrics are sensitive to variations in the size of image-relevant objects mentioned in the caption, while CLIPScore and PAC-S are also sensitive to the number of mentions of image-relevant objects in the caption. Regarding linguistic aspects of a caption, all metrics show weak comprehension of negation, and CLIPScore and PAC-S are insensitive to the structure of the caption to a great extent. We hope our findings will guide further improvements in reference-free evaluation of image captioning.</abstract>
       <url hash="98d0754c">2024.findings-eacl.14</url>
       <bibkey>ahmadi-agrawal-2024-examination</bibkey>
+      <video href="2024.findings-eacl.14.mp4"/>
     </paper>
     <paper id="15">
       <title>Barriers to Effective Evaluation of Simultaneous Interpretation</title>
@@ -193,6 +207,7 @@
       <abstract>Simultaneous interpretation is an especially challenging form of translation because it requires converting speech from one language to another in real-time. Though prior work has relied on out-of-the-box machine translation metrics to evaluate interpretation data, we hypothesize that strategies common in high-quality human interpretations, such as summarization, may not be handled well by standard machine translation metrics. In this work, we examine both qualitatively and quantitatively four potential barriers to evaluation of interpretation: disfluency, summarization, paraphrasing, and segmentation. Our experiments reveal that, while some machine translation metrics correlate fairly well with human judgments of interpretation quality, much work is still needed to account for strategies of interpretation during evaluation. As a first step to address this, we develop a fine-tuned model for interpretation evaluation, and achieve better correlation with human judgments than the state-of-the-art machine translation metrics.</abstract>
       <url hash="2ec8040a">2024.findings-eacl.15</url>
       <bibkey>wein-etal-2024-barriers</bibkey>
+      <video href="2024.findings-eacl.15.mp4"/>
     </paper>
     <paper id="16">
       <title>Inconsistent dialogue responses and how to recover from them</title>
@@ -227,6 +242,7 @@
       <url hash="b7def7f3">2024.findings-eacl.18</url>
       <attachment type="software" hash="300fc956">2024.findings-eacl.18.software.zip</attachment>
       <bibkey>benedek-wolf-2024-prilora</bibkey>
+      <video href="2024.findings-eacl.18.mp4"/>
     </paper>
     <paper id="19">
       <title>Revamping Multilingual Agreement Bidirectionally via Switched Back-translation for Multilingual Neural Machine Translation</title>
@@ -251,6 +267,7 @@
       <abstract>Recent multilingual pretrained language models (mPLMs) have been shown to encode strong language-specific signals, which are not explicitly provided during pretraining. It remains an open question whether it is feasible to employ mPLMs to measure language similarity, and subsequently use the similarity results to select source languages for boosting cross-lingual transfer. To investigate this, we propose mPLM-Sim, a language similarity measure that induces the similarities across languages from mPLMs using multi-parallel corpora. Our study shows that mPLM-Sim exhibits moderately high correlations with linguistic similarity measures, such as lexicostatistics, genealogical language family, and geographical sprachbund. We also conduct a case study on languages with low correlation and observe that mPLM-Sim yields more accurate similarity results. Additionally, we find that similarity results vary across different mPLMs and different layers within an mPLM. We further investigate whether mPLM-Sim is effective for zero-shot cross-lingual transfer by conducting experiments on both low-level syntactic tasks and high-level semantic tasks. The experimental results demonstrate that mPLM-Sim is capable of selecting better source languages than linguistic measures, resulting in a 1%-2% improvement in zero-shot cross-lingual transfer performance.</abstract>
       <url hash="accddcb1">2024.findings-eacl.20</url>
       <bibkey>lin-etal-2024-mplm</bibkey>
+      <video href="2024.findings-eacl.20.mp4"/>
     </paper>
     <paper id="21">
       <title><fixed-case>OYXOY</fixed-case>: A <fixed-case>M</fixed-case>odern <fixed-case>NLP</fixed-case> Test Suite for <fixed-case>M</fixed-case>odern <fixed-case>G</fixed-case>reek</title>
@@ -282,6 +299,7 @@
       <url hash="85cb68ea">2024.findings-eacl.22</url>
       <attachment type="software" hash="2f3e1abc">2024.findings-eacl.22.software.zip</attachment>
       <bibkey>bowen-etal-2024-comprehensive</bibkey>
+      <video href="2024.findings-eacl.22.mp4"/>
     </paper>
     <paper id="23">
       <title>Towards efficient self-supervised representation learning in speech processing</title>
@@ -314,6 +332,7 @@
       <url hash="5bf9f38e">2024.findings-eacl.25</url>
       <attachment type="note" hash="c0b2ed72">2024.findings-eacl.25.note.tgz</attachment>
       <bibkey>nguyen-etal-2024-noise</bibkey>
+      <video href="2024.findings-eacl.25.mp4"/>
     </paper>
     <paper id="26">
       <title>Large Language Models for Scientific Information Extraction: An Empirical Study for Virology</title>
@@ -326,6 +345,7 @@
       <attachment type="software" hash="bce15d0c">2024.findings-eacl.26.software.zip</attachment>
       <attachment type="note" hash="27bd030b">2024.findings-eacl.26.note.zip</attachment>
       <bibkey>shamsabadi-etal-2024-large</bibkey>
+      <video href="2024.findings-eacl.26.mp4"/>
     </paper>
     <paper id="27">
       <title>Re3val: Reinforced and Reranked Generative Retrieval</title>
@@ -338,6 +358,7 @@
       <abstract>Generative retrieval models encode pointers to information in a corpus as an index within the model’s parameters. These models serve as part of a larger pipeline, where retrieved information conditions generation for knowledge-intensive NLP tasks. However, we identify two limitations: the generative retrieval does not account for contextual information. Secondly, the retrieval can’t be tuned for the downstream readers as decoding the page title is a non-differentiable operation. This paper introduces Re3val, trained with generative reranking and reinforcement learning using limited data. Re3val leverages context acquired via Dense Passage Retrieval to rerank the retrieved page titles and utilizes REINFORCE to maximize rewards generated by constrained decoding. Additionally, we generate questions from our pre-training dataset to mitigate epistemic uncertainty and bridge the domain gap between the pre-training and fine-tuning datasets. Subsequently, we extract and rerank contexts from the KILT database using the rerank page titles. Upon grounding the top five reranked contexts, Re3val demonstrates the Top 1 KILT scores compared to all other generative retrieval models across five KILT datasets.</abstract>
       <url hash="9e63f5ec">2024.findings-eacl.27</url>
       <bibkey>song-etal-2024-re3val</bibkey>
+      <video href="2024.findings-eacl.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Entity Linking in the Job Market Domain</title>
@@ -348,6 +369,7 @@
       <abstract>In Natural Language Processing, entity linking (EL) has centered around Wikipedia, but yet remains underexplored for the job market domain. Disambiguating skill mentions can help us get insight into the current labor market demands. In this work, we are the first to explore EL in this domain, specifically targeting the linkage of occupational skills to the ESCO taxonomy (le Vrang et al., 2014). Previous efforts linked coarse-grained (full) sentences to a corresponding ESCO skill. In this work, we link more fine-grained span-level mentions of skills. We tune two high-performing neural EL models, a bi-encoder (Wu et al., 2020) and an autoregressive model (Cao et al., 2021), on a synthetically generated mention–skill pair dataset and evaluate them on a human-annotated skill-linking benchmark. Our findings reveal that both models are capable of linking implicit mentions of skills to their correct taxonomy counterparts. Empirically, BLINK outperforms GENRE in strict evaluation, but GENRE performs better in loose evaluation (accuracy@k).</abstract>
       <url hash="21d3a5dd">2024.findings-eacl.28</url>
       <bibkey>zhang-etal-2024-entity</bibkey>
+      <video href="2024.findings-eacl.28.mp4"/>
     </paper>
     <paper id="29">
       <title>(Chat)<fixed-case>GPT</fixed-case> v <fixed-case>BERT</fixed-case> Dawn of Justice for Semantic Change Detection</title>
@@ -369,6 +391,7 @@
       <abstract>Thanks to the recent progress in vision-language modeling and the evolving nature of news consumption, the tasks of automatic summarization and headline generation based on multimodal news articles have been gaining popularity. One of the limitations of the current approaches is caused by the commonly used sophisticated modular architectures built upon hierarchical cross-modal encoders and modality-specific decoders, which restrict the model’s applicability to specific data modalities – once trained on, e.g., text+video pairs there is no straightforward way to apply the model to text+image or text-only data. In this work, we propose a unified task formulation that utilizes a simple encoder-decoder model to generate headlines from uni- and multi-modal news articles. This model is trained jointly on data of several modalities and extends the textual decoder to handle the multimodal output.</abstract>
       <url hash="3aed0449">2024.findings-eacl.30</url>
       <bibkey>krubinski-pecina-2024-towards</bibkey>
+      <video href="2024.findings-eacl.30.mp4"/>
     </paper>
     <paper id="31">
       <title>On the Relationship between Sentence Analogy Identification and Sentence Structure Encoding in Large Language Models</title>
@@ -383,6 +406,7 @@
       <abstract>The ability of Large Language Models (LLMs) to encode syntactic and semantic structures of language is well examined in NLP. Additionally, analogy identification, in the form of word analogies are extensively studied in the last decade of language modeling literature. In this work we specifically look at how LLMs’ abilities to capture sentence analogies (sentences that convey analogous meaning to each other) vary with LLMs’ abilities to encode syntactic and semantic structures of sentences. Through our analysis, we find that LLMs’ ability to identify sentence analogies is positively correlated with their ability to encode syntactic and semantic structures of sentences. Specifically, we find that the LLMs which capture syntactic structures better, also have higher abilities in identifying sentence analogies.</abstract>
       <url hash="6fe2b6ac">2024.findings-eacl.31</url>
       <bibkey>wijesiriwardene-etal-2024-relationship</bibkey>
+      <video href="2024.findings-eacl.31.mp4"/>
     </paper>
     <paper id="32">
       <title>Contextualization Distillation from Large Language Model for Knowledge Graph Completion</title>
@@ -396,6 +420,7 @@
       <bibkey>li-etal-2024-contextualization</bibkey>
       <revision id="1" href="2024.findings-eacl.32v1" hash="7f273319"/>
       <revision id="2" href="2024.findings-eacl.32v2" hash="4b81611d" date="2024-03-30">This revision corrects the citation display problem in the Appendix.</revision>
+      <video href="2024.findings-eacl.32.mp4"/>
     </paper>
     <paper id="33">
       <title>Differentially Private Natural Language Models: Recent Advances and Future Directions</title>
@@ -407,6 +432,7 @@
       <abstract>Recent developments in deep learning have led to great success in various natural language processing (NLP) tasks. However, these applications may involve data that contain sensitive information. Therefore, how to achieve good performance while also protecting the privacy of sensitive data is a crucial challenge in NLP. To preserve privacy, Differential Privacy (DP), which can prevent reconstruction attacks and protect against potential side knowledge, is becoming a de facto technique for private data analysis. In recent years, NLP in DP models (DP-NLP) has been studied from different perspectives, which deserves a comprehensive review. In this paper, we provide the first systematic review of recent advances in DP deep learning models in NLP. In particular, we first discuss some differences and additional challenges of DP-NLP compared with the standard DP deep learning. Then, we investigate some existing work on DP-NLP andpresent its recent developments from three aspects: gradient perturbation based methods, embedding vector perturbation based methods, and ensemble model based methods. We also discuss some challenges and future directions.</abstract>
       <url hash="25234848">2024.findings-eacl.33</url>
       <bibkey>hu-etal-2024-differentially</bibkey>
+      <video href="2024.findings-eacl.33.mp4"/>
     </paper>
     <paper id="34">
       <title>Learning to Compare Financial Reports for Financial Forecasting</title>
@@ -446,6 +472,7 @@
       <url hash="fbd74890">2024.findings-eacl.36</url>
       <attachment type="software" hash="6164d06c">2024.findings-eacl.36.software.zip</attachment>
       <bibkey>cao-jiang-2024-knowledge</bibkey>
+      <video href="2024.findings-eacl.36.mp4"/>
     </paper>
     <paper id="37">
       <title>Simple Temperature Cool-down in Contrastive Framework for Unsupervised Sentence Representation Learning</title>
@@ -456,6 +483,7 @@
       <abstract>In this paper, we proposes a simple, tricky method to improve sentence representation of unsupervised contrastive learning. Even though contrastive learning has achieved great performances in both visual representation learning (VRL) and sentence representation learning (SRL) fields, we focus on the fact that there is a gap between characteristics and training dynamics of VRL and SRL. We first examine the role of temperature to bridge the gap between VRL and SRL, and find some temperature-dependent elements in SRL; <i>i.e.</i>, a higher temperature causes overfitting of the uniformity while improving the alignment in earlier phase of training. Then, we design a <i>temperature cool-down</i> technique based on this observation, which helps PLMs to be more suitable for contrastive learning via preparation of uniform representation space. Our experimental results on widely-utilized benchmarks demonstrate the effectiveness and extensiblity of our method.</abstract>
       <url hash="13d445d4">2024.findings-eacl.37</url>
       <bibkey>jeong-etal-2024-simple</bibkey>
+      <video href="2024.findings-eacl.37.mp4"/>
     </paper>
     <paper id="38">
       <title>Bootstrap Your Own <fixed-case>PLM</fixed-case>: Boosting Semantic Features of <fixed-case>PLM</fixed-case>s for Unsuperivsed Contrastive Learning</title>
@@ -466,6 +494,7 @@
       <abstract>This paper aims to investigate the possibility of exploiting original semantic features of PLMs (pre-trained language models) during contrastive learning in the context of SRL (sentence representation learning). In the context of feature modification, we identified a method called IFM (implicit feature modification), which reduces the tendency of contrastive models for VRL (visual representation learning) to rely on feature-suppressing shortcut solutions. We observed that IFM did not work well for SRL, which may be due to differences between the nature of VRL and SRL. We propose BYOP, which boosts well-represented features, taking the opposite idea of IFM, under the assumption that SimCSE’s dropout-noise-based augmentation may be too simple to modify high-level semantic features, and that the features learned by PLMs are semantically meaningful and should be boosted, rather than removed. Extensive experiments lend credence to the logic of BYOP, which considers the nature of SRL.</abstract>
       <url hash="10ca921c">2024.findings-eacl.38</url>
       <bibkey>jeong-etal-2024-bootstrap</bibkey>
+      <video href="2024.findings-eacl.38.mp4"/>
     </paper>
     <paper id="39">
       <title>Personalized Abstractive Summarization by Tri-agent Generation Pipeline</title>
@@ -477,6 +506,7 @@
       <abstract>Tailoring outputs from large language models, like ChatGPT, to implicit user preferences remains a challenge despite their impressive generative capabilities. In this paper, we propose a tri-agent generation pipeline comprising a generator, an instructor, and an editor to enhance output personalization. The generator produces an initial output, the instructor automatically generates editing instructions based on user preferences, and the editor refines the output to align with those preferences. The inference-only large language model (ChatGPT) serves as both the generator and editor, with a smaller model acting as the instructor to guide output generation. We train the instructor using editor-steered reinforcement learning, leveraging feedback from a large-scale editor model to optimize instruction generation. Experimental results on two abstractive summarization datasets demonstrate the effectiveness of our approach in generating outputs that better meet user expectations.</abstract>
       <url hash="17757746">2024.findings-eacl.39</url>
       <bibkey>xiao-etal-2024-personalized</bibkey>
+      <video href="2024.findings-eacl.39.mp4"/>
     </paper>
     <paper id="40">
       <title>Revisiting the <fixed-case>M</fixed-case>arkov Property for Machine Translation</title>
@@ -488,6 +518,7 @@
       <abstract>In this paper, we re-examine the Markov property in the context of neural machine translation. We design a Markov Autoregressive Transformer (MAT) and undertake a comprehensive assessment of its performance across four WMT benchmarks. Our findings indicate that MAT with an order larger than 4 can generate translations with quality on par with that of conventional autoregressive transformers. In addition, counter-intuitively, we also find that the advantages of utilizing a higher-order MAT do not specifically contribute to the translation of longer sentences.</abstract>
       <url hash="df419ec5">2024.findings-eacl.40</url>
       <bibkey>du-etal-2024-revisiting</bibkey>
+      <video href="2024.findings-eacl.40.mp4"/>
     </paper>
     <paper id="41">
       <title>Reward Engineering for Generating Semi-structured Explanation</title>
@@ -498,6 +529,7 @@
       <abstract>Semi-structured explanation depicts the implicit process of a reasoner with an explicit representation. This explanation highlights how available information in a specific query is utilised and supplemented with information a reasoner produces from its internal weights towards generating an answer. Despite the recent improvements in generative capabilities of language models, producing structured explanations to verify a model’s true reasoning capabilities remains a challenge. This issue is particularly pronounced for not-so-large LMs (e.g., FLAN-T5-XXL). In this work, we first underscore the limitations of supervised fine-tuning (SFT) in tackling this challenge, and then introduce a carefully crafted reward engineering method in reinforcement learning (RL) to better address this problem. We investigate multiple reward aggregation methods and provide a detailed discussion which sheds light on the promising potential of RL for future research. Our proposed method on two semi-structured explanation generation benchmarks (ExplaGraph and COPA-SSE) achieves new state-of-the-art results.</abstract>
       <url hash="b3d3ccc3">2024.findings-eacl.41</url>
       <bibkey>han-etal-2024-reward</bibkey>
+      <video href="2024.findings-eacl.41.mp4"/>
     </paper>
     <paper id="42">
       <title>Towards Context-Based Violence Detection: A <fixed-case>K</fixed-case>orean Crime Dialogue Dataset</title>
@@ -520,6 +552,7 @@
       <abstract>Deriving meaningful sentence embeddings is crucial in capturing the semantic relationship between texts. Recent advances in building sentence embedding models have centered on replacing traditional human-generated text datasets with those generated by LLMs. However, the properties of these widely used LLM-generated texts remain largely unexplored. Here, we evaluate the quality of the LLM-generated texts from four perspectives (Positive Text Repetition, Length Difference Penalty, Positive Score Compactness, and Negative Text Implausibility) and find that there exists an inherent difference between human and LLM-generated datasets. To further enhance sentence embeddings using both human and LLM-generated datasets, we propose a novel loss function that incorporates Positive-Negative sample Augmentation (PNA) within the contrastive learning objective. Our results demonstrate that PNA effectively mitigates the sentence anisotropy problem in Wikipedia corpus (-7% compared to CLHAIF) and simultaneously improves the Spearman’s correlation in standard Semantic Textual Similarity (STS) tasks (+1.47% compared to CLHAIF).</abstract>
       <url hash="5023b599">2024.findings-eacl.43</url>
       <bibkey>an-etal-2024-capturing</bibkey>
+      <video href="2024.findings-eacl.43.mp4"/>
     </paper>
     <paper id="44">
       <title>Harmonizing Code-mixed Conversations: Personality-assisted Code-mixed Response Generation in Dialogues</title>
@@ -529,6 +562,7 @@
       <abstract>Code-mixing, the blending of multiple languages within a single conversation, introduces a distinctive challenge, particularly in the context of response generation. Capturing the intricacies of code-mixing proves to be a formidable task, given the wide-ranging variations influenced by individual speaking styles and cultural backgrounds. In this study, we explore response generation within code-mixed conversations. We introduce a novel approach centered on harnessing the Big Five personality traits acquired in an unsupervised manner from the conversations to bolster the performance of response generation. These inferred personality attributes are seamlessly woven into the fabric of the dialogue context, using a novel fusion mechanism, . It uses an effective two-step attention formulation to fuse the dialogue and personality information. This fusion not only enhances the contextual relevance of generated responses but also elevates the overall performance of the model. Our experimental results, grounded in a dataset comprising of multi-party Hindi-English code-mix conversations, highlight the substantial advantages offered by personality-infused models over their conventional counterparts. This is evident in the increase observed in ROUGE and BLUE scores for the response generation task when the identified personality is seamlessly integrated into the dialogue context. Qualitative assessment for personality identification and response generation aligns well with our quantitative results.</abstract>
       <url hash="6210c1f5">2024.findings-eacl.44</url>
       <bibkey>kumar-chakraborty-2024-harmonizing</bibkey>
+      <video href="2024.findings-eacl.44.mp4"/>
     </paper>
     <paper id="45">
       <title>Morality is Non-Binary: Building a Pluralist Moral Sentence Embedding Space using Contrastive Learning</title>
@@ -540,6 +574,7 @@
       <url hash="7a5007bf">2024.findings-eacl.45</url>
       <attachment type="software" hash="935b473d">2024.findings-eacl.45.software.zip</attachment>
       <bibkey>park-etal-2024-morality</bibkey>
+      <video href="2024.findings-eacl.45.mp4"/>
     </paper>
     <paper id="46">
       <title>Prosody in Cascade and Direct Speech-to-Text Translation: a case study on <fixed-case>K</fixed-case>orean Wh-Phrases</title>
@@ -551,6 +586,7 @@
       <abstract>Speech-to-Text Translation (S2TT) has typically been addressed with cascade systems, where speech recognition systems generate a transcription that is subsequently passed to a translation model. While there has been a growing interest in developing direct speech translation systems to avoid propagating errors and losing non-verbal content, prior work in direct S2TT has struggled to conclusively establish the advantages of integrating the acoustic signal directly into the translation process. This work proposes using contrastive evaluation to quantitatively measure the ability of direct S2TT systems to disambiguate utterances where prosody plays a crucial role. Specifically, we evaluated Korean-English translation systems on a test set containing wh-phrases, for which prosodic features are necessary to produce translations with the correct intent, whether it’s a statement, a yes/no question, a wh-question, and more. Our results clearly demonstrate the value of direct translation systems over cascade translation models, with a notable 12.9% improvement in overall accuracy in ambiguous cases, along with up to a 15.6% increase in F1 scores for one of the major intent categories. To the best of our knowledge, this work stands as the first to provide quantitative evidence that direct S2TT models can effectively leverage prosody. The code for our evaluation is openly accessible and freely available for review and utilisation.</abstract>
       <url hash="5135cf37">2024.findings-eacl.46</url>
       <bibkey>zhou-etal-2024-prosody</bibkey>
+      <video href="2024.findings-eacl.46.mp4"/>
     </paper>
     <paper id="47">
       <title>Exploring the Potential of <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> on Sentence Level Relations: A Focus on Temporal, Causal, and Discourse Relations</title>
@@ -565,6 +601,7 @@
       <abstract>This paper aims to quantitatively evaluate the performance of ChatGPT, an interactive large language model, on inter-sentential relations such as temporal relations, causal relations, and discourse relations. Given ChatGPT’s promising performance across various tasks, we proceed to carry out thorough evaluations on the whole test sets of 11 datasets, including temporal and causal relations, PDTB2.0-based, and dialogue-based discourse relations. To ensure the reliability of our findings, we employ three tailored prompt templates for each task, including the zero-shot prompt template, zero-shot prompt engineering (PE) template, and in-context learning (ICL) prompt template, to establish the initial baseline scores for all popular sentence-pair relation classification tasks for the first time. Through our study, we discover that ChatGPT exhibits exceptional proficiency in detecting and reasoning about causal relations, albeit it may not possess the same level of expertise in identifying the temporal order between two events. While it is capable of identifying the majority of discourse relations with existing explicit discourse connectives, the implicit discourse relation remains a formidable challenge. Concurrently, ChatGPT demonstrates subpar performance in the dialogue discourse parsing task that requires structural understanding in a dialogue before being aware of the discourse relation.</abstract>
       <url hash="375a167d">2024.findings-eacl.47</url>
       <bibkey>chan-etal-2024-exploring</bibkey>
+      <video href="2024.findings-eacl.47.mp4"/>
     </paper>
     <paper id="48">
       <title>Backtracing: Retrieving the Cause of the Query</title>
@@ -577,6 +614,7 @@
       <abstract>Many online content portals allow users to ask questions to supplement their understanding (e.g., of lectures). While information retrieval (IR) systems may provide answers for such user queries, they do not directly assist content creators—such as lecturers who want to improve their content—identify segments that caused a user to ask those questions.We introduce the task of backtracing, in which systems retrieve the text segment that most likely caused a user query.We formalize three real-world domains for which backtracing is important in improving content delivery and communication: understanding the cause of (a) student confusion in the Lecture domain, (b) reader curiosity in the News Article domain, and (c) user emotion in the Conversation domain.We evaluate the zero-shot performance of popular information retrieval methods and language modeling methods, including bi-encoder, re-ranking and likelihood-based methods and ChatGPT.While traditional IR systems retrieve semantically relevant information (e.g., details on “projection matrices” for a query “does projecting multiple times still lead to the same point?”), they often miss the causally relevant context (e.g., the lecturer states “projecting twice gets me the same answer as one projection”). Our results show that there is room for improvement on backtracing and it requires new retrieval approaches.We hope our benchmark serves to improve future retrieval systems for backtracing, spawning systems that refine content generation and identify linguistic triggers influencing user queries.</abstract>
       <url hash="b176d238">2024.findings-eacl.48</url>
       <bibkey>wang-etal-2024-backtracing</bibkey>
+      <video href="2024.findings-eacl.48.mp4"/>
     </paper>
     <paper id="49">
       <title>Unsupervised Multilingual Dense Retrieval via Generative Pseudo Labeling</title>
@@ -590,6 +628,7 @@
       <url hash="9afec971">2024.findings-eacl.49</url>
       <attachment type="software" hash="a6b3b82e">2024.findings-eacl.49.software.zip</attachment>
       <bibkey>huang-etal-2024-unsupervised</bibkey>
+      <video href="2024.findings-eacl.49.mp4"/>
     </paper>
     <paper id="50">
       <title>Investigating grammatical abstraction in language models using few-shot learning of novel noun gender</title>
@@ -615,6 +654,7 @@
       <attachment type="software" hash="f0ea4f26">2024.findings-eacl.51.software.zip</attachment>
       <attachment type="note" hash="7205c223">2024.findings-eacl.51.note.zip</attachment>
       <bibkey>fang-etal-2024-fly</bibkey>
+      <video href="2024.findings-eacl.51.mp4"/>
     </paper>
     <paper id="52">
       <title>Style Vectors for Steering Generative Large Language Models</title>
@@ -631,6 +671,7 @@
       <url hash="a02d4eb5">2024.findings-eacl.52</url>
       <attachment type="software" hash="29f876dd">2024.findings-eacl.52.software.zip</attachment>
       <bibkey>konen-etal-2024-style</bibkey>
+      <video href="2024.findings-eacl.52.mp4"/>
     </paper>
     <paper id="53">
       <title>Consistent Joint Decision-Making with Heterogeneous Learning Models</title>
@@ -640,6 +681,7 @@
       <abstract>This paper introduces a novel decision-making framework that promotes consistency among decisions made by diverse models while utilizing external knowledge. Leveraging the Integer Linear Programming(ILP) framework, we map predictions from various models into globally normalized and comparable values by incorporating information about decisions’ prior probability, confidence (uncertainty), and the models’ expected accuracy. Our empirical study demonstrates the superiority of our approach over conventional baselines on multiple datasets.</abstract>
       <url hash="4dc934db">2024.findings-eacl.53</url>
       <bibkey>rajaby-faghihi-kordjamshidi-2024-consistent</bibkey>
+      <video href="2024.findings-eacl.53.mp4"/>
     </paper>
     <paper id="54">
       <title>Quantifying Association Capabilities of Large Language Models and Its Implications on Privacy Leakage</title>
@@ -664,6 +706,7 @@
       <url hash="9b9e33fb">2024.findings-eacl.55</url>
       <attachment type="software" hash="5e4d61a2">2024.findings-eacl.55.software.zip</attachment>
       <bibkey>masud-etal-2024-probing</bibkey>
+      <video href="2024.findings-eacl.55.mp4"/>
     </paper>
     <paper id="56">
       <title>Embible: Reconstruction of <fixed-case>A</fixed-case>ncient <fixed-case>H</fixed-case>ebrew and <fixed-case>A</fixed-case>ramaic Texts Using Transformers</title>
@@ -676,6 +719,7 @@
       <abstract>Hebrew and Aramaic inscriptions serve as an essential source of information on the ancient history of the Near East. Unfortunately, some parts of the inscribed texts become illegible over time. Special experts, called epigraphists, use time-consuming manual procedures to estimate the missing content. This problem can be considered an extended masked language modeling task, where the damaged content can comprise single characters, character n-grams (partial words), single complete words, and multi-word n-grams.This study is the first attempt to apply the masked language modeling approach to corrupted inscriptions in Hebrew and Aramaic languages, both using the Hebrew alphabet consisting mostly of consonant symbols. In our experiments, we evaluate several transformer-based models, which are fine-tuned on the Biblical texts and tested on three different percentages of randomly masked parts in the testing corpus. For any masking percentage, the highest text completion accuracy is obtained with a novel ensemble of word and character prediction models.</abstract>
       <url hash="a4fcfdf5">2024.findings-eacl.56</url>
       <bibkey>fono-etal-2024-embible</bibkey>
+      <video href="2024.findings-eacl.56.mp4"/>
     </paper>
     <paper id="57">
       <title>Stateful Memory-Augmented Transformers for Efficient Dialogue Modeling</title>
@@ -699,6 +743,7 @@
       <abstract>In this study, we present an investigation into the anisotropy dynamics and intrinsic dimension of embeddings in transformer architectures, focusing on the dichotomy between encoders and decoders. Our findings reveal that the anisotropy profile in transformer decoders exhibits a distinct bell-shaped curve, with the highest anisotropy concentrations in the middle layers. This pattern diverges from the more uniformly distributed anisotropy observed in encoders. In addition, we found that the intrinsic dimension of embeddings increases in the initial phases of training, indicating an expansion into higher-dimensional space. This fact is then followed by a compression phase towards the end of training with dimensionality decrease, suggesting a refinement into more compact representations. Our results provide fresh insights to the understanding of encoders and decoders embedding properties.</abstract>
       <url hash="3436b782">2024.findings-eacl.58</url>
       <bibkey>razzhigaev-etal-2024-shape</bibkey>
+      <video href="2024.findings-eacl.58.mp4"/>
     </paper>
     <paper id="59">
       <title><fixed-case>MED</fixed-case>s for <fixed-case>PET</fixed-case>s: Multilingual Euphemism Disambiguation for Potentially Euphemistic Terms</title>
@@ -727,6 +772,7 @@
       <url hash="fd7afc75">2024.findings-eacl.60</url>
       <attachment type="software" hash="e7e35714">2024.findings-eacl.60.software.zip</attachment>
       <bibkey>feng-etal-2024-promptexplainer</bibkey>
+      <video href="2024.findings-eacl.60.mp4"/>
     </paper>
     <paper id="61">
       <title>Do-Not-Answer: Evaluating Safeguards in <fixed-case>LLM</fixed-case>s</title>
@@ -739,6 +785,7 @@
       <abstract>With the rapid evolution of large language models (LLMs), new and hard-to-predict harmful capabilities are emerging. This requires developers to identify potential risks through the evaluation of “dangerous capabilities” in order to responsibly deploy LLMs. Here we aim to facilitate this process. In particular, we collect an open-source dataset to evaluate the safeguards in LLMs, to facilitate the deployment of safer open-source LLMs at a low cost. Our dataset is curated and filtered to consist only of instructions that responsible language models should not follow. We assess the responses of six popular LLMs to these instructions, and we find that simple BERT-style classifiers can achieve results that are comparable to GPT-4 on automatic safety evaluation. Our data and code are available at https://github.com/Libr-AI/do-not-answer</abstract>
       <url hash="3329c299">2024.findings-eacl.61</url>
       <bibkey>wang-etal-2024-answer</bibkey>
+      <video href="2024.findings-eacl.61.mp4"/>
     </paper>
     <paper id="62">
       <title>Do Language Models Know When They’re Hallucinating References?</title>
@@ -752,6 +799,7 @@
       <attachment type="software" hash="2c355a56">2024.findings-eacl.62.software.zip</attachment>
       <attachment type="note" hash="35db54a9">2024.findings-eacl.62.note.zip</attachment>
       <bibkey>agrawal-etal-2024-language</bibkey>
+      <video href="2024.findings-eacl.62.mp4"/>
     </paper>
     <paper id="63">
       <title>Bridging Cultural Nuances in Dialogue Agents through Cultural Value Surveys</title>
@@ -782,6 +830,7 @@
       <abstract>Recent years, have seen the rise of large language models (LLMs), where practitioners use task-specific prompts; this was shown to be effective for a variety of tasks. However, when applied to semantic textual similarity (STS) and natural language inference (NLI), the effectiveness of LLMs turns out to be limited by low-resource domain accuracy, model overconfidence, and difficulty to capture the disagreements between human judgements. With this in mind, here we try to rethink STS and NLI in the era of LLMs. We first evaluate the performance of STS and NLI in the clinical/biomedical domain, and then we assess LLMs’ predictive confidence and their capability of capturing collective human opinions. We find that these old problems are still to be properly addressed in the era of LLMs.</abstract>
       <url hash="4cbaa3d3">2024.findings-eacl.65</url>
       <bibkey>wang-etal-2024-rethinking</bibkey>
+      <video href="2024.findings-eacl.65.mp4"/>
     </paper>
     <paper id="66">
       <title>Learning High-Quality and General-Purpose Phrase Representations</title>
@@ -793,6 +842,7 @@
       <url hash="da034f83">2024.findings-eacl.66</url>
       <attachment type="software" hash="55439295">2024.findings-eacl.66.software.zip</attachment>
       <bibkey>chen-etal-2024-learning</bibkey>
+      <video href="2024.findings-eacl.66.mp4"/>
     </paper>
     <paper id="67">
       <title>Explaining Language Model Predictions with High-Impact Concepts</title>
@@ -815,6 +865,7 @@
       <abstract>Recent research has revealed that machine learning models have a tendency to leverage spurious correlations that exist in the training set but may not hold true in general circumstances. For instance, a sentiment classifier may erroneously learn that the token “performances” is commonly associated with positive movie reviews.Relying on these spurious correlations degrades the classifier’s performance when it deploys on out-of-distribution data.In this paper, we examine the implications of spurious correlations through a novel perspective called neighborhood analysis. The analysis uncovers how spurious correlations lead unrelated words to erroneously cluster together in the embedding space. Driven by the analysis, we design a metric to detect spurious tokens and also propose a family of regularization methods, NFL (doN’t Forget your Language) to mitigate spurious correlations in text classification.Experiments show that NFL can effectively prevent erroneous clusters and significantly improve the robustness of classifiers without auxiliary data. The code is publicly available at https://github.com/oscarchew/doNt-Forget-your-Language.</abstract>
       <url hash="ac912e09">2024.findings-eacl.68</url>
       <bibkey>chew-etal-2024-understanding</bibkey>
+      <video href="2024.findings-eacl.68.mp4"/>
     </paper>
     <paper id="69">
       <title>On the Intractability to Synthesize Factual Inconsistencies in Summarization</title>
@@ -840,6 +891,7 @@
       <abstract>This study focuses on media bias detection, crucial in today’s era of influential social media platforms shaping individual attitudes and opinions. In contrast to prior work that primarily relies on training specific models tailored to particular datasets, resulting in limited adaptability and subpar performance on out-of-domain data, we introduce a general bias detection framework, IndiVec, built upon large language models. IndiVec begins by constructing a fine-grained media bias database, leveraging the robust instruction-following capabilities of large language models and vector database techniques. When confronted with new input for bias detection, our framework automatically selects the most relevant indicator from the vector database and employs majority voting to determine the input’s bias label. IndiVec excels compared to previous methods due to its adaptability (demonstrating consistent performance across diverse datasets from various sources) and explainability (providing explicit top-k indicators to interpret bias predictions). Experimental results on four political bias datasets highlight IndiVec’s significant superiority over baselines. Furthermore, additional experiments and analysis provide profound insights into the framework’s effectiveness.</abstract>
       <url hash="05911d09">2024.findings-eacl.70</url>
       <bibkey>lin-etal-2024-indivec</bibkey>
+      <video href="2024.findings-eacl.70.mp4"/>
     </paper>
     <paper id="71">
       <title>Are Large Language Model-based Evaluators the Solution to Scaling Up Multilingual Evaluation?</title>
@@ -855,6 +907,7 @@
       <abstract>Large Language Models (LLMs) excel in various Natural Language Processing (NLP) tasks, yet their evaluation, particularly in languages beyond the top 20, remains inadequate due to existing benchmarks and metrics limitations. Employing LLMs as evaluators to rank or score other models’ outputs emerges as a viable solution, addressing the constraints tied to human annotators and established benchmarks. In this study, we explore the potential of LLM-based evaluators in enhancing multilingual evaluation by calibrating them against 20K human judgments across three text-generation tasks, five metrics, and eight languages. Our analysis reveals a bias in LLM-based evaluators towards higher scores, underscoring the necessity of calibration with native speaker judgments, especially in low-resource and non-Latin script languages, to ensure accurate evaluation of LLM performance across diverse languages.</abstract>
       <url hash="24bd4a12">2024.findings-eacl.71</url>
       <bibkey>hada-etal-2024-large</bibkey>
+      <video href="2024.findings-eacl.71.mp4"/>
     </paper>
     <paper id="72">
       <title>Computational Morphology and Lexicography Modeling of <fixed-case>M</fixed-case>odern <fixed-case>S</fixed-case>tandard <fixed-case>A</fixed-case>rabic Nominals</title>
@@ -867,6 +920,7 @@
       <abstract>Modern Standard Arabic (MSA) nominals present many morphological and lexical modeling challenges that have not been consistently addressed previously. This paper attempts to define the space of such challenges, and leverage a recently proposed morphological framework to build a comprehensive and extensible model for MSA nominals. Our model design addresses the nominals’ intricate morphotactics, as well as their paradigmatic irregularities. Our implementation showcases enhanced accuracy and consistency compared to a commonly used MSA morphological analyzer and generator. We make our models publicly available.</abstract>
       <url hash="4372130f">2024.findings-eacl.72</url>
       <bibkey>khairallah-etal-2024-computational</bibkey>
+      <video href="2024.findings-eacl.72.mp4"/>
     </paper>
     <paper id="73">
       <title>Relabeling Minimal Training Subset to Flip a Prediction</title>
@@ -877,6 +931,7 @@
       <abstract>When facing an unsatisfactory prediction from a machine learning model, users can be interested in investigating the underlying reasons and exploring the potential for reversing the outcome. We ask: To flip the prediction on a test point <tex-math>x_t</tex-math>, how to identify the smallest training subset <tex-math>\mathcal{S}_t</tex-math> that we need to relabel?We propose an efficient algorithm to identify and relabel such a subset via an extended influence function for binary classification models with convex loss.We find that relabeling fewer than 2% of the training points can always flip a prediction.This mechanism can serve multiple purposes: (1) providing an approach to challenge a model prediction by altering training points; (2) evaluating model robustness with the cardinality of the subset (i.e., <tex-math>|\mathcal{S}_t|</tex-math>); we show that <tex-math>|\mathcal{S}_t|</tex-math> is highly related to the noise ratio in the training set and <tex-math>|\mathcal{S}_t|</tex-math> is correlated with but complementary to predicted probabilities; and (3) revealing training points lead to group attribution bias. To the best of our knowledge, we are the first to investigate identifying and relabeling the minimal training subset required to flip a given prediction.</abstract>
       <url hash="57da3035">2024.findings-eacl.73</url>
       <bibkey>yang-etal-2024-relabeling</bibkey>
+      <video href="2024.findings-eacl.73.mp4"/>
     </paper>
     <paper id="74">
       <title>Why Generate When You Can Discriminate? A Novel Technique for Text Classification using Language Models</title>
@@ -889,6 +944,7 @@
       <abstract>In this paper, we propose a novel two-step technique for text classification using autoregressive Language Models (LM). In the first step, a set of perplexity and log-likelihood based numeric features are elicited from an LM for a text instance to be classified. Then, in the second step, a classifier based on these features is trained to predict the final label. The classifier used is usually a simple machine learning classifier like Support Vector Machine (SVM) or Logistic Regression (LR) and it is trained using a small set of training examples. We believe, our technique presents a whole new way of exploiting the available training instances, in addition to the existing ways like fine-tuning LMs or in-context learning. Our approach stands out by eliminating the need for parameter updates in LMs, as required in fine-tuning, and does not impose limitations on the number of training examples faced while building prompts for in-context learning. We evaluate our technique across 5 different datasets and compare with multiple competent baselines.</abstract>
       <url hash="c83c1003">2024.findings-eacl.74</url>
       <bibkey>pawar-etal-2024-generate</bibkey>
+      <video href="2024.findings-eacl.74.mp4"/>
     </paper>
     <paper id="75">
       <title>Autism Detection in Speech – A Survey</title>
@@ -898,6 +954,7 @@
       <abstract>There has been a range of studies of how autism is displayed in voice, speech, and language. We analyse studies from the biomedical, as well as the psychological domain, but also from the NLP domain in order to find linguistic, prosodic and acoustic cues. Our survey looks at all three domains. We define autism and which comorbidities might influence the correct detection of the disorder. We especially look at observations such as verbal and semantic fluency, prosodic features, but also disfluencies and speaking rate. We also show word-based approaches and describe machine learning and transformer-based approaches both on the audio data as well as the transcripts. Lastly, we conclude, while there already is a lot of research, female patients seem to be severely under-researched. Also, most NLP research focuses on traditional machine learning methods instead of transformers. Additionally, we were unable to find research combining both features from audio and transcripts.</abstract>
       <url hash="51af6a1c">2024.findings-eacl.75</url>
       <bibkey>probol-mieskes-2024-autism</bibkey>
+      <video href="2024.findings-eacl.75.mp4"/>
     </paper>
     <paper id="76">
       <title>Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary Tasks</title>
@@ -920,6 +977,7 @@
       <url hash="1265d531">2024.findings-eacl.77</url>
       <attachment type="software" hash="a178fa2c">2024.findings-eacl.77.software.zip</attachment>
       <bibkey>holtermann-etal-2024-weight</bibkey>
+      <video href="2024.findings-eacl.77.mp4"/>
     </paper>
     <paper id="78">
       <title><fixed-case>I</fixed-case>ndi<fixed-case>F</fixed-case>ood<fixed-case>VQA</fixed-case>: Advancing Visual Question Answering and Reasoning with a Knowledge-Infused Synthetic Data Generation Pipeline</title>
@@ -942,6 +1000,7 @@
       <attachment type="software" hash="19d09dc0">2024.findings-eacl.79.software.zip</attachment>
       <attachment type="note" hash="60b3c1f3">2024.findings-eacl.79.note.zip</attachment>
       <bibkey>zeng-zubiaga-2024-maple</bibkey>
+      <video href="2024.findings-eacl.79.mp4"/>
     </paper>
     <paper id="80">
       <title>Leveraging Open Information Extraction for More Robust Domain Transfer of Event Trigger Detection</title>
@@ -953,6 +1012,7 @@
       <abstract>Event detection is a crucial information extraction task in many domains, such as Wikipedia or news. The task typically relies on trigger detection (TD) – identifying token spans in the text that evoke specific events. While the notion of triggers should ideally be universal across domains, domain transfer for TD from high- to low-resource domains results in significant performance drops. We address the problem of negative transfer in TD by coupling triggers between domains using subject-object relations obtained from a rule-based open information extraction (OIE) system. We demonstrate that OIE relations injected through multi-task training can act as mediators between triggers in different domains, enhancing zero- and few-shot TD domain transfer and reducing performance drops, in particular when transferring from a high-resource source domain (Wikipedia) to a low(er)-resource target domain (news). Additionally, we combine this improved transfer with masked language modeling on the target domain, observing further TD transfer gains. Finally, we demonstrate that the gains are robust to the choice of the OIE system.</abstract>
       <url hash="32442344">2024.findings-eacl.80</url>
       <bibkey>dukic-etal-2024-leveraging</bibkey>
+      <video href="2024.findings-eacl.80.mp4"/>
     </paper>
     <paper id="81">
       <title>Exploring efficient zero-shot synthetic dataset generation for Information Retrieval</title>
@@ -974,6 +1034,7 @@
       <abstract>Keyphrase extraction is the task of identifying a set of keyphrases present in a document that captures its most salient topics. Scientific domain-specific pre-training has led to achieving state-of-the-art keyphrase extraction performance with a majority of benchmarks being within the domain. In this work, we explore how to effectively enable the cross-domain generalization capabilities of such models without requiring the same scale of data. We primarily focus on the few-shot setting in non-scientific domain datasets such as OpenKP from the Web domain &amp; StackEx from the StackExchange forum. We propose to leverage topic information intrinsically available in the data, to build a novel clustering-based sampling approach that facilitates selecting a few samples to label from the target domain facilitating building robust and performant models. This approach leads to large gains in performance of up to 26.35 points in F1 when compared to selecting few-shot samples uniformly at random. We also explore the setting where we have access to labeled data from the model’s pretraining domain corpora and perform gradual training which involves slowly folding in target domain data to the source domain data. Here we demonstrate further improvements in the model performance by up to 12.76 F1 points.</abstract>
       <url hash="91306589">2024.findings-eacl.82</url>
       <bibkey>mishra-etal-2024-clustering</bibkey>
+      <video href="2024.findings-eacl.82.mp4"/>
     </paper>
     <paper id="83">
       <title>Random Smooth-based Certified Defense against Text Adversarial Attack</title>
@@ -985,6 +1046,7 @@
       <abstract>Certified defense methods have identified their effectiveness against textual adversarial examples, which train models on the worst-case text generated by substituting words in original texts with synonyms. However, due to the discrete word embedding representations, the large search space hinders the robust training efficiency, resulting in significant time consumption. To overcome this challenge, motivated by the observation that synonym embedding has a small distance, we propose to treat the word substitution as a continuous perturbation on the word embedding representation. The proposed method Text-RS applies random smooth techniques to approximate the word substitution operation, offering a computationally efficient solution that outperforms conventional discrete methods and improves the robustness in training. The evaluation results demonstrate its effectiveness in defending against multiple textual adversarial attacks.</abstract>
       <url hash="563c62a2">2024.findings-eacl.83</url>
       <bibkey>zhang-etal-2024-random</bibkey>
+      <video href="2024.findings-eacl.83.mp4"/>
     </paper>
     <paper id="84">
       <title>Clarifying the Path to User Satisfaction: An Investigation into Clarification Usefulness</title>
@@ -1011,6 +1073,7 @@
       <abstract>Cross-lingual transfer of language models trained on high-resource languages like English has been widely studied for many NLP tasks, but focus on conversational tasks has been rather limited. This is partly due to the high cost of obtaining non-English conversational data, which results in limited coverage. In this work, we introduce for cross-lingual alignment pretraining, a parallel and large-scale multilingual conversation dataset that we created by translating the English-only Schema-Guided Dialogue (SGD) dataset (Rastogi et al., 2020) into 105 other languages. XSGD contains about 330k utterances per language. To facilitate aligned cross-lingual representations, we develop an efficient prompt-tuning-based method for learning alignment prompts. We also investigate two different classifiers: NLI-based and vanilla classifiers, and test cross-lingual capability enabled by the aligned prompts. We evaluate our model’s cross-lingual generalization capabilities on two conversation tasks: slot-filling and intent classification. Our results demonstrate strong and efficient modeling ability of NLI-based classifiers and the large cross-lingual transfer improvements achieved by our aligned prompts, particularly in few-shot settings. We also conduct studies on large language models (LLMs) such as text-davinci-003 and ChatGPT in both zero- and few-shot settings. While LLMs exhibit impressive performance in English, their cross-lingual capabilities in other languages, particularly low-resource ones, are limited.</abstract>
       <url hash="26a32271">2024.findings-eacl.85</url>
       <bibkey>tu-etal-2024-efficiently</bibkey>
+      <video href="2024.findings-eacl.85.mp4"/>
     </paper>
     <paper id="86">
       <title>Correcting Language Model Outputs by Editing Salient Layers</title>
@@ -1036,6 +1099,7 @@
       <abstract>Many approaches to Natural Language Processing tasks often treat them as single-step problems, where an agent receives an instruction, executes it, and is evaluated based on the final outcome. However, language is inherently interactive, as evidenced by the back-and-forth nature of human conversations. In light of this, we posit that human-AI collaboration should also be interactive, with humans monitoring the work of AI agents and providing feedback that the agent can understand and utilize. Further, the AI agent should be able to detect when it needs additional information and proactively ask for help. Enabling this scenario would lead to more natural, efficient, and engaging human-AI collaboration.In this paper, we investigate these directions using the challenging task established by the IGLU competition, an interactive grounded language understanding task in a MineCraft-like world. We delve into multiple types of help players can give to the AI to guide it and analyze the impact of this help on behavior, resulting in performance improvements and an end-to-end interactive system.</abstract>
       <url hash="49597c49">2024.findings-eacl.87</url>
       <bibkey>mehta-etal-2024-improving</bibkey>
+      <video href="2024.findings-eacl.87.mp4"/>
     </paper>
     <paper id="88">
       <title>Goodhart’s Law Applies to <fixed-case>NLP</fixed-case>’s Explanation Benchmarks</title>
@@ -1047,6 +1111,7 @@
       <abstract>Despite the rising popularity of saliency-based explanations, the research community remains at an impasse, facing doubts concerning their purpose, efficacy, and tendency to contradict each other. Seeking to unite the community’s efforts around common goals, several recent works have proposed evaluation metrics. In this paper, we critically examine two sets of metrics: the ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics, focusing our inquiry on natural language processing. First, we show that we can inflate a model’s comprehensiveness and sufficiency scores dramatically without altering its predictions or explanations on in-distribution test inputs. Our strategy exploits the tendency for extracted explanations and their complements to be “out-of-support” relative to each other and in-distribution inputs. Next, we demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple method that encodes the label, even though EVAL-X is precisely motivated to address such exploits. Our results raise doubts about the ability of current metrics to guide explainability research, underscoring the need for a broader reassessment of what precisely these metrics are intended to capture.</abstract>
       <url hash="e6cc8838">2024.findings-eacl.88</url>
       <bibkey>hsia-etal-2024-goodharts</bibkey>
+      <video href="2024.findings-eacl.88.mp4"/>
     </paper>
     <paper id="89">
       <title>Syllable-level lyrics generation from melody exploiting character-level language model</title>
@@ -1058,6 +1123,7 @@
       <abstract>The generation of lyrics tightly connected to accompanying melodies involves establishing a mapping between musical notes and syllables of lyrics. This process requires a deep understanding of music constraints and semantic patterns at syllable-level, word-level, and sentence-level semantic meanings. However, pre-trained language models specifically designed at the syllable level are publicly unavailable. To solve these challenging issues, we propose to exploit fine-tuning character-level language models for syllable-level lyrics generation from symbolic melody. In particular, our method aims to fine-tune a character-level pre-trained language model, allowing to incorporation of linguistic knowledge of the language model into the beam search process of a syllable-level Transformer generator network. Besides, by exploring ChatGPT-based evaluation of generated lyrics in addition to human subjective evaluation, we prove that our approach improves the coherence and correctness of generated lyrics, without the need to train expensive new language models.</abstract>
       <url hash="5a393d93">2024.findings-eacl.89</url>
       <bibkey>zhang-etal-2024-syllable</bibkey>
+      <video href="2024.findings-eacl.89.mp4"/>
     </paper>
     <paper id="90">
       <title>Monolingual or Multilingual Instruction Tuning: Which Makes a Better Alpaca</title>
@@ -1071,6 +1137,7 @@
       <abstract>Foundational large language models (LLMs) can be instruction-tuned to perform open-domain question answering, facilitating applications like chat assistants. While such efforts are often carried out in a single language, we empirically analyze cost-efficient strategies for multilingual scenarios. Our study employs the Alpaca dataset and machine translations of it to form multilingual data, which is then used to tune LLMs through either low-rank adaptation or full-parameter training. Under a controlled computation budget, comparisons show that multilingual tuning is on par or better than tuning a model for each language. Furthermore, multilingual tuning with downsampled data can be as powerful and more robust. Our findings serve as a guide for expanding language support through instruction tuning.</abstract>
       <url hash="f30017a2">2024.findings-eacl.90</url>
       <bibkey>chen-etal-2024-monolingual</bibkey>
+      <video href="2024.findings-eacl.90.mp4"/>
     </paper>
     <paper id="91">
       <title>Prompt Perturbation Consistency Learning for Robust Language Models</title>
@@ -1085,6 +1152,7 @@
       <abstract>Large language models (LLMs) have demonstrated impressive performance on a number of natural language processing tasks, such as question answering and text summarization. However, their performance on sequence labeling tasks such as intent classification and slot filling (IC-SF), which is a central component in personal assistant systems, lags significantly behind discriminative models. Furthermore, there is a lack of substantive research on robustness of LLMs to various perturbations in the input prompts. The contributions of this paper are three-fold. First, we show that fine-tuning sufficiently large LLMs can produce IC-SF performance comparable to discriminative models. Next, we systematically analyze the performance deterioration of those fine-tuned models due to three distinct yet relevant types of input perturbations - oronyms, synonyms, and paraphrasing. Finally, we propose an efficient mitigation approach, Prompt Perturbation Consistency Learning (PPCL), which works by regularizing the divergence between losses from clean and perturbed samples. Our experiments show that PPCL can recover on an average 59% and 69% of the performance drop for IC and SF tasks, respectively. Furthermore, PPCL beats data augmentation approach while using ten times fewer augmented data samples.</abstract>
       <url hash="90de46fe">2024.findings-eacl.91</url>
       <bibkey>qiang-etal-2024-prompt</bibkey>
+      <video href="2024.findings-eacl.91.mp4"/>
     </paper>
     <paper id="92">
       <title>Enhancing Society-Undermining Disinformation Detection through Fine-Grained Sentiment Analysis Pre-Finetuning</title>
@@ -1096,6 +1164,7 @@
       <abstract>In the era of the digital world, while freedom of speech has been flourishing, it has also paved the way for disinformation, causing detrimental effects on society. Legal and ethical criteria are insufficient to address this concern, thus necessitating technological intervention. This paper presents a novel method leveraging pre-finetuning concept for efficient detection and removal of disinformation that may undermine society, as deemed by judicial entities. We argue the importance of detecting this type of disinformation and validate our approach with real-world data derived from court orders. Following a study that highlighted four areas of interest for rumor analysis, our research proposes the integration of a fine-grained sentiment analysis task in the pre-finetuning phase of language models, using the GoEmotions dataset. Our experiments validate the effectiveness of our approach in enhancing performance significantly. Furthermore, we explore the application of our approach across different languages using multilingual language models, showing promising results. To our knowledge, this is the first study that investigates the role of sentiment analysis pre-finetuning in disinformation detection.</abstract>
       <url hash="64499bf9">2024.findings-eacl.92</url>
       <bibkey>pan-etal-2024-enhancing</bibkey>
+      <video href="2024.findings-eacl.92.mp4"/>
     </paper>
     <paper id="93">
       <title>Minimal Distillation Schedule for Extreme Language Model Compression</title>
@@ -1162,6 +1231,7 @@
       <abstract>The performance of NLP methods for severely under-resourced languages cannot currently hope to match the state of the art in NLP methods for well resourced languages. We explore the extent to which pretrained large language models (LLMs) can bridge this gap, via the example of data-to-text generation for Irish, Welsh, Breton and Maltese. We test LLMs on these under-resourced languages and English, in a range of scenarios. We find that LLMs easily set the state of the art for the under-resourced languages by substantial margins, as measured by both automatic and human evaluations. For all our languages, human evaluation shows on-a-par performance with humans for our best systems, but BLEU scores collapse compared to English, casting doubt on the metric’s suitability for evaluating non-task-specific systems. Overall, our results demonstrate the great potential of LLMs to bridge the performance gap for under-resourced languages.</abstract>
       <url hash="6a16680d">2024.findings-eacl.98</url>
       <bibkey>lorandi-belz-2024-high</bibkey>
+      <video href="2024.findings-eacl.98.mp4"/>
     </paper>
     <paper id="99">
       <title>Antonym vs Synonym Distinction using <fixed-case>I</fixed-case>nterla<fixed-case>C</fixed-case>ed Encoder <fixed-case>NET</fixed-case>works (<fixed-case>ICE</fixed-case>-<fixed-case>NET</fixed-case>)</title>
@@ -1193,6 +1263,7 @@
       <bibkey>khiu-etal-2024-predicting</bibkey>
       <revision id="1" href="2024.findings-eacl.100v1" hash="02411aaa"/>
       <revision id="2" href="2024.findings-eacl.100v2" hash="144d271f" date="2024-03-25">Include authors email address.</revision>
+      <video href="2024.findings-eacl.100.mp4"/>
     </paper>
     <paper id="101">
       <title>Does <fixed-case>CLIP</fixed-case> Bind Concepts? Probing Compositionality in Large Image Models</title>
@@ -1208,6 +1279,7 @@
       <url hash="88b32c5a">2024.findings-eacl.101</url>
       <attachment type="software" hash="88b0c508">2024.findings-eacl.101.software.zip</attachment>
       <bibkey>lewis-etal-2024-clip</bibkey>
+      <video href="2024.findings-eacl.101.mp4"/>
     </paper>
     <paper id="102">
       <title>Code-Switching and Back-Transliteration Using a Bilingual Model</title>
@@ -1245,6 +1317,7 @@
       <attachment type="software" hash="9c7db982">2024.findings-eacl.104.software.zip</attachment>
       <attachment type="note" hash="f185d42a">2024.findings-eacl.104.note.zip</attachment>
       <bibkey>he-etal-2024-reading</bibkey>
+      <video href="2024.findings-eacl.104.mp4"/>
     </paper>
     <paper id="105">
       <title>Unified Embeddings for Multimodal Retrieval via Frozen <fixed-case>LLM</fixed-case>s</title>
@@ -1257,6 +1330,7 @@
       <abstract>In this work, We present Unified Embeddings for Multimodal Retrieval (UniMuR), a simple but effective approach that embeds multimodal inputs and retrieves visual and textual outputs via frozen Large Language Models (LLMs). Specifically, UniMuR jointly retrieves multimodal outputs via a unified multimodal embedding and applies dual alignment training to account for both visual and textual semantics. Thus, unlike previous approaches, UniMuR significantly reduces LLM’s modality bias towards generating text-only outputs. Meanwhile, the proposed unified multimodal embedding mitigates the inconsistency between visual and textual outputs and provides coherent multimodal outputs. Furthermore, benefiting from the joint training of visual and textual semantics, UniMuR also achieves strong image/text retrieval ability. Compared to existing approaches, UniMuR achieves better zero-shot multimodal response retrieval performance on MMDialog, improving the overall R@1 by 6.5% while boosting the image retrieval rate and having better cross-modal consistency on multimodal outputs. UniMuR also achieves 2.4% and 3.9% improvement on context-based image retrieval tasks on MMDialog and VisDial respectively when compared to previous approaches, validating its generalization ability across multiple tasks.</abstract>
       <url hash="7ee77965">2024.findings-eacl.105</url>
       <bibkey>wang-etal-2024-unified</bibkey>
+      <video href="2024.findings-eacl.105.mp4"/>
     </paper>
     <paper id="106">
       <title>Assessing the Portability of Parameter Matrices Trained by Parameter-Efficient Finetuning Methods</title>
@@ -1275,6 +1349,7 @@
       <abstract>Sentence-level attacks craft adversarial sentences that are synonymous with correctly-classified sentences but are misclassified by the text classifiers. Under the black-box setting, classifiers are only accessible through their feedback to queried inputs, which is predominately available in the form of class probabilities. Even though utilizing class probabilities results in stronger attacks, due to the challenges of using them for sentence-level attacks, existing attacks use either no feedback or only the class labels. Overcoming the challenges, we develop a novel algorithm that uses class probabilities for black-box sentence-level attacks, investigate the effectiveness of using class probabilities on the attack’s success, and examine the question if it is worthy or practical to use class probabilities by black-box sentence-level attacks. We conduct extensive evaluations of the proposed attack comparing with the baselines across various classifiers and benchmark datasets.</abstract>
       <url hash="973ff83c">2024.findings-eacl.107</url>
       <bibkey>moraffah-liu-2024-exploiting</bibkey>
+      <video href="2024.findings-eacl.107.mp4"/>
     </paper>
     <paper id="108">
       <title>Learning Label Hierarchy with Supervised Contrastive Learning</title>
@@ -1285,6 +1360,7 @@
       <abstract>Supervised contrastive learning (SCL) frameworks treat each class as independent and thus consider all classes to be equally important. This neglects the common scenario in which label hierarchy exists, where fine-grained classes under the same category show more similarity than very different ones. This paper introduces a family of Label-Aware SCL methods (LA-SCL) that incorporates hierarchical information to SCL by leveraging similarities between classes, resulting in creating a more well-structured and discriminative feature space. This is achieved by first adjusting the distance between instances based on measures of the proximity of their classes with the scaled instance-instance-wise contrastive. An additional instance-center-wise contrastive is introduced to move within-class examples closer to their centers, which are represented by a set of learnable label parameters. The learned label parameters can be directly used as a nearest neighbor classifier without further finetuning. In this way, a better feature representation is generated with improvements of intra-cluster compactness and inter-cluster separation. Experiments on three datasets show that the proposed LA-SCL works well on text classification of distinguishing a single label among multi-labels, outperforming the baseline supervised approaches. Our code is publicly available <tex-math>^1</tex-math>.</abstract>
       <url hash="2d0f13d7">2024.findings-eacl.108</url>
       <bibkey>lian-etal-2024-learning</bibkey>
+      <video href="2024.findings-eacl.108.mp4"/>
     </paper>
     <paper id="109">
       <title><fixed-case>G</fixed-case>roun<fixed-case>D</fixed-case>ial: Human-norm Grounded Safe Dialog Response Generation</title>
@@ -1323,6 +1399,7 @@
       <url hash="419ea91d">2024.findings-eacl.111</url>
       <attachment type="note" hash="8bdbf9b3">2024.findings-eacl.111.note.zip</attachment>
       <bibkey>das-etal-2024-low</bibkey>
+      <video href="2024.findings-eacl.111.mp4"/>
     </paper>
     <paper id="112">
       <title>Teaching Probabilistic Logical Reasoning to Transformers</title>
@@ -1335,6 +1412,7 @@
       <attachment type="software" hash="ea1e1ac9">2024.findings-eacl.112.software.zip</attachment>
       <attachment type="note" hash="ea47095c">2024.findings-eacl.112.note.zip</attachment>
       <bibkey>nafar-etal-2024-teaching</bibkey>
+      <video href="2024.findings-eacl.112.mp4"/>
     </paper>
     <paper id="113">
       <title>On Measuring Context Utilization in Document-Level <fixed-case>MT</fixed-case> Systems</title>
@@ -1344,6 +1422,7 @@
       <abstract>Document-level translation models are usually evaluated using general metrics such as BLEU, which are not informative about the benefits of context. Current work on context-aware evaluation, such as contrastive methods, only measure translation accuracy on words that need context for disambiguation. Such measures cannot reveal whether the translation model uses the correct supporting context. We propose to complement accuracy-based evaluation with measures of context utilization. We find that perturbation-based analysis (comparing models’ performance when provided with correct versus random context) is an effective measure of overall context utilization. For a finer-grained phenomenon-specific evaluation, we propose to measure how much the supporting context contributes to handling context-dependent discourse phenomena. We show that automatically-annotated supporting context gives similar conclusions to human-annotated context and can be used as alternative for cases where human annotations are not available. Finally, we highlight the importance of using discourse-rich datasets when assessing context utilization.</abstract>
       <url hash="d45369b7">2024.findings-eacl.113</url>
       <bibkey>mohammed-niculae-2024-measuring</bibkey>
+      <video href="2024.findings-eacl.113.mp4"/>
     </paper>
     <paper id="114">
       <title>Solving <fixed-case>NLP</fixed-case> Problems through Human-System Collaboration: A Discussion-based Approach</title>
@@ -1365,6 +1444,7 @@
       <abstract>Recently, encoder-only pre-trained models such as BERT have been successfully applied in automated essay scoring (AES) to predict a single overall score. However, studies have yet to explore these models in multi-trait AES, possibly due to the inefficiency of replicating BERT-based models for each trait. Breaking away from the existing sole use of *encoder*, we propose an autoregressive prediction of multi-trait scores (ArTS), incorporating a *decoding* process by leveraging the pre-trained T5. Unlike prior regression or classification methods, we redefine AES as a score-generation task, allowing a single model to predict multiple scores. During decoding, the subsequent trait prediction can benefit by conditioning on the preceding trait scores. Experimental results proved the efficacy of ArTS, showing over 5% average improvements in both prompts and traits.</abstract>
       <url hash="ae5946d1">2024.findings-eacl.115</url>
       <bibkey>do-etal-2024-autoregressive</bibkey>
+      <video href="2024.findings-eacl.115.mp4"/>
     </paper>
     <paper id="116">
       <title><fixed-case>CMA</fixed-case>-<fixed-case>R</fixed-case>: Causal Mediation Analysis for Explaining Rumour Detection</title>
@@ -1384,6 +1464,7 @@
       <abstract>Terminology-constrained NMT systems facilitate the forced translation of domain-specific vocabulary. A notable method in this context is the “copy-and-inflect” approach, which appends the target term lemmas of constraints to their corresponding source terms in the input sentence. In this work, we propose a novel adaptation of the “copy-and-inflect” method, referred to as “morph-masking”. Our method involves masking the source terms of the constraints from the input sentence while retaining essential grammatical information. Our approach is based on the hypothesis that “copy-and-inflect” systems have access to both source and target terms, allowing them to generate the correct surface form of the constraint by either translating the source term itself or properly inflecting the target term lemma. Through extensive validation of our method in two translation directions with different levels of source morphological complexity, Basque to Spanish and English to German, we have demonstrated that “morph-masking” is capable of providing a harder constraint signal, resulting in a notable improvement over the “copy-and-inflect” method (up to 38% in term accuracy), especially in challenging constraint scenarios.</abstract>
       <url hash="f037957d">2024.findings-eacl.117</url>
       <bibkey>corral-saralegi-2024-morphology</bibkey>
+      <video href="2024.findings-eacl.117.mp4"/>
     </paper>
     <paper id="118">
       <title>Improving Backchannel Prediction Leveraging Sequential and Attentive Context Awareness</title>
@@ -1396,6 +1477,7 @@
       <url hash="0120613a">2024.findings-eacl.118</url>
       <attachment type="software" hash="747f9bcc">2024.findings-eacl.118.software.zip</attachment>
       <bibkey>park-etal-2024-improving</bibkey>
+      <video href="2024.findings-eacl.118.mp4"/>
     </paper>
     <paper id="119">
       <title><fixed-case>SENSE</fixed-case>-<fixed-case>LM</fixed-case> : A Synergy between a Language Model and Sensorimotor Representations for Auditory and Olfactory Information Extraction</title>
@@ -1407,6 +1489,7 @@
       <abstract>The five human senses – vision, taste, smell, hearing, and touch – are key concepts that shape human perception of the world. The extraction of sensory references (i.e., expressions that evoke the presence of a sensory experience) in textual corpus is a challenge of high interest, with many applications in various areas. In this paper, we propose SENSE-LM, an information extraction system tailored for the discovery of sensory references in large collections of textual documents. Based on the novel idea of combining the strength of large language models and linguistic resources such as sensorimotor norms, it addresses the task of sensory information extraction at a coarse-grained (sentence binary classification) and fine-grained (sensory term extraction) level.Our evaluation of SENSE-LM for two sensory functions, Olfaction and Audition, and comparison with state-of-the-art methods emphasize a significant leap forward in automating these complex tasks.</abstract>
       <url hash="1b5a5d95">2024.findings-eacl.119</url>
       <bibkey>boscher-etal-2024-sense</bibkey>
+      <video href="2024.findings-eacl.119.mp4"/>
     </paper>
     <paper id="120">
       <title>Analyzing the Role of Part-of-Speech in Code-Switching: A Corpus-Based Study</title>
@@ -1416,6 +1499,7 @@
       <abstract>Code-switching (CS) is a common linguistic phenomenon wherein speakers fluidly transition between languages in conversation. While the cognitive processes driving CS remain a complex domain, earlier investigations have shed light on its multifaceted triggers. This study delves into the influence of Part-of-Speech (POS) on the propensity of bilinguals to engage in CS, employing a comprehensive analysis of Spanish-English and Mandarin-English corpora. Compared with prior research, our findings not only affirm the existence of a statistically significant connection between POS and the likelihood of CS across language pairs, but notably find this relationship exhibits its maximum strength in proximity to CS instances, progressively diminishing as tokens distance themselves from these CS points.</abstract>
       <url hash="cdc4e20a">2024.findings-eacl.120</url>
       <bibkey>chi-bell-2024-analyzing</bibkey>
+      <video href="2024.findings-eacl.120.mp4"/>
     </paper>
     <paper id="121">
       <title>In-Contextual Gender Bias Suppression for Large Language Models</title>
@@ -1426,6 +1510,7 @@
       <abstract>Despite their impressive performance in a wide range of NLP tasks, Large Language Models (LLMs) have been reported to encode worrying-levels of gender biases. Prior work has proposed debiasing methods that require human labelled examples, data augmentation and fine-tuning of LLMs, which are computationally costly. Moreover, one might not even have access to the model parameters for performing debiasing such as in the case of closed LLMs such as GPT-4. To address this challenge, we propose bias suppression that prevents biased generations of LLMs by simply providing textual preambles constructed from manually designed templates and real-world statistics, without accessing to model parameters. We show that, using CrowsPairs dataset, our textual preambles covering counterfactual statements can suppress gender biases in English LLMs such as LLaMA2. Moreover, we find that gender-neutral descriptions of gender-biased objects can also suppress their gender biases. Moreover, we show that bias suppression has acceptable adverse effect on downstream task performance with HellaSwag and COPA.</abstract>
       <url hash="5dd0284b">2024.findings-eacl.121</url>
       <bibkey>oba-etal-2024-contextual</bibkey>
+      <video href="2024.findings-eacl.121.mp4"/>
     </paper>
     <paper id="122">
       <title>Parameter-Efficient Fine-Tuning: Is There An Optimal Subset of Parameters to Tune?</title>
@@ -1435,6 +1520,7 @@
       <abstract>The ever-growing size of pretrained language models (PLM) presents a significant challenge for efficiently fine-tuning and deploying these models for diverse sets of tasks within memory-constrained environments.In light of this, recent research has illuminated the possibility of selectively updating only a small subset of a model’s parameters during the fine-tuning process.Since no new parameters or modules are added, these methods retain the inference speed of the original model and come at no additional computational cost. However, an open question pertains to which subset of parameters should best be tuned to maximize task performance and generalizability. To investigate, this paper presents comprehensive experiments covering a large spectrum of subset selection strategies. We comparatively evaluate their impact on model performance as well as the resulting model’s capability to generalize to different tasks.Surprisingly, we find that the gains achieved in performance by elaborate selection strategies are, at best, marginal when compared to the outcomes obtained by tuning a random selection of parameter subsets. Our experiments also indicate that selection-based tuning impairs generalizability to new tasks.</abstract>
       <url hash="da50d04e">2024.findings-eacl.122</url>
       <bibkey>ploner-akbik-2024-parameter</bibkey>
+      <video href="2024.findings-eacl.122.mp4"/>
     </paper>
     <paper id="123">
       <title>Contextualized Topic Coherence Metrics</title>
@@ -1449,6 +1535,7 @@
       <url hash="f7168365">2024.findings-eacl.123</url>
       <attachment type="software" hash="3868254e">2024.findings-eacl.123.software.zip</attachment>
       <bibkey>rahimi-etal-2024-contextualized</bibkey>
+      <video href="2024.findings-eacl.123.mp4"/>
     </paper>
     <paper id="124">
       <title><fixed-case>P</fixed-case>ro<fixed-case>MIS</fixed-case>e: A Proactive Multi-turn Dialogue Dataset for Information-seeking Intent Resolution</title>
@@ -1491,6 +1578,7 @@
       <abstract>Context-aware Machine Translation aims to improve translations of sentences by incorporating surrounding sentences as context. Towards this task, two main architectures have been applied, namely single-encoder (based on concatenation) and multi-encoder models. In this study, we show that a special case of multi-encoder architecture, where the latent representation of the source sentence is cached and reused as the context in the next step, achieves higher accuracy on the contrastive datasets (where the models have to rank the correct translation among the provided sentences) and comparable BLEU and COMET scores as the single- and multi-encoder approaches. Furthermore, we investigate the application of Sequence Shortening to the cached representations. We test three pooling-based shortening techniques and introduce two novel methods - Latent Grouping and Latent Selecting, where the network learns to group tokens or selects the tokens to be cached as context. Our experiments show that the two methods achieve competitive BLEU and COMET scores and accuracies on the contrastive datasets to the other tested methods while potentially allowing for higher interpretability and reducing the growth of memory requirements with increased context size.</abstract>
       <url hash="774a45f4">2024.findings-eacl.127</url>
       <bibkey>maka-etal-2024-sequence</bibkey>
+      <video href="2024.findings-eacl.127.mp4"/>
     </paper>
     <paper id="128">
       <title>Jigsaw Pieces of Meaning: Modeling Discourse Coherence with Informed Negative Sample Synthesis</title>
@@ -1509,6 +1597,7 @@
       <abstract>Quantifying uncertainty in automatically generated text is important for letting humans check potential hallucinations and making systems more reliable. Conformal prediction is an attractive framework to provide predictions imbued with statistical guarantees, however, its application to text generation is challenging since any i.i.d. assumptions are not realistic. In this paper, we bridge this gap by leveraging recent results on *non-exchangeable* conformal prediction, which still ensures bounds on coverage. The result, *non-exchangeable conformal nucleus sampling*, is a novel extension of the conformal prediction framework to generation based on nearest neighbors. Our method can be used post-hoc for an arbitrary model without extra training and supplies token-level, calibrated prediction sets equipped with statistical guarantees. Experiments in machine translation and language modeling show encouraging results in generation quality. By also producing tighter prediction sets with good coverage, we thus give a more theoretically principled way to perform sampling with conformal guarantees.</abstract>
       <url hash="74ae0a2d">2024.findings-eacl.129</url>
       <bibkey>ulmer-etal-2024-non</bibkey>
+      <video href="2024.findings-eacl.129.mp4"/>
     </paper>
     <paper id="130">
       <title>Evidentiality-aware Retrieval for Overcoming Abstractiveness in Open-Domain Question Answering</title>
@@ -1523,6 +1612,7 @@
       <abstract>The long-standing goal of dense retrievers in abtractive open-domain question answering (ODQA) tasks is to learn to capture evidence passages among relevant passages for any given query, such that the reader produce factually correct outputs from evidence passages. One of the key challenge is the insufficient amount of training data with the supervision of the answerability of the passages. Recent studies rely on iterative pipelines to annotate answerability using signals from the reader, but their high computational costs hamper practical applications. In this paper, we instead focus on a data-driven approach and propose Evidentiality-Aware Dense Passage Retrieval (EADPR), which leverages synthetic distractor samples to learn to discriminate evidence passages from distractors. We conduct extensive experiments to validate the effectiveness of our proposed method on multiple abstractive ODQA tasks.</abstract>
       <url hash="021372bf">2024.findings-eacl.130</url>
       <bibkey>song-etal-2024-evidentiality</bibkey>
+      <video href="2024.findings-eacl.130.mp4"/>
     </paper>
     <paper id="131">
       <title>Self-training Strategies for Sentiment Analysis: An Empirical Study</title>
@@ -1535,6 +1625,7 @@
       <abstract>Sentiment analysis is a crucial task in natural language processing that involves identifying and extracting subjective sentiment from text. Self-training has recently emerged as an economical and efficient technique for developing sentiment analysis models by leveraging a small amount of labeled data and a large amount of unlabeled data. However, given a set of training data, how to utilize them to conduct self-training makes a significant difference in the final performance of the model. We refer to this methodology as the self-training strategy. In this paper, we present an empirical study of various self-training strategies for sentiment analysis. First, we investigate the influence of the self-training strategy and hyper-parameters on the performance of traditional small language models (SLMs) in various few-shot settings. Second, we also explore the feasibility of leveraging large language models (LLMs) to help self-training. We propose and empirically compare several self-training strategies with the intervention of LLMs. Extensive experiments are conducted on three real-world sentiment analysis datasets.</abstract>
       <url hash="10c2026e">2024.findings-eacl.131</url>
       <bibkey>liu-etal-2024-self</bibkey>
+      <video href="2024.findings-eacl.131.mp4"/>
     </paper>
     <paper id="132">
       <title>Language is All a Graph Needs</title>
@@ -1573,6 +1664,7 @@
       <abstract>Using large language models (LMs) for query or document expansion can improve generalization in information retrieval. However, it is unknown whether these techniques are universally beneficial or only effective in specific settings, such as for particular retrieval models, dataset domains, or query types. To answer this, we conduct the first comprehensive analysis of LM-based expansion. We find that there exists a strong negative correlation between retriever performance and gains from expansion: expansion improves scores for weaker models, but generally harms stronger models. We show this trend holds across a set of eleven expansion techniques, twelve datasets with diverse distribution shifts, and twenty-four retrieval models. Through qualitative error analysis, we hypothesize that although expansions provide extra information (potentially improving recall), they add additional noise that makes it difficult to discern between the top relevant documents (thus introducing false positives). Our results suggest the following recipe: use expansions for weaker models or when the target dataset significantly differs from training corpus in format; otherwise, avoid expansions to keep the relevance signal clear.</abstract>
       <url hash="74f3c4d7">2024.findings-eacl.134</url>
       <bibkey>weller-etal-2024-generative</bibkey>
+      <video href="2024.findings-eacl.134.mp4"/>
     </paper>
     <paper id="135">
       <title>Can Large Language Models Understand Context?</title>
@@ -1589,6 +1681,7 @@
       <abstract>Understanding context is key to understanding human language, an ability which Large Language Models (LLMs) have been increasingly seen to demonstrate to an impressive extent. However, though the evaluation of LLMs encompasses various domains within the realm of Natural Language Processing, limited attention has been paid to probing their linguistic capability of understanding contextual features. This paper introduces a context understanding benchmark by adapting existing datasets to suit the evaluation of generative models. This benchmark comprises of four distinct tasks and nine datasets, all featuring prompts designed to assess the models’ ability to understand context. First, we evaluate the performance of LLMs under the in-context learning pretraining scenario. Experimental results indicate that pre-trained dense models struggle with understanding more nuanced contextual features when compared to state-of-the-art fine-tuned models. Second, as LLM compression holds growing significance in both research and real-world applications, we assess the context understanding of quantized models under in-context-learning settings. We find that 3-bit post-training quantization leads to varying degrees of performance reduction on our benchmark. We conduct an extensive analysis of these scenarios to substantiate our experimental results.</abstract>
       <url hash="400d0d77">2024.findings-eacl.135</url>
       <bibkey>zhu-etal-2024-large</bibkey>
+      <video href="2024.findings-eacl.135.mp4"/>
     </paper>
     <paper id="136">
       <title>Let’s Negotiate! A Survey of Negotiation Dialogue Systems</title>
@@ -1637,6 +1730,7 @@
       <abstract>In the months since its release, ChatGPT and its underlying model, GPT3.5, have garnered massive attention, due to their potent mix of capability and accessibility. While a niche industry of papers have emerged examining the scope of capabilities these models possess, language — whether natural or stylized like code — has been the vehicle to exchange information with the network. Drawing inspiration from the multi-modal knowledge we’d expect an agent with true understanding to possess, we examine GPT3.5’s aptitude for visual tasks, where the inputs feature ASCII-art without overt distillation into a lingual summary. In particular, we scrutinize its performance on carefully designed image recognition and generation tasks. An extended version of this write-up is available at: https://arxiv.org/abs/2307.16806 .</abstract>
       <url hash="f92e7ca9">2024.findings-eacl.139</url>
       <bibkey>bayani-2024-testing</bibkey>
+      <video href="2024.findings-eacl.139.mp4"/>
     </paper>
     <paper id="140">
       <title>Cross-lingual Editing in Multilingual Language Models</title>
@@ -1647,6 +1741,7 @@
       <abstract>The training of large language models (LLMs) necessitates substantial data and computational resources, and updating outdated LLMs entails significant efforts and resources. While numerous model editing techniques (METs) have emerged to efficiently update model outputs without retraining, their effectiveness in multilingual LLMs, where knowledge is stored in diverse languages, remains an underexplored research area. This research paper introduces the cross-lingual model editing (XME) paradigm, wherein a fact is edited in one language, and the subsequent update propagation is observed across other languages. To investigate the XME paradigm, we conducted experiments using BLOOM, mBERT, and XLM-RoBERTa using the two writing scripts: Latin (English, French, and Spanish) and Indic (Hindi, Gujarati, and Bengali). The results reveal notable performance limitations of state-of-the-art METs under the XME setting, mainly when the languages involved belong to two distinct script families. These findings highlight the need for further research and development of XME techniques to address these challenges. For more comprehensive information, the dataset used in this research and the associated code are publicly available at the following [URL](https://github.com/lingo-iitgn/XME).</abstract>
       <url hash="4343bb32">2024.findings-eacl.140</url>
       <bibkey>beniwal-etal-2024-cross</bibkey>
+      <video href="2024.findings-eacl.140.mp4"/>
     </paper>
     <paper id="141">
       <title>Sorted <fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case>: Unlocking the Potential of Intermediate Layers of Large Language Models for Dynamic Inference</title>
@@ -1660,6 +1755,7 @@
       <abstract>Large language models (LLMs) have revolutionized natural language processing (NLP) by excelling at understanding and generating human-like text. However, their widespread deployment can be prohibitively expensive. SortedNet is a recent training technique for enabling dynamic inference by leveraging the modularity in networks and sorting sub-models based on computation/accuracy in a nested manner. We extend SortedNet to generative NLP tasks, making large language models dynamic without any Pre-Training and by only replacing Standard Fine-Tuning (SFT) with Sorted Fine-Tuning (SoFT). Our approach boosts model efficiency, eliminating the need for multiple models for various scenarios during inference. We show that this approach can unlock the potential of intermediate layers of transformers in generating the target output. Our sub-models remain integral components of the original model, minimizing storage requirements and transition costs between different computational/latency budgets. The efficacy of our proposed method was demonstrated by applying it to tune LLaMA 2 13B on the Stanford Alpaca dataset for instruction following and TriviaQA for closed-book question answering. Our results show the superior performance of sub-models in comparison to Standard Fine-Tuning and SFT+ICT (Early-Exit), all achieved with very efficient tuning and without additional memory usage during inference.</abstract>
       <url hash="7cadd98a">2024.findings-eacl.141</url>
       <bibkey>kavehzadeh-etal-2024-sorted</bibkey>
+      <video href="2024.findings-eacl.141.mp4"/>
     </paper>
     <paper id="142">
       <title><fixed-case>A</fixed-case>ccent<fixed-case>F</fixed-case>old: A Journey through <fixed-case>A</fixed-case>frican Accents for Zero-Shot <fixed-case>ASR</fixed-case> Adaptation to Target Accents</title>
@@ -1683,6 +1779,7 @@
       <abstract>Long prompts present a significant challenge for practical LLM-based systems that need to operate with low latency and limited resources. We investigate prompt compression for zero-shot dialogue systems that learn to use unseen APIs directly in-context from their documentation, which may take up hundreds of prompt tokens per API. We start from a recently introduced approach (Mu et al., 2023) that learns to compress the prompt into a few “gist token” activations during finetuning. However, this simple idea is ineffective in compressing API documentation, resulting in low accuracy compared to the baseline using an uncompressed prompt. In this work, we introduce two major improvements. First, we specialize gist tokens for different hierarchies within an API: we use one <tex-math>\mathrm{Gist}_{\mathrm{arg}}</tex-math> token for compressing an argument and one <tex-math>\mathrm{Gist}_{\mathrm{value}}</tex-math> token for compressing an acceptable value of a categorical argument. We then dynamically reveal <tex-math>\mathrm{Gist}_{\mathrm{value}}</tex-math> tokens only when they are needed. Second, we add a reconstruction loss to predict the API documentation from the gist tokens. On multiple API-calling tasks, our proposed system keeps the simplicity, efficiency, and large compression factor (20x on SGD) of the gist token approach while achieving significantly better accuracy.</abstract>
       <url hash="8c1adec3">2024.findings-eacl.143</url>
       <bibkey>jiang-etal-2024-hierarchical</bibkey>
+      <video href="2024.findings-eacl.143.mp4"/>
     </paper>
     <paper id="144">
       <title>Fine-tuning <fixed-case>CLIP</fixed-case> Text Encoders with Two-step Paraphrasing</title>
@@ -1711,6 +1808,7 @@
       <url hash="7a40f6e9">2024.findings-eacl.145</url>
       <attachment type="software" hash="0e3e8735">2024.findings-eacl.145.software.zip</attachment>
       <bibkey>moon-etal-2024-generative</bibkey>
+      <video href="2024.findings-eacl.145.mp4"/>
     </paper>
     <paper id="146">
       <title>Dive into the Chasm: Probing the Gap between In- and Cross-Topic Generalization</title>
@@ -1721,6 +1819,7 @@
       <abstract>Pre-trained language models (PLMs) perform well in In-Topic setups, where training and testing data come from the same topics. However, they face challenges in Cross-Topic scenarios where testing data is derived from distinct topics. This paper analyzes various PLMs with three probing-based experiments to better understand the reasons behind such generalization gaps. For the first time, we demonstrate that the extent of these generalization gaps and the sensitivity to token-level interventions vary significantly across PLMs. By evaluating large language models (LLMs), we show the usefulness of our analysis for these recent models. Overall, we observe diverse pre-training objectives and architectural regularization contribute to more robust PLMs and mitigate generalization gaps. Our research contributes to a deeper understanding and comparison of language models across different generalization scenarios.</abstract>
       <url hash="a4646e90">2024.findings-eacl.146</url>
       <bibkey>waldis-etal-2024-dive</bibkey>
+      <video href="2024.findings-eacl.146.mp4"/>
     </paper>
     <paper id="147">
       <title><fixed-case>LLM</fixed-case>-<fixed-case>GE</fixed-case>m: Large Language Model-Guided Prediction of People’s Empathy Levels towards Newspaper Article</title>
@@ -1736,6 +1835,7 @@
       <bibkey>hasan-etal-2024-llm</bibkey>
       <revision id="1" href="2024.findings-eacl.147v1" hash="d405c649"/>
       <revision id="2" href="2024.findings-eacl.147v2" hash="ac47d4fc" date="2024-05-02">Minor updates.</revision>
+      <video href="2024.findings-eacl.147.mp4"/>
     </paper>
     <paper id="148">
       <title><fixed-case>ICE</fixed-case>-Score: Instructing Large Language Models to Evaluate Code</title>
@@ -1759,6 +1859,7 @@
       <url hash="9b808224">2024.findings-eacl.149</url>
       <attachment type="note" hash="a01f614e">2024.findings-eacl.149.note.zip</attachment>
       <bibkey>kim-etal-2024-crese</bibkey>
+      <video href="2024.findings-eacl.149.mp4"/>
     </paper>
     <paper id="150">
       <title><fixed-case>BMX</fixed-case>: Boosting Natural Language Generation Metrics with Explainability</title>
@@ -1771,6 +1872,7 @@
       <attachment type="software" hash="bb92b528">2024.findings-eacl.150.software.zip</attachment>
       <attachment type="note" hash="8c28b8db">2024.findings-eacl.150.note.zip</attachment>
       <bibkey>leiter-etal-2024-bmx</bibkey>
+      <video href="2024.findings-eacl.150.mp4"/>
     </paper>
     <paper id="151">
       <title>Joint Inference of Retrieval and Generation for Passage Re-ranking</title>
@@ -1782,6 +1884,7 @@
       <url hash="b861c24e">2024.findings-eacl.151</url>
       <attachment type="software" hash="45613d80">2024.findings-eacl.151.software.zip</attachment>
       <bibkey>fang-etal-2024-joint</bibkey>
+      <video href="2024.findings-eacl.151.mp4"/>
     </paper>
     <paper id="152">
       <title><fixed-case>D</fixed-case>ialog<fixed-case>S</fixed-case>tudio: Towards Richest and Most Diverse Unified Dataset Collection for Conversational <fixed-case>AI</fixed-case></title>
@@ -1813,6 +1916,7 @@
       <abstract>Linguistic features have a strong contribution in the context of the automatic assessment of text readability (ARA). They have been one of the anchors between the computational and theoretical models. With the development in the ARA field, the research moved to Deep Learning (DL). In an attempt to reconcile the mixed results reported in this context, we present a systematic comparison of 6 hybrid approaches along with standard Machine Learning and DL approaches, on 4 corpora (different languages and target audiences). The various experiments clearly highlighted two rather simple hybridization methods (soft label and simple concatenation). They also appear to be the most robust on smaller datasets and across various tasks and languages. This study stands out as the first to systematically compare different architectures and approaches to feature hybridization in DL, as well as comparing performance in terms of two languages and two target audiences of the text, which leads to a clearer pattern of results.</abstract>
       <url hash="4f4857c4">2024.findings-eacl.153</url>
       <bibkey>wilkens-etal-2024-exploring</bibkey>
+      <video href="2024.findings-eacl.153.mp4"/>
     </paper>
     <paper id="154">
       <title>Establishing degrees of closeness between audio recordings along different dimensions using large-scale cross-lingual models</title>
@@ -1825,6 +1929,7 @@
       <abstract>In the highly constrained context of low-resource language studies, we explore vector representations of speech from a pretrained model to determine their level of abstraction with regard to the audio signal. We propose a new unsupervised method using ABX tests on audio recordings with carefully curated metadata to shed light on the type of information present in the representations. ABX tests determine whether the representations computed by a multilingual speech model encode a given characteristic. Three experiments are devised: one on room acoustics aspects, one on linguistic genre, and one on phonetic aspects. The results confirm that the representations extracted from recordings with different linguistic/extra-linguistic characteristics differ along the same lines. Embedding more audio signal in one vector better discriminates extra-linguistic characteristics, whereas shorter snippets are better to distinguish segmental information. The method is fully unsupervised, potentially opening new research avenues for comparative work on under-documented languages.</abstract>
       <url hash="be4bc756">2024.findings-eacl.154</url>
       <bibkey>fily-etal-2024-establishing</bibkey>
+      <video href="2024.findings-eacl.154.mp4"/>
     </paper>
     <paper id="155">
       <title>The Queen of <fixed-case>E</fixed-case>ngland is not <fixed-case>E</fixed-case>ngland’s Queen: On the Lack of Factual Coherency in <fixed-case>PLM</fixed-case>s</title>
@@ -1835,6 +1940,7 @@
       <abstract>Factual knowledge encoded in Pre-trained Language Models (PLMs) enriches their representations and justifies their use as knowledge bases. Previous work has focused on probing PLMs for factual knowledge by measuring how often they can correctly predict an _object_ entity given a subject and a relation, and improving fact retrieval by optimizing the prompts used for querying PLMs. In this work, we consider a complementary aspect, namely the coherency of factual knowledge in PLMs, i.e., how often can PLMs predict the _subject_ entity given its initial prediction of the object entity. This goes beyond evaluating how much PLMs know, and focuses on the internal state of knowledge inside them. Our results indicate that PLMs have low coherency using manually written, optimized and paraphrased prompts, but including an evidence paragraph leads to substantial improvement. This shows that PLMs fail to model inverse relations and need further enhancements to be able to handle retrieving facts from their parameters in a coherent manner, and to be considered as knowledge bases.</abstract>
       <url hash="5a44cdb4">2024.findings-eacl.155</url>
       <bibkey>youssef-etal-2024-queen</bibkey>
+      <video href="2024.findings-eacl.155.mp4"/>
     </paper>
     <paper id="156">
       <title><fixed-case>H</fixed-case>ierarchy<fixed-case>N</fixed-case>et: Learning to Summarize Source Code with Heterogeneous Representations</title>
@@ -1847,6 +1953,7 @@
       <abstract>Code representation is important to machine learning models in the code-related applications. Existing code summarization approaches primarily leverage Abstract Syntax Trees (ASTs) and sequential information from source code to generate code summaries while often overlooking the critical consideration of the interplay of dependencies among code elements and code hierarchy. However, effective summarization necessitates a holistic analysis of code snippets from three distinct aspects: lexical, syntactic, and semantic information. In this paper, we propose a novel code summarization approach utilizing Heterogeneous Code Representations (HCRs) and our specially designed HierarchyNet. HCRs adeptly capture essential code features at lexical, syntactic, and semantic levels within a hierarchical structure. HierarchyNet processes each layer of the HCR separately, employing a Heterogeneous Graph Transformer, a Tree-based CNN, and a Transformer Encoder. In addition, HierarchyNet demonstrates superior performance compared to fine-tuned pre-trained models, including CodeT5, and CodeBERT, as well as large language models that employ zero/few-shot settings, such as CodeLlama, StarCoder, and CodeGen. Implementation details can be found at https://github.com/FSoft-AI4Code/HierarchyNet.</abstract>
       <url hash="3eef3fbe">2024.findings-eacl.156</url>
       <bibkey>nguyen-etal-2024-hierarchynet</bibkey>
+      <video href="2024.findings-eacl.156.mp4"/>
     </paper>
     <paper id="157">
       <title>Understanding the effects of language-specific class imbalance in multilingual fine-tuning</title>
@@ -1856,6 +1963,7 @@
       <abstract>We study the effect of one type of imbalance often present in real-life multilingual classification datasets: an uneven distribution of labels across languages. We show evidence that fine-tuning a transformer-based Large Language Model (LLM) on a dataset with this imbalance leads to worse performance, a more pronounced separation of languages in the latent space, and the promotion of uninformative features. We modify the traditional class weighing approach to imbalance by calculating class weights separately for each language and show that this helps mitigate those detrimental effects. These results create awareness of the negative effects of language-specific class imbalance in multilingual fine-tuning and the way in which the model learns to rely on the separation of languages to perform the task.</abstract>
       <url hash="a17b7cc2">2024.findings-eacl.157</url>
       <bibkey>jung-plas-2024-understanding</bibkey>
+      <video href="2024.findings-eacl.157.mp4"/>
     </paper>
     <paper id="158">
       <title><fixed-case>NL</fixed-case>2<fixed-case>F</fixed-case>ormula: Generating Spreadsheet Formulas from Natural Language Queries</title>
diff --git a/data/xml/2024.finnlp.xml b/data/xml/2024.finnlp.xml
new file mode 100644
index 0000000000..ffdb98ac98
--- /dev/null
+++ b/data/xml/2024.finnlp.xml
@@ -0,0 +1,410 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.finnlp">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Joint Workshop of the 7th Financial Technology and Natural Language Processing, the 5th Knowledge Discovery from Unstructured Data in Financial Services, and the 4th Workshop on Economics and Natural Language Processing @ LREC-COLING 2024</booktitle>
+      <editor><first>Chung-Chi</first><last>Chen</last></editor>
+      <editor><first>Xiaomo</first><last>Liu</last></editor>
+      <editor><first>Udo</first><last>Hahn</last></editor>
+      <editor><first>Armineh</first><last>Nourbakhsh</last></editor>
+      <editor><first>Zhiqiang</first><last>Ma</last></editor>
+      <editor><first>Charese</first><last>Smiley</last></editor>
+      <editor><first>Veronique</first><last>Hoste</last></editor>
+      <editor><first>Sanjiv Ranjan</first><last>Das</last></editor>
+      <editor><first>Manling</first><last>Li</last></editor>
+      <editor><first>Mohammad</first><last>Ghassemi</last></editor>
+      <editor><first>Hen-Hsen</first><last>Huang</last></editor>
+      <editor><first>Hiroya</first><last>Takamura</last></editor>
+      <editor><first>Hsin-Hsi</first><last>Chen</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="adfb0033">2024.finnlp-1</url>
+      <venue>finnlp</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="1cd0f577">2024.finnlp-1.0</url>
+      <bibkey>finnlp-2024-joint</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Construction of a <fixed-case>J</fixed-case>apanese Financial Benchmark for Large Language Models</title>
+      <author><first>Masanori</first><last>Hirano</last></author>
+      <pages>1–9</pages>
+      <abstract>With the recent development of large language models (LLMs), models that focus on certain domains and languages have been discussed for their necessity. There is also a growing need for benchmarks to evaluate the performance of current LLMs in each domain. Therefore, in this study, we constructed a benchmark comprising multiple tasks specific to the Japanese and financial domains and performed benchmark measurements on some models. Consequently , we confirmed that GPT-4 is currently outstanding, and that the constructed benchmarks function effectively. According to our analysis, our benchmark can differentiate benchmark scores among models in all performance ranges by combining tasks with different difficulties.</abstract>
+      <url hash="134a5ee1">2024.finnlp-1.1</url>
+      <bibkey>hirano-2024-construction-japanese</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>KRX</fixed-case> Bench: Automating Financial Benchmark Creation via Large Language Models</title>
+      <author><first>Guijin</first><last>Son</last></author>
+      <author><first>Hyunjun</first><last>Jeon</last></author>
+      <author><first>Chami</first><last>Hwang</last></author>
+      <author><first>Hanearl</first><last>Jung</last></author>
+      <pages>10–20</pages>
+      <abstract>In this work, we introduce KRX-Bench, an automated pipeline for creating financial benchmarks via GPT-4. To demonstrate the effectiveness of the pipeline, we create KRX-Bench-POC, a benchmark assessing the knowledge of LLMs in real-world companies. This dataset comprises 1,002 questions, each focusing on companies across the U.S., Japanese, and Korean stock markets. We make our pipeline and dataset publicly available and integrate the evaluation code into EleutherAI’s Language Model Evaluation Harness.</abstract>
+      <url hash="5f271f53">2024.finnlp-1.2</url>
+      <bibkey>son-etal-2024-krx-bench</bibkey>
+    </paper>
+    <paper id="3">
+      <title><fixed-case>BLU</fixed-case>-<fixed-case>S</fixed-case>yn<fixed-case>T</fixed-case>ra: Distinguish Synergies and Trade-offs between Sustainable Development Goals Using Small Language Models</title>
+      <author><first>Loris</first><last>Bergeron</last></author>
+      <author><first>Jerome</first><last>Francois</last></author>
+      <author><first>Radu</first><last>State</last></author>
+      <author><first>Jean</first><last>Hilger</last></author>
+      <pages>21–33</pages>
+      <abstract>Since the United Nations defined the Sustainable Development Goals, studies have shown that these goals are interlinked in different ways. The concept of SDG interlinkages refers to the complex network of interactions existing within and between the SDGs themselves. These interactions are referred to as synergies and trade-offs. Synergies represent positive interactions where the progress of one SDG contributes positively to the progress of another. On the other hand, trade-offs are negative interactions where the progress of one SDG has a negative impact on another. However, evaluating such interlinkages is a complex task, not only because of the multidimensional nature of SDGs, but also because it is highly exposed to personal interpretation bias and technical limitations. Recent studies are mainly based on expert judgements, literature reviews, sentiment or data analysis. To remedy these limitations we propose the use of Small Language Models in addition of an advanced Retrieval Augmented Generation to distinguish synergies and trade-offs between SDGs. In order to validate our results, we have drawn on the study carried out by the European Commission’s Joint Research Centre which provides a database of interlinkages labelled according to the presence of synergies or trade-offs.</abstract>
+      <url hash="e212313d">2024.finnlp-1.3</url>
+      <bibkey>bergeron-etal-2024-blu-syntra</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Assessing the Impact of <fixed-case>ESG</fixed-case>-Related News on Stock Trading in the <fixed-case>I</fixed-case>ndonesian Market: A Text Similarity Framework Approach</title>
+      <author><first>Okiriza</first><last>Wibisono</last></author>
+      <author><first>Ali Akbar</first><last>Septiandri</last></author>
+      <author><first>Reinhard Denis</first><last>Najogie</last></author>
+      <pages>34–39</pages>
+      <abstract>Environmental, Social, and Governance (ESG) perspectives have become integral to corporate decision-making and investment, with global regulatory mandates for ESG disclosure. The reliability of ESG ratings, crucial for assessing corporate sustainability practices, is compromised by inconsistencies and discrepancies across and within rating agencies, casting doubt on their effectiveness in reflecting true ESG performance and impact on firm valuations. While there have been studies using ESG-related news articles to measure their effect on stock trading, none have studied the Indonesian stock market. To address this gap, we developed a text similarity framework to identify ESG-related news articles based on Sustainability Accounting Standards Board (SASB) Standards without the need for manual annotations. Using news articles from one of the prominent business media outlets in Indonesia and an event study method, we found that 17.9% out of 18,431 environment-related news are followed by increased stock trading on the firms mentioned in the news, compared to 16.0% on random-dates datasets of the same size and firm composition. This approach is intended as a simpler alternative to building an ESG-specific news labeling model or using third-party data providers, although further analyses may be required to evaluate its robustness.</abstract>
+      <url hash="ab0dc797">2024.finnlp-1.4</url>
+      <bibkey>wibisono-etal-2024-assessing-impact</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Development and Evaluation of a <fixed-case>G</fixed-case>erman Language Model for the Financial Domain</title>
+      <author><first>Nata</first><last>Kozaeva</last></author>
+      <author><first>Serhii</first><last>Hamotskyi</last></author>
+      <author><first>Christian</first><last>Hanig</last></author>
+      <pages>40–49</pages>
+      <abstract>Recent advancements in self-supervised pre-training of Language Models (LMs) have significantly improved their performance across a wide range of Natural Language Processing (NLP) tasks. Yet, the adaptation of these models to specialized domains remains a critical endeavor, as it enables the models to grasp domain-specific nuances, terminology, and patterns more effectively, thereby enhancing their utility in specialized contexts. This paper presents an in-depth investigation into the training and fine-tuning of German language models specifically for the financial sector. We construct various datasets for training and fine-tuning to examine the impact of different data construction strategies on the models’ performance. Our study provides detailed insights into essential pre-processing steps, including text extraction from PDF documents and language identification, to evaluate their influence on the performance of the language models. Addressing the scarcity of resources in the German financial domain, we also introduce a German Text Classification benchmark dataset, aimed at fostering further research and development in this area. The performance of the trained models is evaluated on two domain-specific tasks, demonstrating that fine-tuning with domain-specific data improves model outcomes, even with limited amounts of domain-specific data.</abstract>
+      <url hash="1c1cff32">2024.finnlp-1.5</url>
+      <bibkey>kozaeva-etal-2024-development-evaluation</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Evaluating Multilingual Language Models for Cross-Lingual <fixed-case>ESG</fixed-case> Issue Identification</title>
+      <author><first>Wing Yan</first><last>Li</last></author>
+      <author><first>Emmanuele</first><last>Chersoni</last></author>
+      <author><first>Cindy Sing Bik</first><last>Ngai</last></author>
+      <pages>50–58</pages>
+      <abstract>The automation of information extraction from ESG reports has recently become a topic of increasing interest in the Natural Language Processing community. While such information is highly relevant for socially responsible investments, identifying the specific issues discussed in a corporate social responsibility report is one of the first steps in an information extraction pipeline. In this paper, we evaluate methods for tackling the Multilingual Environmental, Social and Governance (ESG) Issue Identification Task. Our experiments use existing datasets in English, French and Chinese with a unified label set. Leveraging multilingual language models, we compare two approaches that are commonly adopted for the given task: off-the-shelf and fine-tuning. We show that fine-tuning models end-to-end is more robust than off-the-shelf methods. Additionally, translating text into the same language has negligible performance benefits.</abstract>
+      <url hash="2fcb7405">2024.finnlp-1.6</url>
+      <bibkey>li-etal-2024-evaluating-multilingual</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Modal-adaptive Knowledge-enhanced Graph-based Financial Prediction from Monetary Policy Conference Calls with <fixed-case>LLM</fixed-case></title>
+      <author><first>Kun</first><last>Ouyang</last></author>
+      <author><first>Yi</first><last>Liu</last></author>
+      <author><first>Shicheng</first><last>Li</last></author>
+      <author><first>Ruihan</first><last>Bao</last></author>
+      <author><first>Keiko</first><last>Harimoto</last></author>
+      <author><first>Xu</first><last>Sun</last></author>
+      <pages>59–69</pages>
+      <abstract>Financial prediction from Monetary Policy Conference (MPC) calls is a new yet challenging task, which targets at predicting the price movement and volatility for specific financial assets by analyzing multimodal information including text, video, and audio. Although the existing work has achieved great success using cross-modal transformer blocks, it overlooks the potential external financial knowledge, the varying contributions of different modalities to financial prediction, as well as the innate relations among different financial assets. To tackle these limitations, we propose a novel Modal-Adaptive kNowledge-enhAnced Graph-basEd financial pRediction scheme, named MANAGER. Specifically, MANAGER resorts to FinDKG to obtain the external related knowledge for the input text. Meanwhile, MANAGER adopts BEiT-3 and Hidden-unit BERT (HuBERT) to extract the video and audio features, respectively. Thereafter, MANAGER introduces a novel knowledge-enhanced cross-modal graph that fully characterizes the semantic relations among text, external knowledge, video and audio, to adaptively utilize the information in different modalities, with ChatGLM2 as the backbone. Extensive experiments on a publicly available dataset Monopoly verify the superiority of our model over cutting-edge methods.</abstract>
+      <url hash="e0c332c8">2024.finnlp-1.7</url>
+      <bibkey>ouyang-etal-2024-modal-adaptive</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>N</fixed-case>et<fixed-case>Z</fixed-case>ero<fixed-case>F</fixed-case>acts: Two-Stage Emission Information Extraction from Company Reports</title>
+      <author><first>Marco</first><last>Wrzalik</last></author>
+      <author><first>Florian</first><last>Faust</last></author>
+      <author><first>Simon</first><last>Sieber</last></author>
+      <author><first>Adrian</first><last>Ulges</last></author>
+      <pages>70–84</pages>
+      <abstract>We address the challenge of efficiently extracting structured emission information, specifically emission goals, from company reports. Leveraging the potential of Large Language Models (LLMs), we propose a two-stage pipeline that first filters and retrieves potentially relevant passages and then extracts structured information from them using a generative model. We contribute an annotated dataset covering over 14.000 text passages, from which we extracted 739 expert annotated facts. On this dataset, we investigate the accuracy, efficiency and limitations of LLM-based emission information extraction, evaluate different retrieval techniques, and assess efficiency gains for human analysts by using the proposed pipeline. Our research demonstrates the promise of LLM technology in addressing the intricate task of sustainable emission data extraction from company reports.</abstract>
+      <url hash="d2962da1">2024.finnlp-1.8</url>
+      <bibkey>wrzalik-etal-2024-netzerofacts-two</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>FB</fixed-case>-<fixed-case>GAN</fixed-case>: A Novel Neural Sentiment-Enhanced Model for Stock Price Prediction</title>
+      <author><first>Jainendra Kumar</first><last>Jain</last></author>
+      <author><first>Ruchit</first><last>Agrawal</last></author>
+      <pages>85–93</pages>
+      <abstract>Predicting stock prices remains a significant challenge in financial markets. This study explores existing stock price prediction systems, identifies their strengths and weaknesses, and proposes a novel method for stock price prediction that leverages a state-of-the-art neural network framework, combining the BERT language model for sentiment analysis on news articles and the GAN model for stock price prediction. We introduce the FB-GAN model, an ensemble model that leverages stock price history and market sentiment score for more accurate stock price prediction and propose effective strategies to capture the market sentiment. We conduct experiments on stock price prediction for five major equities (Amazon, Apple, Microsoft, Nvidia, and Adobe), and compare the performance obtained by our proposed model against the existing state-of-the-art baseline model. The results demonstrate that our proposed model outperforms existing models across the five major equities. We demonstrate that the strategic incorporation of market sentiment using both headlines as well summaries of news articles significantly enhances the accuracy and robustness of stock price prediction.</abstract>
+      <url hash="6554b302">2024.finnlp-1.9</url>
+      <bibkey>jain-agrawal-2024-fb-gan</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Unveiling Currency Market Dynamics: Leveraging Federal Reserve Communications for Strategic Investment Insights</title>
+      <author><first>Martina</first><last>Menzio</last></author>
+      <author><first>Davide</first><last>Paris</last></author>
+      <author><first>Elisabetta</first><last>Fersini</last></author>
+      <pages>94–102</pages>
+      <abstract>The purpose of this paper is to extract market signals for the major currencies (EUR, USD, GBP, JPY, CNY) analyzing the Federal Reserve System (FED) minutes and speeches, and, consequently, making suggestions about going long/short or remaining neutral to investors thanks to the causal relationships between FED sentiment and currency exchange rates. To this purpose, we aim to verify the hypothesis that the currency market dynamics follow a trend that is subject to the sentiment of FED minutes and speeches related to specific relevant currencies. The proposed paper has highlighted two main findings: (1) the sentiment expressed in the FED minutes has a strong influence on financial market predictability on major currencies trend and (2) the sentiment over time Granger-causes the exchange rate of currencies not only immediately but also at increasing lags according to a monotonically decreasing impact.</abstract>
+      <url hash="dddcebd4">2024.finnlp-1.10</url>
+      <bibkey>menzio-etal-2024-unveiling-currency</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Analysis of Material Facts on Financial Assets: A Generative <fixed-case>AI</fixed-case> Approach</title>
+      <author><first>Gabriel</first><last>Assis</last></author>
+      <author><first>Daniela</first><last>Vianna</last></author>
+      <author><first>Gisele L.</first><last>Pappa</last></author>
+      <author><first>Alexandre</first><last>Plastino</last></author>
+      <author><first>Wagner</first><last>Meira Jr</last></author>
+      <author><first>Altigran Soares</first><last>da Silva</last></author>
+      <author><first>Aline</first><last>Paes</last></author>
+      <pages>103–118</pages>
+      <abstract>Material facts (MF) are crucial and obligatory disclosures that can significantly influence asset values. Following their release, financial analysts embark on the meticulous and highly specialized task of crafting analyses to shed light on their impact on company assets, a challenge elevated by the daily amount of MFs released. Generative AI, with its demonstrated power of crafting coherent text, emerges as a promising solution to this task. However, while these analyses must incorporate the MF, they must also transcend it, enhancing it with vital background information, valuable and grounded recommendations, prospects, potential risks, and their underlying reasoning. In this paper, we approach this task as an instance of controllable text generation, aiming to ensure adherence to the MF and other pivotal attributes as control elements. We first explore language models’ capacity to manage this task by embedding those elements into prompts and engaging popular chatbots. A bilingual proof of concept underscores both the potential and the challenges of applying generative AI techniques to this task.</abstract>
+      <url hash="4d7649c8">2024.finnlp-1.11</url>
+      <bibkey>assis-etal-2024-analysis-material</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Exploring Large Language Models in Financial Argument Relation Identification</title>
+      <author><first>Yasser</first><last>Otiefy</last></author>
+      <author><first>Alaa</first><last>Alhamzeh</last></author>
+      <pages>119–129</pages>
+      <abstract>In the dynamic landscape of financial analytics, the argumentation within Earnings Conference Calls (ECCs) provides valuable insights for investors and market participants. This paper delves into the automatic relation identification between argument components in this type of data, a poorly studied task in the literature. To tackle this challenge, we empirically examined and analysed a wide range of open-source models, as well as the Generative Pre-trained Transformer GPT-4. On the one hand, our experiments in open-source models spanned general-purpose models, debate-fine-tuned models, and financial-fine-tuned models. On the other hand, we assessed the performance of GPT-4 zero-shot learning on a financial argumentation dataset (FinArg). Our findings show that a smaller open-source model, fine-tuned on relevant data, can perform as a huger general-purpose one, showing the value of enriching the local embeddings with the semantic context of data. However, GPT-4 demonstrated superior performance with F1-score of 0.81, even with no given samples or shots. In this paper, we detail our data, models and experimental setup. We also provide further performance analysis from different aspects.</abstract>
+      <url hash="3ffbbfa3">2024.finnlp-1.12</url>
+      <bibkey>otiefy-alhamzeh-2024-exploring-large</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Keyword-based Annotation of Visually-Rich Document Content for Trend and Risk Analysis Using Large Language Models</title>
+      <author><first>Giuseppe</first><last>Gallipoli</last></author>
+      <author><first>Simone</first><last>Papicchio</last></author>
+      <author><first>Lorenzo</first><last>Vaiani</last></author>
+      <author><first>Luca</first><last>Cagliero</last></author>
+      <author><first>Arianna</first><last>Miola</last></author>
+      <author><first>Daniele</first><last>Borghi</last></author>
+      <pages>130–136</pages>
+      <abstract>In the banking and finance sectors, members of the business units focused on Trend and Risk Analysis daily process internal and external visually-rich documents including text, images, and tables. Given a facet (i.e., topic) of interest, they are particularly interested in retrieving the top trending keywords related to it and then use them to annotate the most relevant document elements (e.g., text paragraphs, images or tables). In this paper, we explore the use of both open-source and proprietary Large Language Models to automatically generate lists of facet-relevant keywords, automatically produce free-text descriptions of both keywords and multimedia document content, and then annotate documents by leveraging textual similarity approaches. The preliminary results, achieved on English and Italian documents, show that OpenAI GPT-4 achieves superior performance in keyword description generation and multimedia content annotation, while the open-source Meta AI Llama2 model turns out to be highly competitive in generating additional keywords.</abstract>
+      <url hash="889cbd4d">2024.finnlp-1.13</url>
+      <bibkey>gallipoli-etal-2024-keyword-based</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>ESG</fixed-case>-<fixed-case>FTSE</fixed-case>: A Corpus of News Articles with <fixed-case>ESG</fixed-case> Relevance Labels and Use Cases</title>
+      <author><first>Mariya</first><last>Pavlova</last></author>
+      <author><first>Bernard</first><last>Casey</last></author>
+      <author><first>Miaosen</first><last>Wang</last></author>
+      <pages>137–149</pages>
+      <abstract>We present ESG-FTSE, the first corpus comprised of news articles with Environmental, Social and Governance (ESG) relevance annotations. In recent years, investors and regulators have pushed ESG investing to the mainstream due to the urgency of climate change. This has led to the rise of ESG scores to evaluate an investment’s credentials as socially responsible. While demand for ESG scores is high, their quality varies wildly. Quantitative techniques can be applied to improve ESG scores, thus, responsible investing. To contribute to resource building for ESG and financial text mining, we pioneer the ESG-FTSE corpus. We further present the first of its kind ESG annotation schema. It has three levels: a binary classification (relevant versus irrelevant news articles), ESG classification (ESG-related news articles), and target company. Both supervised and unsupervised learning experiments for ESG relevance detection were conducted to demonstrate that the corpus can be used in different settings to derive accurate ESG predictions.</abstract>
+      <url hash="83428c9c">2024.finnlp-1.14</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="196544a8">2024.finnlp-1.14.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>pavlova-etal-2024-esg-ftse</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>BBRC</fixed-case>: <fixed-case>B</fixed-case>razilian Banking Regulation Corpora</title>
+      <author><first>Rafael</first><last>Faria de Azevedo</last></author>
+      <author><first>Thiago Henrique</first><last>Eduardo Muniz</last></author>
+      <author><first>Claudio</first><last>Pimentel</last></author>
+      <author><first>Guilherme</first><last>Jose de Assis Foureaux</last></author>
+      <author><first>Barbara</first><last>Caldeira Macedo</last></author>
+      <author><first>Daniel de Lima</first><last>Vasconcelos</last></author>
+      <pages>150–166</pages>
+      <abstract>We present BBRC, a collection of 25 corpus of banking regulatory risk from different departments of Banco do Brasil (BB). These are individual corpus about investments, insurance, human resources, security, technology, treasury, loans, accounting, fraud, credit cards, payment methods, agribusiness, risks, etc. They were annotated in binary form by experts indicating whether each regulatory document contains regulatory risk that may require changes to products, processes, services, and channels of a bank department or not. The corpora in Portuguese contain documents from 26 Brazilian regulatory authorities in the financial sector. In total, there are 61,650 annotated documents, mostly between half and three pages long. The corpora belong to a Natural Language Processing (NLP) application that has been in production since 2020. In this work, we also performed binary classification benchmarks with some of the corpus. Experiments were carried out with different sampling techniques and in one of them we sought to solve an intraclass imbalance problem present in each corpus of the corpora. For the benchmarks, we used the following classifiers: Multinomial Naive Bayes, Random Forest, SVM, XGBoost, and BERTimbau (a version of BERT for Portuguese). The BBRC can be downloaded through a link in the article.</abstract>
+      <url hash="400db4b4">2024.finnlp-1.15</url>
+      <bibkey>faria-de-azevedo-etal-2024-bbrc-brazilian</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Stock Price Prediction with Sentiment Analysis for <fixed-case>C</fixed-case>hinese Market</title>
+      <author><first>Yuchen</first><last>Luan</last></author>
+      <author><first>Haiyang</first><last>Zhang</last></author>
+      <author><first>Chenlei</first><last>Zhang</last></author>
+      <author><first>Yida</first><last>Mu</last></author>
+      <author><first>Wei</first><last>Wang</last></author>
+      <pages>167–177</pages>
+      <abstract>Accurate prediction of stock prices is considered as a significant practical challenge and has been a longstanding topic of debate within the economic domain. In recent years, sentiment analysis on social media comments has been considered an important data source for stock prediction. However, most of these works focus on exploring stocks with high market values or from specific industries. The extent to which sentiments affect a broader range of stocks and their overall performance remains uncertain. In this paper, we study the influence of sentiment analysis on stock price prediction with respect to (1) different market value groups and (2) different Book-to-Market ratio groups in the Chinese stock market. To this end, we create a new dataset that consists of 24 stocks across different market value groups and Book-to-Market ratio categories, along with 12,000 associated comments that have been collected and manually annotated. We then utilized this dataset to train a variety of sentiment classifiers, which were subsequently integrated into sequential neural-based models for stock price prediction. Experimental findings indicate that while sentiment integration generally improve the predictive performance for price prediction, it may not consistently lead to better results for individual stocks. Moreover, these outcomes are notably influenced by varying market values and Book-to-Market ratios, with stocks of higher market values and B/M ratios often exhibiting more accurate predictions. Among all the models tested, the Bi-LSTM model incorporated with the sentiment analysis, achieves the best prediction performance.</abstract>
+      <url hash="18bcd6df">2024.finnlp-1.16</url>
+      <bibkey>luan-etal-2024-stock-price</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Topic Taxonomy Construction from <fixed-case>ESG</fixed-case> Reports</title>
+      <author><first>Saif Majdi</first><last>AlNajjar</last></author>
+      <author><first>Xinyu</first><last>Wang</last></author>
+      <author><first>Yulan</first><last>He</last></author>
+      <pages>178–187</pages>
+      <abstract>The surge in Environmental, Societal, and Governance (ESG) reports, essential for corporate transparency and modern investments, presents a challenge for investors due to their varying lengths and sheer volume. We present a novel methodology, called MultiTaxoGen, for creating topic taxonomies designed specifically for analysing the ESG reports. Topic taxonomies serve to illustrate topics covered in a corpus of ESG reports while also highlighting the hierarchical relationships between them. Unfortunately, current state-of-the-art approaches for constructing topic taxonomies are designed for more general datasets, resulting in ambiguous topics and the omission of many latent topics presented in ESG-focused corpora. This makes them unsuitable for the specificity required by investors. Our method instead adapts topic modelling techniques by employing them recursively on each topic’s local neighbourhood, the subcorpus of documents assigned to that topic. This iterative approach allows us to identify the children topics and offers a better understanding of topic hierarchies in a fine-grained paradigm. Our findings reveal that our method captures more latent topics in our ESG report corpus than the leading method and provides more coherent topics with comparable relational accuracy.</abstract>
+      <url hash="4fae7568">2024.finnlp-1.17</url>
+      <bibkey>alnajjar-etal-2024-topic-taxonomy</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Duration Dynamics: Fin-Turbo’s Rapid Route to <fixed-case>ESG</fixed-case> Impact Insight</title>
+      <author><first>Weijie</first><last>Yang</last></author>
+      <author><first>Xinyun</first><last>Rong</last></author>
+      <pages>188–196</pages>
+      <abstract>This study introduces “Duration Dynamics: Fin-Turbo’s Rapid Route to ESG Impact Insight”, an innovative approach employing advanced Natural Language Processing (NLP) techniques to assess the impact duration of ESG events on corporations. Leveraging a unique dataset comprising multilingual news articles, the research explores the utility of machine translation for language uniformity, text segmentation for contextual understanding, data augmentation for dataset balance, and an ensemble learning method integrating models like ESG-BERT, RoBERTa, DeBERTa, and Flan-T5 for nuanced analysis. Yielding excellent results, our research showcases the potential of using language models to improve ESG-oriented decision-making, contributing valuable insights to the FinNLP community.</abstract>
+      <url hash="7e6eaa1e">2024.finnlp-1.18</url>
+      <bibkey>yang-rong-2024-duration-dynamics</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Multilingual <fixed-case>ESG</fixed-case> News Impact Identification Using an Augmented Ensemble Approach</title>
+      <author><first>Harika</first><last>Abburi</last></author>
+      <author><first>Ajay</first><last>Kumar</last></author>
+      <author><first>Edward</first><last>Bowen</last></author>
+      <author><first>Balaji</first><last>Veeramani</last></author>
+      <pages>197–202</pages>
+      <abstract>Determining the duration and length of a news event’s impact on a company’s performance remains elusive for financial analysts. The complexity arises from the fact that the effects of these news articles are influenced by various extraneous factors and can change over time. As a result, in this work, we investigate our ability to predict 1) the duration (length) of a news event’s impact, and 2) level of impact on companies. The datasets used in this study are provided as part of the Multi-Lingual ESG Impact Duration Inference (ML-ESG-3) shared task. To handle the data scarcity, we explored data augmentation techniques to augment our training data. To address each of the research objectives stated above, we employ an ensemble approach combining transformer model, a variant of Convolutional Neural Networks (CNNs), specifically the KimCNN model and contextual embeddings. The model’s performance is assessed across a multilingual dataset encompassing English, French, Japanese, and Korean news articles. For the first task of determining impact duration, our model ranked in first, fifth, seventh, and eight place for Japanese, French, Korean and English texts respectively (with respective macro F1 scores of 0.256, 0.458, 0.552, 0.441). For the second task of assessing impact level, our model ranked in sixth, and eight place for French and English texts, respectively (with respective macro F1 scores of 0.488 and 0.550).</abstract>
+      <url hash="7a1544fb">2024.finnlp-1.19</url>
+      <bibkey>abburi-etal-2024-multilingual-esg</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Cheap Talk: Topic Analysis of <fixed-case>CSR</fixed-case> Themes on Corporate <fixed-case>T</fixed-case>witter</title>
+      <author><first>Nile</first><last>Phillips</last></author>
+      <author><first>Sathvika</first><last>Anand</last></author>
+      <author><first>Michelle</first><last>Lum</last></author>
+      <author><first>Manisha</first><last>Goel</last></author>
+      <author><first>Michelle</first><last>Zemel</last></author>
+      <author><first>Alexandra</first><last>Schofield</last></author>
+      <pages>203–211</pages>
+      <abstract>Numerous firms advertise action around corporate social responsibility (CSR) on social media. Using a Twitter corpus from S&amp;P 500 companies and topic modeling, we investigate how companies talk about their social and sustainability efforts and whether CSR-related speech predicts Environmental, Social, and Governance (ESG) risk scores. As part of our work in progress, we present early findings suggesting a possible distinction in language between authentic discussion of positive practices and corporate posturing.</abstract>
+      <url hash="0ffb9e00">2024.finnlp-1.20</url>
+      <bibkey>phillips-etal-2024-cheap-talk</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case>-2-Econ: Enhancing Title Generation, Abstract Classification, and Academic <fixed-case>Q</fixed-case>&amp;<fixed-case>A</fixed-case> in Economic Research</title>
+      <author><first>Onur</first><last>Keles</last></author>
+      <author><first>Omer Turan</first><last>Bayraklı</last></author>
+      <pages>212–218</pages>
+      <abstract>Using Quantized Low Rank Adaptation and Parameter Efficient Fine Tuning, we fine-tuned Meta AI’s LLaMA-2-7B large language model as a research assistant in the field of economics for three different types of tasks: title generation, abstract classification, and question and answer. The model was fine-tuned on economics paper abstracts and syntheticically created question-answer dialogues based on the abstracts. For the title generation, the results of the experiment demonstrated that LLaMA-2-Econ (the fine-tuned model) surpassed the base model (7B and 13B) with few shot learning, and comparable models of similar size like Mistral-7B and Bloom-7B in the BLEU and ROUGE metrics. For abstract categorization, LLaMA-2-Econ outperformed different machine and deep learning algorithms in addition to state-of-the-art models like GPT 3.5 and GPT 4 with both single and representative few shot learning. We tested the fine-tuned Q&amp;A model by comparing its output with the base LLaMA-2-7B-chat with a Retrieval Augmented Generation (RAG) pipeline with semantic search and dense vector indexing, and found that LLaMA-2 performed on a par with the base model with RAG.</abstract>
+      <url hash="a903c69c">2024.finnlp-1.21</url>
+      <bibkey>keles-bayrakli-2024-llama-2</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Multi-Lingual <fixed-case>ESG</fixed-case> Impact Duration Inference</title>
+      <author><first>Chung-Chi</first><last>Chen</last></author>
+      <author><first>Yu-Min</first><last>Tseng</last></author>
+      <author><first>Juyeon</first><last>Kang</last></author>
+      <author><first>Anais</first><last>Lhuissier</last></author>
+      <author><first>Yohei</first><last>Seki</last></author>
+      <author><first>Hanwool</first><last>Lee</last></author>
+      <author><first>Min-Yuh</first><last>Day</last></author>
+      <author><first>Teng-Tsai</first><last>Tu</last></author>
+      <author><first>Hsin-Hsi</first><last>Chen</last></author>
+      <pages>219–227</pages>
+      <abstract>To accurately assess the dynamic impact of a company’s activities on its Environmental, Social, and Governance (ESG) scores, we have initiated a series of shared tasks, named ML-ESG. These tasks adhere to the MSCI guidelines for annotating news articles across various languages. This paper details the third iteration of our series, ML-ESG-3, with a focus on impact duration inference—a task that poses significant challenges in estimating the enduring influence of events, even for human analysts. In ML-ESG-3, we provide datasets in five languages (Chinese, English, French, Korean, and Japanese) and share insights from our experience in compiling such subjective datasets. Additionally, this paper reviews the methodologies proposed by ML-ESG-3 participants and offers a comparative analysis of the models’ performances. Concluding the paper, we introduce the concept for the forthcoming series of shared tasks, namely multi-lingual ESG promise verification, and discuss its potential contributions to the field.</abstract>
+      <url hash="2897826b">2024.finnlp-1.22</url>
+      <bibkey>chen-etal-2024-multi-lingual</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>IMNTPU</fixed-case> at <fixed-case>ML</fixed-case>-<fixed-case>ESG</fixed-case>-3: Transformer Language Models for Multi-Lingual <fixed-case>ESG</fixed-case> Impact Type and Duration Classification</title>
+      <author><first>Yu Han</first><last>Kao</last></author>
+      <author><first>Vidhya</first><last>Nataraj</last></author>
+      <author><first>Ting-Chi</first><last>Wang</last></author>
+      <author><first>Yu-Jyun</first><last>Zheng</last></author>
+      <author><first>Hsiao-Chuan</first><last>Liu</last></author>
+      <author><first>Wen-Hsuan</first><last>Liao</last></author>
+      <author><first>Chia-Tung</first><last>Tsai</last></author>
+      <author><first>Min-Yuh</first><last>Day</last></author>
+      <pages>228–233</pages>
+      <abstract>Our team participated in the multi-lingual Environmental, Social, and Governance (ESG) classification task, focusing on datasets in three languages: English, French, and Japanese. This study leverages Pre-trained Language Models (PLMs), with a particular emphasis on the Bidirectional Encoder Representations from Transformers (BERT) framework, to analyze sentence and document structures across these varied linguistic datasets. The team’s experimentation with diverse PLM-based network designs facilitated a nuanced comparative analysis within this multi-lingual context. For each language-specific dataset, different BERT-based transformer models were trained and evaluated. Notably, in the experimental results, the RoBERTa-Base model emerged as the most effective in official evaluation, particularly in the English dataset, achieving a micro-F1 score of 58.82 %, thereby demonstrating superior performance in classifying ESG impact levels. This research highlights the adaptability and effectiveness of PLMs in tackling the complexities of multi-lingual ESG classification tasks, underscoring the exceptional performance of the Roberta Base model in processing English-language data.</abstract>
+      <url hash="bbf23c87">2024.finnlp-1.23</url>
+      <bibkey>kao-etal-2024-imntpu-ml</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>DICE</fixed-case> @ <fixed-case>ML</fixed-case>-<fixed-case>ESG</fixed-case>-3: <fixed-case>ESG</fixed-case> Impact Level and Duration Inference Using <fixed-case>LLM</fixed-case>s for Augmentation and Contrastive Learning</title>
+      <author><first>Konstantinos</first><last>Bougiatiotis</last></author>
+      <author><first>Andreas</first><last>Sideras</last></author>
+      <author><first>Elias</first><last>Zavitsanos</last></author>
+      <author><first>Georgios</first><last>Paliouras</last></author>
+      <pages>234–243</pages>
+      <abstract>We present the submission of team DICE for ML-ESG-3, the 3rd Shared Task on Multilingual ESG impact duration inference in the context of the joint FinNLP-KDF workshop series. The task provides news articles and seeks to determine the impact and duration of an event in the news article may have on a company. We experiment with various baselines and discuss the results of our best-performing submissions based on contrastive pre-training and a stacked model based on the bag-of-words assumption and sentence embeddings. We also explored the label correlations among events stemming from the same news article and the correlations between impact level and impact length. Our analysis shows that even simple classifiers trained in this task can achieve comparable performance with more complex models, under certain conditions.</abstract>
+      <url hash="8db5296d">2024.finnlp-1.24</url>
+      <bibkey>bougiatiotis-etal-2024-dice-ml</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Fine-tuning Language Models for Predicting the Impact of Events Associated to Financial News Articles</title>
+      <author><first>Neelabha</first><last>Banerjee</last></author>
+      <author><first>Anubhav</first><last>Sarkar</last></author>
+      <author><first>Swagata</first><last>Chakraborty</last></author>
+      <author><first>Sohom</first><last>Ghosh</last></author>
+      <author><first>Sudip Kumar</first><last>Naskar</last></author>
+      <pages>244–247</pages>
+      <abstract>Investors and other stakeholders like consumers and employees, increasingly consider ESG factors when making decisions about investments or engaging with companies. Taking into account the importance of ESG today, FinNLP-KDF introduced the <i>ML-ESG-3</i> shared task, which seeks to determine the duration of the impact of financial news articles in four languages - English, French, Korean, and Japanese. This paper describes our team, LIPI’s approach towards solving the above-mentioned task. Our final systems consist of translation, paraphrasing and fine-tuning language models like BERT, Fin-BERT and RoBERTa for classification. We ranked first in the impact duration prediction subtask for French language.</abstract>
+      <url hash="2c1b8bb3">2024.finnlp-1.25</url>
+      <bibkey>banerjee-etal-2024-fine-tuning</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>C</fixed-case>ritical<fixed-case>M</fixed-case>inds: Enhancing <fixed-case>ML</fixed-case> Models for <fixed-case>ESG</fixed-case> Impact Analysis Categorisation Using Linguistic Resources and Aspect-Based Sentiment Analysis</title>
+      <author><first>Iana</first><last>Atanassova</last></author>
+      <author><first>Marine</first><last>Potier</last></author>
+      <author><first>Maya</first><last>Mathie</last></author>
+      <author><first>Marc</first><last>Bertin</last></author>
+      <author><first>Panggih Kusuma</first><last>Ningrum</last></author>
+      <pages>248–253</pages>
+      <abstract>This paper presents our method and findings for the ML-ESG-3 shared task for categorising Environmental, Social, and Governance (ESG) impact level and duration. We introduce a comprehensive machine learning framework incorporating linguistic and semantic features to predict ESG impact levels and durations in English and French. Our methodology uses features that are derived from FastText embeddings, TF-IDF vectors, manually crafted linguistic resources, the ESG taxonomy, and aspect-based sentiment analysis (ABSA). We detail our approach, feature engineering process, model selection via grid search, and results. The best performance for this task was achieved by the Random Forest and XGBoost classifiers, with micro-F1 scores of 47.06 % and 65.44 % for English Impact level and Impact length, and 39.04 % and 54.79 % for French Impact level and Impact length respectively.</abstract>
+      <url hash="a6180ee0">2024.finnlp-1.26</url>
+      <bibkey>atanassova-etal-2024-criticalminds-enhancing</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Jetsons at <fixed-case>F</fixed-case>in<fixed-case>NLP</fixed-case> 2024: Towards Understanding the <fixed-case>ESG</fixed-case> Impact of a News Article Using Transformer-based Models</title>
+      <author><first>Parag Pravin</first><last>Dakle</last></author>
+      <author><first>Alolika</first><last>Gon</last></author>
+      <author><first>Sihan</first><last>Zha</last></author>
+      <author><first>Liang</first><last>Wang</last></author>
+      <author><first>Sai Krishna</first><last>Rallabandi</last></author>
+      <author><first>Preethi</first><last>Raghavan</last></author>
+      <pages>254–260</pages>
+      <abstract>In this paper, we describe the different approaches explored by the Jetsons team for the Multi-Lingual ESG Impact Duration Inference (ML-ESG-3) shared task. The shared task focuses on predicting the duration and type of the ESG impact of a news article. The shared task dataset consists of 2,059 news titles and articles in English, French, Korean, and Japanese languages. For the impact duration classification task, we fine-tuned XLM-RoBERTa with a custom fine-tuning strategy and using self-training and DeBERTa-v3 using only English translations. These models individually ranked first on the leaderboard for Korean and Japanese and in an ensemble for the English language, respectively. For the impact type classification task, our XLM-RoBERTa model fine-tuned using a custom fine-tuning strategy ranked first for the English language.</abstract>
+      <url hash="e5e6a11c">2024.finnlp-1.27</url>
+      <bibkey>dakle-etal-2024-jetsons-finnlp</bibkey>
+    </paper>
+    <paper id="28">
+      <title><fixed-case>ESG</fixed-case> Classification by Implicit Rule Learning via <fixed-case>GPT</fixed-case>-4</title>
+      <author><first>Yun</first><last>Hyojeong</last></author>
+      <author><first>Kim</first><last>Chanyoung</last></author>
+      <author><first>Moonjeong</first><last>Hahm</last></author>
+      <author><first>Kyuri</first><last>Kim</last></author>
+      <author><first>Guijin</first><last>Son</last></author>
+      <pages>261–268</pages>
+      <abstract>In this work, we adopt multiple prompting, chain-of-thought reasoning, and in-context learning strategies to guide GPT-4 in solving ESG classification tasks. We rank second in the Korean subset for Shared Task ML-ESG-3 in Impact Type prediction. Furthermore, we adopt open models to explain their calibration and robustness to different prompting strategies. The longer general pre-training correlates with enhanced performance in financial downstream tasks.</abstract>
+      <url hash="2353ed36">2024.finnlp-1.28</url>
+      <bibkey>hyojeong-etal-2024-esg-classification</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Leveraging Semi-Supervised Learning on a Financial-Specialized Pre-trained Language Model for Multilingual <fixed-case>ESG</fixed-case> Impact Duration and Type Classification</title>
+      <author><first>Jungdae</first><last>Kim</last></author>
+      <author><first>Eunkwang</first><last>Jeon</last></author>
+      <author><first>Jeon</first><last>Sang Hyun</last></author>
+      <pages>269–273</pages>
+      <abstract>This paper presents the results of our participation in the Multilingual ESG Impact Duration Inference (ML-ESG-3) shared task organized by FinNLP-KDF@LREC-COLING-2024. The objective of this challenge is to leverage natural language processing (NLP) techniques to identify the impact duration or impact type of events that may affect a company based on news articles written in various languages. Our approach employs semi-supervised learning methods on a finance-specialized pre-trained language model. Our methodology demonstrates strong performance, achieving 1st place in the Korean - Impact Type subtask and 2nd place in the Korean - Impact Duration subtask. These results showcase the efficacy of our approach in detecting ESG-related issues from news articles. Our research shows the potential to improve existing ESG ratings by quickly reflecting the latest events of companies.</abstract>
+      <url hash="6f3b4f2b">2024.finnlp-1.29</url>
+      <bibkey>kim-etal-2024-leveraging-semi</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Adapting <fixed-case>LLM</fixed-case> to Multi-lingual <fixed-case>ESG</fixed-case> Impact and Length Prediction Using In-context Learning and Fine-Tuning with Rationale</title>
+      <author><first>Pawan Kumar</first><last>Rajpoot</last></author>
+      <author><first>Ashvini</first><last>Jindal</last></author>
+      <author><first>Ankur</first><last>Parikh</last></author>
+      <pages>274–278</pages>
+      <abstract>The prediction of Environmental, Social, and Governance (ESG) impact and duration (length) of impact from company events, as reported in news articles, hold immense significance for investors, policymakers, and various stakeholders. In this paper, we describe solutions from our team “Upaya” to ESG impact and length prediction tasks on one such dataset ML-ESG-3. ML-ESG-3 dataset was released along with shared task as a part of the Fifth Workshop on Knowledge Discovery from Unstructured Data in Financial Services, co-located with LREC-COLING 2024. We employed two different paradigms to adapt Large Language Models (LLMs) to predict both the ESG impact and length of events. In the first approach, we leverage GPT-4 within the In-context learning (ICL) framework. A learning-free dense retriever identifies top K-relevant In-context learning examples from the training data for a given test example. The second approach involves instruction-tuning Mistral (7B) LLM to predict impact and duration, supplemented with rationale generated using GPT-4. Our models secured second place in French tasks and achieved reasonable results (fifth and ninth rank) in English tasks. These results demonstrate the potential of different LLM-based paradigms for delivering valuable insights within the ESG investing landscape.</abstract>
+      <url hash="6e6b9ab1">2024.finnlp-1.30</url>
+      <bibkey>rajpoot-etal-2024-adapting-llm</bibkey>
+    </paper>
+    <paper id="31">
+      <title><fixed-case>ESG</fixed-case>-<fixed-case>GPT</fixed-case>:<fixed-case>GPT</fixed-case>4-Based Few-Shot Prompt Learning for Multi-lingual <fixed-case>ESG</fixed-case> News Text Classification</title>
+      <author><first>Ke</first><last>Tian</last></author>
+      <author><first>Hua</first><last>Chen</last></author>
+      <pages>279–282</pages>
+      <abstract>Environmental, Social, and Governance (ESG) factors for company assessment have gained great attention from finance investors to identify companies’ risks and growth opportunities. ESG Text data regarding the company like sustainable reports, media news text, and social media text are important data sources for ESG analysis like ESG factors classification. Recently, FinNLP has proposed several ESG-related tasks. One of the tasks is Multi-Lingual ESG Issue Identification 3(ML-ESG-3) which is to determine the duration or impact level of the impact of an event in the news article regarding the company. In this paper, we mainly discussed our team: KaKa’s solution to this ML-ESG-3 task. We proposed the GPT4 model based on few-shot prompt learning to predict the impact level or duration of the impact of multi-lingual ESG news for the company. The experiment result demonstrates that GPT4-based few-shot prompt learning achieved good performance in leaderboard quantitative evaluations of ML-ESG-3 tasks across different languages.</abstract>
+      <url hash="6279fe77">2024.finnlp-1.31</url>
+      <bibkey>tian-chen-2024-esg-gpt</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Shared Task for Cross-lingual Classification of Corporate Social Responsibility (<fixed-case>CSR</fixed-case>) Themes and Topics</title>
+      <author><first>Yola</first><last>Nayekoo</last></author>
+      <author><first>Sophia</first><last>Katrenko</last></author>
+      <author><first>Veronique</first><last>Hoste</last></author>
+      <author><first>Aaron</first><last>Maladry</last></author>
+      <author><first>Els</first><last>Lefever</last></author>
+      <pages>283–291</pages>
+      <abstract>This paper provides an overview of the Shared Task for Cross-lingual Classification of CSR Themes and Topics. We framed the task as two separate sub-tasks: one cross-lingual multi-class CSR theme recognition task for English, French and simplified Chinese and one multi-label fine-grained classification task of CSR topics for Environment (ENV) and Labor and Human Rights (LAB) themes in English. The participants were provided with URLs and annotations for both tasks. Several teams downloaded the data, of which two teams submitted a system for both sub-tasks. In this overview paper, we discuss the set-up of the task and our main findings.</abstract>
+      <url hash="d98d6032">2024.finnlp-1.32</url>
+      <bibkey>nayekoo-etal-2024-shared-task</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Advancing <fixed-case>CSR</fixed-case> Theme and Topic Classification: <fixed-case>LLM</fixed-case>s and Training Enhancement Insights</title>
+      <author><first>Jens</first><last>Van Nooten</last></author>
+      <author><first>Andriy</first><last>Kosar</last></author>
+      <pages>292–305</pages>
+      <abstract>In this paper, we present our results of the classification of Corporate Social Responsibility (CSR) Themes and Topics shared task, which encompasses cross-lingual multi-class classification and monolingual multi-label classification. We examine the performance of multiple machine learning (ML) models, ranging from classical models to pre-trained large language models (LLMs), and assess the effectiveness of Data Augmentation (DA), Data Translation (DT), and Contrastive Learning (CL). We find that state-of-the-art generative LLMs in a zero-shot setup still fall behind on more complex classification tasks compared to fine-tuning local models with enhanced datasets and additional training objectives. Our work provides a wide array of comparisons and highlights the relevance of utilizing smaller language models for more complex classification tasks.</abstract>
+      <url hash="73ad40ab">2024.finnlp-1.33</url>
+      <bibkey>van-nooten-kosar-2024-advancing-csr</bibkey>
+    </paper>
+    <paper id="34">
+      <title>Improving Cross-Lingual <fixed-case>CSR</fixed-case> Classification Using Pretrained Transformers with Variable Selection Networks and Data Augmentation</title>
+      <author><first>Shubham</first><last>Sharma</last></author>
+      <author><first>Himanshu</first><last>Janbandhu</last></author>
+      <author><first>Ankush</first><last>Chopra</last></author>
+      <pages>306–318</pages>
+      <abstract>This paper describes our submission to the Cross-Lingual Classification of Corporate Social Responsibility (CSR) Themes and Topics shared task, aiming to identify themes and fine-grained topics present in news articles. Classifying news articles poses several challenges, including limited training data, noisy articles, and longer context length. In this paper, we explore the potential of using pretrained transformer models to classify news articles into CSR themes and fine-grained topics. We propose two different approaches for these tasks. For multi-class classification of CSR themes, we suggest using a pretrained multi-lingual encoder-based model like microsoft/mDeBERTa-v3-base, along with a variable selection network to classify the article into CSR themes. To identify all fine-grained topics in each article, we propose using a pretrained encoder-based model like Longformer, which offers a higher context length. We employ chunking-based inference to avoid information loss in inference and experimented with using different parts and manifestation of original article for training and inference.</abstract>
+      <url hash="0d1b4f04">2024.finnlp-1.34</url>
+      <bibkey>sharma-etal-2024-improving-cross</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.games.xml b/data/xml/2024.games.xml
new file mode 100644
index 0000000000..8ab55e3922
--- /dev/null
+++ b/data/xml/2024.games.xml
@@ -0,0 +1,146 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.games">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 10th Workshop on Games and Natural Language Processing @ LREC-COLING 2024</booktitle>
+      <editor><first>Chris</first><last>Madge</last></editor>
+      <editor><first>Jon</first><last>Chamberlain</last></editor>
+      <editor><first>Karen</first><last>Fort</last></editor>
+      <editor><first>Udo</first><last>Kruschwitz</last></editor>
+      <editor><first>Stephanie</first><last>Lukin</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="f3bd4f2e">2024.games-1</url>
+      <venue>games</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="7eec2185">2024.games-1.0</url>
+      <bibkey>games-2024-games</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>“Actors Challenge”: Collecting Data to Study Prosodic Patterns and Their Mappings to Meanings Across Languages</title>
+      <author><first>Sia V.</first><last>Sepanta</last></author>
+      <pages>1–5</pages>
+      <abstract>In this paper we describe “Actors Challenge”: a web-based interactive game designed to collect massively multi-speaker, multi-lingual oral data on the connection between prosody and various aspects of meaning. Game participants take on the two roles of auditioners and casting directors. Auditioners are asked to record certain target phrases modulated according to the emotional or attitudinal profiles that correspond to contexts or stage cues given to them. They then switch roles and become Casting Directors. Now they have to listen to other participants’ recordings, guess the corresponding context/stage cue that the auditioner tried to convey, and evaluate how good the performance was. By having the players alternate between these two roles we obtain both data creation and data validation from the same set of participants. We expect that the final dataset of labeled recordings will be valuable for a range of applications: training multilingual Speech Emotion Recognition classifiers; discovering correlations and variations in prosodic patterns among unrelated languages; examining correlations between prosodic patterns and emotion recognizability; probing the possibility that some prosodic patterns are universal.</abstract>
+      <url hash="67c9eb88">2024.games-1.1</url>
+      <bibkey>sepanta-2024-actors</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Empowering Adaptive Digital Game-Based Language Learning for Under-Resourced Languages Through Text Analysis</title>
+      <author><first>Elaine</first><last>Uí Dhonnchadha</last></author>
+      <author><first>Sally</first><last>Bruen</last></author>
+      <author><first>Liang</first><last>Xu</last></author>
+      <author><first>Monica</first><last>Ward</last></author>
+      <pages>6–13</pages>
+      <abstract>This study explores Cipher, an adaptive language learning game tailored for the under-resourced Irish language, aimed mainly at primary school students. By integrating text analysis techniques, Cipher dynamically adjusts its difficulty based on the player’s language proficiency, offering a customised learning experience. The game’s narrative involves decoding spells to access Irish myths and stories, combining language learning with cultural elements. Development involved collaboration with educators to align the game content with curriculum standards and incorporate culturally relevant materials. This paper outlines the game’s development process, emphasising the use of text analysis for difficulty adjustment and the importance of engaging, educational gameplay. Preliminary results indicate that adaptive games like Cipher can enhance language learning by providing immersive, personalised experiences that maintain player motivation and engagement.</abstract>
+      <url hash="8de4f751">2024.games-1.2</url>
+      <bibkey>ui-dhonnchadha-etal-2024-empowering</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Hostomytho: A <fixed-case>GWAP</fixed-case> for Synthetic Clinical Texts Evaluation and Annotation</title>
+      <author><first>Nicolas</first><last>Hiebel</last></author>
+      <author><first>Bertrand</first><last>Remy</last></author>
+      <author><first>Bruno</first><last>Guillaume</last></author>
+      <author><first>Olivier</first><last>Ferret</last></author>
+      <author><first>Aurélie</first><last>Névéol</last></author>
+      <author><first>Karen</first><last>Fort</last></author>
+      <pages>14–20</pages>
+      <abstract>This paper presents the creation of Hostomytho, a game with a purpose intended for evaluating the quality of synthetic biomedical texts through multiple mini-games. Hostomytho was developed entirely using open source technologies both for internet browser and mobile platforms (IOS &amp; Android). The code and the annotations created for synthetic clinical cases in French will be made freely available.</abstract>
+      <url hash="e0d957be">2024.games-1.3</url>
+      <bibkey>hiebel-etal-2024-hostomytho</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Using In-context Learning to Automate <fixed-case>AI</fixed-case> Image Generation for a Gamified Text Labelling Task</title>
+      <author><first>Fatima</first><last>Althani</last></author>
+      <author><first>Chris</first><last>Madge</last></author>
+      <author><first>Massimo</first><last>Poesio</last></author>
+      <pages>21–31</pages>
+      <abstract>This paper explores a novel automated method to produce AI-generated images for a text-labelling gamified task. By leveraging the in-context learning capabilities of GPT-4, we automate the optimisation of text-to-image prompts to align with the text being labelled in the part-of-speech tagging task. As an initial evaluation, we compare the optimised prompts to the original sentences based on imageability and concreteness scores. Our results revealed that optimised prompts had significantly higher imageability and concreteness scores. Moreover, to evaluate text-to-image outputs, we generate images using Stable Diffusion XL based on the two prompt types, optimised prompts and the original sentences. Using the automated LIAON-Aesthetic predictor model, we assigned aesthetic scores for the generated images. This resulted in the outputs using optimised prompts scoring significantly higher in predicted aesthetics than those using original sentences as prompts. Our preliminary findings suggest that this methodology provides significantly more aesthetic text-to-image outputs than using the original sentence as a prompt. While the initial results are promising, the text labelling task and AI-generated images presented in this paper have yet to undergo human evaluation.</abstract>
+      <url hash="5441d1ac">2024.games-1.4</url>
+      <bibkey>althani-etal-2024-using</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Aspect-based Sentiment Evaluation of Chess Moves (<fixed-case>ASSESS</fixed-case>): an <fixed-case>NLP</fixed-case>-based Method for Evaluating Chess Strategies from Textbooks</title>
+      <author><first>Haifa</first><last>Alrdahi</last></author>
+      <author><first>Riza</first><last>Batista-Navarro</last></author>
+      <pages>32–42</pages>
+      <abstract>The chess domain is well-suited for creating an artificial intelligence (AI) system that mimics real-world challenges, including decision-making. Throughout the years, minimal attention has been paid to investigating insights derived from unstructured chess data sources. In this study, we examine the complicated relationships between multiple referenced moves in a chess-teaching textbook, and propose a novel method designed to encapsulate chess knowledge derived from move-action phrases. This study investigates the feasibility of using a modified sentiment analysis method as a means for evaluating chess moves based on text. Our proposed Aspect-Based Sentiment Analysis (ABSA) method represents an advancement in evaluating the sentiment associated with referenced chess moves. By extracting insights from move-action phrases, our approach aims to provide a more fine-grained and contextually aware ‘chess move’-based sentiment classification. Through empirical experiments and analysis, we evaluate the performance of our fine-tuned ABSA model, presenting results that confirm the efficiency of our approach in advancing aspect-based sentiment classification within the chess domain. This research contributes to the area of game-playing by machines and shows the practical applicability of leveraging NLP techniques to understand the context of strategic games. Keywords: Natural Language Processing, Chess, Aspect-based Sentiment Analysis (ABSA), Chess Move Evaluation.</abstract>
+      <url hash="3912b899">2024.games-1.5</url>
+      <bibkey>alrdahi-batista-navarro-2024-aspect</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Generating Converging Narratives for Games with Large Language Models</title>
+      <author><first>Douglas</first><last>Summers-Stay</last></author>
+      <author><first>Clare R.</first><last>Voss</last></author>
+      <pages>43–60</pages>
+      <abstract>We explore methods of combining the probability distributions generated by two LLM prompts in order to generate a continuation that is appropriate for both prompts at once. This is a new capability that extends the possibilities for branching and rejoining narratives in games.</abstract>
+      <url hash="a99520ac">2024.games-1.6</url>
+      <bibkey>summers-stay-voss-2024-generating</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Leveraging Large Language Models for Spell-Generation in Dungeons &amp; Dragons</title>
+      <author><first>Elio</first><last>Musacchio</last></author>
+      <author><first>Lucia</first><last>Siciliani</last></author>
+      <author><first>Pierpaolo</first><last>Basile</last></author>
+      <author><first>Giovanni</first><last>Semeraro</last></author>
+      <pages>61–69</pages>
+      <abstract>Dungeons &amp; Dragons (D&amp;D) is a classic tabletop game with a 50-year history. Its intricate and customizable gameplay allows players to create endless worlds and stories. Due to the highly narrative component of this game, D&amp;D and many other interactive games represent a challenging setting for the Natural Language Generation (NLG) capabilities of LLMs. This paper explores using LLMs to generate new spells, which are one of the most captivating aspects of D&amp;D gameplay. Due to the scarcity of resources available for such a specific task, we build a dataset of 3,259 instances by combining official and fan-made D&amp;D spells. We considered several LLMs in generating spells, which underwent a quantitative and qualitative evaluation. Metrics including Bleu and BertScore were computed for quantitative assessments. Subsequently, we also conducted an in-vivo evaluation with a survey involving D&amp;D players, which could assess the quality of the generated spells as well as their adherence to the rules. Furthermore, the paper emphasizes the open-sourcing of all models, datasets, and findings, aiming to catalyze further research on this topic.</abstract>
+      <url hash="ab9590b2">2024.games-1.7</url>
+      <bibkey>musacchio-etal-2024-leveraging</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Branching Narratives: Character Decision Points Detection</title>
+      <author><first>Alexey</first><last>Tikhonov</last></author>
+      <pages>70–75</pages>
+      <abstract>This paper presents the Character Decision Points Detection (CHADPOD) task, a task of identification of points within narratives where characters make decisions that may significantly influence the story’s direction. We propose a novel dataset based on Choose Your Own Adventure (a registered trademark of Chooseco LLC) games graphs to be used as a benchmark for such a task. We provide a comparative analysis of different models’ performance on this task, including a couple of LLMs and several MLMs as baselines, achieving up to 89% accuracy. This underscores the complexity of narrative analysis, showing the challenges associated with understanding character-driven story dynamics. Additionally, we show how such a model can be applied to the existing text to produce linear segments divided by potential branching points, demonstrating the practical application of our findings in narrative analysis.</abstract>
+      <url hash="75820dec">2024.games-1.8</url>
+      <bibkey>tikhonov-2024-branching</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Utilizing <fixed-case>GPT</fixed-case>-4 to Solve <fixed-case>T</fixed-case>ext<fixed-case>W</fixed-case>orld Commonsense Games Efficiently</title>
+      <author><first>Binggang</first><last>Zhuo</last></author>
+      <author><first>Masaki</first><last>Murata</last></author>
+      <pages>76–84</pages>
+      <abstract>Most artificial intelligence agents in interactive fiction games are implemented using reinforcement learning. Considering the recent rapid development of large language models, we propose an approach that utilizes a large language model to tackle interactive fiction game tasks. The chosen test dataset is TextWorld Commonsense, an interactive fiction game environment designed for artificial intelligence agents. In these games, the AI agent’s task is to organize rooms and place items in appropriate locations. To achieve a high score in the game, common sense knowledge about “which items belong to which locations” is important. Our approach is based on GPT-4 and a carefully designed prompt. Experimental results demonstrate that our approach outperforms prior research. Specifically, GPT-4 with feedback-augmented prompt successfully completed all tasks in both simple and medium level game environments without fine-tuning. In hard level game environments, our approach achieved a normalized score of 0.70, surpassing the best baseline score of 0.57.</abstract>
+      <url hash="c312976d">2024.games-1.9</url>
+      <bibkey>zhuo-murata-2024-utilizing</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Linguistic Acceptability and Usability Enhancement: A Case Study of <fixed-case>GWAP</fixed-case> Evaluation and Redesign</title>
+      <author><first>Wateen Abdullah</first><last>Aliady</last></author>
+      <author><first>Massimo</first><last>Poesio</last></author>
+      <pages>85–96</pages>
+      <abstract>Collecting high-quality annotations for Natural Language Processing (NLP) tasks poses challenges. Gamified annotation systems, like Games-with-a-Purpose (GWAP), have become popular tools for data annotation. For GWAPs to be effective, they must be user-friendly and produce high-quality annotations to ensure the collected data’s usefulness. This paper investigates the effectiveness of a gamified approach through two specific studies on an existing GWAP designed for collecting NLP coreference judgments. The first study involved preliminary usability testing using the concurrent think-aloud method to gather open-ended feedback. This feedback was crucial in pinpointing design issues. Following this, we conducted semi-structured interviews with our participants, and the insights collected from these interviews were instrumental in crafting player personas, which informed design improvements aimed at enhancing user experience. The outcomes of our research have been generalized to benefit other GWAP implementations. The second study evaluated the linguistic acceptability and reliability of the data collected through our GWAP. Our findings indicate that our GWAP produced reliable corpora with 91.49% accuracy and 0.787 Cohen’s kappa.</abstract>
+      <url hash="82eb90e1">2024.games-1.10</url>
+      <bibkey>aliady-poesio-2024-linguistic</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Riddle Me This: Evaluating Large Language Models in Solving Word-Based Games</title>
+      <author><first>Raffaele</first><last>Manna</last></author>
+      <author><first>Maria Pia</first><last>di Buono</last></author>
+      <author><first>Johanna</first><last>Monti</last></author>
+      <pages>97–106</pages>
+      <abstract>In this contribution, we examine the proficiency of Large Language Models (LLMs) in solving the linguistic game “La Ghigliottina,” the final game of the popular Italian TV quiz show “L’Eredità”. This game is particularly challenging as it requires LLMs to engage in semantic inference reasoning for identifying the solutions of the game. Our experiment draws inspiration from Ghigliottin-AI, a task of EVALITA 2020, an evaluation campaign focusing on Natural Language Processing (NLP) and speech tools designed for the Italian language. To benchmark our experiment, we use the results of the most successful artificial player in this task, namely Il Mago della Ghigliottina. The paper describes the experimental setting and the results which show that LLMs perform poorly.</abstract>
+      <url hash="42a1ccb6">2024.games-1.11</url>
+      <bibkey>manna-etal-2024-riddle</bibkey>
+    </paper>
+    <paper id="12">
+      <title><fixed-case>LLM</fixed-case>s of Catan: Exploring Pragmatic Capabilities of Generative Chatbots Through Prediction and Classification of Dialogue Acts in Boardgames’ Multi-party Dialogues</title>
+      <author><first>Andrea</first><last>Martinenghi</last></author>
+      <author><first>Gregor</first><last>Donabauer</last></author>
+      <author><first>Simona</first><last>Amenta</last></author>
+      <author><first>Sathya</first><last>Bursic</last></author>
+      <author><first>Mathyas</first><last>Giudici</last></author>
+      <author><first>Udo</first><last>Kruschwitz</last></author>
+      <author><first>Franca</first><last>Garzotto</last></author>
+      <author><first>Dimitri</first><last>Ognibene</last></author>
+      <pages>107–118</pages>
+      <abstract>Human language interactions involve complex processes beyond pure information exchange, for example, actions aimed at influencing beliefs and behaviors within a communicative context. In this paper, we propose to investigate the dialogue understanding capabilities of large language models (LLMs), particularly in multi-party settings, where challenges like speaker identification and turn-taking are common. Through experiments on the game-based STAC dataset, we explore zero and few-shot learning approaches for dialogue act classification in a multi-party game setting. Our intuition is that LLMs may excel in tasks framed through examples rather than formal descriptions, influenced by a range of pragmatic features like information presentation order in prompts and others. We also explore the models’ predictive abilities regarding future dialogue acts and study integrating information on dialogue act sequences to improve predictions. Our findings suggest that ChatGPT can keep up with baseline models trained from scratch for classification of certain dialogue act types but also reveal biases and limitations associated with the approach. These insights can be valuable for the development of multi-party chatbots and we try to point out directions for future research towards nuanced understanding and adaptation in diverse conversational contexts</abstract>
+      <url hash="8acdee56">2024.games-1.12</url>
+      <bibkey>martinenghi-etal-2024-llms</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.htres.xml b/data/xml/2024.htres.xml
new file mode 100644
index 0000000000..7e1b352c99
--- /dev/null
+++ b/data/xml/2024.htres.xml
@@ -0,0 +1,112 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.htres">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the First Workshop on Holocaust Testimonies as Language Resources (HTRes) @ LREC-COLING 2024</booktitle>
+      <editor><first>Isuri</first><last>Anuradha</last></editor>
+      <editor><first>Martin</first><last>Wynne</last></editor>
+      <editor><first>Francesca</first><last>Frontini</last></editor>
+      <editor><first>Alistair</first><last>Plum</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="153171d4">2024.htres-1</url>
+      <venue>htres</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="17b02769">2024.htres-1.0</url>
+      <bibkey>htres-2024-holocaust</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>The Impact of Digital Editing on the Study of Holocaust Survivors’ Testimonies in the context of Voci dall’Inferno Project</title>
+      <author><first>Angelo Mario</first><last>Del Grosso</last></author>
+      <author><first>Marina</first><last>Riccucci</last></author>
+      <author><first>Elvira</first><last>Mercatanti</last></author>
+      <pages>1–9</pages>
+      <abstract>In Nazi concentration camps, approximately 20 million people perished. This included young and old, men and women, Jews, dissidents, and homosexuals. Only 10% of those deported survived. This paper introduces “Voci dall’Inferno” project, which aims to achieve two key objectives: a) Create a comprehensive digital archive: by encoding a corpus of non-literary testimonies including both written and oral sources. b) Analyze the use of Dante’s language: by identifying the presence of Dante’s lexicon and allusions. Currently, the project holds 47 testimonies, with 29 transcribed in full text and 18 encoded using the XML-TEI format. This project is propelled by a multidisciplinary and educational context with experts in humanities and computer science. The project’s findings will be disseminated through a user-friendly web application built on an XML foundation. Though currently in its prototyping phase, the application boasts several features, including a search engine for testimonies, terms, or phrases within the corpus. Additionally, a browsing interface allows users to read and listen the original testimonies, while a visualization tool enables deeper exploration of the corpus’s content. Adhering to the Text Encoding Initiative (TEI) guidelines, the project ensures a structured digital archive, aligned with the FAIR principles for data accessibility and reusability.</abstract>
+      <url hash="15875fff">2024.htres-1.1</url>
+      <bibkey>del-grosso-etal-2024-impact</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>TEI</fixed-case> Specifications for a Sustainable Management of Digitized Holocaust Testimonies</title>
+      <author><first>Sarah</first><last>Bénière</last></author>
+      <author><first>Floriane</first><last>Chiffoleau</last></author>
+      <author><first>Laurent</first><last>Romary</last></author>
+      <pages>10–17</pages>
+      <abstract>Data modeling and standardization are central issues in the field of Digital Humanities, and all the more so when dealing with Holocaust testimonies, where stable preservation and long-term accessibility are key. The EHRI Online Editions are composed of documents of diverse nature (testimonies, letters, diplomatic reports, etc.), held by EHRI’s partnering institutions, and selected, gathered thematically and encoded according to the TEI Guidelines by the editors within the EHRI Consortium. Standardization is essential in order to make sure that the editions are consistent with one another. The issue of consistency also encourages a broader reflection on the usage of standards when processing data, and on the standardization of digital scholarly editions of textual documents in general. In this paper, we present the normalization work we carried out on the EHRI Online Editions. It includes a customization of the TEI adapted to Holocaust-related documents, and a focus on the implementation of controlled vocabulary. We recommend the use of these encoding specifications as a tool for researchers and/or non-TEI experts to ensure their encoding is valid and consistent across editions, but also as a mechanism for integrating the edition work smoothly within a wider workflow leading from image digitization to publication.</abstract>
+      <url hash="7cdf7364">2024.htres-1.2</url>
+      <bibkey>beniere-etal-2024-tei</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Repurposing Holocaust-Related Digital Scholarly Editions to Develop Multilingual Domain-Specific Named Entity Recognition Tools</title>
+      <author><first>Maria</first><last>Dermentzi</last></author>
+      <author><first>Hugo</first><last>Scheithauer</last></author>
+      <pages>18–28</pages>
+      <abstract>The European Holocaust Research Infrastructure (EHRI) aims to support Holocaust research by making information about dispersed Holocaust material accessible and interconnected through its services. Creating a tool capable of detecting named entities in texts such as Holocaust testimonies or archival descriptions would make it easier to link more material with relevant identifiers in domain-specific controlled vocabularies, semantically enriching it, and making it more discoverable. With this paper, we release EHRI-NER, a multilingual dataset (Czech, German, English, French, Hungarian, Dutch, Polish, Slovak, Yiddish) for Named Entity Recognition (NER) in Holocaust-related texts. EHRI-NER is built by aggregating all the annotated documents in the EHRI Online Editions and converting them to a format suitable for training NER models. We leverage this dataset to fine-tune the multilingual Transformer-based language model XLM-RoBERTa (XLM-R) to determine whether a single model can be trained to recognize entities across different document types and languages. The results of our experiments show that despite our relatively small dataset, in a multilingual experiment setup, the overall F1 score achieved by XLM-R fine-tuned on multilingual annotations is 81.5%. We argue that this score is sufficiently high to consider the next steps towards deploying this model.</abstract>
+      <url hash="b8f66755">2024.htres-1.3</url>
+      <bibkey>dermentzi-scheithauer-2024-repurposing</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Dates and places as points of attachment for memorial contents in the <fixed-case>ISW</fixed-case> corpus: 1938 as a turning point</title>
+      <author><first>Carolina</first><last>Flinz</last></author>
+      <author><first>Simona</first><last>Leonardi</last></author>
+      <pages>29–36</pages>
+      <abstract>Aim of the paper is the identification and subsequent analysis of crisis years in the narrative biographical interviews with German speaking Jews from the corpus ISW (Emigrantendeutsch in Israel: Wiener in Jerusalem/ Migrant German in Israel: Viennese in Jerusalem); also the possible “chronological landmarks” within a year will be tackled, investigating how a certain year – 1938 – represents in the life story of the narrators a turning point, as it clusters most traumatic events linked to the Shoah. The transcripts were analysed using the tool Sketch Engine. An alternation of corpus-driven and corpus-based steps characterizes this study, which uses a quantitative-qualitative approach (see Lemnitzer and Zinsmeister, 2015) and integrates also approaches from narrative analysis. The research questions that guide our investigation are as follows: Are there any special dates that recur as chronological landmarks of crisis situations (Leonardi 2023a)? Which are they? Do they recur in connection with special places? which ones?</abstract>
+      <url hash="3ba8534a">2024.htres-1.4</url>
+      <bibkey>flinz-leonardi-2024-dates</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Creating a Typology of Places to Annotate Holocaust Testimonies Through Machine Learning</title>
+      <author><first>Christine</first><last>Liu</last></author>
+      <author><first>William J.B.</first><last>Mattingly</last></author>
+      <pages>37</pages>
+      <abstract>The Holocaust was not only experienced in iconic places like Auschwitz or the Warsaw ghetto. Ordinary places, such as city streets, forests, hills, and homes, were transformed by occupation and systematic violence. While most of these places are unnamed and locationally ambiguous, their omnipresence throughout post-war testimonies from witnesses and survivors of the Holocaust emphasize their undeniable importance. This paper shares a methodology for developing a typology of places in order to annotate both named and unnamed places within interview transcripts from the United States Holocaust Memorial Museum (USHMM) through a machine learning model. The approach underscores the benefits of hybrid analysis through both automated extraction and manual review to create distinct categories of places. This paper also reviews how testimony transcripts were converted into structured data for annotation and previews ongoing work to design a search engine for users to dynamically query this place-based approach to studying the Holocaust.</abstract>
+      <url hash="50efb52e">2024.htres-1.5</url>
+      <bibkey>liu-mattingly-2024-creating</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Speech Technology Services for Oral History Research</title>
+      <author><first>Christoph</first><last>Draxler</last></author>
+      <author><first>Henk</first><last>van den Heuvel</last></author>
+      <author><first>Arjan</first><last>van Hessen</last></author>
+      <author><first>Pavel</first><last>Ircing</last></author>
+      <author><first>Jan</first><last>Lehečka</last></author>
+      <pages>38–43</pages>
+      <abstract>Oral history is about oral sources of witnesses and commentors on historical events. Speech technology is an important instrument to process such recordings in order to obtain transcription and further enhancements to structure the oral account In this contribution we address the transcription portal and the webservices associated with speech processing at BAS, speech solutions developed at LINDAT, how to do it yourself with Whisper, remaining challenges, and future developments.</abstract>
+      <url hash="7aa301e1">2024.htres-1.6</url>
+      <bibkey>draxler-etal-2024-speech</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Identifying Narrative Patterns and Outliers in Holocaust Testimonies Using Topic Modeling</title>
+      <author><first>Maxim</first><last>Ifergan</last></author>
+      <author><first>Omri</first><last>Abend</last></author>
+      <author><first>Renana</first><last>Keydar</last></author>
+      <author><first>Amit</first><last>Pinchevski</last></author>
+      <pages>44–52</pages>
+      <abstract>The vast collection of Holocaust survivor testimonies presents invaluable historical insights but poses challenges for manual analysis. This paper leverages advanced Natural Language Processing (NLP) techniques to explore the USC Shoah Foundation Holocaust testimony corpus. By treating testimonies as structured question-and-answer sections, we apply topic modeling to identify key themes. We experiment with BERTopic, which leverages recent advances in language modeling technology. We align testimony sections into fixed parts, revealing the evolution of topics across the corpus of testimonies. This highlights both a common narrative schema and divergences between subgroups based on age and gender. We introduce a novel method to identify testimonies within groups that exhibit atypical topic distributions resembling those of other groups. This study offers unique insights into the complex narratives of Holocaust survivors, demonstrating the power of NLP to illuminate historical discourse and identify potential deviations in survivor experiences.</abstract>
+      <url hash="f1dc9284">2024.htres-1.7</url>
+      <bibkey>ifergan-etal-2024-identifying</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Tracing the deportation to define Holocaust geometries. The exploratory case of Milan</title>
+      <author><first>Giovanni Pietro</first><last>Vitali</last></author>
+      <author><first>Laura</first><last>Brazzo</last></author>
+      <pages>53–62</pages>
+      <abstract>This paper presents a pilot project conducted in collaboration with the Fondazione CDEC to shed light on the historical dynamics of the arrests and deportations of Jews from Italy to foreign concentration camps between 1943 and 1945. Led by a multidisciplinary team, including a Digital Humanities expert, an archivist, a GIS developer, and an education manager, the project aimed to rework archival information into data visualisation models utilising a subset of data from the CDEC LOD dataset of the victims of the Holocaust in Italy to construct detailed visual representations of deportation routes. Drawing inspiration from previous projects like the Atlas of Nazi-Fascist Massacres and research on Holocaust testimonies, this project sought to create interactive maps, network and graphs illustrating the paths of forced transfers endured by arrested Jews, particularly focusing on those born or arrested in Milan. Despite challenges such as incomplete or imprecise data, the team managed to reconstruct deportation routes and classify transport convoys, enhancing the understanding of this dark period in history. The visualisations, along with detailed repositories and links provided on GitHub, serve as valuable research tools for both scholarly and educational purposes, offering users varying levels of granularity to explore historical events and timelines. Through meticulous data analysis and visualisation techniques, this project contributes to ongoing efforts to preserve and understand the tragic events of the Holocaust, emphasizing the importance of archival work and interdisciplinary collaboration in historical research.</abstract>
+      <url hash="df20f39f">2024.htres-1.8</url>
+      <bibkey>vitali-brazzo-2024-tracing</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Zero-shot Trajectory Mapping in Holocaust Testimonies</title>
+      <author><first>Eitan</first><last>Wagner</last></author>
+      <author><first>Renana</first><last>Keydar</last></author>
+      <author><first>Omri</first><last>Abend</last></author>
+      <pages>63–70</pages>
+      <abstract>This work presents the task of Zero-shot Trajectory Mapping, which focuses on the spatial dimension of narratives. The task consists of two parts: (1) creating a “map” with all the locations mentioned in a set of texts, and (2) extracting a trajectory from a single testimony and positioning it within the map. Following recent advances in context length capabilities of large language models, we propose a pipeline for this task in a completely unsupervised manner, without the requirement of any type of labels. We demonstrate the pipeline on a set of ≈ 75 testimonies and present the resulting map and samples of the trajectory. We conclude that current long-range models succeed in generating meaningful maps and trajectories. Other than the visualization and indexing, we propose future directions for adaptation of the task as a step for dividing testimony sets into clusters and for alignment between parallel parts of different testimonies.</abstract>
+      <url hash="7ab7d8d3">2024.htres-1.9</url>
+      <bibkey>wagner-etal-2024-zero</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.humeval.xml b/data/xml/2024.humeval.xml
new file mode 100644
index 0000000000..32dfdc5ee9
--- /dev/null
+++ b/data/xml/2024.humeval.xml
@@ -0,0 +1,298 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.humeval">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Fourth Workshop on Human Evaluation of NLP Systems (HumEval) @ LREC-COLING 2024</booktitle>
+      <editor><first>Simone</first><last>Balloccu</last></editor>
+      <editor><first>Anya</first><last>Belz</last></editor>
+      <editor><first>Rudali</first><last>Huidrom</last></editor>
+      <editor><first>Ehud</first><last>Reiter</last></editor>
+      <editor><first>Joao</first><last>Sedoc</last></editor>
+      <editor><first>Craig</first><last>Thomson</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="761fe296">2024.humeval-1</url>
+      <venue>humeval</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="8d7fc377">2024.humeval-1.0</url>
+      <bibkey>humeval-2024-human</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Quality and Quantity of Machine Translation References for Automatic Metrics</title>
+      <author><first>Vilém</first><last>Zouhar</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <pages>1–11</pages>
+      <abstract>Automatic machine translation metrics typically rely on human translations to determine the quality of system translations. Common wisdom in the field dictates that the human references should be of very high quality. However, there are no cost-benefit analyses that could be used to guide practitioners who plan to collect references for machine translation evaluation. We find that higher-quality references lead to better metric correlations with humans at the segment-level. Having up to 7 references per segment and taking their average (or maximum) helps all metrics. Interestingly, the references from vendors of different qualities can be mixed together and improve metric success. Higher quality references, however, cost more to create and we frame this as an optimization problem: given a specific budget, what references should be collected to maximize metric success. These findings can be used by evaluators of shared tasks when references need to be created under a certain budget.</abstract>
+      <url hash="63f93dc2">2024.humeval-1.1</url>
+      <bibkey>zouhar-bojar-2024-quality</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Exploratory Study on the Impact of <fixed-case>E</fixed-case>nglish Bias of Generative Large Language Models in <fixed-case>D</fixed-case>utch and <fixed-case>F</fixed-case>rench</title>
+      <author><first>Ayla</first><last>Rigouts Terryn</last></author>
+      <author><first>Miryam</first><last>de Lhoneux</last></author>
+      <pages>12–27</pages>
+      <abstract>The most widely used LLMs like GPT4 and Llama 2 are trained on large amounts of data, mostly in English but are still able to deal with non-English languages. This English bias leads to lower performance in other languages, especially low-resource ones. This paper studies the linguistic quality of LLMs in two non-English high-resource languages: Dutch and French, with a focus on the influence of English. We first construct a comparable corpus of text generated by humans versus LLMs (GPT-4, Zephyr, and GEITje) in the news domain. We proceed to annotate linguistic issues in the LLM-generated texts, obtaining high inter-annotator agreement, and analyse these annotated issues. We find a substantial influence of English for all models under all conditions: on average, 16% of all annotations of linguistic errors or peculiarities had a clear link to English. Fine-tuning a LLM to a target language (GEITje is fine-tuned on Dutch) reduces the number of linguistic issues and probably also the influence of English. We further find that using a more elaborate prompt leads to linguistically better results than a concise prompt. Finally, increasing the temperature for one of the models leads to lower linguistic quality but does not alter the influence of English.</abstract>
+      <url hash="4d07a6b7">2024.humeval-1.2</url>
+      <bibkey>rigouts-terryn-de-lhoneux-2024-exploratory</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Adding Argumentation into Human Evaluation of Long Document Abstractive Summarization: A Case Study on Legal Opinions</title>
+      <author><first>Mohamed</first><last>Elaraby</last></author>
+      <author><first>Huihui</first><last>Xu</last></author>
+      <author><first>Morgan</first><last>Gray</last></author>
+      <author><first>Kevin</first><last>Ashley</last></author>
+      <author><first>Diane</first><last>Litman</last></author>
+      <pages>28–35</pages>
+      <abstract>Human evaluation remains the gold standard for assessing abstractive summarization. However, current practices often prioritize constructing evaluation guidelines for fluency, coherence, and factual accuracy, overlooking other critical dimensions. In this paper, we investigate argument coverage in abstractive summarization by focusing on long legal opinions, where summaries must effectively encapsulate the document’s argumentative nature. We introduce a set of human-evaluation guidelines to evaluate generated summaries based on argumentative coverage. These guidelines enable us to assess three distinct summarization models, studying the influence of including argument roles in summarization. Furthermore, we utilize these evaluation scores to benchmark automatic summarization metrics against argument coverage, providing insights into the effectiveness of automated evaluation methods.</abstract>
+      <url hash="4066d1a1">2024.humeval-1.3</url>
+      <bibkey>elaraby-etal-2024-adding</bibkey>
+    </paper>
+    <paper id="4">
+      <title>A Gold Standard with Silver Linings: Scaling Up Annotation for Distinguishing <fixed-case>B</fixed-case>osnian, <fixed-case>C</fixed-case>roatian, <fixed-case>M</fixed-case>ontenegrin and <fixed-case>S</fixed-case>erbian</title>
+      <author><first>Aleksandra</first><last>Miletić</last></author>
+      <author><first>Filip</first><last>Miletić</last></author>
+      <pages>36–46</pages>
+      <abstract>Bosnian, Croatian, Montenegrin and Serbian are the official standard linguistic varieties in Bosnia and Herzegovina, Croatia, Montenegro, and Serbia, respectively. When these four countries were part of the former Yugoslavia, the varieties were considered to share a single linguistic standard. After the individual countries were established, the national standards emerged. Today, a central question about these varieties remains the following: How different are they from each other? How hard is it to distinguish them? While this has been addressed in NLP as part of the task on Distinguishing Between Similar Languages (DSL), little is known about human performance, making it difficult to contextualize system results. We tackle this question by reannotating the existing BCMS dataset for DSL with annotators from all target regions. We release a new gold standard, replacing the original single-annotator, single-label annotation by a multi-annotator, multi-label one, thus improving annotation reliability and explicitly coding the existence of ambiguous instances. We reassess a previously proposed DSL system on the new gold standard and establish the human upper bound on the task. Finally, we identify sources of annotation difficulties and provide linguistic insights into the BCMS dialect continuum, with multiple indicators highlighting an intermediate position of Bosnian and Montenegrin.</abstract>
+      <url hash="1ff9fbe6">2024.humeval-1.4</url>
+      <bibkey>miletic-miletic-2024-gold</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Insights of a Usability Study for <fixed-case>KBQA</fixed-case> Interactive Semantic Parsing: Generation Yields Benefits over Templates but External Validity Remains Challenging</title>
+      <author><first>Ashley</first><last>Lewis</last></author>
+      <author><first>Lingbo</first><last>Mo</last></author>
+      <author><first>Marie-Catherine</first><last>de Marneffe</last></author>
+      <author><first>Huan</first><last>Sun</last></author>
+      <author><first>Michael</first><last>White</last></author>
+      <pages>47–62</pages>
+      <abstract>We present our findings from a usability study of an interactive semantic parsing system for knowledge based question answering (KBQA). The system is designed to help users access information within a knowledge base without having to know its query language. The system translates the user’s question into the query language, retrieves an answer, then presents an English explanation of the process so that the user can make corrections if necessary. To our knowledge, our work is the most thorough usability study conducted for such a system and the only one that uses crowdworkers as participants to verify that the system is usable for average users. Our crowdworkers participate in KBQA dialogues using 4 versions of a system based on the framework by Mo et al. (2022) and answer surveys about their experiences. Some key takeaways from this work are: 1) we provide evidence for the benefits of interactivity in semantic parsing with human users and using generated questions in lieu of templated representations, 2) we identify limitations of simulations and provide contrasting evidence from actual system use, and 3) we provide an examination of crowdsourcing methodology, in particular the trade-offs of using crowdworkers vs. a specially trained group of evaluators.</abstract>
+      <url hash="c8cb6e3c">2024.humeval-1.5</url>
+      <bibkey>lewis-etal-2024-insights</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Extrinsic evaluation of question generation methods with user journey logs</title>
+      <author><first>Elie</first><last>Antoine</last></author>
+      <author><first>Eléonore</first><last>Besnehard</last></author>
+      <author><first>Frederic</first><last>Bechet</last></author>
+      <author><first>Geraldine</first><last>Damnati</last></author>
+      <author><first>Eric</first><last>Kergosien</last></author>
+      <author><first>Arnaud</first><last>Laborderie</last></author>
+      <pages>63–70</pages>
+      <abstract>There is often a significant disparity between the performance of Natural Language Processing (NLP) tools as evaluated on benchmark datasets using metrics like ROUGE or BLEU, and the actual user experience encountered when employing these tools in real-world scenarios. This highlights the critical necessity for user-oriented studies aimed at evaluating user experience concerning the effectiveness of developed methodologies. A primary challenge in such “ecological” user studies is their assessment of specific configurations of NLP tools, making replication under identical conditions impractical. Consequently, their utility is limited for the automated evaluation and comparison of different configurations of the same tool. The objective of this study is to conduct an “ecological” evaluation of a question generation within the context of an external task involving document linking. To do this we conducted an "<i>ecological</i>" evaluation of a document linking tool in the context of the exploration of a Social Science archives and from this evaluation, we aim to derive a form of a “reference corpus” that can be used offline for the automated comparison of models and quantitative tool assessment. This corpus is available on the following link: https://gitlab.lis-lab.fr/archival-public/autogestion-qa-linking</abstract>
+      <url hash="93bee20c">2024.humeval-1.6</url>
+      <bibkey>antoine-etal-2024-extrinsic</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Towards Holistic Human Evaluation of Automatic Text Simplification</title>
+      <author><first>Luisa</first><last>Carrer</last></author>
+      <author><first>Andreas</first><last>Säuberli</last></author>
+      <author><first>Martin</first><last>Kappus</last></author>
+      <author><first>Sarah</first><last>Ebling</last></author>
+      <pages>71–80</pages>
+      <abstract>Text simplification refers to the process of rewording within a single language, moving from a standard form into an easy-to-understand one. Easy Language and Plain Language are two examples of simplified varieties aimed at improving readability and understanding for a wide-ranging audience. Human evaluation of automatic text simplification is usually done by employing experts or crowdworkers to rate the generated texts. However, this approach does not include the target readers of simplified texts and does not reflect actual comprehensibility. In this paper, we explore different ways of measuring the quality of automatically simplified texts. We conducted a multi-faceted evaluation study involving end users, post-editors, and Easy Language experts and applied a variety of qualitative and quantitative methods. We found differences in the perception and actual comprehension of the texts by different user groups. In addition, qualitative surveys and behavioral observations proved to be essential in interpreting the results.</abstract>
+      <url hash="e2b2a61c">2024.humeval-1.7</url>
+      <bibkey>carrer-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Decoding the Metrics Maze: Navigating the Landscape of Conversational Question Answering System Evaluation in Procedural Tasks</title>
+      <author><first>Alexander</first><last>Frummet</last></author>
+      <author><first>David</first><last>Elsweiler</last></author>
+      <pages>81–90</pages>
+      <abstract>Conversational systems are widely used for various tasks, from answering general questions to domain-specific procedural tasks, such as cooking. While the effectiveness of metrics for evaluating general question answering (QA) tasks has been extensively studied, the evaluation of procedural QA remains a challenge as we do not know what answer types users prefer in such tasks. Existing studies on metrics evaluation often focus on general QA tasks and typically limit assessments to one answer type, such as short, SQuAD-like responses or longer passages. This research aims to achieve two objectives. Firstly, it seeks to identify the desired traits of conversational QA systems in procedural tasks, particularly in the context of cooking (RQ1). Second, it assesses how commonly used conversational QA metrics align with these traits and perform across various categories of correct and incorrect answers (RQ2). Our findings reveal that users generally favour concise conversational responses, except in time-sensitive scenarios where brief, clear answers hold more value (e.g. when heating in oil). While metrics effectively identify inaccuracies in short responses, several commonly employed metrics tend to assign higher scores to incorrect conversational answers when compared to correct ones. We provide a selection of metrics that reliably detect correct and incorrect information in short and conversational answers.</abstract>
+      <url hash="bbdc3c78">2024.humeval-1.8</url>
+      <bibkey>frummet-elsweiler-2024-decoding</bibkey>
+    </paper>
+    <paper id="9">
+      <title>The 2024 <fixed-case>R</fixed-case>epro<fixed-case>NLP</fixed-case> Shared Task on Reproducibility of Evaluations in <fixed-case>NLP</fixed-case>: Overview and Results</title>
+      <author><first>Anya</first><last>Belz</last></author>
+      <author><first>Craig</first><last>Thomson</last></author>
+      <pages>91–105</pages>
+      <abstract>This paper presents an overview of, and the results from, the 2024 Shared Task on Reproducibility of Evaluations in NLP (ReproNLP’24), following on from three previous shared tasks on reproducibility of evaluations in NLP, ReproNLP’23, ReproGen’22 and ReproGen’21. This shared task series forms part of an ongoing research programme designed to develop theory and practice of reproducibility assessment in NLP and machine learning, against a backdrop of increasing recognition of the importance of reproducibility across the two fields. We describe the ReproNLP’24 shared task, summarise results from the reproduction studies submitted, and provide additional comparative analysis of their results.</abstract>
+      <url hash="5d207e5d">2024.humeval-1.9</url>
+      <bibkey>belz-thomson-2024-2024</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Once Upon a Replication: It is Humans’ Turn to Evaluate <fixed-case>AI</fixed-case>’s Understanding of Children’s Stories for <fixed-case>QA</fixed-case> Generation</title>
+      <author><first>Andra-Maria</first><last>Florescu</last></author>
+      <author><first>Marius</first><last>Micluta-Campeanu</last></author>
+      <author><first>Liviu P.</first><last>Dinu</last></author>
+      <pages>106–113</pages>
+      <abstract>The following paper presents the outcomes of a collaborative experiment on human evaluation from the ReproNLP 2024 shared task, track B, part of the ReproHum project. For this paper, we evaluated a QAG (question-answer generation) system centered on English children’s storybooks that was presented in a previous research, by using human evaluators for the study. The system generated relevant QA (Question-Answer) pairs based on a dataset with storybooks for early education (kindergarten up to middle school) called FairytaleQA. In the framework of the ReproHum project, we first outline the previous paper and the reproduction strategy that has been decided upon. The complete setup of the first human evaluation is then described, along with the modifications required to replicate it. We also add other relevant related works on this subject. In conclusion, we juxtapose the replication outcomes with those documented in the cited publication. Additionally, we explore the general features of this endeavor as well as its shortcomings.</abstract>
+      <url hash="654632fa">2024.humeval-1.10</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="7f752ec1">2024.humeval-1.10.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>florescu-etal-2024-upon</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Exploring Reproducibility of Human-Labelled Data for Code-Mixed Sentiment Analysis</title>
+      <author><first>Sachin</first><last>Sasidharan Nair</last></author>
+      <author><first>Tanvi</first><last>Dinkar</last></author>
+      <author><first>Gavin</first><last>Abercrombie</last></author>
+      <pages>114–124</pages>
+      <abstract>Growing awareness of a ‘Reproducibility Crisis’ in natural language processing (NLP) has focused on human evaluations of generative systems. While labelling for supervised classification tasks makes up a large part of human input to systems, the reproduction of such efforts has thus far not been been explored. In this paper, we re-implement a human data collection study for sentiment analysis of code-mixed Malayalam movie reviews, as well as automated classification experiments. We find that missing and under-specified information makes reproduction challenging, and we observe potentially consequential differences between the original labels and those we collect. Classification results indicate that the reliability of the labels is important for stable performance.</abstract>
+      <url hash="800c5fff">2024.humeval-1.11</url>
+      <bibkey>sasidharan-nair-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Reproducing the Metric-Based Evaluation of a Set of Controllable Text Generation Techniques</title>
+      <author><first>Michela</first><last>Lorandi</last></author>
+      <author><first>Anya</first><last>Belz</last></author>
+      <pages>125–131</pages>
+      <abstract>Rerunning a metric-based evaluation should be more straightforward and results should be closer than in a human-based evaluation, especially where code and model checkpoints are made available by the original authors. As this brief report of our efforts to rerun a metric-based evaluation of a set of multi-aspect controllable text generation (CTG) techniques shows however, such reruns of evaluations do not always produce results that are the same as the original results, and can reveal errors in the orginal work.</abstract>
+      <url hash="cae237d0">2024.humeval-1.12</url>
+      <bibkey>lorandi-belz-2024-reproducing</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um: #0033-03: How Reproducible Are Fluency Ratings of Generated Text? A Reproduction of <fixed-case>A</fixed-case>ugust et al. 2022</title>
+      <author><first>Emiel</first><last>van Miltenburg</last></author>
+      <author><first>Anouck</first><last>Braggaar</last></author>
+      <author><first>Nadine</first><last>Braun</last></author>
+      <author><first>Martijn</first><last>Goudbeek</last></author>
+      <author><first>Emiel</first><last>Krahmer</last></author>
+      <author><first>Chris</first><last>van der Lee</last></author>
+      <author><first>Steffen</first><last>Pauws</last></author>
+      <author><first>Frédéric</first><last>Tomas</last></author>
+      <pages>132–144</pages>
+      <abstract>In earlier work, August et al. (2022) evaluated three different Natural Language Generation systems on their ability to generate fluent, relevant, and factual scientific definitions. As part of the ReproHum project (Belz et al., 2023), we carried out a partial reproduction study of their human evaluation procedure, focusing on human fluency ratings. Following the standardised ReproHum procedure, our reproduction study follows the original study as closely as possible, with two raters providing 300 ratings each. In addition to this, we carried out a second study where we collected ratings from eight additional raters and analysed the variability of the ratings. We successfully reproduced the inferential statistics from the original study (i.e. the same hypotheses were supported), albeit with a lower inter-annotator agreement. The remainder of our paper shows significant variation between different raters, raising questions about what it really means to reproduce human evaluation studies.</abstract>
+      <url hash="b0a4e9d4">2024.humeval-1.13</url>
+      <bibkey>van-miltenburg-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0927-03: <fixed-case>DE</fixed-case>xpert Evaluation? Reproducing Human Judgements of the Fluency of Generated Text</title>
+      <author><first>Tanvi</first><last>Dinkar</last></author>
+      <author><first>Gavin</first><last>Abercrombie</last></author>
+      <author><first>Verena</first><last>Rieser</last></author>
+      <pages>145–152</pages>
+      <abstract>ReproHum is a large multi-institution project designed to examine the reproducibility of human evaluations of natural language processing. As part of the second phase of the project, we attempt to reproduce an evaluation of the fluency of continuations generated by a pre-trained language model compared to a range of baselines. Working within the constraints of the project, with limited information about the original study, and without access to their participant pool, or the responses of individual participants, we find that we are not able to reproduce the original results. Our participants display a greater tendency to prefer one of the system responses, avoiding a judgement of ‘equal fluency’ more than in the original study. We also conduct further evaluations: we elicit ratings from (1) a broader range of participants; (2) from the same participants at different times; and (3) with an altered definition of fluency. Results of these experiments suggest that the original evaluation collected too few ratings, and that the task formulation may be quite ambiguous. Overall, although we were able to conduct a re-evaluation study, we conclude that the original evaluation was not comprehensive enough to make truly meaningful comparisons</abstract>
+      <url hash="3b6e7cfe">2024.humeval-1.14</url>
+      <bibkey>dinkar-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0927-3: Reproducing The Human Evaluation Of The <fixed-case>DE</fixed-case>xperts Controlled Text Generation Method</title>
+      <author><first>Javier</first><last>González Corbelle</last></author>
+      <author><first>Ainhoa</first><last>Vivel Couso</last></author>
+      <author><first>Jose Maria</first><last>Alonso-Moral</last></author>
+      <author><first>Alberto</first><last>Bugarín-Diz</last></author>
+      <pages>153–162</pages>
+      <abstract>This paper presents a reproduction study aimed at reproducing and validating a human NLP evaluation performed for the DExperts text generation method. The original study introduces DExperts, a controlled text generation method, evaluated using non-toxic prompts from the RealToxicityPrompts dataset. Our reproduction study aims to reproduce the human evaluation of the continuations generated by DExperts in comparison with four baseline methods, in terms of toxicity, topicality, and fluency. We first describe the agreed approach for reproduction within the ReproHum project and detail the configuration of the original evaluation, including necessary adaptations for reproduction. Then, we make a comparison of our reproduction results with those reported in the reproduced paper. Interestingly, we observe how the human evaluators in our experiment appreciate higher quality in the texts generated by DExperts in terms of less toxicity and better fluency. All in all, new scores are higher, also for the baseline methods. This study contributes to ongoing efforts in ensuring the reproducibility and reliability of findings in NLP evaluation and emphasizes the critical role of robust methodologies in advancing the field.</abstract>
+      <url hash="fc4a070f">2024.humeval-1.15</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="89a4534c">2024.humeval-1.15.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>gonzalez-corbelle-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="16">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #1018-09: Reproducing Human Evaluations of Redundancy Errors in Data-To-Text Systems</title>
+      <author><first>Filip</first><last>Klubička</last></author>
+      <author><first>John D.</first><last>Kelleher</last></author>
+      <pages>163–198</pages>
+      <abstract>This paper describes a reproduction of a human evaluation study evaluating redundancies generated in automatically generated text from a data-to-text system. While the scope of the original study is broader, a human evaluation—a manual error analysis—is included as part of the system evaluation. We attempt a reproduction of this human evaluation, however while the authors annotate multiple properties of the generated text, we focus exclusively on a single quality criterion, that of redundancy. In focusing our study on a single minimal reproducible experimental unit, with the experiment being fairly straightforward and all data made available by the authors, we encountered no challenges with our reproduction and were able to reproduce the trend found in the original experiment. However, while still confirming the general trend, we found that both our annotators identified twice as many errors in the dataset than the original authors.</abstract>
+      <url hash="0a99acef">2024.humeval-1.16</url>
+      <bibkey>klubicka-kelleher-2024-reprohum</bibkey>
+    </paper>
+    <paper id="17">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um#0043: Human Evaluation Reproducing Language Model as an Annotator: Exploring Dialogue Summarization on <fixed-case>AMI</fixed-case> Dataset</title>
+      <author><first>Vivian</first><last>Fresen</last></author>
+      <author><first>Mei-Shin</first><last>Wu-Urbanek</last></author>
+      <author><first>Steffen</first><last>Eger</last></author>
+      <pages>199–209</pages>
+      <abstract>This study, conducted as part of the ReproHum project, aimed to replicate and evaluate the experiment presented in “Language Model as an Annotator: Exploring DialoGPT for Dialogue Summarization” by Feng et al. (2021). By employing DialoGPT, BART, and PGN models, the study assessed dialogue summarization’s informativeness. Based on the ReproHum project’s baselines, we conducted a human evaluation for the AIMI dataset, aiming to compare the results of the original study with our own experiments. Our objective is to contribute to the research on human evaluation and the reproducibility of the original study’s findings in the field of Natural Language Processing (NLP). Through this endeavor, we seek to enhance understanding and establish reliable benchmarks in human evaluation methodologies within the NLP domain.</abstract>
+      <url hash="953b0db4">2024.humeval-1.17</url>
+      <bibkey>fresen-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="18">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0712-01: Human Evaluation Reproduction Report for “Hierarchical Sketch Induction for Paraphrase Generation”</title>
+      <author><first>Mohammad</first><last>Arvan</last></author>
+      <author><first>Natalie</first><last>Parde</last></author>
+      <pages>210–220</pages>
+      <abstract>Human evaluations are indispensable in the development of NLP systems because they provide direct insights into how effectively these systems meet real-world needs and expectations. Ensuring the reproducibility of these evaluations is vital for maintaining credibility in natural language processing research. This paper presents our reproduction of the human evaluation experiments conducted by Hosking et al. (2022) for their paraphrase generation approach. Through careful replication we found that our results closely align with those in the original study, indicating a high degree of reproducibility.</abstract>
+      <url hash="4800a84c">2024.humeval-1.18</url>
+      <bibkey>arvan-parde-2024-reprohum</bibkey>
+    </paper>
+    <paper id="19">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0712-01: Reproducing Human Evaluation of Meaning Preservation in Paraphrase Generation</title>
+      <author><first>Lewis N.</first><last>Watson</last></author>
+      <author><first>Dimitra</first><last>Gkatzia</last></author>
+      <pages>221–228</pages>
+      <abstract>Reproducibility is a cornerstone of scientific research, ensuring the reliability and generalisability of findings. The ReproNLP Shared Task on Reproducibility of Evaluations in NLP aims to assess the reproducibility of human evaluation studies. This paper presents a reproduction study of the human evaluation experiment presented in “Hierarchical Sketch Induction for Paraphrase Generation” by Hosking et al. (2022). The original study employed a human evaluation on Amazon Mechanical Turk, assessing the quality of paraphrases generated by their proposed model using three criteria: meaning preservation, fluency, and dissimilarity. In our reproduction study, we focus on the meaning preservation criterion and utilise the Prolific platform for participant recruitment, following the ReproNLP challenge’s common approach to reproduction. We discuss the methodology, results, and implications of our reproduction study, comparing them to the original findings. Our findings contribute to the understanding of reproducibility in NLP research and highlights the potential impact of platform changes and evaluation criteria on the reproducibility of human evaluation studies.</abstract>
+      <url hash="fba708e7">2024.humeval-1.19</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="fd77e3e6">2024.humeval-1.19.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>watson-gkatzia-2024-reprohum</bibkey>
+    </paper>
+    <paper id="20">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0043-4: Evaluating Summarization Models: investigating the impact of education and language proficiency on reproducibility</title>
+      <author><first>Mateusz</first><last>Lango</last></author>
+      <author><first>Patricia</first><last>Schmidtova</last></author>
+      <author><first>Simone</first><last>Balloccu</last></author>
+      <author><first>Ondrej</first><last>Dusek</last></author>
+      <pages>229–237</pages>
+      <abstract>In this paper, we describe several reproductions of a human evaluation experiment measuring the quality of automatic dialogue summarization (Feng et al., 2021). We investigate the impact of the annotators’ highest level of education, field of study, and native language on the evaluation of the informativeness of the summary. We find that the evaluation is relatively consistent regardless of these factors, but the biggest impact seems to be a prior specific background in natural language processing (as opposed to, e.g. a background in computer sci- ence). We also find that the experiment setup (asking for single vs. multiple criteria) may have an impact on the results.</abstract>
+      <url hash="20085545">2024.humeval-1.20</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="06a7a7c5">2024.humeval-1.20.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>lango-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0033-3: Comparable Relative Results with Lower Absolute Values in a Reproduction Study</title>
+      <author><first>Yiru</first><last>Li</last></author>
+      <author><first>Huiyuan</first><last>Lai</last></author>
+      <author><first>Antonio</first><last>Toral</last></author>
+      <author><first>Malvina</first><last>Nissim</last></author>
+      <pages>238–249</pages>
+      <abstract>In the context of the ReproHum project aimed at assessing the reliability of human evaluation, we replicated the human evaluation conducted in “Generating Scientific Definitions with Controllable Complexity” by August et al. (2022). Specifically, humans were asked to assess the fluency of automatically generated scientific definitions by three different models, with output complexity varying according to target audience. Evaluation conditions were kept as close as possible to the original study, except of necessary and minor adjustments. Our results, despite yielding lower absolute performance, show that relative performance across the three tested systems remains comparable to what was observed in the original paper. On the basis of lower inter-annotator agreement and feedback received from annotators in our experiment, we also observe that the ambiguity of the concept being evaluated may play a substantial role in human assessment.</abstract>
+      <url hash="b3a9654c">2024.humeval-1.21</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="309c1870">2024.humeval-1.21.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>li-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="22">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0124-03: Reproducing Human Evaluations of end-to-end approaches for Referring Expression Generation</title>
+      <author><first>Saad</first><last>Mahamood</last></author>
+      <pages>250–254</pages>
+      <abstract>In this paper we describe our attempt to reproduce a single human evaluation quality criterion of the human evaluation that was in conducted in the paper “NeuralREG: An end-to-end approach to referring expression generation”. In particular, this paper describes the approach and challenges involved in reproducing the human evaluation as done by the original authors of the paper, the results obtained, and what insights we have gained from attempting this particular reproduction. Insights that we hope will enable refinements to both how human evaluations are documented by author(s) and enable better reproductions of NLP experiments in the future.</abstract>
+      <url hash="1707e102">2024.humeval-1.22</url>
+      <bibkey>mahamood-2024-reprohum</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0087-01: Human Evaluation Reproduction Report for Generating Fact Checking Explanations</title>
+      <author><first>Tyler</first><last>Loakman</last></author>
+      <author><first>Chenghua</first><last>Lin</last></author>
+      <pages>255–260</pages>
+      <abstract>This paper describes a partial reproduction of the work titled “Generating Fact Checking Explanations” by Atanasova et al. (2020) as part of the ReproHum element within the ReproNLP shared task, aimed at reproducing findings in NLP research related to human evaluation. The task investigates whether NLP research is becoming more or less reproducible over time. Following instructions from the task organizers and the original authors, we gathered relative rankings for three fact-checking explanations (including a gold standard and outputs from two models) for 40 inputs based on the criterion of Coverage. Our reproduction and reanalysis of the original study’s raw results support the initial findings, showing similar patterns between the original work and our reproduction. Though we observed slight variations from the original results, our findings align with the main conclusions drawn by the original authors regarding the effectiveness of their proposed models.</abstract>
+      <url hash="7f2042a3">2024.humeval-1.23</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="b17223cc">2024.humeval-1.23.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>loakman-lin-2024-reprohum</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0892-01: The painful route to consistent results: A reproduction study of human evaluation in <fixed-case>NLG</fixed-case></title>
+      <author><first>Irene</first><last>Mondella</last></author>
+      <author><first>Huiyuan</first><last>Lai</last></author>
+      <author><first>Malvina</first><last>Nissim</last></author>
+      <pages>261–268</pages>
+      <abstract>In spite of the core role human judgement plays in evaluating the performance of NLP systems, the way human assessments are elicited in NLP experiments, and to some extent the nature of human judgement itself, pose challenges to the reliability and validity of human evaluation. In the context of the larger ReproHum project, aimed at running large scale multi-lab reproductions of human judgement, we replicated the understandability assessment by humans on several generated outputs of simplified text described in the paper “Neural Text Simplification of Clinical Letters with a Domain Specific Phrase Table” by Shardlow and Nawaz, appeared in the Proceedings of ACL 2019. Although we had to implement a series of modifications compared to the original study, which were necessary to run our human evaluation on exactly the same data, we managed to collect assessments and compare results with the original study. We obtained results consistent with those of the reference study, confirming their findings. The paper is complete with as much information as possible to foster and facilitate future reproduction.</abstract>
+      <url hash="99a2cd49">2024.humeval-1.24</url>
+      <bibkey>mondella-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="25">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0087-01: A Reproduction Study of the Human Evaluation of the Coverage of Fact Checking Explanations</title>
+      <author><first>Mingqi</first><last>Gao</last></author>
+      <author><first>Jie</first><last>Ruan</last></author>
+      <author><first>Xiaojun</first><last>Wan</last></author>
+      <pages>269–273</pages>
+      <abstract>We present a reproduction study of the human evaluation of the coverage of fact checking explanations conducted by Atanasova et al. (2020), as a team in Track B of ReproNLP 2024. The setup of our reproduction study is almost the same as the original study, with some necessary modifications to the evaluation guideline and annotation interface. Our reproduction achieves a higher IAA of 0.20 compared to the original study’s 0.12, but discovers a mismatch between the IAA calculated by us with the raw annotation in the original study and the IAA reported in the original paper. Additionally, our reproduction results on the ranks of three types of explanations are drastically different from the original experiment, rendering that one important conclusion in the original paper cannot be confirmed at all. The case study illustrates that the annotators in the reproduction study may understand the quality criterion differently from the annotators in the original study.</abstract>
+      <url hash="9495d902">2024.humeval-1.25</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="2c4b1d3d">2024.humeval-1.25.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>gao-etal-2024-reprohum</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>um #0866-04: Another Evaluation of Readers’ Reactions to News Headlines</title>
+      <author><first>Zola</first><last>Mahlaza</last></author>
+      <author><first>Toky Hajatiana</first><last>Raboanary</last></author>
+      <author><first>Kyle</first><last>Seakgwa</last></author>
+      <author><first>C. Maria</first><last>Keet</last></author>
+      <pages>274–280</pages>
+      <abstract>The reproduction of Natural Language Processing (NLP) studies is important in establishing their reliability. Nonetheless, many papers in NLP have never been reproduced. This paper presents a reproduction of Gabriel et al. (2022)’s work to establish the extent to which their findings, pertaining to the utility of large language models (T5 and GPT2) to automatically generate writer’s intents when given headlines to curb misinformation, can be confirmed. Our results show no evidence to support two of their four findings and they partially support the rest of the original findings. Specifically, while we confirmed that all the models are judged to be capable of influencing readers’ trust or distrust, there was a difference in T5’s capability to reduce trust. Our results show that its generations are more likely to have greater influence in reducing trust while Gabriel et al. (2022) found more cases where they had no impact at all. In addition, most of the model generations are considered socially acceptable only if we relax the criteria for determining a majority to mean more than chance rather than the apparent &gt; 70% of the original study. Overall, while they found that “machine-generated MRF implications alongside news headlines to readers can increase their trust in real news while decreasing their trust in misinformation”, we found that they are more likely to decrease trust in both cases vs. having no impact at all.</abstract>
+      <url hash="2c057d4c">2024.humeval-1.26</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="c1182088">2024.humeval-1.26.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>mahlaza-etal-2024-reprohum</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.isa.xml b/data/xml/2024.isa.xml
new file mode 100644
index 0000000000..2f7ee4201d
--- /dev/null
+++ b/data/xml/2024.isa.xml
@@ -0,0 +1,203 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.isa">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 20th Joint ACL - ISO Workshop on Interoperable Semantic Annotation @ LREC-COLING 2024</booktitle>
+      <editor><first>Harry</first><last>Bunt</last></editor>
+      <editor><first>Nancy</first><last>Ide</last></editor>
+      <editor><first>Kiyong</first><last>Lee</last></editor>
+      <editor><first>Volha</first><last>Petukhova</last></editor>
+      <editor><first>James</first><last>Pustejovsky</last></editor>
+      <editor><first>Laurent</first><last>Romary</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="f7734352">2024.isa-1</url>
+      <venue>isa</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="9de5d50d">2024.isa-1.0</url>
+      <bibkey>isa-2024-joint</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>The <fixed-case>MEET</fixed-case> Corpus: Collocated, Distant and Hybrid Three-party Meetings with a Ranking Task</title>
+      <author><first>Ghazaleh</first><last>Esfandiari-Baiat</last></author>
+      <author><first>Jens</first><last>Edlund</last></author>
+      <pages>1–7</pages>
+      <abstract>We introduce the MEET corpus. The corpus was collected with the aim of systematically studying the effects of collocated (physical), remote (digital) and hybrid work meetings on collaborative decision-making. It consists of 10 sessions, where each session contains three recordings: a collocated, a remote and a hybrid meeting between three participants. The participants are working on a different survival ranking task during each meeting. The duration of each meeting ranges from 10 to 18 minutes, resulting in 380 minutes of conversation altogether. We also present the annotation scheme designed specifically to target our research questions. The recordings are currently being transcribed and annotated in accordance with this scheme</abstract>
+      <url hash="9e6837f5">2024.isa-1.1</url>
+      <bibkey>esfandiari-baiat-edlund-2024-meet</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>MSNER</fixed-case>: A Multilingual Speech Dataset for Named Entity Recognition</title>
+      <author><first>Quentin</first><last>Meeus</last></author>
+      <author><first>Marie-Francine</first><last>Moens</last></author>
+      <author><first>Hugo</first><last>Van hamme</last></author>
+      <pages>8–16</pages>
+      <abstract>While extensively explored in text-based tasks, Named Entity Recognition (NER) remains largely neglected in spoken language understanding. Existing resources are limited to a single, English-only dataset. This paper addresses this gap by introducing MSNER, a freely available, multilingual speech corpus annotated with named entities. It provides annotations to the VoxPopuli dataset in four languages (Dutch, French, German, and Spanish). We have also releasing an efficient annotation tool that leverages automatic pre-annotations for faster manual refinement. This results in 590 and 15 hours of silver-annotated speech for training and validation, alongside a 17-hour, manually-annotated evaluation set. We further provide an analysis comparing silver and gold annotations. Finally, we present baseline NER models to stimulate further research on this newly available dataset.</abstract>
+      <url hash="edabe280">2024.isa-1.2</url>
+      <bibkey>meeus-etal-2024-msner</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Attitudes in Diplomatic Speeches: Introducing the <fixed-case>C</fixed-case>o<fixed-case>D</fixed-case>ip<fixed-case>A</fixed-case> <fixed-case>UNSC</fixed-case> 1.0</title>
+      <author><first>Mariia</first><last>Anisimova</last></author>
+      <author><first>Šárka</first><last>Zikánová</last></author>
+      <pages>17–26</pages>
+      <abstract>This paper presents CoDipA UNSC 1.0, a Corpus of Diplomatic Attitudes of the United Nations Security Council annotated with the attitude-part of the Appraisal theory. The speeches were manually selected according to topic-related and temporal criteria. The texts were then annotated according to the predefined annotation scenario. The distinguishing features of the diplomatic texts require a modified approach to attitude evaluation, which was implemented and presented in the current work. The corpus analysis has proven diplomatic speeches to be consistently evaluative, offered an overview of the most prominent means of expressing subjectivity in the corpus, and provided the results of the inter-annotator agreement evaluation.</abstract>
+      <url hash="f2580394">2024.isa-1.3</url>
+      <bibkey>anisimova-zikanova-2024-attitudes</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Automatic Alignment of Discourse Relations of Different Discourse Annotation Frameworks</title>
+      <author><first>Yingxue</first><last>Fu</last></author>
+      <pages>27–38</pages>
+      <abstract>Existing discourse corpora are annotated based on different frameworks, which show significant dissimilarities in definitions of arguments and relations and structural constraints. Despite surface differences, these frameworks share basic understandings of discourse relations. The relationship between these frameworks has been an open research question, especially the correlation between relation inventories utilized in different frameworks. Better understanding of this question is helpful for integrating discourse theories and enabling interoperability of discourse corpora annotated under different frameworks. However, studies that explore correlations between discourse relation inventories are hindered by different criteria of discourse segmentation, and expert knowledge and manual examination are typically needed. Some semi-automatic methods have been proposed, but they rely on corpora annotated in multiple frameworks in parallel. In this paper, we introduce a fully automatic approach to address the challenges. Specifically, we extend the label-anchored contrastive learning method introduced by Zhang et al. (2022b) to learn label embeddings during discourse relation classification. These embeddings are then utilized to map discourse relations from different frameworks. We show experimental results on RST-DT (Carlson et al., 2001) and PDTB 3.0 (Prasad et al., 2018).</abstract>
+      <url hash="2512473b">2024.isa-1.4</url>
+      <bibkey>fu-2024-automatic</bibkey>
+    </paper>
+    <paper id="5">
+      <title>A New Annotation Scheme for the Semantics of Taste</title>
+      <author><first>Teresa</first><last>Paccosi</last></author>
+      <author><first>Sara</first><last>Tonelli</last></author>
+      <pages>39–46</pages>
+      <abstract>This paper introduces a new annotation scheme for the semantics of gustatory language in English, which builds upon a previous framework for olfactory language based on frame semantics. The purpose of this annotation framework is to be used for annotating comparable resources for the study of sensory language and to create training datasets for supervised systems aimed at extracting sensory information. Furthermore, our approach incorporates words from specific historical periods, thereby enhancing the framework’s utility for studying language from a diachronic perspective.</abstract>
+      <url hash="7a2fea57">2024.isa-1.5</url>
+      <bibkey>paccosi-tonelli-2024-new</bibkey>
+    </paper>
+    <paper id="6">
+      <title>What to Annotate: Retrieving Lexical Markers of Conspiracy Discourse from an <fixed-case>I</fixed-case>talian-<fixed-case>E</fixed-case>nglish Corpus of Telegram Data</title>
+      <author><first>Costanza</first><last>Marini</last></author>
+      <author><first>Elisabetta</first><last>Jezek</last></author>
+      <pages>47–52</pages>
+      <abstract>In this age of social media, Conspiracy Theories (CTs) have become an issue that can no longer be ignored. After providing an overview of CT literature and corpus studies, we describe the creation of a 40,000-token English-Italian bilingual corpus of conspiracy-oriented Telegram comments – the Complotto corpus – and the linguistic analysis we performed using the Sketch Engine online platform (Kilgarriff et al., 2010) on our annotated data to identify statistically relevant linguistic markers of CT discourse. Thanks to the platform’s keywords and key terms extraction functions, we were able to assess the statistical significance of the following lexical and semantic phenomena, both cross-linguistically and cross-CT, namely: (1) evidentiality and epistemic modality markers; (2) debunking vocabulary referring to another version of the truth lying behind the official one; (3) the conceptual metaphor INSTITUTIONS ARE ABUSERS. All these features qualify as markers of CT discourse and have the potential to be effectively used for future semantic annotation tasks to develop automatic systems for CT identification.</abstract>
+      <url hash="1388ee73">2024.isa-1.6</url>
+      <bibkey>marini-jezek-2024-annotate</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Lightweight Connective Detection Using Gradient Boosting</title>
+      <author><first>Mustafa Erolcan</first><last>Er</last></author>
+      <author><first>Murathan</first><last>Kurfalı</last></author>
+      <author><first>Deniz</first><last>Zeyrek</last></author>
+      <pages>53–59</pages>
+      <abstract>In this work, we introduce a lightweight discourse connective detection system. Employing gradient boosting trained on straightforward, low-complexity features, this proposed approach sidesteps the computational demands of the current approaches that rely on deep neural networks. Considering its simplicity, our approach achieves competitive results while offering significant gains in terms of time even on CPU. Furthermore, the stable performance across two unrelated languages suggests the robustness of our system in the multilingual scenario. The model is designed to support the annotation of discourse relations, particularly in scenarios with limited resources, while minimizing performance loss.</abstract>
+      <url hash="13368f3c">2024.isa-1.7</url>
+      <bibkey>er-etal-2024-lightweight</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Shallow Discourse Parsing on <fixed-case>T</fixed-case>witter Conversations</title>
+      <author><first>Berfin</first><last>Aktas</last></author>
+      <author><first>Burak</first><last>Özmen</last></author>
+      <pages>60–65</pages>
+      <abstract>We present our PDTB-style annotations on conversational Twitter data, which was initially annotated by Scheffler et al. (2019). We introduced 1,043 new annotations to the dataset, nearly doubling the number of previously annotated discourse relations. Subsequently, we applied a neural Shallow Discourse Parsing (SDP) model to the resulting corpus, improving its performance through retraining with in-domain data. The most substantial improvement was observed in the sense identification task (+19%). Our experiments with diverse training data combinations underline the potential benefits of exploring various data combinations in domain adaptation efforts for SDP. To the best of our knowledge, this is the first application of Shallow Discourse Parsing on Twitter data</abstract>
+      <url hash="f29cd3c5">2024.isa-1.8</url>
+      <bibkey>aktas-ozmen-2024-shallow</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Search tool for An Event-Type Ontology</title>
+      <author><first>Nataliia</first><last>Petliak</last></author>
+      <author><first>Cristina Fernandéz</first><last>Alcaina</last></author>
+      <author><first>Eva</first><last>Fučíková</last></author>
+      <author><first>Jan</first><last>Hajič</last></author>
+      <author><first>Zdeňka</first><last>Urešová</last></author>
+      <pages>66–70</pages>
+      <abstract>This short demo description paper presents a new tool designed for searching an event-type ontology with rich information, demonstrated on the SynSemClass ontology resource. The tool complements a web browser, created by the authors of the SynSemClass ontology previously. Due to the complexity of the resource, the search tool offers possibilities both for a linguistically-oriented researcher as well as for teams working with the resource from a technical point of view, such as building role labeling tools, automatic annotation tools, etc.</abstract>
+      <url hash="72d32c32">2024.isa-1.9</url>
+      <bibkey>petliak-etal-2024-search</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Tiny But Mighty: A Crowdsourced Benchmark Dataset for Triple Extraction from Unstructured Text</title>
+      <author><first>Muhammad</first><last>Salman</last></author>
+      <author><first>Armin</first><last>Haller</last></author>
+      <author><first>Sergio J.</first><last>Rodriguez Mendez</last></author>
+      <author><first>Usman</first><last>Naseem</last></author>
+      <pages>71–81</pages>
+      <abstract>In the context of Natural Language Processing (NLP) and Semantic Web applications, constructing Knowledge Graphs (KGs) from unstructured text plays a vital role. Several techniques have been developed for KG construction from text, but the lack of standardized datasets hinders the evaluation of triple extraction methods. The evaluation of existing KG construction approaches is based on structured data or manual investigations. To overcome this limitation, this work introduces a novel dataset specifically designed to evaluate KG construction techniques from unstructured text. Our dataset consists of a diverse collection of compound and complex sentences meticulously annotated by human annotators with potential triples (subject, verb, object). The annotations underwent further scrutiny by expert ontologists to ensure accuracy and consistency. For evaluation purposes, the proposed F-measure criterion offers a robust approach to quantify the relatedness and assess the alignment between extracted triples and the ground-truth triples, providing a valuable tool for evaluating the performance of triple extraction systems. By providing a diverse collection of high-quality triples, our proposed benchmark dataset offers a comprehensive training and evaluation set for refining the performance of state-of-the-art language models on a triple extraction task. Furthermore, this dataset encompasses various KG-related tasks, such as named entity recognition, relation extraction, and entity linking.</abstract>
+      <url hash="9e589ff8">2024.isa-1.10</url>
+      <bibkey>salman-etal-2024-tiny</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Less is Enough: Less-Resourced Multilingual <fixed-case>AMR</fixed-case> Parsing</title>
+      <author><first>Bram</first><last>Vanroy</last></author>
+      <author><first>Tim</first><last>Van de Cruys</last></author>
+      <pages>82–92</pages>
+      <abstract>This paper investigates the efficacy of multilingual models for the task of text-to-AMR parsing, focusing on English, Spanish, and Dutch. We train and evaluate models under various configurations, including monolingual and multilingual settings, both in full and reduced data scenarios. Our empirical results reveal that while monolingual models exhibit superior performance, multilingual models are competitive across all languages, offering a more resource-efficient alternative for training and deployment. Crucially, our findings demonstrate that AMR parsing benefits from transfer learning across languages even when having access to significantly smaller datasets. As a tangible contribution, we provide text-to-AMR parsing models for the aforementioned languages as well as multilingual variants, and make available the large corpora of translated data for Dutch, Spanish (and Irish) that we used for training them in order to foster AMR research in non-English languages. Additionally, we open-source the training code and offer an interactive interface for parsing AMR graphs from text.</abstract>
+      <url hash="1194c7f7">2024.isa-1.11</url>
+      <bibkey>vanroy-van-de-cruys-2024-less</bibkey>
+    </paper>
+    <paper id="12">
+      <title><fixed-case>M</fixed-case>o<fixed-case>CCA</fixed-case>: A Model of Comparative Concepts for Aligning Constructicons</title>
+      <author><first>Arthur</first><last>Lorenzi</last></author>
+      <author><first>Peter</first><last>Ljunglöf</last></author>
+      <author><first>Ben</first><last>Lyngfelt</last></author>
+      <author><first>Tiago</first><last>Timponi Torrent</last></author>
+      <author><first>William</first><last>Croft</last></author>
+      <author><first>Alexander</first><last>Ziem</last></author>
+      <author><first>Nina</first><last>Böbel</last></author>
+      <author><first>Linnéa</first><last>Bäckström</last></author>
+      <author><first>Peter</first><last>Uhrig</last></author>
+      <author><first>Ely E.</first><last>Matos</last></author>
+      <pages>93–98</pages>
+      <abstract>This paper presents MoCCA, a Model of Comparative Concepts for Aligning Constructicons under development by a consortium of research groups building Constructicons of different languages including Brazilian Portuguese, English, German and Swedish. The Constructicons will be aligned by using comparative concepts (CCs) providing language-neutral definitions of linguistic properties. The CCs are drawn from typological research on grammatical categories and constructions, and from FrameNet frames, organized in a conceptual network. Language-specific constructions are linked to the CCs in accordance with general principles. MoCCA is organized into files of two types: a largely static CC Database file and multiple Linking files containing relations between constructions in a Constructicon and the CCs. Tools are planned to facilitate visualization of the CC network and linking of constructions to the CCs. All files and guidelines will be versioned, and a mechanism is set up to report cases where a language-specific construction cannot be easily linked to existing CCs.</abstract>
+      <url hash="4801fc5d">2024.isa-1.12</url>
+      <bibkey>lorenzi-etal-2024-mocca</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>ISO</fixed-case> 24617-8 Applied: Insights from Multilingual Discourse Relations Annotation in <fixed-case>E</fixed-case>nglish, <fixed-case>P</fixed-case>olish, and <fixed-case>P</fixed-case>ortuguese</title>
+      <author><first>Aleksandra</first><last>Tomaszewska</last></author>
+      <author><first>Purificação</first><last>Silvano</last></author>
+      <author><first>António</first><last>Leal</last></author>
+      <author><first>Evelin</first><last>Amorim</last></author>
+      <pages>99–110</pages>
+      <abstract>The main objective of this study is to contribute to multilingual discourse research by employing ISO-24617 Part 8 (Semantic Relations in Discourse, Core Annotation Schema – DR-core) for annotating discourse relations. Centering around a parallel discourse relations corpus that includes English, Polish, and European Portuguese, we initiate one of the few ISO-based comparative analyses through a multilingual corpus that aligns discourse relations across these languages. In this paper, we discuss the project’s contributions, including the annotated corpus, research findings, and statistics related to the use of discourse relations. The paper further discusses the challenges encountered in complying with the ISO standard, such as defining the scope of arguments and annotating specific relation types like Expansion. Our findings highlight the necessity for clearer definitions of certain discourse relations and more precise guidelines for argument spans, especially concerning the inclusion of connectives. Additionally, the study underscores the importance of ongoing collaborative efforts to broaden the inclusion of languages and more comprehensive datasets, with the objective of widening the reach of ISO-guided multilingual discourse research.</abstract>
+      <url hash="a259d30a">2024.isa-1.13</url>
+      <bibkey>tomaszewska-etal-2024-iso</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Combining semantic annotation schemes through interlinking</title>
+      <author><first>Harry</first><last>Bunt</last></author>
+      <pages>111–121</pages>
+      <abstract>This paper explores the possibilities of using combinations of different semantic annotation schemes. This is particularly interesting for annotation schemes developed under the umbrella of the ISO Semantic Annotation Framework (ISO 24617), since these schemes were intended to be complementary, providing ways of indicating different semantic information about the same entities. However, there are certain overlaps between the schemes of SemAF parts, due to overlaps of their semantic domains, which are a potential source of inconsistencies. The paper shows how issues relating to inconsistencies can be addressed at the levels of concrete representation, abstract syntax, and semantic interpretation.</abstract>
+      <url hash="4d4d83a3">2024.isa-1.14</url>
+      <bibkey>bunt-2024-combining</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Fusing <fixed-case>ISO</fixed-case> 24617-2 Dialogue Acts and Application-Specific Semantic Content Annotations</title>
+      <author><first>Andrei</first><last>Malchanau</last></author>
+      <author><first>Volha</first><last>Petukhova</last></author>
+      <author><first>Harry</first><last>Bunt</last></author>
+      <pages>122–132</pages>
+      <abstract>Accurately annotated data determines whether a modern high-performing AI/ML model will present a suitable solution to a complex task/application challenge, or time and resources are wasted. The more adequate the structure of the incoming data is specified, the more efficient the data is translated to be used by the application. This paper presents an approach to an application-specific dialogue semantics design which integrates the dialogue act annotation standard ISO 24617-2 and various domain-specific semantic annotations. The proposed multi-scheme design offers a plausible and a rather powerful strategy to integrate, validate, extend and reuse existing annotations, and automatically generate code for dialogue system modules. Advantages and possible trade-offs are discussed.</abstract>
+      <url hash="4bd0cba0">2024.isa-1.15</url>
+      <bibkey>malchanau-etal-2024-fusing</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Annotation-Based Semantics for Dialogues in the Vox World</title>
+      <author><first>Kiyong</first><last>Lee</last></author>
+      <pages>133–143</pages>
+      <abstract>This paper aims at enriching Annotation-Based Semantics (ABS) with the notion of small visual worlds, called the <i>Vox worlds</i>, to interpret dialogues in natural language. It attempts to implement classical set-theoretic models with these Vox worlds that serve as interpretation models. These worlds describe dialogue situations while providing background for the visualization of those situations in which these described dialogues take place interactively among dialogue participants, often triggering actions and emotions. The enriched ABS is based on VoxML, a modeling language for visual object conceptual structures (vocs or vox) that constitute the structural basis of visual worlds.</abstract>
+      <url hash="4bcfe2b3">2024.isa-1.16</url>
+      <bibkey>lee-2024-annotation</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Annotating Evaluative Language: Challenges and Solutions in Applying Appraisal Theory</title>
+      <author><first>Jiamei</first><last>Zeng</last></author>
+      <author><first>Min</first><last>Dong</last></author>
+      <author><first>Alex Chengyu</first><last>Fang</last></author>
+      <pages>144–151</pages>
+      <abstract>This article describes a corpus-based experiment to identify the challenges and solutions in the annotation of evaluative language according to the scheme defined in Appraisal Theory (Martin and White, 2005). Originating from systemic functional linguistics, Appraisal Theory provides a robust framework for the analysis of linguistic expressions of evaluation, stance, and interpersonal relationships. Despite its theoretical richness, the practical application of Appraisal Theory in text annotation presents significant challenges, chiefly due to the intricacies of identifying and classifying evaluative expressions within its sub-system of Attitude, which comprises Affect, Judgement, and Appreciation. This study examines these challenges through the annotation of a corpus of editorials related to the Russian-Ukraine conflict and aims to offer practical solutions to enhance the transparency and consistency of the annotation. By refining the annotation process and addressing the subjective nature in the identification and classification of evaluative language, this work represents some timely effort in the annotation of pragmatic knowledge in language resources.</abstract>
+      <url hash="373eb5f2">2024.isa-1.17</url>
+      <bibkey>zeng-etal-2024-annotating</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Attractive Multimodal Instructions, Describing Easy and Engaging Recipe Blogs</title>
+      <author><first>Ielka</first><last>van der Sluis</last></author>
+      <author><first>Jarred</first><last>Kiewiet de Jonge</last></author>
+      <pages>152–164</pages>
+      <abstract>This paper presents a corpus study that extends and generalises an existing annotation model which integrates functional content descriptions delivered via text, pictures and interactive components. The model is used to describe a new corpus with 20 online vegan recipe blogs in terms of their Attractiveness for at least two types of readers: vegan readers and readers interested in a vegan lifestyle. Arguably, these readers value a blog that shows that the target dish is Easy to Make which can be inferred from the number of ingredients, procedural steps and visualised actions, according to an Easy to Read cooking instruction that displays a coherent use of verbal and visual modalities presenting processes and results of the cooking actions involved. Moreover, added value may be attributed to invitations to Engage with the blog content and functionality through which information about the recipe, the author, diet and nutrition can be accessed. Thus, the corpus study merges generalisable annotations of verbal, visual and interaction phenomena to capture the Attractiveness of online vegan recipe blogs to inform reader and user studies and ultimately offer guidelines for authoring effective online multimodal instructions.</abstract>
+      <url hash="623e3939">2024.isa-1.18</url>
+      <bibkey>van-der-sluis-kiewiet-de-jonge-2024-attractive</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.latechclfl.xml b/data/xml/2024.latechclfl.xml
index edb3aeb475..699d91d442 100644
--- a/data/xml/2024.latechclfl.xml
+++ b/data/xml/2024.latechclfl.xml
@@ -28,6 +28,7 @@
       <abstract>In this paper, we evaluate two different natural language processing (NLP) approaches to solve a paradigmatic task for computational literary studies (CLS): the recognition of knowledge transfer in literary texts. We focus on the question of how adequately large language models capture the transfer of knowledge about family relations in German drama texts when this transfer is treated as a classification or textual entailment task using in-context learning (ICL). We find that a 13 billion parameter LLAMA 2 model performs best on the former, while GPT-4 performs best on the latter task. However, all models achieve relatively low scores compared to standard NLP benchmark results, struggle from inconsistencies with small changes in prompts and are often not able to make simple inferences beyond the textual surface, which is why an unreflected generic use of ICL in the CLS seems still not advisable.</abstract>
       <url hash="eeef9bac">2024.latechclfl-1.1</url>
       <bibkey>pagel-etal-2024-evaluating</bibkey>
+      <video href="2024.latechclfl-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Coreference in Long Documents using Hierarchical Entity Merging</title>
@@ -38,6 +39,7 @@
       <abstract>Current top-performing coreference resolution approaches are limited with regard to the maximum length of texts they can accept. We explore a recursive merging technique of entities that allows us to apply coreference models to texts of arbitrary length, as found in many narrative genres. In experiments on established datasets, we quantify the drop in resolution quality caused by this approach. Finally, we use an under-explored resource in the form of a fully coreference-annotated novel to illustrate our model’s performance for long documents in practice. Here, we achieve state-of-the-art performance, outperforming previous systems capable of handling long documents.</abstract>
       <url hash="07186827">2024.latechclfl-1.2</url>
       <bibkey>gupta-etal-2024-coreference</bibkey>
+      <video href="2024.latechclfl-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Metaphorical Framing of Refugees, Asylum Seekers and Immigrants in <fixed-case>UK</fixed-case>s Left and Right-Wing Media</title>
@@ -46,6 +48,7 @@
       <abstract>The metaphorical framing of refugees, asylum seekers, and immigrants (RASIM) has been widely explored in academia, but mainly through close analysis. The present research outlines a large-scale computational investigation of RASIM metaphors in UKs media discourse. We experiment with a method that facilitates automatic identification of RASIM metaphors in 21 years of RASIM-related news reports from eight popular UK newspapers. From the metaphors extracted, four overarching frames are identified. Further analysis reveals correlations between political bias and metaphor usage: overall, right-biased newspapers use RASIM metaphors more frequently than their left-biased counterparts. Within the metaphorical frames, water, disaster, and non-human metaphors are more prevalent in right-biased media. Additionally, diachronic analysis illustrates that the distinctions between left and right media have evolved over time. Water metaphors, for example, have become increasingly more representative of the political right in the past two decades.</abstract>
       <url hash="70785919">2024.latechclfl-1.3</url>
       <bibkey>wang-2024-metaphorical</bibkey>
+      <video href="2024.latechclfl-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Computational Analysis of Dehumanization of Ukrainians on <fixed-case>R</fixed-case>ussian Social Media</title>
@@ -55,6 +58,7 @@
       <abstract>Dehumanization is a pernicious process of denying some or all attributes of humanness to the target group. It is frequently cited as a common hallmark of incitement to commit genocide. The international security landscape has seen a dramatic shift following the 2022 Russian invasion of Ukraine. This, coupled with recent developments in the conceptualization of dehumanization, necessitates the creation of new techniques for analyzing and detecting this extreme violence-related phenomenon on a large scale. Our project pioneers the development of a detection system for instances of dehumanization. To achieve this, we collected the entire posting history of the most popular bloggers on Russian Telegram and tested classical machine learning, deep learning, and zero-shot learning approaches to explore and detect the dehumanizing rhetoric. We found that the transformer-based method for entity extraction SpERT shows a promising result of F 1 = 0.85 for binary classification. The proposed methods can be built into the systems of anticipatory governance, contribute to the collection of evidence of genocidal intent in the Russian invasion of Ukraine, and pave the way for large-scale studies of dehumanizing language. This paper contains references to language that some readers may find offensive.</abstract>
       <url hash="3d41a6c0">2024.latechclfl-1.4</url>
       <bibkey>burovova-romanyshyn-2024-computational</bibkey>
+      <video href="2024.latechclfl-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Compilation of a Synthetic <fixed-case>J</fixed-case>udeo-<fixed-case>F</fixed-case>rench Corpus</title>
@@ -65,6 +69,7 @@
       <abstract>This is a short paper describing the process of derivation of synthetic Judeo-French text. Judeo-French is one of a number of rare languages used in speaking and writing by Jewish communities as confined to a particular temporal and geographical frame (in this case, 11th- to 14th-century France). The number of resources in the language is very limited and its involvement in the contemporary domain of Natural Language Processing (NLP) is practically non-existent. This work outlines the compilation of a synthetic Judeo-French corpus. For the purpose, a pipeline of transformations is applied to Old French text belonging to the same general time period, leading to the derivation of text that is as reliable as possible in terms of phonological, morphological and lexical characteristics as witnessed in Judeo-French. Ultimately, the goal is for this synthetic corpus to be used in standard NLP tasks, such as Neural Machine Translation (NMT), as an instance of data augmentation.</abstract>
       <url hash="94dac8af">2024.latechclfl-1.5</url>
       <bibkey>nikolova-stoupak-etal-2024-compilation</bibkey>
+      <video href="2024.latechclfl-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Detecting Structured Language Alternations in Historical Documents by Combining Language Identification with <fixed-case>F</fixed-case>ourier Analysis</title>
@@ -76,6 +81,7 @@
       <url hash="8ffaf0e4">2024.latechclfl-1.6</url>
       <attachment type="SupplementaryMaterial" hash="137438be">2024.latechclfl-1.6.SupplementaryMaterial.zip</attachment>
       <bibkey>sirin-etal-2024-detecting</bibkey>
+      <video href="2024.latechclfl-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title><fixed-case>E</fixed-case>motion<fixed-case>A</fixed-case>rcs: Emotion Arcs for 9,000 Literary Texts</title>
@@ -100,6 +106,7 @@
       <url hash="9b00cd94">2024.latechclfl-1.8</url>
       <attachment type="SupplementaryMaterial" hash="eb806bf8">2024.latechclfl-1.8.SupplementaryMaterial.zip</attachment>
       <bibkey>alves-etal-2024-multi</bibkey>
+      <video href="2024.latechclfl-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title><fixed-case>E</fixed-case>vent<fixed-case>N</fixed-case>et-<fixed-case>ITA</fixed-case>: <fixed-case>I</fixed-case>talian Frame Parsing for Events</title>
@@ -118,6 +125,7 @@
       <abstract>The Moravians are a Christian group that has emerged from a 15th century movement. In this paper, we investigate how memoirs written by the devotees of this group can be analyzed with methods from computational linguistics, in particular sentiment analysis. To this end, we experiment with two different fine-tuning strategies and find that the best performance for ternary sentiment analysis (81% accuracy) is achieved by fine-tuning a German BERT model, outperforming in particular models trained on much larger German sentiment datasets. We further investigate the model(s) using SHAP scores and find that the best performing model struggles with multiple negations and mixed statements. Finally, we show two application scenarios motivated by research questions from religious studies.</abstract>
       <url hash="57750266">2024.latechclfl-1.10</url>
       <bibkey>brookshire-reiter-2024-modeling</bibkey>
+      <video href="2024.latechclfl-1.10.mp4"/>
     </paper>
     <paper id="11">
       <title>Applying Information-theoretic Notions to Measure Effects of the Plain <fixed-case>E</fixed-case>nglish Movement on <fixed-case>E</fixed-case>nglish Law Reports and Scientific Articles</title>
@@ -127,6 +135,7 @@
       <abstract>We investigate the impact of the Plain English Movement (PEM) on the complexity of legal language in UK law reports from the 1950s-2010s, contrasting it with the evolution of scientific language. The PEM, emerging in the late 20th century, advocated for clear and understandable legal language. We define complexity through the concept of surprisal - an information-theoretic measure correlating with cognitive processing difficulty. Our research contrasts surprisal with traditional readability measures, which often overlook content. We hypothesize that, if the PEM has influenced legal language, there would be a reduction in complexity over time and a shift from a nominal to a more verbal style. We analyze text complexity and lexico-grammatical changes in line with PEM recommendations. Results indicate minimal impact of the PEM on both legal and scientific domains. This finding suggests future research should consider processing effort when advocating for linguistic norms to enhance accessibility.</abstract>
       <url hash="4f74202b">2024.latechclfl-1.11</url>
       <bibkey>bagdasarov-degaetano-ortlieb-2024-applying</bibkey>
+      <video href="2024.latechclfl-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Uncovering the Handwritten Text in the Margins: End-to-end Handwritten Text Detection and Recognition</title>
@@ -138,6 +147,7 @@
       <abstract>The pressing need for digitization of historical documents has led to a strong interest in designing computerised image processing methods for automatic handwritten text recognition. However, not much attention has been paid on studying the handwritten text written in the margins, i.e. marginalia, that also forms an important source of information. Nevertheless, training an accurate and robust recognition system for marginalia calls for data-efficient approaches due to the unavailability of sufficient amounts of annotated multi-writer texts. Therefore, this work presents an end-to-end framework for automatic detection and recognition of handwritten marginalia, and leverages data augmentation and transfer learning to overcome training data scarcity. The detection phase involves investigation of R-CNN and Faster R-CNN networks. The recognition phase includes an attention-based sequence-to-sequence model, with ResNet feature extraction, bidirectional LSTM-based sequence modeling, and attention-based prediction of marginalia. The effectiveness of the proposed framework has been empirically evaluated on the data from early book collections found in the Uppsala University Library in Sweden. Source code and pre-trained models are available at Github.</abstract>
       <url hash="af81e59c">2024.latechclfl-1.12</url>
       <bibkey>cheng-etal-2024-uncovering</bibkey>
+      <video href="2024.latechclfl-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Historical Portrayal of <fixed-case>G</fixed-case>reek Tourism through Topic Modeling on International Newspapers</title>
@@ -148,6 +158,7 @@
       <abstract>In this paper, we bridge computational linguistics with historical methods to explore the potential of topic modeling in historical newspapers. Our case study focuses on British and American newspapers published in the second half of the 20th century that debate issues of Greek tourism, but our method can be transposed to any diachronic data. We demonstrate that Non-negative Matrix Factorization (NFM) can generate interpretable topics within the historical period under examination providing a tangible example of how computational text analysis can assist historical research. The contribution of our work is two-fold; first, the extracted topics are evaluated both by a computational linguist and by a historian highlighting the crucial role of domain experts when interpreting topic modeling outputs. Second, the extracted topics are contextualized within the historical and political environment in which they appear, providing interesting insights about the historical representations of Greek tourism over the years, and about the development and the hallmarks of American and British tourism in Greece across different historical periods (from 1945 to 1989). The comparative analysis between the American and the British press reveals interesting insights including similar responses to specific events as well as notable differences between British and American tourism to Greece during the historical periods under examination. Overall, the results of our analysis can provide valuable information for academics and researchers in the field of (Digital) Humanities and Social Sciences, as well as for stakeholders in the tourism industry.</abstract>
       <url hash="fb15f277">2024.latechclfl-1.13</url>
       <bibkey>karamouzi-etal-2024-historical</bibkey>
+      <video href="2024.latechclfl-1.13.mp4"/>
     </paper>
     <paper id="14">
       <title>Post-Correction of Historical Text Transcripts with Large Language Models: An Exploratory Study</title>
@@ -161,6 +172,7 @@
       <url hash="19262574">2024.latechclfl-1.14</url>
       <attachment type="SupplementaryMaterial" hash="a05adbcf">2024.latechclfl-1.14.SupplementaryMaterial.zip</attachment>
       <bibkey>boros-etal-2024-post</bibkey>
+      <video href="2024.latechclfl-1.14.mp4"/>
     </paper>
     <paper id="15">
       <title>Distinguishing Fictional Voices: a Study of Authorship Verification Models for Quotation Attribution</title>
@@ -173,6 +185,7 @@
       <url hash="f8443829">2024.latechclfl-1.15</url>
       <attachment type="SupplementaryMaterial" hash="a5f22b04">2024.latechclfl-1.15.SupplementaryMaterial.zip</attachment>
       <bibkey>michel-etal-2024-distinguishing</bibkey>
+      <video href="2024.latechclfl-1.15.mp4"/>
     </paper>
     <paper id="16">
       <title>Perplexing Canon: A study on <fixed-case>GPT</fixed-case>-based perplexity of canonical and non-canonical literary works</title>
@@ -185,6 +198,7 @@
       <url hash="59d8601d">2024.latechclfl-1.16</url>
       <attachment type="SupplementaryMaterial" hash="a6f9365c">2024.latechclfl-1.16.SupplementaryMaterial.zip</attachment>
       <bibkey>wu-etal-2024-perplexing</bibkey>
+      <video href="2024.latechclfl-1.16.mp4"/>
     </paper>
     <paper id="17">
       <title>People and Places of the Past - Named Entity Recognition in <fixed-case>S</fixed-case>wedish Labour Movement Documents from Historical Sources</title>
@@ -195,6 +209,7 @@
       <url hash="e444a946">2024.latechclfl-1.17</url>
       <attachment type="SupplementaryMaterial" hash="542d1133">2024.latechclfl-1.17.SupplementaryMaterial.zip</attachment>
       <bibkey>tudor-pettersson-2024-people</bibkey>
+      <video href="2024.latechclfl-1.17.mp4"/>
     </paper>
     <paper id="18">
       <title>Part-of-Speech Tagging of 16th-Century <fixed-case>L</fixed-case>atin with <fixed-case>GPT</fixed-case></title>
@@ -205,6 +220,7 @@
       <url hash="913d29d3">2024.latechclfl-1.18</url>
       <attachment type="SupplementaryMaterial" hash="5664b739">2024.latechclfl-1.18.SupplementaryMaterial.zip</attachment>
       <bibkey>stussi-strobel-2024-part</bibkey>
+      <video href="2024.latechclfl-1.18.mp4"/>
     </paper>
     <paper id="19">
       <title>Two Approaches to Diachronic Normalization of <fixed-case>P</fixed-case>olish Texts</title>
@@ -218,6 +234,7 @@
       <url hash="f408114f">2024.latechclfl-1.19</url>
       <attachment type="SupplementaryMaterial" hash="e8a5a9fb">2024.latechclfl-1.19.SupplementaryMaterial.tex</attachment>
       <bibkey>dudzic-etal-2024-two</bibkey>
+      <video href="2024.latechclfl-1.19.mp4"/>
     </paper>
     <paper id="20">
       <title>Enriching the Metadata of Community-Generated Digital Content through Entity Linking: An Evaluative Comparison of State-of-the-Art Models</title>
@@ -230,6 +247,7 @@
       <abstract>Digital archive collections that have been contributed by communities, known as community-generated digital content (CGDC), are important sources of historical and cultural knowledge. However, CGDC items are not easily searchable due to semantic information being obscured within their textual metadata. In this paper, we investigate the extent to which state-of-the-art, general-domain entity linking (EL) models (i.e., BLINK, EPGEL and mGENRE) can map named entities mentioned in CGDC textual metadata, to Wikidata entities. We evaluate and compare their performance on an annotated dataset of CGDC textual metadata and provide some error analysis, in the way of informing future studies aimed at enriching CGDC metadata using entity linking methods.</abstract>
       <url hash="e2ef7153">2024.latechclfl-1.20</url>
       <bibkey>benkhedda-etal-2024-enriching</bibkey>
+      <video href="2024.latechclfl-1.20.mp4"/>
     </paper>
     <paper id="21">
       <title>Recognising Occupational Titles in <fixed-case>G</fixed-case>erman Parliamentary Debates</title>
@@ -239,6 +257,7 @@
       <url hash="813a5c1a">2024.latechclfl-1.21</url>
       <attachment type="SupplementaryMaterial" hash="9535ee20">2024.latechclfl-1.21.SupplementaryMaterial.zip</attachment>
       <bibkey>binnewitt-2024-recognising</bibkey>
+      <video href="2024.latechclfl-1.21.mp4"/>
     </paper>
     <paper id="22">
       <title>Dynamic embedded topic models and change-point detection for exploring literary-historical hypotheses</title>
@@ -249,6 +268,7 @@
       <url hash="7dce151e">2024.latechclfl-1.22</url>
       <attachment type="SupplementaryMaterial" hash="46e58da5">2024.latechclfl-1.22.SupplementaryMaterial.zip</attachment>
       <bibkey>sirin-lippincott-2024-dynamic</bibkey>
+      <video href="2024.latechclfl-1.22.mp4"/>
     </paper>
     <paper id="23">
       <title>Post-<fixed-case>OCR</fixed-case> Correction of Digitized <fixed-case>S</fixed-case>wedish Newspapers with <fixed-case>B</fixed-case>y<fixed-case>T</fixed-case>5</title>
@@ -259,6 +279,7 @@
       <url hash="bfa136b7">2024.latechclfl-1.23</url>
       <attachment type="SupplementaryMaterial" hash="f54b4510">2024.latechclfl-1.23.SupplementaryMaterial.zip</attachment>
       <bibkey>lofgren-dannells-2024-post</bibkey>
+      <video href="2024.latechclfl-1.23.mp4"/>
     </paper>
     <paper id="24">
       <title>The Kronieken Corpus: an Annotated Collection of <fixed-case>D</fixed-case>utch/<fixed-case>F</fixed-case>lemish Chronicles from 1500-1850</title>
@@ -273,6 +294,7 @@
       <url hash="2712f1a7">2024.latechclfl-1.24</url>
       <attachment type="SupplementaryMaterial" hash="c7a654bf">2024.latechclfl-1.24.SupplementaryMaterial.zip</attachment>
       <bibkey>dekker-etal-2024-kronieken</bibkey>
+      <video href="2024.latechclfl-1.24.mp4"/>
     </paper>
     <paper id="25">
       <title>Direct Speech Identification in <fixed-case>S</fixed-case>wedish Literature and an Exploration of Training Data Type, Typographical Markers, and Evaluation Granularity</title>
@@ -281,6 +303,7 @@
       <abstract>Identifying direct speech in literary fiction is challenging for cases that do not mark speech segments with quotation marks. Such efforts have previously been based either on smaller manually annotated gold data or larger automatically annotated silver data, extracted from works with quotation marks. However, no direct comparison has so far been made between the performance of these two types of training data. In this work, we address this gap. We further explore the effect of different types of typographical speech marking and of using evaluation metrics of different granularity. We perform experiments on Swedish literary texts and find that using gold and silver data has different strengths, with gold data having stronger results on token-level metrics, whereas silver data overall has stronger results on span-level metrics. If the training data contains some data that matches the typographical speech marking of the target, that is generally sufficient for achieving good results, but it does not seem to hurt if the training data also contains other types of marking.</abstract>
       <url hash="3ecd1747">2024.latechclfl-1.25</url>
       <bibkey>stymne-2024-direct</bibkey>
+      <video href="2024.latechclfl-1.25.mp4"/>
     </paper>
     <paper id="26">
       <title>Pairing Orthographically Variant Literary Words to Standard Equivalents Using Neural Edit Distance Models</title>
@@ -291,6 +314,7 @@
       <url hash="d4f84797">2024.latechclfl-1.26</url>
       <attachment type="SupplementaryMaterial" hash="b461ee61">2024.latechclfl-1.26.SupplementaryMaterial.zip</attachment>
       <bibkey>messner-lippincott-2024-pairing</bibkey>
+      <video href="2024.latechclfl-1.26.mp4"/>
     </paper>
     <paper id="27">
       <title><fixed-case>[Lions: 1]</fixed-case> and <fixed-case>[Tigers: 2]</fixed-case> and <fixed-case>[Bears: 3]</fixed-case>, Oh My! Literary Coreference Annotation with <fixed-case>LLM</fixed-case>s</title>
@@ -301,6 +325,7 @@
       <url hash="0468523b">2024.latechclfl-1.27</url>
       <attachment type="SupplementaryMaterial" hash="3f800f38">2024.latechclfl-1.27.SupplementaryMaterial.zip</attachment>
       <bibkey>hicke-mimno-2024-lions</bibkey>
+      <video href="2024.latechclfl-1.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Stage Direction Classification in <fixed-case>F</fixed-case>rench Theater: Transfer Learning Experiments</title>
@@ -311,6 +336,7 @@
       <url hash="d76d1bc2">2024.latechclfl-1.28</url>
       <attachment type="SupplementaryMaterial" hash="4e8f366a">2024.latechclfl-1.28.SupplementaryMaterial.zip</attachment>
       <bibkey>schneider-ruiz-fabo-2024-stage</bibkey>
+      <video href="2024.latechclfl-1.28.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.law.xml b/data/xml/2024.law.xml
index 6b63f59e04..662e8a7f11 100644
--- a/data/xml/2024.law.xml
+++ b/data/xml/2024.law.xml
@@ -27,6 +27,7 @@
       <abstract>Visually Rich Form Understanding (VRFU) poses a complex research problemdue to the documents’ highly structured nature and yet highly variable style and content. Current annotation schemes decompose form understanding and omit key hierarchical structure, making development and evaluation of end-to-end models difficult. In this paper, we propose a novel F1 metric to evaluate form parsers and describe a new content-agnostic, tree-based annotation scheme for VRFU: TreeForm. We provide methods to convert previous annotation schemes into TreeForm structures and evaluate TreeForm predictions using a modified version of the normalized tree-edit distance. We present initial baselines for our end-to-end performance metric and the TreeForm edit distance, averaged over the FUNSD and XFUND datasets, of 61.5 and 26.4 respectively. We hope that TreeForm encourages deeper research in annotating, modeling, and evaluating the complexities of form-like documents.</abstract>
       <url hash="81adf17f">2024.law-1.1</url>
       <bibkey>zmigrod-etal-2024-treeform</bibkey>
+      <video href="2024.law-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Annotation Scheme for <fixed-case>E</fixed-case>nglish Argument Structure Constructions Treebank</title>
@@ -36,6 +37,7 @@
       <abstract>We introduce a detailed annotation scheme for argument structure constructions (ASCs) along with a manually annotated ASC treebank. This treebank encompasses 10,204 sentences from both first (5,936) and second language English datasets (1,948 for written; 2,320 for spoken). We detail the annotation process and evaluate inter-annotation agreement for overall and each ASC category.</abstract>
       <url hash="faeacd8c">2024.law-1.2</url>
       <bibkey>sung-kyle-2024-annotation</bibkey>
+      <video href="2024.law-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>A Mapping on Current Classifying Categories of Emotions Used in Multimodal Models for Emotion Recognition</title>
@@ -60,6 +62,7 @@
       <abstract>In the realm of Machine Learning and Deep Learning, there is a need for high-quality annotated data to train and evaluate supervised models. An extensive number of annotation tools have been developed to facilitate the data labelling process. However, finding the right tool is a demanding task involving thorough searching and testing. Hence, to effectively navigate the multitude of tools, it becomes essential to ensure their findability, accessibility, interoperability, and reusability (FAIR). This survey addresses the FAIRness of existing annotation software by evaluating 50 different tools against the FAIR principles for research software (FAIR4RS). The study indicates that while being accessible and interoperable, annotation tools are difficult to find and reuse. In addition, there is a need to establish community standards for annotation software development, documentation, and distribution.</abstract>
       <url hash="ab4add42">2024.law-1.4</url>
       <bibkey>borisova-etal-2024-surveying</bibkey>
+      <video href="2024.law-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Automatic Annotation Elaboration as Feedback to Sign Language Learners</title>
@@ -69,6 +72,7 @@
       <abstract>Beyond enabling linguistic analyses, linguistic annotations may serve as training material for developing automatic language assessment models as well as for providing textual feedback to language learners. Yet these linguistic annotations in their original form are often not easily comprehensible for learners. In this paper, we explore the utilization of GPT-4, as an example of a large language model (LLM), to process linguistic annotations into clear and understandable feedback on their productions for language learners, specifically sign language learners.</abstract>
       <url hash="8ac3b4b1">2024.law-1.5</url>
       <bibkey>battisti-ebling-2024-automatic</bibkey>
+      <video href="2024.law-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Towards Better Inclusivity: A Diverse Tweet Corpus of <fixed-case>E</fixed-case>nglish Varieties</title>
@@ -90,6 +94,7 @@
       <abstract>Access to jurisprudence is of paramount importance for both law professionals (judges, lawyers, law students) and for the larger public. In Romania, the Superior Council of Magistracy holds a large database of jurisprudence from different courts in the country, which is updated daily. However, granting public access requires its anonymization. This paper presents the efforts behind building a corpus for the anonymization process. We present the annotation scheme, the manual annotation methods, and the platform used.</abstract>
       <url hash="2db28bf2">2024.law-1.7</url>
       <bibkey>pais-etal-2024-building</bibkey>
+      <video href="2024.law-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Class Balancing for Efficient Active Learning in Imbalanced Datasets</title>
@@ -103,6 +108,7 @@
       <abstract>Recent developments in active learning algorithms for NLP tasks show promising results in terms of reducing labelling complexity. In this paper we extend this effort to imbalanced datasets; we bridge between the active learning approach of obtaining diverse andinformative examples, and the heuristic of class balancing used in imbalanced datasets. We develop a novel tune-free weighting technique that canbe applied to various existing active learning algorithms, adding a component of class balancing. We compare several active learning algorithms to their modified version on multiple public datasetsand show that when the classes are imbalanced, with manual annotation effort remaining equal the modified version significantly outperforms the original both in terms of the test metric and the number of obtained minority examples. Moreover, when the imbalance is mild or non-existent (classes are completely balanced), our technique does not harm the base algorithms.</abstract>
       <url hash="62ff1c92">2024.law-1.8</url>
       <bibkey>fairstein-etal-2024-class</bibkey>
+      <video href="2024.law-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title>When is a Metaphor Actually Novel? Annotating Metaphor Novelty in the Context of Automatic Metaphor Detection</title>
@@ -112,6 +118,7 @@
       <abstract>We present an in-depth analysis of metaphor novelty, a relatively overlooked phenomenon in NLP. Novel metaphors have been analyzed via scores derived from crowdsourcing in NLP, while in theoretical work they are often defined by comparison to senses in dictionary entries. We reannotate metaphorically used words in the large VU Amsterdam Metaphor Corpus based on whether their metaphoric meaning is present in the dictionary. Based on this, we find that perceived metaphor novelty often clash with the dictionary based definition. We use the new labels to evaluate the performance of state-of-the-art language models for automatic metaphor detection and notice that novel metaphors according to our dictionary-based definition are easier to identify than novel metaphors according to crowd-sourced novelty scores. In a subsequent analysis, we study the correlation between high novelty scores and word frequencies in the pretraining and finetuning corpora, as well as potential problems with rare words for pre-trained language models. In line with previous works, we find a negative correlation between word frequency in the training data and novelty scores and we link these aspects to problems with the tokenization of BERT and RoBERTa.</abstract>
       <url hash="fc66e5b5">2024.law-1.9</url>
       <bibkey>reimann-scheffler-2024-metaphor</bibkey>
+      <video href="2024.law-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Enhancing Text Classification through <fixed-case>LLM</fixed-case>-Driven Active Learning and Human Annotation</title>
@@ -121,6 +128,7 @@
       <abstract>In the context of text classification, the financial burden of annotation exercises for creating training data is a critical issue. Active learning techniques, particularly those rooted in uncertainty sampling, offer a cost-effective solution by pinpointing the most instructive samples for manual annotation. Similarly, Large Language Models (LLMs) such as GPT-3.5 provide an alternative for automated annotation but come with concerns regarding their reliability. This study introduces a novel methodology that integrates human annotators and LLMs within an Active Learning framework. We conducted evaluations on three public datasets. IMDB for sentiment analysis, a Fake News dataset for authenticity discernment, and a Movie Genres dataset for multi-label classification.The proposed framework integrates human annotation with the output of LLMs, depending on the model uncertainty levels. This strategy achieves an optimal balance between cost efficiency and classification performance. The empirical results show a substantial decrease in the costs associated with data annotation while either maintaining or improving model accuracy.</abstract>
       <url hash="291ad332">2024.law-1.10</url>
       <bibkey>rouzegar-makrehchi-2024-enhancing</bibkey>
+      <video href="2024.law-1.10.mp4"/>
     </paper>
     <paper id="11">
       <title>Using <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> for Annotation of Attitude within the Appraisal Theory: Lessons Learned</title>
@@ -147,6 +155,7 @@
       <abstract>We often assume that annotation tasks, such as annotating for the presence of conspiracy theories, can be annotated with hard labels, without definitions or guidelines. Our annotation experiments, comparing students and experts, show that there is little agreement on basic annotations even among experts. For this reason, we conclude that we need to accept disagreement as an integral part of such annotations.</abstract>
       <url hash="297657ab">2024.law-1.12</url>
       <bibkey>hemm-etal-2024-serious</bibkey>
+      <video href="2024.law-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title>A <fixed-case>GPT</fixed-case> among Annotators: <fixed-case>LLM</fixed-case>-based Entity-Level Sentiment Annotation</title>
@@ -179,6 +188,7 @@
       <abstract>Pre-trained large language models, such as ChatGPT, archive outstanding performance in various reasoning tasks without supervised training and were found to have outperformed crowdsourcing workers. Nonetheless, ChatGPT’s performance in the task of implicit discourse relation classification, prompted by a standard multiple-choice question, is still far from satisfactory and considerably inferior to state-of-the-art supervised approaches. This work investigates several proven prompting techniques to improve ChatGPT’s recognition of discourse relations. In particular, we experimented with breaking down the classification task that involves numerous abstract labels into smaller subtasks. Nonetheless, experiment results show that the inference accuracy hardly changes even with sophisticated prompt engineering, suggesting that implicit discourse relation classification is not yet resolvable under zero-shot or few-shot settings.</abstract>
       <url hash="097817e2">2024.law-1.15</url>
       <bibkey>yung-etal-2024-prompting</bibkey>
+      <video href="2024.law-1.15.mp4"/>
     </paper>
     <paper id="16">
       <title><fixed-case>P</fixed-case>rop<fixed-case>B</fixed-case>ank goes Public: Incorporation into <fixed-case>W</fixed-case>ikidata</title>
@@ -206,6 +216,7 @@
       <abstract>We present the construction of a German chat corpus in an experimental setting. Our primary objective is to advance the methodology of discourse continuation for dialogue. The corpus features a fine-grained, multi-layer annotation of referential expressions and coreferential chains. Additionally, we have developed a comprehensive annotation scheme for coherence relations to describe discourse structure.</abstract>
       <url hash="2517995e">2024.law-1.17</url>
       <bibkey>jasinskaja-etal-2024-reference</bibkey>
+      <video href="2024.law-1.17.mp4"/>
     </paper>
     <paper id="18">
       <title>Dependency Annotation of <fixed-case>O</fixed-case>ttoman <fixed-case>T</fixed-case>urkish with Multilingual <fixed-case>BERT</fixed-case></title>
@@ -228,6 +239,7 @@
       <abstract>Instruction tuning has become an integral part of training pipelines for Large Language Models (LLMs) and has been shown to yield strong performance gains. In an orthogonal line of research, Annotation Error Detection (AED) has emerged as a tool for detecting quality problems in gold standard labels. So far, however, the application of AED methods has been limited to classification tasks. It is an open question how well AED methods generalize to language generation settings, which are becoming more widespread via LLMs. In this paper, we present a first and novel benchmark for AED on instruction tuning data: Donkii.It comprises three instruction-tuning datasets enriched with error annotations by experts and semi-automatic methods. We also provide a novel taxonomy of error types for instruction-tuning data.We find that all three datasets contain clear errors, which sometimes propagate directly into instruction-tuned LLMs. We propose four AED baselines for the generative setting and evaluate them extensively on the newly introduced dataset. Our results show that the choice of the right AED method and model size is indeed crucial and derive practical recommendations for how to use AED methods to clean instruction-tuning data.</abstract>
       <url hash="bc0ad0ac">2024.law-1.19</url>
       <bibkey>weber-etal-2024-donkii</bibkey>
+      <video href="2024.law-1.19.mp4"/>
     </paper>
     <paper id="20">
       <title><fixed-case>EEVEE</fixed-case>: An Easy Annotation Tool for Natural Language Processing</title>
@@ -239,6 +251,7 @@
       <abstract>Annotation tools are the starting point for creating Natural Language Processing (NLP) datasets. There is a wide variety of tools available; setting up these tools is however a hindrance. We propose Eevee, an annotation tool focused on simplicity, efficiency, and ease of use. It can run directly in the browser (no setup required) and uses tab-separated files (as opposed to character offsets or task-specific formats) for annotation. It allows for annotation of multiple tasks on a single dataset and supports four task-types: sequence labeling, span labeling, text classification and seq2seq.</abstract>
       <url hash="ca7c3d3a">2024.law-1.20</url>
       <bibkey>sorensen-etal-2024-eevee</bibkey>
+      <video href="2024.law-1.20.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.ldl.xml b/data/xml/2024.ldl.xml
new file mode 100644
index 0000000000..213a429b16
--- /dev/null
+++ b/data/xml/2024.ldl.xml
@@ -0,0 +1,206 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.ldl">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 9th Workshop on Linked Data in Linguistics @ LREC-COLING 2024</booktitle>
+      <editor><first>Christian</first><last>Chiarcos</last></editor>
+      <editor><first>Katerina</first><last>Gkirtzou</last></editor>
+      <editor><first>Maxim</first><last>Ionov</last></editor>
+      <editor><first>Fahad</first><last>Khan</last></editor>
+      <editor><first>John P.</first><last>McCrae</last></editor>
+      <editor><first>Elena Montiel</first><last>Ponsoda</last></editor>
+      <editor><first>Patricia Martín</first><last>Chozas</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="6a6ebc49">2024.ldl-1</url>
+      <venue>ldl</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="b1626a3e">2024.ldl-1.0</url>
+      <bibkey>ldl-2024-linked</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>LLODIA</fixed-case>: A Linguistic Linked Open Data Model for Diachronic Analysis</title>
+      <author><first>Florentina</first><last>Armaselu</last></author>
+      <author><first>Chaya</first><last>Liebeskind</last></author>
+      <author><first>Paola</first><last>Marongiu</last></author>
+      <author><first>Barbara</first><last>McGillivray</last></author>
+      <author><first>Giedre</first><last>Valunaite Oleskeviciene</last></author>
+      <author><first>Elena-Simona</first><last>Apostol</last></author>
+      <author><first>Ciprian-Octavian</first><last>Truica</last></author>
+      <author><first>Daniela</first><last>Gifu</last></author>
+      <pages>1–10</pages>
+      <abstract>This article proposes a linguistic linked open data model for diachronic analysis (LLODIA) that combines data derived from diachronic analysis of multilingual corpora with dictionary-based evidence. A humanities use case was devised as a proof of concept that includes examples in five languages (French, Hebrew, Latin, Lithuanian and Romanian) related to various meanings of the term “revolution” considered at different time intervals. The examples were compiled through diachronic word embedding and dictionary alignment.</abstract>
+      <url hash="d291e3f7">2024.ldl-1.1</url>
+      <bibkey>armaselu-etal-2024-llodia</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Cross-Lingual Ontology Matching using Structural and Semantic Similarity</title>
+      <author><first>Shubhanker</first><last>Banerjee</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>John Philip</first><last>McCrae</last></author>
+      <pages>11–21</pages>
+      <abstract>The development of ontologies in various languages is attracting attention as the amount of multilingual data available on the web increases. Cross-lingual ontology matching facilitates interoperability amongst ontologies in different languages. Although supervised machine learning-based methods have shown good performance on ontology matching, their application to the cross-lingual setting is limited by the availability of training data. Current state-of-the-art unsupervised methods for cross-lingual ontology matching focus on lexical similarity between entities. These approaches follow a two-stage pipeline where the entities are translated into a common language using a translation service in the first step followed by computation of lexical similarity between the translations to match the entities in the second step. In this paper we introduce a novel ontology matching method based on the fusion of structural similarity and cross-lingual semantic similarity. We carry out experiments using 3 language pairs and report substantial improvements on the performance of the lexical methods thus showing the effectiveness of our proposed approach. To the best of our knowledge this is the first work which tackles the problem of unsupervised ontology matching in the cross-lingual setting by leveraging both structural and semantic embeddings.</abstract>
+      <url hash="80e60f4e">2024.ldl-1.2</url>
+      <bibkey>banerjee-etal-2024-cross</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Querying the Lexicon der indogermanischen Verben in the <fixed-case>L</fixed-case>i<fixed-case>L</fixed-case>a Knowledge Base: Two Use Cases</title>
+      <author><first>Valeria Irene</first><last>Boano</last></author>
+      <author><first>Marco</first><last>Passarotti</last></author>
+      <author><first>Riccardo</first><last>Ginevra</last></author>
+      <pages>22–31</pages>
+      <abstract>This paper presents two use cases of the etymological data provided by the *Lexicon der indogermanischen Verben* (LIV) after their publication as Linked Open Data and their linking to the LiLa Knowledge Base (KB) of interoperable linguistic resources for Latin. The first part of the paper briefly describes the LiLa KB and its structure. Then, the LIV and the information it contains are introduced, followed by a short description of the ontologies and the extensions used for modelling the LIV’s data and interlinking them to the LiLa ecosystem. The last section details the two use cases. The first case concerns the inflection types of the Latin verbs that reflect Proto-Indo-European stems, while the second one focusses on the Latin derivatives of the inherited stems. The results of the investigations are put in relation to current research topics in Historical Linguistics, demonstrating their relevance to the discipline.</abstract>
+      <url hash="65fb5f85">2024.ldl-1.3</url>
+      <bibkey>boano-etal-2024-querying</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Defining an Ontology for Museum Critical Cataloguing Terminology Guidelines</title>
+      <author><first>Erin</first><last>Canning</last></author>
+      <pages>32–36</pages>
+      <abstract>Submission type: Short paper This paper presents the proposed ontology for the project Computational Approaches for Addressing Problematic Terminology (CAAPT). This schema seeks to represent contents and structure of language guideline documents produced by cultural heritage institutions seeking to engage with critical cataloguing or reparative description work, known as terminology guidance documents. It takes the Victoria &amp; Albert Museum’s Terminology Guidance Document as a source for the initial modelling work. Ultimately, CAAPT seeks to expand the knowledge graph beyond the V&amp;A Museum context to incorporate additional terminology guidance documents and linked open data vocabularies. The ontology seeks to bring together scholarly communities in areas relevant to this project, most notably those in cultural heritage and linguistics linked open data, by leveraging existing linked data resources in these areas: as such, OntoLex, CIDOC CRM, and SKOS are used as a foundation for this work, along with a proposed schema from a related project, CULCO. As the CAAPT project is in early stages, this paper presents the preliminary results of work undertaken thus far in order to seek feedback from the linguistics linked open data community.</abstract>
+      <url hash="062608cb">2024.ldl-1.4</url>
+      <bibkey>canning-2024-defining</bibkey>
+    </paper>
+    <paper id="5">
+      <title>The <fixed-case>MOLOR</fixed-case> Lemma Bank: a New <fixed-case>LLOD</fixed-case> Resource for <fixed-case>O</fixed-case>ld <fixed-case>I</fixed-case>rish</title>
+      <author><first>Theodorus</first><last>Fransen</last></author>
+      <author><first>Cormac</first><last>Anderson</last></author>
+      <author><first>Sacha</first><last>Beniamine</last></author>
+      <author><first>Marco</first><last>Passarotti</last></author>
+      <pages>37–43</pages>
+      <abstract>This paper describes the first steps in creating a Lemma Bank for Old Irish (600-900CE) within the Linked Data paradigm, taking inspiration from a similar resource for Latin built as part of the LiLa project (2018–2023). The focus is on the extraction and RDF conversion of nouns from Goidelex, a novel and highly structured morphological resource for Old Irish. The aim is to strike a good balance between retaining a representative level of morphological granularity and at the same time keeping the amount of lemma variants within workable limits, to facilitate straightforward resource interlinking for Old Irish, planned as future work.</abstract>
+      <url hash="e1e87ee8">2024.ldl-1.5</url>
+      <bibkey>fransen-etal-2024-molor</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>CHAMUÇA</fixed-case>: Towards a Linked Data Language Resource of <fixed-case>P</fixed-case>ortuguese Borrowings in <fixed-case>A</fixed-case>sian Languages</title>
+      <author><first>Fahad</first><last>Khan</last></author>
+      <author><first>Ana</first><last>Salgado</last></author>
+      <author><first>Isuri</first><last>Anuradha</last></author>
+      <author><first>Rute</first><last>Costa</last></author>
+      <author><first>Chamila</first><last>Liyanage</last></author>
+      <author><first>John P.</first><last>McCrae</last></author>
+      <author><first>Atul Kr.</first><last>Ojha</last></author>
+      <author><first>Priya</first><last>Rani</last></author>
+      <author><first>Francesca</first><last>Frontini</last></author>
+      <pages>44–48</pages>
+      <abstract>This paper presents the development of CHAMUÇA, a novel lexical resource designed to document the influence of the Portuguese language on various Asian languages, with an initial focus on the languages of South Asia. Through the utilization of linked open data and the OntoLex vocabulary, CHAMUÇA offers structured insights into the linguistic characteristics, and cultural ramifications of Portuguese borrowings across multiple languages. The article outlines CHAMUÇA’s potential contributions to the linguistic linked data community, emphasising its role in addressing the scarcity of resources for lesser-resourced languages and serving as a test case for organising etymological data in a queryable format. CHAMUÇA emerges as an initiative towards the comprehensive catalogization and analysis of Portuguese borrowings, offering valuable insights into language contact dynamics, historical evolution, and cultural exchange in Asia, one that is based on linked data technology.</abstract>
+      <url hash="dcc099a9">2024.ldl-1.6</url>
+      <bibkey>khan-etal-2024-chamuca</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>LOD</fixed-case>in<fixed-case>G</fixed-case>: Linked Open Data in the Humanities</title>
+      <author><first>Jacek</first><last>Kudera</last></author>
+      <author><first>Claudia</first><last>Bamberg</last></author>
+      <author><first>Thomas</first><last>Burch</last></author>
+      <author><first>Folke</first><last>Gernert</last></author>
+      <author><first>Maria</first><last>Hinzmann</last></author>
+      <author><first>Susanne</first><last>Kabatnik</last></author>
+      <author><first>Claudine</first><last>Moulin</last></author>
+      <author><first>Benjamin</first><last>Raue</last></author>
+      <author><first>Achim</first><last>Rettinger</last></author>
+      <author><first>Jörg</first><last>Röpke</last></author>
+      <author><first>Ralf</first><last>Schenkel</last></author>
+      <author><first>Kristin</first><last>Shi-Kupfer</last></author>
+      <author><first>Doris</first><last>Schirra</last></author>
+      <author><first>Christof</first><last>Schöch</last></author>
+      <author><first>Joëlle</first><last>Weis</last></author>
+      <pages>49–54</pages>
+      <abstract>We are presenting LODinG – Linked Open Data in the Humanities (abbreviated from Linked Open Data in den Geisteswissenschaften), a recently launched research initiative exploring the intersection of Linked Open Data (LOD) and a range of areas of work within the Humanities. We focus on effective methods of collecting, modeling, linking, releasing and analyzing machine-readable information relevant to (digital) humanities research in the form of LOD. LODinG combines the sources and methods of digital humanities, general and computational linguistics, digital lexicography, German and Romance philology, translatology, cultural and literary studies, media studies, information science and law to explore and expand the potential of the LOD paradigm for such a diverse and multidisciplinary field. The project’s primary objectives are to improve the methods of extracting, modeling and analyzing multilingual data in the LOD paradigm; to demonstrate the application of the linguistic LOD to various methods and domains within and beyond the humanities; and to develop a modular, cross-domain data model for the humanities.</abstract>
+      <url hash="a5380010">2024.ldl-1.7</url>
+      <bibkey>kudera-etal-2024-loding</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>D</fixed-case>ig<fixed-case>I</fixed-case>t<fixed-case>A</fixed-case>nt: a platform for creating, linking and exploiting <fixed-case>LOD</fixed-case> lexica with heterogeneous resources</title>
+      <author><first>Michele</first><last>Mallia</last></author>
+      <author><first>Michela</first><last>Bandini</last></author>
+      <author><first>Andrea</first><last>Bellandi</last></author>
+      <author><first>Francesca</first><last>Murano</last></author>
+      <author><first>Silvia</first><last>Piccini</last></author>
+      <author><first>Luca</first><last>Rigobianco</last></author>
+      <author><first>Alessandro</first><last>Tommasi</last></author>
+      <author><first>Cesare</first><last>Zavattari</last></author>
+      <author><first>Mariarosaria</first><last>Zinzi</last></author>
+      <author><first>Valeria</first><last>Quochi</last></author>
+      <pages>55–65</pages>
+      <abstract>Over the past few years, the deployment of Linked Open Data (LOD) technologies has witnessed significant advancements across a myriad of sectors, linguistics included. This progression is characterized by an exponential increase in the conversion of resources to adhere to contemporary encoding standards. Such transformations are driven by the objectives outlined in “ecological” methodologies, notably the FAIR data principles, which advocate for the reuse and interoperability of resources. This paper presents the DigItAnt architecture, developed in the context of a national project funded by the Italian Ministry of Research and in the service of a recently started Italian endeavor to realize a federation of infrastructures for the humanities. It details its services, utilities and data types, and shows how it manages to produce, exploit and interlink LLOD and non-LLOD datasets in ways that are meaningful to its intended target disciplinary context, i.e. historical linguistics over epigraphy data. The paper also introduces how DigItAnt services and functionalities will contribute to the empowerment of the H2IOSC Italian infrastructures cluster project, which is devoted to the construction of a nationwide research infrastructure federation for the humanities, and it will possibly contribute to its pilot project towards an authoritative LLOD platform.</abstract>
+      <url hash="d4d87c61">2024.ldl-1.8</url>
+      <bibkey>mallia-etal-2024-digitant</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Teanga Data Model for Linked Corpora</title>
+      <author><first>John P.</first><last>McCrae</last></author>
+      <author><first>Priya</first><last>Rani</last></author>
+      <author><first>Adrian</first><last>Doyle</last></author>
+      <author><first>Bernardo</first><last>Stearns</last></author>
+      <pages>66–74</pages>
+      <abstract>Corpus data is the main source of data for natural language processing applications, however no standard or model for corpus data has become predominant in the field. Linguistic linked data aims to provide methods by which data can be made findable, accessible, interoperable and reusable (FAIR). However, current attempts to create a linked data format for corpora have been unsuccessful due to the verbose and specialised formats that they use. In this work, we present the Teanga data model, which uses a layered annotation model to capture all NLP-relevant annotations. We present the YAML serializations of the model, which is concise and uses a widely-deployed format, and we describe how this can be interpreted as RDF. Finally, we demonstrate three examples of the use of the Teanga data model for syntactic annotation, literary analysis and multilingual corpora.</abstract>
+      <url hash="a1ed2899">2024.ldl-1.9</url>
+      <bibkey>mccrae-etal-2024-teanga</bibkey>
+    </paper>
+    <paper id="10">
+      <title>The Services of the <fixed-case>L</fixed-case>i<fixed-case>L</fixed-case>a Knowledge Base of Interoperable Linguistic Resources for <fixed-case>L</fixed-case>atin</title>
+      <author><first>Marco</first><last>Passarotti</last></author>
+      <author><first>Francesco</first><last>Mambrini</last></author>
+      <author><first>Giovanni</first><last>Moretti</last></author>
+      <pages>75–83</pages>
+      <abstract>This paper describes three online services designed to ease the tasks of querying and populating the linguistic resources for Latin made interoperable through their publication as Linked Open Data in the LiLa Knowledge Base. As for querying the KB, we present an interface to search the collection of lemmas that represents the core of the Knowledge Base, and an interactive, graphical platform to run queries on the resources currently interlinked. As for populating the KB with new textual resources, we describe a tool that performs automatic tokenization, lemmatization and Part-of-Speech tagging of a raw text in Latin and links its tokens to LiLa.</abstract>
+      <url hash="b1937706">2024.ldl-1.10</url>
+      <bibkey>passarotti-etal-2024-services</bibkey>
+    </paper>
+    <paper id="11">
+      <title>An Annotated Dataset for Transformer-based Scholarly Information Extraction and Linguistic Linked Data Generation</title>
+      <author><first>Vayianos</first><last>Pertsas</last></author>
+      <author><first>Marialena</first><last>Kasapaki</last></author>
+      <author><first>Panos</first><last>Constantopoulos</last></author>
+      <pages>84–93</pages>
+      <abstract>We present a manually curated and annotated, multidisciplinary dataset of 15,262 sentences from research articles (abstract and main text) that can be used for transformer-based extraction from scholarly publications of three types of entities: 1) research methods, named entities of variable length, 2) research goals, entities that appear as textual spans of variable length with mostly fixed lexico-syntactic-structure, and 3) research activities, entities that appear as textual spans of variable length with complex lexico-syntactic structure. We explore the capabilities of our dataset by using it for training/fine-tuning various ML and transformer-based models. We compare our finetuned models as well as LLM responses (chatGPT 3.5) based on 10-shot learning, by measuring F1 scores in token-based, entity-based strict and entity-based partial evaluations across interdisciplinary and discipline-specific datasets in order to capture any possible differences in discipline-oriented writing styles. Results show that fine tuning of transformer-based models significantly outperforms the performance of few- shot learning of LLMs such as chatGPT, highlighting the significance of annotation datasets in such tasks. Our dataset can also be used as a source for linguistic linked data by itself. We demonstrate this by presenting indicative queries in SPARQL, executed over such an RDF knowledge graph.</abstract>
+      <url hash="86eace60">2024.ldl-1.11</url>
+      <bibkey>pertsas-etal-2024-annotated</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Linguistic <fixed-case>LOD</fixed-case> for Interoperable Morphological Description</title>
+      <author><first>Michael</first><last>Rosner</last></author>
+      <author><first>Maxim</first><last>Ionov</last></author>
+      <pages>94–102</pages>
+      <abstract>Interoperability is a characteristic of a product or system that seamlessly works with another product or system and implies a certain level of independence from the context of use. Turning to language resources, interoperability is frequently cited as one important rationale underlying the use of LLOD representations and is generally regarded as highly desirable. In this paper we further elaborate this theme, distinguishing three different kinds of interoperability providing practical implementations with examples from morphology.</abstract>
+      <url hash="a018c1eb">2024.ldl-1.12</url>
+      <bibkey>rosner-ionov-2024-linguistic</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Modeling linking between text and lexicon with <fixed-case>O</fixed-case>nto<fixed-case>L</fixed-case>ex-Lemon: a case study of computational terminology for the <fixed-case>B</fixed-case>abylonian Talmud</title>
+      <author><first>Flavia</first><last>Sciolette</last></author>
+      <pages>103–107</pages>
+      <abstract>This paper illustrates the first steps in the creation of a computational terminology for the Babylonian Talmud. After introducing reasons and the state of the art, the paper exposes the choice of using OntoLex-Lemon and the new FrAC module for encoding the attestations and quantitative data of the terminology extraction. After that, the Talmudic terminology base is introduced and an example entry with the above-mentioned data is shown. The scheme is motivated not only by the rich representation the model allows, but also by the future management of the link between text and lexical entries.</abstract>
+      <url hash="619cf56f">2024.ldl-1.13</url>
+      <bibkey>sciolette-2024-modeling</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>O</fixed-case>nto<fixed-case>L</fixed-case>ex Publication Made Easy: A Dataset of Verbal Aspectual Pairs for <fixed-case>B</fixed-case>osnian, <fixed-case>C</fixed-case>roatian and <fixed-case>S</fixed-case>erbian</title>
+      <author><first>Ranka</first><last>Stanković</last></author>
+      <author><first>Maxim</first><last>Ionov</last></author>
+      <author><first>Medina</first><last>Bajtarević</last></author>
+      <author><first>Lorena</first><last>Ninčević</last></author>
+      <pages>108–114</pages>
+      <abstract>This paper introduces a novel language resource for retrieving and researching verbal aspectual pairs in BCS (Bosnian, Croatian, and Serbian) created using Linguistic Linked Open Data (LLOD) principles. As there is no resource to help learners of Bosnian, Croatian, and Serbian as foreign languages to recognize the aspect of a verb or its pairs, we have created a new resource that will provide users with information about the aspect, as well as the link to a verb’s aspectual counterparts. This resource also contains external links to monolingual dictionaries, Wordnet, and BabelNet. As this is a work in progress, our resource only includes verbs and their perfective pairs formed with prefixes “pro”, “od”, “ot”, “iz”, “is” and “na”. The goal of this project is to have a complete dataset of all the aspectual pairs in these three languages. We believe it will be useful for research in the field of aspectology, as well as machine translation and other NLP tasks. Using this resource as an example, we also propose a sustainable approach to publishing small to moderate LLOD resources on the Web, both in a user-friendly way and according to the Linked Data principles.</abstract>
+      <url hash="05aa1547">2024.ldl-1.14</url>
+      <bibkey>stankovic-etal-2024-ontolex</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Towards Semantic Interoperability: Parallel Corpora as Linked Data Incorporating Named Entity Linking</title>
+      <author><first>Ranka</first><last>Stanković</last></author>
+      <author><first>Milica</first><last>Ikonić Nešić</last></author>
+      <author><first>Olja</first><last>Perisic</last></author>
+      <author><first>Mihailo</first><last>Škorić</last></author>
+      <author><first>Olivera</first><last>Kitanović</last></author>
+      <pages>115–125</pages>
+      <abstract>The paper presents the results of the research related to the preparation of parallel corpora, focusing on transformation into RDF graphs using NLP Interchange Format (NIF) for linguistic annotation. We give an overview of the parallel corpus that was used in this case study, as well as the process of POS tagging, lemmatization, named entity recognition (NER), and named entity linking (NEL), which is implemented using Wikidata. In the first phase of NEL main characters and places mentioned in novels are stored in Wikidata and in the second phase they are linked with the occurrences of previously annotated entities in text. Next, we describe the named entity linking (NEL), data conversion to RDF, and incorporation of NIF annotations. Produced NIF files were evaluated through the exploration of triplestore using SPARQL queries. Finally, the bridging of Linked Data and Digital Humanities research is discussed, as well as some drawbacks related to the verbosity of transformation. Semantic interoperability concept in the context of linked data and parallel corpora ensures that data exchanged between systems carries shared and well-defined meanings, enabling effective communication and understanding.</abstract>
+      <url hash="6f465319">2024.ldl-1.15</url>
+      <bibkey>stankovic-etal-2024-towards</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.legal.xml b/data/xml/2024.legal.xml
new file mode 100644
index 0000000000..abc142a4b8
--- /dev/null
+++ b/data/xml/2024.legal.xml
@@ -0,0 +1,128 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.legal">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Workshop on Legal and Ethical Issues in Human Language Technologies @ LREC-COLING 2024</booktitle>
+      <editor><first>Ingo</first><last>Siegert</last></editor>
+      <editor><first>Khalid</first><last>Choukri</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="402027cc">2024.legal-1</url>
+      <venue>legal</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="878d6933">2024.legal-1.0</url>
+      <bibkey>legal-2024-legal</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Compliance by Design Methodologies in the Legal Governance Schemes of <fixed-case>E</fixed-case>uropean Data Spaces</title>
+      <author><first>Kossay</first><last>Talmoudi</last></author>
+      <author><first>Khalid</first><last>Choukri</last></author>
+      <author><first>Isabelle</first><last>Gavanon</last></author>
+      <pages>1–5</pages>
+      <abstract>Creating novel ways of sharing data to boost the digital economy has been one of the growing priorities of the European Union. In order to realise a set of data-sharing modalities, the European Union funds several projects that aim to put in place Common Data Spaces. These infrastructures are set to be a catalyser for the data economy. However, many hurdles face their implementation. Legal compliance is still one of the major ambiguities of European Common Data Spaces and many initiatives intend to proactively integrate legal compliance schemes in the architecture of sectoral Data Spaces. The various initiatives must navigate a complex web of cross-cutting legal frameworks, including contract law, data protection, intellectual property, protection of trade secrets, competition law, European sovereignty, and cybersecurity obligations. As the conceptualisation of Data Spaces evolves and shows signs of differentiation from one sector to another, it is important to showcase the legal repercussions of the options of centralisation and decentralisation that can be observed in different Data Spaces. This paper will thus delve into their legal requirements and attempt to sketch out a stepping stone for understanding legal governance in data spaces.</abstract>
+      <url hash="eabf2214">2024.legal-1.1</url>
+      <bibkey>talmoudi-etal-2024-compliance</bibkey>
+    </paper>
+    <paper id="2">
+      <title>A Legal Framework for Natural Language Model Training in <fixed-case>P</fixed-case>ortugal</title>
+      <author><first>Ruben</first><last>Almeida</last></author>
+      <author><first>Evelin</first><last>Amorim</last></author>
+      <pages>6–12</pages>
+      <abstract>Recent advances in deep learning have promoted the advent of many computational systems capable of performing intelligent actions that, until then, were restricted to the human intellect. In the particular case of human languages, these advances allowed the introduction of applications like ChatGPT that are capable of generating coherent text without being explicitly programmed to do so. Instead, these models use large volumes of textual data to learn meaningful representations of human languages. Associated with these advances, concerns about copyright and data privacy infringements caused by these applications have emerged. Despite these concerns, the pace at which new natural language processing applications continued to be developed largely outperformed the introduction of new regulations. Today, communication barriers between legal experts and computer scientists motivate many unintentional legal infringements during the development of such applications. In this paper, a multidisciplinary team intends to bridge this communication gap and promote more compliant Portuguese NLP research by presenting a series of everyday NLP use cases, while highlighting the Portuguese legislation that may arise during its development.</abstract>
+      <url hash="90be3c2b">2024.legal-1.2</url>
+      <bibkey>almeida-amorim-2024-legal</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Intellectual property rights at the training, development and generation stages of Large Language Models</title>
+      <author><first>Christin</first><last>Kirchhübel</last></author>
+      <author><first>Georgina</first><last>Brown</last></author>
+      <pages>13–18</pages>
+      <abstract>Large Language Models (LLMs) prompt new questions around Intellectual Property (IP): what is the IP status of the datasets used to train LLMs, the resulting LLMs themselves, and their outputs? The training needs of LLMs may be at odds with current copyright law, and there are active conversations around the ownership of their outputs. A report published by the House of Lords Committee following its inquiry into LLMs and generative AI criticises, among other things, the lack of government guidance, and stresses the need for clarity (through legislation, where appropriate) in this sphere. This paper considers the little guidance and caselaw there is involving AI more broadly to allow us to anticipate legal cases and arguments involving LLMs. Given the pre-emptive nature of this paper, it is not possible to provide comprehensive answers to these questions, but we hope to equip language technology communities with a more informed understanding of the current position with respect to UK copyright and patent law.</abstract>
+      <url hash="ca1c348e">2024.legal-1.3</url>
+      <bibkey>kirchhubel-brown-2024-intellectual</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Ethical Issues in Language Resources and Language Technology – New Challenges, New Perspectives</title>
+      <author><first>Pawel</first><last>Kamocki</last></author>
+      <author><first>Andreas</first><last>Witt</last></author>
+      <pages>19–23</pages>
+      <abstract>This article elaborates on the author’s contribution to the previous edition of the LREC conference, in which they proposed a tentative taxonomy of ethical issues that affect Language Resources (LRs) and Language Technology (LT) at the various stages of their lifecycle (conception, creation, use and evaluation). The proposed taxonomy was built around the following ethical principles: Privacy, Property, Equality, Transparency and Freedom. In this article, the authors would like to: 1) examine whether and how this taxonomy stood the test of time, in light of the recent developments in the legal framework and popularisation of Large Language Models (LLMs); 2) provide some details and a tentative checklist on how the taxonomy can be applied in practice; and 3) develop the taxonomy by adding new principles (Accountability; Risk Anticipation and Limitation; Reliability and Limited Confidence), to address the technological developments in LLMs and the upcoming Artificial Intelligence Act.</abstract>
+      <url hash="ae181518">2024.legal-1.4</url>
+      <bibkey>kamocki-witt-2024-ethical</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Legal and Ethical Considerations that Hinder the Use of <fixed-case>LLM</fixed-case>s in a <fixed-case>F</fixed-case>innish Institution of Higher Education</title>
+      <author><first>Mika</first><last>Hämäläinen</last></author>
+      <pages>24–27</pages>
+      <abstract>Large language models (LLMs) make it possible to solve many business problems easier than ever before. However, embracing LLMs in an organization may be slowed down due to ethical and legal considerations. In this paper, we will describe some of these issues we have faced at our university while developing university-level NLP tools to empower teaching and study planning. The identified issues touch upon topics such as GDPR, copyright, user account management and fear towards the new technology.</abstract>
+      <url hash="d5659e0f">2024.legal-1.5</url>
+      <bibkey>hamalainen-2024-legal</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Implications of Regulations on Large Generative <fixed-case>AI</fixed-case> Models in the Super-Election Year and the Impact on Disinformation</title>
+      <author><first>Vera</first><last>Schmitt</last></author>
+      <author><first>Jakob</first><last>Tesch</last></author>
+      <author><first>Eva</first><last>Lopez</last></author>
+      <author><first>Tim</first><last>Polzehl</last></author>
+      <author><first>Aljoscha</first><last>Burchardt</last></author>
+      <author><first>Konstanze</first><last>Neumann</last></author>
+      <author><first>Salar</first><last>Mohtaj</last></author>
+      <author><first>Sebastian</first><last>Möller</last></author>
+      <pages>28–38</pages>
+      <abstract>With the rise of Large Generative AI Models (LGAIMs), disinformation online has become more concerning than ever before. Within the super-election year 2024, the influence of mis- and disinformation can severely influence public opinion. To combat the increasing amount of disinformation online, humans need to be supported by AI-based tools to increase the effectiveness of detecting false content. This paper examines the critical intersection of the AI Act with the deployment of LGAIMs for disinformation detection and the implications from research, deployer, and the user’s perspective. The utilization of LGAIMs for disinformation detection falls under the high-risk category defined in the AI Act, leading to several obligations that need to be followed after the enforcement of the AI Act. Among others, the obligations include risk management, transparency, and human oversight which pose the challenge of finding adequate technical interpretations. Furthermore, the paper articulates the necessity for clear guidelines and standards that enable the effective, ethical, and legally compliant use of AI. The paper contributes to the discourse on balancing technological advancement with ethical and legal imperatives, advocating for a collaborative approach to utilizing LGAIMs in safeguarding information integrity and fostering trust in digital ecosystems.</abstract>
+      <url hash="19ed3df7">2024.legal-1.6</url>
+      <bibkey>schmitt-etal-2024-implications</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Selling Personal Information: Data Brokers and the Limits of <fixed-case>US</fixed-case> Regulation</title>
+      <author><first>Denise</first><last>DiPersio</last></author>
+      <pages>39–46</pages>
+      <abstract>A principal pillar of the US Blueprint for an AI Bill of Rights is data privacy, specifically, that individuals should be protected from abusive practices by data collectors and data aggregators, and that users should have control over how their personal information is collected and used. An area that spotlights the need for such protections is found in the common practices of data brokers who scrape, purchase, process and reassemble personal information in bulk and sell it for a variety of downstream uses. Such activities almost always occur in the absence of users’ knowledge or meaningful consent, yet they are legal under US law. This paper examines how data brokers operate, provides some examples of recent US regulatory actions taken against them, summarizes federal efforts to redress data broker practices and concludes that as long as there continues to be no comprehensive federal data protection and privacy scheme, efforts to control such behavior will have only a limited effect. This paper also addresses the limits of informed consent on the use of personal information in language resources and suggests a solution in an holistic approach to data protection and privacy across the data/development life cycle.</abstract>
+      <url hash="497860df">2024.legal-1.7</url>
+      <bibkey>dipersio-2024-selling</bibkey>
+    </paper>
+    <paper id="8">
+      <title>What Can <fixed-case>I</fixed-case> Do with this Data Point? Towards Modeling Legal and Ethical Aspects of Linguistic Data Collection and (Re-)use</title>
+      <author><first>Annett</first><last>Jorschick</last></author>
+      <author><first>Paul T.</first><last>Schrader</last></author>
+      <author><first>Hendrik</first><last>Buschmeier</last></author>
+      <pages>47–51</pages>
+      <abstract>Linguistic data often inherits characteristics that limit open science practices such as data publication, sharing, and reuse. Part of the problem is researchers’ uncertainty about the legal requirements, which need to be considered at the beginning of study planning, when consent forms for participants, ethics applications, and data management plans need to be written. This paper presents a newly funded project that will develop a research data management infrastructure that will provide automated support to researchers in the planning, collection, storage, use, reuse, and sharing of data, taking into account ethical and legal aspects to encourage open science practices.</abstract>
+      <url hash="17e33536">2024.legal-1.8</url>
+      <bibkey>jorschick-etal-2024-data</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Data-Envelopes for Cultural Heritage: Going beyond Datasheets</title>
+      <author><first>Mrinalini</first><last>Luthra</last></author>
+      <author><first>Maria</first><last>Eskevich</last></author>
+      <pages>52–65</pages>
+      <abstract>Cultural heritage data is a rich source of information about the history and culture development in the past. When used with due understanding of its intrinsic complexity it can both support research in social sciences and humanities, and become input for machine learning and artificial intelligence algorithms. In all cases ethical and contextual considerations can be encouraged when the relevant information is provided in a clear and well structured form to potential users before they begin to interact with the data. Proposed data-envelopes, basing on the existing documentation frameworks, address the particular needs and challenges of the cultural heritage field while combining machine-readability and user-friendliness. We develop and test data-envelopes usability on the data from the Huygens Institute for History and Culture of the Netherlands. This paper presents the following contributions: i) we highlight the complexity of CH data, featuring the unique ethical and contextual considerations they entail; ii) we evaluate and compare existing dataset documentation frameworks, examining their suitability for CH datasets; iii) we introduce the “data-envelope”–a machine readable adaptation of existing dataset documentation frameworks, to tackle the specificities of CH datasets. Its modular form is designed to serve not only the needs of machine learning (ML), but also and especially broader user groups varying from humanities scholars, governmental monitoring authorities to citizen scientists and the general public. Importantly, the data-envelope framework emphasises the legal and ethical dimensions of dataset documentation, facilitating compliance with evolving data protection regulations and enhancing the accountability of data stewardship in the cultural heritage sector. We discuss and invite the readers for further conversation on the topic of ethical considerations, and how the different audiences should be informed about the importance of datasets documentation management and their context.</abstract>
+      <url hash="ec3fd042">2024.legal-1.9</url>
+      <bibkey>eskevich-luthra-2024-data</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Emotional Toll and Coping Strategies: Navigating the Effects of Annotating Hate Speech Data</title>
+      <author><first>Maryam M.</first><last>AlEmadi</last></author>
+      <author><first>Wajdi</first><last>Zaghouani</last></author>
+      <pages>66–72</pages>
+      <abstract>Freedom of speech on online social media platforms, often comes with the cost of hate speech production. Hate speech can be very harmful to the peace and development of societies as they bring about conflict and encourage crime. To regulate the hate speech content, moderators and annotators are employed. In our research, we look at the effects of prolonged exposure to hate speech on the mental and physical health of these annotators, as well as researchers with work revolving around the topic of hate speech. Through the methodology of analyzing literature, we found that prolonged exposure to hate speech does mentally and physically impact annotators and researchers in this field. We also propose solutions to reduce these negative impacts such as providing mental health services, fair labor practices, psychological assessments and interventions, as well as developing AI to assist in the process of hate speech detection.</abstract>
+      <url hash="9b0dd575">2024.legal-1.10</url>
+      <bibkey>alemadi-zaghouani-2024-emotional</bibkey>
+    </paper>
+    <paper id="11">
+      <title>User Perspective on Anonymity in Voice Assistants – A comparison between <fixed-case>G</fixed-case>ermany and <fixed-case>F</fixed-case>inland</title>
+      <author><first>Ingo</first><last>Siegert</last></author>
+      <author><first>Silas</first><last>Rech</last></author>
+      <author><first>Tom</first><last>Bäckström</last></author>
+      <author><first>Matthias</first><last>Haase</last></author>
+      <pages>73–78</pages>
+      <abstract>This study investigates the growing importance of voice assistants, particularly focusing on their usage patterns and associated user characteristics, trust perceptions, and concerns about data security. While previous research has identified correlations between the use of voice assistants and trust in these technologies, as well as data security concerns, little evidence exists regarding the relationship between individual user traits and perceived trust and security concerns. The study design involves surveying various user attributes, including technical proficiency, personality traits, and experience with digital technologies, alongside attitudes toward and usage of voice assistants. A comparison between Germany and Finland is conducted to explore potential cultural differences. The findings aim to inform strategies for enhancing voice assistant acceptance, including the implementation of anonymization methods.</abstract>
+      <url hash="f8c764bf">2024.legal-1.11</url>
+      <bibkey>siegert-etal-2024-user</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.lrec.xml b/data/xml/2024.lrec.xml
index f2455dbb33..10bead03e9 100644
--- a/data/xml/2024.lrec.xml
+++ b/data/xml/2024.lrec.xml
@@ -10,7 +10,7 @@
       <editor><first>Sakriani</first><last>Sakti</last></editor>
       <editor><first>Nianwen</first><last>Xue</last></editor>
       <publisher>ELRA and ICCL</publisher>
-      <address>Torino, Italy</address>
+      <address>Torino, Italia</address>
       <month>May</month>
       <year>2024</year>
       <url hash="bf2544a5">2024.lrec-main</url>
@@ -110,6 +110,7 @@
     <paper id="8">
       <title>A Canonical Form for Flexible Multiword Expressions</title>
       <author><first>Jan</first><last>Odijk</last></author>
+      <author><first>Martin</first><last>Kroon</last></author>
       <pages>91–101</pages>
       <abstract>This paper proposes a canonical form for Multiword Expressions (MWEs), in particular for the Dutch language. The canonical form can be enriched with all kinds of annotations that can be used to describe the properties of the MWE and its components. It also introduces the DUCAME (DUtch CAnonical Multiword Expressions) lexical resource with more than 11k MWEs in canonical form. DUCAME is used in MWE-Finder to automatically generate queries for searching for flexible MWEs in large text corpora.</abstract>
       <url hash="ba3cc982">2024.lrec-main.8</url>
@@ -194,7 +195,7 @@
     </paper>
     <paper id="16">
       <title>A Comparative Analysis of Word-Level Metric Differential Privacy: Benchmarking the Privacy-Utility Trade-off</title>
-      <author><first>Stephen Joseph</first><last>Meisenbacher</last></author>
+      <author><first>Stephen</first><last>Meisenbacher</last></author>
       <author><first>Nihildev</first><last>Nandakumar</last></author>
       <author><first>Alexandra</first><last>Klymenko</last></author>
       <author><first>Florian</first><last>Matthes</last></author>
@@ -1023,8 +1024,8 @@
     <paper id="85">
       <title>Analyzing Large Language Models’ Capability in Location Prediction</title>
       <author><first>Zhaomin</first><last>Xiao</last></author>
-      <author><first>Eduardo</first><last>Blanco</last></author>
       <author><first>Yan</first><last>Huang</last></author>
+      <author><first>Eduardo</first><last>Blanco</last></author>
       <pages>951–958</pages>
       <abstract>In this paper, we investigate and evaluate large language models’ capability in location prediction. We present experimental results with four models—FLAN-T5, FLAN-UL2, FLAN-Alpaca, and ChatGPT—in various instruction finetuning and exemplar settings. We analyze whether taking into account the context—tweets published before and after the tweet mentioning a location—is beneficial. Additionally, we conduct an ablation study to explore whether instruction modification is beneficial. Lastly, our qualitative analysis sheds light on the errors made by the best-performing model.</abstract>
       <url hash="c857c024">2024.lrec-main.85</url>
@@ -2536,8 +2537,10 @@
       <author><first>Sandra</first><last>Kübler</last></author>
       <pages>2397–2409</pages>
       <abstract>Neural parsing is very dependent on the underlying language model. However, very little is known about how choices in the language model affect parsing performance, especially in multi-task learning. We investigate questions on how the choice of subwords affects parsing, how subword sharing is responsible for gains or negative transfer in a multi-task setting where each task is parsing of a specific domain of the same language. More specifically, we investigate these issues across four languages: English, German, Italian, and Turkish. We find a general preference for averaged or last subwords across languages and domains. However, specific POS tags may require different subwords, and the distributional overlap between subwords across domains is perhaps a more influential factor in determining positive or negative transfer than discrepancies in the data sizes.</abstract>
-      <url hash="c528aed4">2024.lrec-main.215</url>
+      <url hash="21ee04af">2024.lrec-main.215</url>
       <bibkey>dakota-kubler-2024-bits-pieces</bibkey>
+      <revision id="1" href="2024.lrec-main.215v1" hash="c528aed4"/>
+      <revision id="2" href="2024.lrec-main.215v2" hash="21ee04af" date="2024-06-02">Table 5 error corrected (9.5 -&gt; 79.50).</revision>
     </paper>
     <paper id="216">
       <title><fixed-case>B</fixed-case>i<fixed-case>V</fixed-case>ert: Bidirectional Vocabulary Evaluation Using Relations for Machine Translation</title>
@@ -4106,7 +4109,7 @@
     </paper>
     <paper id="346">
       <title><fixed-case>C</fixed-case>ontrast<fixed-case>WSD</fixed-case>: Enhancing Metaphor Detection with Word Sense Disambiguation Following the Metaphor Identification Procedure</title>
-      <author><first>Mohamad MZ</first><last>Elzohbi</last></author>
+      <author><first>Mohamad</first><last>Elzohbi</last></author>
       <author><first>Richard</first><last>Zhao</last></author>
       <pages>3907–3915</pages>
       <abstract>This paper presents ContrastWSD, a RoBERTa-based metaphor detection model that integrates the Metaphor Identification Procedure (MIP) and Word Sense Disambiguation (WSD) to extract and contrast the contextual meaning with the basic meaning of a word to determine whether it is used metaphorically in a sentence. By utilizing the word senses derived from a WSD model, our model enhances the metaphor detection process and outperforms other methods that rely solely on contextual embeddings or integrate only the basic definitions and other external knowledge. We evaluate our approach on various benchmark datasets and compare it with strong baselines, indicating the effectiveness in advancing metaphor detection.</abstract>
@@ -4149,8 +4152,10 @@
       <author><first>Naoaki</first><last>Okazaki</last></author>
       <pages>3955–3961</pages>
       <abstract>In Grammatical Error Correction (GEC), it is crucial to ensure the user’s comprehension of a reason for correction. Existing studies present tokens, examples, and hints for corrections, but do not directly explain the reasons in natural language. Although methods that use Large Language Models (LLMs) to provide direct explanations in natural language have been proposed for various tasks, no such method exists for GEC. Generating explanations for GEC corrections involves aligning input and output tokens, identifying correction points, and presenting corresponding explanations consistently. However, it is not straightforward to specify a complex format to generate explanations, because explicit control of generation is difficult with prompts. This study introduces a method called controlled generation with Prompt Insertion (PI) so that LLMs can explain the reasons for corrections in natural language. In PI, LLMs first correct the input text, and then we automatically extract the correction points based on the rules. The extracted correction points are sequentially inserted into the LLM’s explanation output as prompts, guiding the LLMs to generate explanations for the correction points. We also create an Explainable GEC (XGEC) dataset of correction reasons by annotating NUCLE, CoNLL2013, and CoNLL2014. Although generations from GPT-3.5 and ChatGPT using original prompts miss some correction points, the generation control using PI can explicitly guide to describe explanations for all correction points, contributing to improved performance in generating correction reasons.</abstract>
-      <url hash="a80f2f90">2024.lrec-main.350</url>
+      <url hash="00bd6ae2">2024.lrec-main.350</url>
       <bibkey>kaneko-okazaki-2024-controlled-generation</bibkey>
+      <revision id="1" href="2024.lrec-main.350v1" hash="a80f2f90"/>
+      <revision id="2" href="2024.lrec-main.350v2" hash="00bd6ae2" date="2024-05-30">The original paper is anonymized.</revision>
     </paper>
     <paper id="351">
       <title><fixed-case>C</fixed-case>ontroversial<fixed-case>QA</fixed-case>: Exploring Controversy in Question Answering</title>
@@ -4766,8 +4771,10 @@
       <author id="yang-liu-ict"><first>Yang</first><last>Liu</last></author>
       <pages>4530–4541</pages>
       <abstract>Recent work has made a preliminary attempt to use large language models (LLMs) to solve the stance detection task, showing promising results. However, considering that stance detection usually requires detailed background knowledge, the vanilla reasoning method may neglect the domain knowledge to make a professional and accurate analysis. Thus, there is still room for improvement of LLMs reasoning, especially in leveraging the generation capability of LLMs to simulate specific experts (i.e., multi-agents) to detect the stance. In this paper, different from existing multi-agent works that require detailed descriptions and use fixed experts, we propose a Dynamic Experienced Expert Modeling (DEEM) method which can leverage the generated experienced experts and let LLMs reason in a semi-parametric way, making the experts more generalizable and reliable. Experimental results demonstrate that DEEM consistently achieves the best results on three standard benchmarks, outperforms methods with self-consistency reasoning, and reduces the bias of LLMs.</abstract>
-      <url hash="ab09ba39">2024.lrec-main.405</url>
+      <url hash="a176b822">2024.lrec-main.405</url>
       <bibkey>wang-etal-2024-deem-dynamic</bibkey>
+      <revision id="1" href="2024.lrec-main.405v1" hash="ab09ba39"/>
+      <revision id="2" href="2024.lrec-main.405v2" hash="a176b822" date="2024-06-03">Typo correction.</revision>
     </paper>
     <paper id="406">
       <title>Deep Learning Based Named Entity Recognition Models for Recipes</title>
@@ -6377,8 +6384,10 @@
       <author><first>Guoqiong</first><last>Liao</last></author>
       <pages>6091–6109</pages>
       <abstract>Large language models (LLMs) with prompting have achieved encouraging results on many natural language processing (NLP) tasks based on task-tailored promptings. Text-to-SQL is a critical task that generates SQL queries from natural language questions. However, prompting on LLMs haven’t show superior performance on Text-to-SQL task due to the absence of tailored promptings. In this work, we propose three promptings specifically designed for Text-to-SQL: SL-prompt, CC-prompt, and SL+CC prompt. SL-prompt is designed to guide LLMs to identify relevant tables; CC-prompt directs LLMs to generate SQL clause by clause; and SL+CC prompt is proposed to combine the strengths of these above promptings. The three prompting strategies makes three solutions for Text-to-SQL. Then, another prompting strategy, the RS-prompt is proposed to direct LLMs to select the best answer from the results of the solutions. We conducted extensive experiments, and experimental results show that our method achieved an execution accuracy of 86.2% and a test-suite accuracy of 76.9%, which is 1.1%, and 2.7% higher than the current state-of-the-art Text-to-SQL methods, respectively. The results confirmed that the proposed promptings enhanced the capabilities of LLMs on Text-to-SQL. Experimental results also show that the granularity of schema linking and the order of clause generation have great impact on the performance, which are considered little in previous research.</abstract>
-      <url hash="c5c0ab96">2024.lrec-main.539</url>
+      <url hash="0e76fd27">2024.lrec-main.539</url>
       <bibkey>tan-etal-2024-enhancing-text</bibkey>
+      <revision id="1" href="2024.lrec-main.539v1" hash="c5c0ab96"/>
+      <revision id="2" href="2024.lrec-main.539v2" hash="0e76fd27" date="2024-05-28">Deanonymize and add Acknowledgements.</revision>
     </paper>
     <paper id="540">
       <title>Enhancing Translation Ability of Large Language Models by Leveraging Task-Related Layers</title>
@@ -7987,6 +7996,7 @@
       <title><fixed-case>GENTRAC</fixed-case>: A Tool for Tracing Trauma in Genocide and Mass Atrocity Court Transcripts</title>
       <author><first>Miriam</first><last>Schirmer</last></author>
       <author><first>Christian</first><last>Brechenmacher</last></author>
+      <author><first>Endrit</first><last>Jashari</last></author>
       <author><first>Juergen</first><last>Pfeffer</last></author>
       <pages>7666–7671</pages>
       <abstract>This paper introduces GENTRAC, an open-access web-based tool built to interactively detect and analyze potentially traumatic content in witness statements of genocide and mass atrocity trials. Harnessing recent developments in natural language processing (NLP) to detect trauma, GENTRAC processes and formats court transcripts for NLP analysis through a sophisticated parsing algorithm and detects the likelihood of traumatic content for each speaker segment. The tool visualizes the density of such content throughout a trial day and provides statistics on the overall amount of traumatic content and speaker distribution. Capable of processing transcripts from four prominent international criminal courts, including the International Criminal Court (ICC), GENTRAC’s reach is vast, tailored to handle millions of pages of documents from past and future trials. Detecting potentially re-traumatizing examination methods can enhance the development of trauma-informed legal procedures. GENTRAC also serves as a reliable resource for legal, human rights, and other professionals, aiding their comprehension of mass atrocities’ emotional toll on survivors.</abstract>
@@ -10028,8 +10038,10 @@
       <author><first>Pekka</first><last>Marttinen</last></author>
       <pages>9787–9798</pages>
       <abstract>Adverse drug events (ADEs) are an important aspect of drug safety. Various texts such as biomedical literature, drug reviews, and user posts on social media and medical forums contain a wealth of information about ADEs. Recent studies have applied word embedding and deep learning-based natural language processing to automate ADE detection from text. However, they did not explore incorporating explicit medical knowledge about drugs and adverse reactions or the corresponding feature learning. This paper adopts the heterogeneous text graph, which describes relationships between documents, words, and concepts, augments it with medical knowledge from the Unified Medical Language System, and proposes a concept-aware attention mechanism that learns features differently for the different types of nodes in the graph. We further utilize contextualized embeddings from pretrained language models and convolutional graph neural networks for effective feature representation and relational learning. Experiments on four public datasets show that our model performs competitively to the recent advances, and the concept-aware attention consistently outperforms other attention mechanisms.</abstract>
-      <url hash="a3814096">2024.lrec-main.855</url>
+      <url hash="72980da8">2024.lrec-main.855</url>
       <bibkey>gao-etal-2024-knowledge-augmented</bibkey>
+      <revision id="1" href="2024.lrec-main.855v1" hash="a3814096"/>
+      <revision id="2" href="2024.lrec-main.855v2" hash="72980da8" date="2024-05-21">Minor update.</revision>
     </paper>
     <paper id="856">
       <title>Knowledge-aware Attention Network for Medication Effectiveness Prediction</title>
@@ -10817,8 +10829,10 @@
       <author><first>Emily</first><last>Drummond</last></author>
       <pages>10530–10538</pages>
       <abstract>We introduce LinguaMeta, a unified resource for language metadata for thousands of languages, including language codes, names, number of speakers, writing systems, countries, official status, coordinates, and language varieties. The resources are drawn from various existing repositories and supplemented with our own research. Each data point is tagged for its origin, allowing us to easily trace back to and improve existing resources with more up-to-date and complete metadata. The resource is intended for use by researchers and organizations who aim to extend technology to thousands of languages.</abstract>
-      <url hash="ebab2fdd">2024.lrec-main.921</url>
+      <url hash="e98de3ba">2024.lrec-main.921</url>
       <bibkey>ritchie-etal-2024-linguameta-unified</bibkey>
+      <revision id="1" href="2024.lrec-main.921v1" hash="ebab2fdd"/>
+      <revision id="2" href="2024.lrec-main.921v2" hash="e98de3ba" date="2024-05-23">Fix URL to point directly to LinguaMeta directory and update section on language varieties.</revision>
     </paper>
     <paper id="922">
       <title>Linguistic Knowledge Can Enhance Encoder-Decoder Models (If You Let It)</title>
@@ -13040,7 +13054,7 @@
       <title><fixed-case>PDAM</fixed-case>eta: Meta-Learning Framework with Progressive Data Augmentation for Few-Shot Text Classification</title>
       <author><first>Xurui</first><last>Li</last></author>
       <author><first>Kaisong</first><last>Song</last></author>
-      <author><first>Tianqianjing</first><last>Lin</last></author>
+      <author><first>Tianqianjin</first><last>Lin</last></author>
       <author><first>Yangyang</first><last>Kang</last></author>
       <author><first>Fubang</first><last>Zhao</last></author>
       <author><first>Changlong</first><last>Sun</last></author>
@@ -14726,8 +14740,10 @@
       <author><first>Juri</first><last>Opitz</last></author>
       <pages>14400–14406</pages>
       <abstract>The Area Under Curve measure (AUC) seems apt to evaluate and compare diverse models, possibly without calibration. An important example of AUC application is the evaluation and benchmarking of models that predict faithfulness of generated text. But we show that the AUC yields an academic and optimistic notion of accuracy that can misalign with the actual accuracy observed in application, yielding significant changes in benchmark rankings. To paint a more realistic picture of downstream model performance (and prepare it for actual application), we explore different calibration modes, testing calibration data and method.</abstract>
-      <url hash="88164497">2024.lrec-main.1255</url>
+      <url hash="303d2ba0">2024.lrec-main.1255</url>
       <bibkey>opitz-2024-schroedingers-threshold</bibkey>
+      <revision id="1" href="2024.lrec-main.1255v1" hash="88164497"/>
+      <revision id="2" href="2024.lrec-main.1255v2" hash="303d2ba0" date="2024-05-21">Minor update.</revision>
     </paper>
     <paper id="1256">
       <title><fixed-case>S</fixed-case>ci<fixed-case>DMT</fixed-case>: A Large-Scale Corpus for Detecting Scientific Mentions</title>
@@ -15126,7 +15142,7 @@
     <paper id="1289">
       <title><fixed-case>S</fixed-case>ign<fixed-case>BLEU</fixed-case>: Automatic Evaluation of Multi-channel Sign Language Translation</title>
       <author><first>Jung-Ho</first><last>Kim</last></author>
-      <author><first>Mathew John</first><last>Huerta-Enochian</last></author>
+      <author><first>Mathew</first><last>Huerta-Enochian</last></author>
       <author><first>Changyong</first><last>Ko</last></author>
       <author><first>Du Hui</first><last>Lee</last></author>
       <pages>14796–14811</pages>
@@ -15901,7 +15917,7 @@
     </paper>
     <paper id="1355">
       <title>Target-Adaptive Consistency Enhanced Prompt-Tuning for Multi-Domain Stance Detection</title>
-      <author><first>Shangkang</first><last>Wang</last></author>
+      <author><first>Shaokang</first><last>Wang</last></author>
       <author><first>Li</first><last>Pan</last></author>
       <pages>15585–15594</pages>
       <abstract>Stance detection is a fundamental task in Natural Language Processing (NLP). It is challenging due to diverse expressions and topics related to the targets from multiple domains. Recently, prompt-tuning has been introduced to convert the original task into a cloze-style prediction task, achieving impressive results. Many prompt-tuning-based methods focus on one or two classic scenarios with concrete external knowledge enhancement. However, when facing intricate information in multi-domain stance detection, these methods cannot be adaptive to multi-domain semantics. In this paper, we propose a novel target-adaptive consistency enhanced prompt-tuning method (TCP) for stance detection with multiple domains. TCP incorporates target knowledge and prior knowledge to construct target-adaptive verbalizers for diverse domains and employs pilot experiments distillation to enhance the consistency between verbalizers and model training. Specifically, to capture the knowledge from multiple domains, TCP uses a target-adaptive candidate mining strategy to obtain the domain-related candidates. Then, TCP refines them with prior attributes to ensure prediction consistency. The Pre-trained Language Models (PLMs) in prompt-tuning are with large-scale parameters, while only changing the verbalizer without corresponding tuning has a limited impact on the training process. Target-aware pilot experiments are conducted to enhance the consistency between the verbalizer and training by distilling the target-adaptive knowledge into prompt-tuning. Extensive experiments and ablation studies demonstrate that TCP outperforms the state-of-the-art methods on nine stance detection datasets from multiple domains.</abstract>
@@ -16313,6 +16329,7 @@
       <author><first>Dejan</first><last>Stosic</last></author>
       <author><first>Saša</first><last>Marjanović</last></author>
       <author><first>Delphine</first><last>Bernhard</last></author>
+      <author><first>Xavier</first><last>Bach</last></author>
       <author><first>Myriam</first><last>Bras</last></author>
       <author><first>Laurent</first><last>Kevers</last></author>
       <author><first>Stella</first><last>Retali-Medori</last></author>
@@ -16368,7 +16385,7 @@
       <author><first>Lisa</first><last>Beinborn</last></author>
       <author><first>Antske</first><last>Fokkens</last></author>
       <pages>16066–16078</pages>
-      <abstract>Post-hoc explanation methods for transformer models tend to disagree with one another. Agreement is generally measured for a small subset of most important tokens. However, the presence of disagreement is often overlooked and the reasons for disagreement insufficiently examined, causing these methods to be utilised without adequate care. In this work, we explain disagreement from a linguistic perspective. We find that different methods systematically select different token types. Additionally, similar methods display similar linguistic preferences, which consequently affect agreement. By estimating the subsets of *k* most important tokens dynamically over sentences, we find that methods better agree on the syntactic span level. Especially the methods that agree the least with other methods benefit most from this dynamic subset estimation. We methodically explore the different settings of the dynamic *k* approach: we observe that its combination with spans yields favourable results in capturing important signals in the sentence, and propose an improved setting of global token importance.</abstract>
+      <abstract>Post-hoc explanation methods are an important tool for increasing model transparency for users. Unfortunately, the currently used methods for attributing token importance often yield diverging patterns. In this work, we study potential sources of disagreement across methods from a linguistic perspective. We find that different methods systematically select different classes of words and that methods that agree most with other methods and with humans display similar linguistic preferences. Token-level differences between methods are smoothed out if we compare them on the syntactic span level. We also find higher agreement across methods by estimating the most important spans dynamically instead of relying on a fixed subset of size k. We systematically investigate the interaction between k and spans and propose an improved configuration for selecting important tokens.</abstract>
       <url hash="7b776ed0">2024.lrec-main.1397</url>
       <bibkey>kamp-etal-2024-role-syntactic</bibkey>
     </paper>
@@ -17126,6 +17143,8 @@
     <paper id="1463">
       <title>Triple-<fixed-case>R</fixed-case>: Automatic Reasoning for Fact Verification Using Language Models</title>
       <author><first>Mohammadamin</first><last>Kanaani</last></author>
+      <author><first>Sajjad</first><last>Dadkhah</last></author>
+      <author><first>Ali A.</first><last>Ghorbani</last></author>
       <pages>16831–16840</pages>
       <abstract>The rise of online social media platforms has made them a popular source of news. However, they are also prone to misinformation and fake news. To combat this, fact-checking is essential to verify the accuracy of claims made on these platforms. However, the existing methods in this field often lack the use of external sources and human-understandable explanations for system decisions. In this paper, we introduce a framework called Triple-R (Retriever, Ranker, Reasoner) that addresses these challenges. The framework uses the Web as an external knowledge source to retrieve relevant evidence for claims and includes a method to generate reasons based on the retrieved evidence for datasets lacking explanations. We then use this modified dataset to fine-tune a causal language model that generates natural language explanations and labels for pairs of retrieved evidence and claims. Our approach aims to improve the transparency and interpretability of fact-checking systems by providing understandable explanations for decision-making processes. We evaluated our method on a popular dataset and demonstrated its performance through an ablation study. The modified dataset is available on the Canadian Institute for Cybersecurity datasets webpage at https://www.unb.ca/cic/datasets/index.html.</abstract>
       <url hash="1df69927">2024.lrec-main.1463</url>
@@ -17212,12 +17231,12 @@
       <bibkey>xie-etal-2024-typos-correction</bibkey>
     </paper>
     <paper id="1471">
-      <title><fixed-case>UC</fixed-case>xn: Typologically-Informed Annotation of Constructions Atop <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies</title>
+      <title><fixed-case>UC</fixed-case>xn: Typologically Informed Annotation of Constructions Atop <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies</title>
       <author><first>Leonie</first><last>Weissweiler</last></author>
       <author><first>Nina</first><last>Böbel</last></author>
       <author><first>Kirian</first><last>Guiller</last></author>
       <author><first>Santiago</first><last>Herrera</last></author>
-      <author><first>Wesley Samuel</first><last>Scivetti</last></author>
+      <author><first>Wesley</first><last>Scivetti</last></author>
       <author><first>Arthur</first><last>Lorenzi</last></author>
       <author><first>Nurit</first><last>Melnik</last></author>
       <author><first>Archna</first><last>Bhatia</last></author>
@@ -18219,7 +18238,7 @@
       <bibkey>zhu-etal-2024-zero-shot</bibkey>
     </paper>
   </volume>
-  <volume id="tutorials" ingest-date="2024-05-10" type="proceedings">
+  <volume id="tutorials" ingest-date="2024-05-20" type="proceedings">
     <meta>
       <booktitle>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries</booktitle>
       <editor><first>Roman</first><last>Klinger</last></editor>
@@ -18230,13 +18249,12 @@
       <address>Torino, Italia</address>
       <month>May</month>
       <year>2024</year>
-      <url hash="b712390f">2024.lrec-tutorials</url>
+      <url hash="15bba0c4">2024.lrec-tutorials</url>
       <venue>lrec</venue>
-      <venue>coling</venue>
     </meta>
     <frontmatter>
-      <url hash="14adbb23">2024.lrec-tutorials.0</url>
-      <bibkey>lrec-2024-tutorials</bibkey>
+      <url hash="c022a703">2024.lrec-tutorials.0</url>
+      <bibkey>lrec-2024-2024-joint</bibkey>
     </frontmatter>
     <paper id="1">
       <title>From Multimodal <fixed-case>LLM</fixed-case> to Human-level <fixed-case>AI</fixed-case>: Modality, Instruction, Reasoning, Efficiency and beyond</title>
@@ -18248,7 +18266,7 @@
       <author><first>Tat-Seng</first><last>Chua</last></author>
       <pages>1–8</pages>
       <abstract>Artificial intelligence (AI) encompasses knowledge acquisition and real-world grounding across various modalities. As a multidisciplinary research field, multimodal large language models (MLLMs) have recently garnered growing interest in both academia and industry, showing an unprecedented trend to achieve human-level AI via MLLMs. These large models offer an effective vehicle for understanding, reasoning, and planning by integrating and modeling diverse information modalities, including language, visual, auditory, and sensory data. This tutorial aims to deliver a comprehensive review of cutting-edge research in MLLMs, focusing on four key areas: MLLM architecture design, instructional learning, multimodal reasoning, and the efficiency of MLLMs. We will explore technical advancements, synthesize key challenges, and discuss potential avenues for future research.</abstract>
-      <url hash="d5a30633">2024.lrec-tutorials.1</url>
+      <url hash="c40d7c45">2024.lrec-tutorials.1</url>
       <bibkey>fei-etal-2024-multimodal</bibkey>
     </paper>
     <paper id="2">
@@ -18257,7 +18275,7 @@
       <author><first>Rida</first><last>Qadri</last></author>
       <pages>9–12</pages>
       <abstract>Training and evaluation of language models are increasingly relying on semi-structured data that is annotated by humans, along with techniques such as RLHF growing in usage across the board. As a result, both the data and the human perspectives involved in this process play a key role in what is taken as ground truth by our models. As annotation tasks are becoming increasingly more subjective and culturally complex, it is unclear how much of their socio-cultural identity annotators use to respond to tasks. We also currently do not have ways to integrate rich and diverse community perspectives into our language technologies. Accounting for such cross-cultural differences in interacting with technology is an increasingly crucial step for evaluating AI harms holistically. Without this, the state of the art of the AI models being deployed is at risk of causing unprecedented biases at a global scale. In this tutorial, we will take an interactive approach by utilizing some different types of annotation tasks to investigate together how our different socio-cultural perspectives and lived experiences influence what we consider as appropriate representations of global concepts.</abstract>
-      <url hash="a6d87570">2024.lrec-tutorials.2</url>
+      <url hash="6785235f">2024.lrec-tutorials.2</url>
       <bibkey>dev-qadri-2024-geo</bibkey>
     </paper>
     <paper id="3">
@@ -18270,7 +18288,7 @@
       <author><first>Nianwen</first><last>Xue</last></author>
       <pages>13–18</pages>
       <abstract>This tutorial reviews the design of common meaning representations, SoTA models for predicting meaning representations, and the applications of meaning representations in a wide range of downstream NLP tasks and real-world applications. Reporting by a diverse team of NLP researchers from academia and industry with extensive experience in designing, building and using meaning representations, our tutorial has three components: (1) an introduction to common meaning representations, including basic concepts and design challenges; (2) a review of SoTA methods on building models for meaning representations; and (3) an overview of applications of meaning representations in downstream NLP tasks and real-world applications. We propose a cutting-edge, full-day tutorial for all stakeholders in the AI community, including NLP researchers, domain-specific practitioners, and students</abstract>
-      <url hash="33d0656d">2024.lrec-tutorials.3</url>
+      <url hash="76df5399">2024.lrec-tutorials.3</url>
       <bibkey>bonn-etal-2024-meaning</bibkey>
     </paper>
     <paper id="4">
@@ -18282,7 +18300,7 @@
       <author><first>Gabriel</first><last>Stanovsky</last></author>
       <pages>19–25</pages>
       <abstract>General-Purpose Language Models have changed the world of Natural Language Processing, if not the world itself. The evaluation of such versatile models, while supposedly similar to evaluation of generation models before them, in fact presents a host of new evaluation challenges and opportunities. In this Tutorial, we will start from the building blocks of evaluation. The tutorial welcomes people from diverse backgrounds and assumes little familiarity with metrics, datasets, prompts and benchmarks. It will lay the foundations and explain the basics and their importance, while touching on the major points and breakthroughs of the recent era of evaluation. It will also compare traditional evaluation methods – which are still widely used – to newly developed methods. We will contrast new to old approaches, from evaluating on many-task benchmarks rather than on dedicated datasets to efficiency constraints, and from testing stability and prompts on in-context learning to using the models themselves as evaluation metrics. Finally, the tutorial will cover practical issues, ranging from reviewing widely-used benchmarks and prompt banks to efficient evaluation.</abstract>
-      <url hash="12099d66">2024.lrec-tutorials.4</url>
+      <url hash="5552e2cc">2024.lrec-tutorials.4</url>
       <bibkey>choshen-etal-2024-navigating</bibkey>
     </paper>
     <paper id="5">
@@ -18293,7 +18311,7 @@
       <author><first>Henning</first><last>Wachsmuth</last></author>
       <pages>26–32</pages>
       <abstract>Computational argumentation is an interdisciplinary research field, connecting Natural Language Processing (NLP) to other disciplines such as the social sciences. The focus of recent research has concentrated on <i>argument quality assessment</i>: what makes an argument good or bad? We present a tutorial which is an updated edition of the EACL 2023 tutorial presented by the same authors. As in the previous version, the tutorial will have a strong interdisciplinary and interactive nature, and will be structured along three main coordinates: (1) the notions of argument quality (AQ) across disciplines (how do we recognize good and bad arguments?), with a particular focus on the interface between Argument Mining (AM) and Deliberation Theory; (2) the modeling of subjectivity (who argues to whom; what are their beliefs?); and (3) the generation of improved arguments (what makes an argument better?). The tutorial will also touch upon a series of topics that are particularly relevant for the LREC-COLING audience (the issue of resource quality for the assessment of AQ; the interdisciplinary application of AM and AQ in a text-as-data approach to Political Science), in line with the developments in NLP (LLMs for AQ assessment), and relevant for the societal applications of AQ assessment (bias and debiasing). We will involve the participants in two annotation studies on the assessment and the improvement of quality.</abstract>
-      <url hash="e12baa86">2024.lrec-tutorials.5</url>
+      <url hash="ad00e52c">2024.lrec-tutorials.5</url>
       <bibkey>lapesa-etal-2024-mining</bibkey>
     </paper>
     <paper id="6">
@@ -18303,7 +18321,7 @@
       <author><first>Shumin</first><last>Deng</last></author>
       <pages>33–41</pages>
       <abstract>Even with their impressive abilities, Large Language Models (LLMs) such as ChatGPT are not immune to issues of factual or logically consistent. Concretely, the key concern is how to seamlessly update those LLMs to correct mistakes without resorting to an exhaustive retraining or continuous training procedure, both of which can demand significant computational resources and time. Thus, the capability to edit LLMs offers an efficient solution to alter a model’s behavior, notably within a distinct area of interest, without negatively impacting its performance on other tasks. Through this tutorial, we strive to acquaint interested NLP researchers with recent and emerging techniques for editing LLMs. Specifically, we aim to present a systematic and current overview of cutting-edge methods, supplemented with practical tools, and unveil new research opportunities for our audiences. All the valuable resources can be accessed at https://github.com/zjunlp/KnowledgeEditingPapers.</abstract>
-      <url hash="4cf4a23d">2024.lrec-tutorials.6</url>
+      <url hash="bb1b8586">2024.lrec-tutorials.6</url>
       <bibkey>zhang-etal-2024-knowledge</bibkey>
     </paper>
     <paper id="7">
@@ -18311,7 +18329,7 @@
       <author><first>Milan</first><last>Dojchinovski</last></author>
       <pages>42–44</pages>
       <abstract>This tutorial introduces DBpedia Databus (https://databus.dbpedia.org), a FAIR data publishing platform, to address challenges faced by data producers and consumers. It covers data organization, publishing, and consumption on the DBpedia Databus, with an exclusive focus on Linguistic Knowledge Graphs. The tutorial offers practical insights for knowledge graph stakeholders, aiding data integration and accessibility in the Linked Open Data community. Designed for a diverse audience, it fosters hands-on learning to familiarize participants with the DBpedia Databus technology.</abstract>
-      <url hash="c9c0dce3">2024.lrec-tutorials.7</url>
+      <url hash="335a6229">2024.lrec-tutorials.7</url>
       <bibkey>dojchinovski-2024-dbpedia</bibkey>
     </paper>
     <paper id="8">
@@ -18320,7 +18338,7 @@
       <author><first>Saber</first><last>Akhondi</last></author>
       <pages>45–49</pages>
       <abstract>In this half-day tutorial we will be giving an introductory overview to a number of recent applications of natural language processing to a relatively underrepresented application domain: chemistry. Specifically, we will see how neural language models (transformers) can be applied (oftentimes with near-human performance) to chemical text mining, reaction extraction, or more importantly computational chemistry (forward and backward synthesis of chemical compounds). At the same time, a number of gold standards for experimentation have been made available to the research –academic and otherwise– community. Theoretical results will be, whenever possible, supported by system demonstrations in the form of Jupyter notebooks. This tutorial targets an audience interested in bioinformatics and biomedical applications, but pre-supposes no advanced knowledge of either.</abstract>
-      <url hash="921ee182">2024.lrec-tutorials.8</url>
+      <url hash="8b57ca46">2024.lrec-tutorials.8</url>
       <bibkey>thorne-akhondi-2024-nlp</bibkey>
     </paper>
     <paper id="9">
@@ -18330,7 +18348,7 @@
       <author><first>André</first><last>Freitas</last></author>
       <pages>50–55</pages>
       <abstract>Text embeddings provide a concise representation of the semantics of sentences and larger spans of text, rather than individual words, capturing a wide range of linguistic features. They have found increasing application to a variety of NLP tasks, including machine translation and natural language inference. While most recent breakthroughs in task performance are being achieved by large scale distributional models, there is a growing disconnection between their knowledge representation and traditional semantics, which hinders efforts to capture such knowledge in human interpretable form or explain model inference behaviour. In this tutorial, we examine from basics to the cutting edge research on the analysis and control of text representations, aiming to shorten the gap between deep latent semantics and formal symbolics. This includes the considerations on knowledge formalisation, the linguistic information that can be extracted and measured from distributional models, and intervention techniques that enable explainable reasoning and controllable text generation, covering methods from pooling to LLM-based.</abstract>
-      <url hash="498a56d9">2024.lrec-tutorials.9</url>
+      <url hash="f4d501c4">2024.lrec-tutorials.9</url>
       <bibkey>silva-de-carvalho-etal-2024-formal</bibkey>
     </paper>
     <paper id="10">
@@ -18341,7 +18359,7 @@
       <author><first>Tom</first><last>Hope</last></author>
       <pages>56–67</pages>
       <abstract>Due to the rapid growth of publications varying in quality, there exists a pressing need to help scientists digest and evaluate relevant papers, thereby facilitating scientific discovery. This creates a number of urgent questions; however, computer-human collaboration in the scientific paper lifecycle is still in the exploratory stage and lacks a unified framework for analyzing the relevant tasks. Additionally, with the recent significant success of large language models (LLMs), they have increasingly played an important role in academic writing. In this cutting-edge tutorial, we aim to provide an all-encompassing overview of the paper lifecycle, detailing how machines can augment every stage of the research process for the scientist, including scientific literature understanding, experiment development, manuscript draft writing, and finally draft evaluation. This tutorial is devised for researchers interested in this rapidly-developing field of NLP-augmented paper writing. The tutorial will also feature a session of hands-on exercises during which participants can guide machines in generating ideas and automatically composing key paper elements. Furthermore, we will address current challenges, explore future directions, and discuss potential ethical issues. A toolkit designed for human-computer collaboration throughout the paper lifecycle will also be made publically available.</abstract>
-      <url hash="7340dc19">2024.lrec-tutorials.10</url>
+      <url hash="48c515ae">2024.lrec-tutorials.10</url>
       <bibkey>wang-etal-2024-towards</bibkey>
     </paper>
     <paper id="11">
@@ -18352,7 +18370,7 @@
       <author><first>Amitava</first><last>Das</last></author>
       <pages>68–72</pages>
       <abstract>In the fast-paced domain of Large Language Models (LLMs), the issue of hallucination is a prominent challenge. Despite continuous endeavors to address this concern, it remains a highly active area of research within the LLM landscape. Grasping the intricacies of this problem can be daunting, especially for those new to the field. This tutorial aims to bridge this knowledge gap by introducing the emerging realm of hallucination in LLMs. It will comprehensively explore the key aspects of hallucination, including benchmarking, detection, and mitigation techniques. Furthermore, we will delve into the specific constraints and shortcomings of current approaches, providing valuable insights to guide future research efforts for participants.</abstract>
-      <url hash="988de175">2024.lrec-tutorials.11</url>
+      <url hash="21947296">2024.lrec-tutorials.11</url>
       <bibkey>rawte-etal-2024-tutorial</bibkey>
     </paper>
     <paper id="12">
@@ -18365,7 +18383,7 @@
       <author><first>Pushpak</first><last>Bhattacharyya</last></author>
       <pages>73–79</pages>
       <abstract>In the landscape of natural language processing (NLP), addressing the challenges of bias and hallucination is paramount to ensuring the ethical and unbiased development of Large Language Models (LLMs). This tutorial delves into the intricate dimensions of LLMs, shedding light on the critical importance of understanding and mitigating the profound impacts of bias and hallucination. Divided into two parts, the first part delves deep into the complexity of bias propagation in LLM development, where we dissect its origins and far-reaching impacts. We then present innovative methodologies for mitigating diverse forms of bias, including dynamic word embeddings and robust benchmarking strategies. The second part of the tutorial discusses hallucination - a prevalent issue in generative AI systems such as LLMs. Through advanced data-driven techniques, we decode its intricate effects and complexities, followed factually-driven mitigation strategies. Furthermore, we shed light on the pivotal role of human cognitive behavior in the context of hallucination, drawing insights from cognitive data, including human eye-tracking data. Ultimately, this cutting-edge tutorial serves as a guiding light, equipping participants with indispensable tools and insights to navigate the ethical complexities of LLMs, thus paving the way for the development of unbiased and ethically robust NLP systems.</abstract>
-      <url hash="ca710c70">2024.lrec-tutorials.12</url>
+      <url hash="44e8fc20">2024.lrec-tutorials.12</url>
       <bibkey>sahoo-etal-2024-addressing</bibkey>
     </paper>
     <paper id="13">
@@ -18376,7 +18394,7 @@
       <author><first>Asif</first><last>Ekbal</last></author>
       <pages>80–87</pages>
       <abstract>This tutorial provides an in-depth exploration of Knowledge-enhanced Dialogue Systems (KEDS), diving into their foundational aspects, methodologies, advantages, and practical applications. Topics include the distinction between internal and external knowledge integration, diverse methodologies employed in grounding dialogues, and innovative approaches to leveraging knowledge graphs for enhanced conversation quality. Furthermore, the tutorial touches upon the rise of biomedical text mining, the advent of domain-specific language models, and the challenges and strategies specific to medical dialogue generation. The primary objective is to give attendees a comprehensive understanding of KEDS. By delineating the nuances of these systems, the tutorial aims to elucidate their significance, highlight advancements made using deep learning, and pinpoint the current challenges. Special emphasis is placed on showcasing how KEDS can be fine-tuned for domain-specific requirements, with a spotlight on the healthcare sector. The tutorial is crafted for both beginners and intermediate researchers in the dialogue systems domain, with a focus on those keen on advancing research in KEDS. It will also be valuable for practitioners in sectors like healthcare, seeking to integrate advanced dialogue systems.</abstract>
-      <url hash="34c8625a">2024.lrec-tutorials.13</url>
+      <url hash="26fbb81e">2024.lrec-tutorials.13</url>
       <bibkey>priya-etal-2024-knowledge</bibkey>
     </paper>
   </volume>
@@ -18389,5 +18407,43 @@
     <links>
       <url type="website">https://lrec-coling-2024.org</url>
     </links>
+    <colocated>
+      <volume-id>2024.bucc-1</volume-id>
+      <volume-id>2024.cawl-1</volume-id>
+      <volume-id>2024.cl4health-1</volume-id>
+      <volume-id>2024.cogalex-1</volume-id>
+      <volume-id>2024.determit-1</volume-id>
+      <volume-id>2024.delite-1</volume-id>
+      <volume-id>2024.dlnld-1</volume-id>
+      <volume-id>2024.dmr-1</volume-id>
+      <volume-id>2024.ecnlp-1</volume-id>
+      <volume-id>2024.eurali-1</volume-id>
+      <volume-id>2024.finnlp-1</volume-id>
+      <volume-id>2024.games-1</volume-id>
+      <volume-id>2024.htres-1</volume-id>
+      <volume-id>2024.humeval-1</volume-id>
+      <volume-id>2024.isa-1</volume-id>
+      <volume-id>2024.ldl-1</volume-id>
+      <volume-id>2024.legal-1</volume-id>
+      <volume-id>2024.lt4hala-1</volume-id>
+      <volume-id>2024.mathnlp-1</volume-id>
+      <volume-id>2024.mwe-1</volume-id>
+      <volume-id>2024.neusymbridge-1</volume-id>
+      <volume-id>2024.nlperspectives-1</volume-id>
+      <volume-id>2024.osact-1</volume-id>
+      <volume-id>2024.parlaclarin-1</volume-id>
+      <volume-id>2024.politicalnlp-1</volume-id>
+      <volume-id>2024.rail-1</volume-id>
+      <volume-id>2024.rapid-1</volume-id>
+      <volume-id>2024.readi-1</volume-id>
+      <volume-id>2024.rfp-1</volume-id>
+      <volume-id>2024.safety4convai-1</volume-id>
+      <volume-id>2024.sigul-1</volume-id>
+      <volume-id>2024.signlang-1</volume-id>
+      <volume-id>2024.tdle-1</volume-id>
+      <volume-id>2024.trac-1</volume-id>
+      <volume-id>2024.unlp-1</volume-id>
+      <volume-id>2024.wildre-1</volume-id>
+    </colocated>
   </event>
 </collection>
diff --git a/data/xml/2024.lt4hala.xml b/data/xml/2024.lt4hala.xml
new file mode 100644
index 0000000000..c3b1ab3921
--- /dev/null
+++ b/data/xml/2024.lt4hala.xml
@@ -0,0 +1,351 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.lt4hala">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Third Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) @ LREC-COLING-2024</booktitle>
+      <editor><first>Rachele</first><last>Sprugnoli</last></editor>
+      <editor><first>Marco</first><last>Passarotti</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="650e7ee2">2024.lt4hala-1</url>
+      <venue>lt4hala</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="a9cd380e">2024.lt4hala-1.0</url>
+      <bibkey>lt4hala-2024-language</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Goidelex: A Lexical Resource for <fixed-case>O</fixed-case>ld <fixed-case>I</fixed-case>rish</title>
+      <author><first>Cormac</first><last>Anderson</last></author>
+      <author><first>Sacha</first><last>Beniamine</last></author>
+      <author><first>Theodorus</first><last>Fransen</last></author>
+      <pages>1–10</pages>
+      <abstract>We introduce Goidelex, a new lexical database resource for Old Irish. Goidelex is an openly accessible relational database in CSV format, linked by formal relationships. The launch version documents 695 headwords with extensive linguistic annotations, including orthographic forms using a normalised orthography, automatically generated phonemic transcriptions, and information about morphosyntactic features, such as gender, inflectional class, etc. Metadata in JSON format, following the Frictionless standard, provides detailed descriptions of the tables and dataset. The database is designed to be fully compatible with the Paralex and CLDF standards and is interoperable with existing lexical resources for Old Irish such as CorPH and eDIL. It is suited to both qualitative and quantitative investigation into Old Irish morphology and lexicon, as well as to comparative research. This paper outlines the creation process, rationale, and resulting structure of the database.</abstract>
+      <url hash="8989e8a1">2024.lt4hala-1.1</url>
+      <bibkey>anderson-etal-2024-goidelex</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Developing a Part-of-speech Tagger for Diplomatically Edited <fixed-case>O</fixed-case>ld <fixed-case>I</fixed-case>rish Text</title>
+      <author><first>Adrian</first><last>Doyle</last></author>
+      <author><first>John P.</first><last>McCrae</last></author>
+      <pages>11–21</pages>
+      <abstract>POS-tagging is typically considered a fundamental text preprocessing task, with a variety of downstream NLP tasks and techniques being dependent on the availability of POS-tagged corpora. As such, POS-taggers are important precursors to further NLP tasks, and their accuracy can impact the potential accuracy of these dependent tasks. While a variety of POS-tagging methods have been developed which work well with modern languages, historical languages present orthographic and editorial challenges which require special attention. The effectiveness of POS-taggers developed for modern languages is reduced when applied to Old Irish, with its comparatively complex orthography and morphology. This paper examines some of the obstacles to POS-tagging Old Irish text, and shows that inconsistencies between extant annotated corpora reduce the quantity of data available for use in training POS-taggers. The development of a multi-layer neural network model for POS-tagging Old Irish text is described, and an experiment is detailed which demonstrates that this model outperforms a variety of off-the-shelf POS-taggers. Moreover, this model sets a new benchmark for POS-tagging diplomatically edited Old Irish text.</abstract>
+      <url hash="058a36b7">2024.lt4hala-1.2</url>
+      <bibkey>doyle-mccrae-2024-developing</bibkey>
+    </paper>
+    <paper id="3">
+      <title>From <fixed-case>YCOE</fixed-case> to <fixed-case>UD</fixed-case>: Rule-based Root Identification in <fixed-case>O</fixed-case>ld <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Luca</first><last>Brigada Villa</last></author>
+      <author><first>Martina</first><last>Giarda</last></author>
+      <pages>22–29</pages>
+      <abstract>In this paper we apply a set of rules to identify the root of a dependency tree, following the Universal Dependencies formalism and starting from the constituency annotation of the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE). This rule-based root-identification task represents the first step towards a rule-based automatic conversion of this valuable resource into the UD format. After presenting Old English and the annotated resources available for this language, we describe the different rules we applied and then we discuss the results and the errors.</abstract>
+      <url hash="ef0c6dd4">2024.lt4hala-1.3</url>
+      <bibkey>brigada-villa-giarda-2024-ycoe</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Too Young to <fixed-case>NER</fixed-case>: Improving Entity Recognition on <fixed-case>D</fixed-case>utch Historical Documents</title>
+      <author><first>Vera</first><last>Provatorova</last></author>
+      <author><first>Marieke</first><last>van Erp</last></author>
+      <author><first>Evangelos</first><last>Kanoulas</last></author>
+      <pages>30–35</pages>
+      <abstract>Named entity recognition (NER) on historical texts is beneficial for the field of digital humanities, as it allows to easily search for the names of people, places and other entities in digitised archives. While the task of historical NER in different languages has been gaining popularity in recent years, Dutch historical NER remains an underexplored topic. Using a recently released historical dataset from the Dutch Language Institute, we train three BERT-based models and analyse the errors to identify main challenges. All three models outperform a contemporary multilingual baseline by a large margin on historical test data.</abstract>
+      <url hash="ae5ed2c5">2024.lt4hala-1.4</url>
+      <bibkey>provatorova-etal-2024-young</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Towards Named-Entity and Coreference Annotation of the <fixed-case>H</fixed-case>ebrew <fixed-case>B</fixed-case>ible</title>
+      <author><first>Daniel G.</first><last>Swanson</last></author>
+      <author><first>Bryce D.</first><last>Bussert</last></author>
+      <author><first>Francis</first><last>Tyers</last></author>
+      <pages>36–40</pages>
+      <abstract>Named-entity annotation refers to the process of specifying what real-world (or, at least, external-to-the-text) entities various names and descriptions within a text refer to. Coreference annotation, meanwhile, specifies what context-dependent words or phrases, such as pronouns refer to. This paper describes an ongoing project to apply both of these to the Hebrew Bible, so far covering most of the book of Genesis, fully marking every person, place, object, and point in time which occurs in the text. The annotation process and possible future uses for the data are covered, along with the challenges involved in applying existing annotation guidelines to the Hebrew text.</abstract>
+      <url hash="2100a7b5">2024.lt4hala-1.5</url>
+      <bibkey>swanson-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>L</fixed-case>i<fixed-case>M</fixed-case>e: A <fixed-case>L</fixed-case>atin Corpus of Late Medieval Criminal Sentences</title>
+      <author><first>Alessanda Clara Carmela</first><last>Bassani</last></author>
+      <author><first>Beatrice Giovanna Maria</first><last>Del Bo</last></author>
+      <author><first>Alfio</first><last>Ferrara</last></author>
+      <author><first>Marta Luigina</first><last>Mangini</last></author>
+      <author><first>Sergio</first><last>Picascia</last></author>
+      <author><first>Ambra</first><last>Stefanello</last></author>
+      <pages>41–49</pages>
+      <abstract>The Latin language has received attention from the computational linguistics research community, which has built, over the years, several valuable resources, ranging from detailed annotated corpora to sophisticated tools for linguistic analysis. With the recent advent of large language models, researchers have also started developing models capable of generating vector representations of Latin texts. The performances of such models remain behind the ones for modern languages, given the disparity in available data. In this paper, we present the LiMe dataset, a corpus of 325 documents extracted from a series of medieval manuscripts called Libri sententiarum potestatis Mediolani, and thoroughly annotated by experts, in order to be employed for masked language model, as well as supervised natural language processing tasks.</abstract>
+      <url hash="9fdf03af">2024.lt4hala-1.6</url>
+      <bibkey>bassani-etal-2024-lime</bibkey>
+    </paper>
+    <paper id="7">
+      <title>The Rise and Fall of Dependency Parsing in Dante Alighieri’s Divine Comedy</title>
+      <author><first>Claudia</first><last>Corbetta</last></author>
+      <author><first>Marco</first><last>Passarotti</last></author>
+      <author><first>Giovanni</first><last>Moretti</last></author>
+      <pages>50–56</pages>
+      <abstract>In this paper, we conduct parsing experiments on Dante Alighieri’s Divine Comedy, an Old Italian poem composed between 1306-1321 and organized into three Cantiche —Inferno, Purgatorio, and Paradiso. We perform parsing on subsets of the poem using both a Modern Italian training set and sections of the Divine Comedy itself to evaluate under which scenarios parsers achieve higher scores. We find that employing in-domain training data supports better results, leading to an increase of approximately +17% in Unlabeled Attachment Score (UAS) and +25-30% in Labeled Attachment Score (LAS). Subsequently, we provide brief commentary on the differences in scores achieved among subsections of Cantiche, and we conduct experimental parsing on a text from the same period and style as the Divine Comedy.</abstract>
+      <url hash="6842b7e8">2024.lt4hala-1.7</url>
+      <bibkey>corbetta-etal-2024-rise</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Unsupervised Authorship Attribution for Medieval <fixed-case>L</fixed-case>atin Using Transformer-Based Embeddings</title>
+      <author><first>Loic</first><last>De Langhe</last></author>
+      <author><first>Orphee</first><last>De Clercq</last></author>
+      <author><first>Veronique</first><last>Hoste</last></author>
+      <pages>57–64</pages>
+      <abstract>We explore the potential of employing transformer-based embeddings in an unsupervised authorship attribution task for medieval Latin. The development of Large Language Models (LLMs) and recent advances in transfer learning alleviate many of the traditional issues associated with authorship attribution in lower-resourced (ancient) languages. Despite this, these methods remain heavily understudied within this domain. Concretely, we generate strong contextual embeddings using a variety of mono -and multilingual transformer models and use these as input for two unsupervised clustering methods: a standard agglomerative clustering algorithm and a self-organizing map. We show that these transformer-based embeddings can be used to generate high-quality and interpretable clusterings, resulting in an attractive alternative to the traditional feature-based methods.</abstract>
+      <url hash="f1c335e3">2024.lt4hala-1.8</url>
+      <bibkey>de-langhe-etal-2024-unsupervised</bibkey>
+    </paper>
+    <paper id="9">
+      <title>“To Have the ‘Million’ Readers Yet”: Building a Digitally Enhanced Edition of the Bilingual <fixed-case>I</fixed-case>rish-<fixed-case>E</fixed-case>nglish Newspaper an Gaodhal (1881-1898)</title>
+      <author><first>Oksana</first><last>Dereza</last></author>
+      <author><first>Deirdre</first><last>Ní Chonghaile</last></author>
+      <author><first>Nicholas</first><last>Wolf</last></author>
+      <pages>65–78</pages>
+      <abstract>This paper introduces the ‘An Gaodhal’ project, which aims to serve the historically under-resourced and endangered language of Irish (known as Gaeilge) by providing new digital tools and resources. The initial goal of the project was the extraction of full text of ‘An Gaodhal’, a monthly bilingual Irish-English newspaper produced from 1881 to 1898, to the highest possible degree of accuracy via Optical Character Recognition (OCR), with a view to making its printed content searchable. The methodology applied toward achieving this goal yielded additional digital outputs including: 1. a new OCR model for the Irish language as printed in Cló Gaelach type; 2. a new OCR model for bilingual Irish-English content printed in Cló Gaelach and Roman types respectively; 3. a BART-based OCR post-correction model for historical bilingual Irish-English data; 4. a historical Irish training set for Named Entity Recognition (NER). All but the first of these four additional outputs appear to be the first of their kind. Each of the project outputs, including the full-text OCR outputs in ALTO XML format, is set for public release to enable open-access research. The paper also identifies the challenges historical Irish data poses to Natural Language Processing (NLP) in general and OCR in particular, and reports on project results and outputs to date. Finally, it contextualises the project within the wider field of NLP and considers its potential impact on under-resourced languages worldwide.</abstract>
+      <url hash="43c60969">2024.lt4hala-1.9</url>
+      <bibkey>dereza-etal-2024-million</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Introducing <fixed-case>P</fixed-case>a<fixed-case>V</fixed-case>e<fixed-case>D</fixed-case>a – <fixed-case>P</fixed-case>avia Verbs Database: Valency Patterns and Pattern Comparison in <fixed-case>A</fixed-case>ncient <fixed-case>I</fixed-case>ndo-<fixed-case>E</fixed-case>uropean Languages</title>
+      <author><first>Silvia</first><last>Luraghi</last></author>
+      <author><first>Alessio</first><last>Palmero Aprosio</last></author>
+      <author><first>Chiara</first><last>Zanchi</last></author>
+      <author><first>Martina</first><last>Giuliani</last></author>
+      <pages>79–88</pages>
+      <abstract>The paper introduces [DATASET], a resource that builds on the ValPaL database of verbs’ valency patterns and alternations by adding a number of ancient languages (completely absent from ValPaL) and a number of new features that enable direct comparison, both diachronic and synchronic. For each verb, ValPaL contains the basic frame and ideally all possible valency alternations allowed by the verb (e.g. passive, causative, reflexive etc.). In order to enable comparison among alternations, an additional level has been added, the alternation class, that overcomes the issue of comparing language specific alternations which were added by individual contributors of ValPaL. The ValPaL had as its main aim typological comparison, and data collection was variously carried out using questionnaires, secondary sources and largely drawing on native speakers’ intuition by contributors. Working with ancient languages entails a methodological change, as the data is extracted from corpora. This has led to re-thinking the notion of valency as a usage-based feature of verbs and to planning future addition of corpus data to modern languages in the database. It further shows the impact of ancient languages on theoretical reflection.</abstract>
+      <url hash="b1ec4ac2">2024.lt4hala-1.10</url>
+      <bibkey>luraghi-etal-2024-introducing</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Development of Robust <fixed-case>NER</fixed-case> Models and Named Entity Tagsets for <fixed-case>A</fixed-case>ncient <fixed-case>G</fixed-case>reek</title>
+      <author><first>Chiara</first><last>Palladino</last></author>
+      <author><first>Tariq</first><last>Yousef</last></author>
+      <pages>89–97</pages>
+      <abstract>This contribution presents a novel approach to the development and evaluation of transformer-based models for Named Entity Recognition and Classification in Ancient Greek texts. We trained two models with annotated datasets by consolidating potentially ambiguous entity types under a harmonized set of classes. Then, we tested their performance with out-of-domain texts, reproducing a real-world use case. Both models performed very well under these conditions, with the multilingual model being slightly superior on the monolingual one. In the conclusion, we emphasize current limitations due to the scarcity of high-quality annotated corpora and to the lack of cohesive annotation strategies for ancient languages.</abstract>
+      <url hash="e61639ea">2024.lt4hala-1.11</url>
+      <bibkey>palladino-yousef-2024-development</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Analysis of Glyph and Writing System Similarities Using <fixed-case>S</fixed-case>iamese Neural Networks</title>
+      <author><first>Claire</first><last>Roman</last></author>
+      <author><first>Philippe</first><last>Meyer</last></author>
+      <pages>98–104</pages>
+      <abstract>In this paper we use siamese neural networks to compare glyphs and writing systems. These deep learning models define distance-like functions and are used to explore and visualize the space of scripts by performing multidimensional scaling and clustering analyses. From 51 historical European, Mediterranean and Middle Eastern alphabets, we use a Ward-linkage hierarchical clustering and obtain 10 clusters of scripts including three isolated writing systems. To collect the glyph database we use the Noto family fonts that encode in a standard form the Unicode character repertoire. This approach has the potential to reveal connections among scripts and civilizations and to help the deciphering of ancient scripts.</abstract>
+      <url hash="c1e6e09a">2024.lt4hala-1.12</url>
+      <bibkey>roman-meyer-2024-analysis</bibkey>
+    </paper>
+    <paper id="13">
+      <title>How to Annotate Emotions in Historical <fixed-case>I</fixed-case>talian Novels: A Case Study on <fixed-case>I</fixed-case> Promessi Sposi</title>
+      <author><first>Rachele</first><last>Sprugnoli</last></author>
+      <author><first>Arianna</first><last>Redaelli</last></author>
+      <pages>105–115</pages>
+      <abstract>This paper describes the annotation of a chapter taken from I Promessi Sposi, the most famous Italian novel of the 19th century written by Alessandro Manzoni, following 3 emotion classifications. The aim of this methodological paper is to understand: i) how the annotation procedure changes depending on the granularity of the classification, ii) how the different granularities impact the inter-annotator agreement, iii) which granularity allows good coverage of emotions, iv) if the chosen classifications are missing emotions that are important for historical literary texts. The opinion of non-experts is integrated in the present study through an online questionnaire. In addition, preliminary experiments are carried out using the new dataset as a test set to evaluate the performances of different approaches for emotion polarity detection and emotion classification respectively. Annotated data are released both as aggregated gold standard and with non-aggregated labels (that is labels before reconciliation between annotators) so to align with the perspectivist approach, that is an established practice in the Humanities and, more recently, also in NLP.</abstract>
+      <url hash="f13169a3">2024.lt4hala-1.13</url>
+      <bibkey>sprugnoli-redaelli-2024-annotate</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Leveraging <fixed-case>LLM</fixed-case>s for Post-<fixed-case>OCR</fixed-case> Correction of Historical Newspapers</title>
+      <author><first>Alan</first><last>Thomas</last></author>
+      <author><first>Robert</first><last>Gaizauskas</last></author>
+      <author><first>Haiping</first><last>Lu</last></author>
+      <pages>116–121</pages>
+      <abstract>Poor OCR quality continues to be a major obstacle for humanities scholars seeking to make use of digitised primary sources such as historical newspapers. Typical approaches to post-OCR correction employ sequence-to-sequence models for a neural machine translation task, mapping erroneous OCR texts to accurate reference texts. We shift our focus towards the adaptation of generative LLMs for a prompt-based approach. By instruction-tuning Llama 2 and comparing it to a fine-tuned BART on BLN600, a parallel corpus of 19th century British newspaper articles, we demonstrate the potential of a prompt-based approach in detecting and correcting OCR errors, even with limited training data. We achieve a significant enhancement in OCR quality with Llama 2 outperforming BART, achieving a 54.51% reduction in the character error rate against BART’s 23.30%. This paves the way for future work leveraging generative LLMs to improve the accessibility and unlock the full potential of historical texts for humanities research.</abstract>
+      <url hash="b2db4f3c">2024.lt4hala-1.14</url>
+      <bibkey>thomas-etal-2024-leveraging</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>LLM</fixed-case>-based Machine Translation and Summarization for <fixed-case>L</fixed-case>atin</title>
+      <author><first>Martin</first><last>Volk</last></author>
+      <author><first>Dominic Philipp</first><last>Fischer</last></author>
+      <author><first>Lukas</first><last>Fischer</last></author>
+      <author><first>Patricia</first><last>Scheurer</last></author>
+      <author><first>Phillip Benjamin</first><last>Ströbel</last></author>
+      <pages>122–128</pages>
+      <abstract>This paper presents an evaluation of machine translation for Latin. We tested multilingual Large Language Models, in particular GPT-4, on letters from the 16th century that are in Latin and Early New High German. Our experiments include translation and cross-language summarization for the two historical languages into modern English and German. We show that LLM-based translation for Latin is clearly superior to previous approaches. We also show that LLM-based paraphrasing of Latin paragraphs from the historical letters produces English and German summaries that are close to human summaries published in the edition.</abstract>
+      <url hash="7ac2a8b2">2024.lt4hala-1.15</url>
+      <bibkey>volk-etal-2024-llm</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Exploring Aspect-Based Sentiment Analysis Methodologies for Literary-Historical Research Purposes</title>
+      <author><first>Tess</first><last>Dejaeghere</last></author>
+      <author><first>Pranaydeep</first><last>Singh</last></author>
+      <author><first>Els</first><last>Lefever</last></author>
+      <author><first>Julie</first><last>Birkholz</last></author>
+      <pages>129–143</pages>
+      <abstract>This study explores aspect-based sentiment analysis (ABSA) methodologies for literary-historical research, aiming to address the limitations of traditional sentiment analysis in understanding nuanced aspects of literature. It evaluates three ABSA toolchains: rule-based, machine learning-based (utilizing BERT and MacBERTh embeddings), and a prompt-based workflow with Mixtral 8x7B. Findings highlight challenges and potentials of ABSA for literary-historical analysis, emphasizing the need for context-aware annotation strategies and technical skills. The research contributes by curating a multilingual corpus of travelogues, publishing an annotated dataset for ABSA, creating openly available Jupyter Notebooks with Python code for each modeling approach, conducting pilot experiments on literary-historical texts, and proposing future endeavors to advance ABSA methodologies in this domain.</abstract>
+      <url hash="8dee5247">2024.lt4hala-1.16</url>
+      <bibkey>dejaeghere-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Early <fixed-case>M</fixed-case>odern <fixed-case>D</fixed-case>utch Comedies and Farces in the Spotlight: Introducing <fixed-case>E</fixed-case>m<fixed-case>DC</fixed-case>om<fixed-case>F</fixed-case> and Its Emotion Framework</title>
+      <author><first>Florian</first><last>Debaene</last></author>
+      <author><first>Kornee</first><last>van der Haven</last></author>
+      <author><first>Veronique</first><last>Hoste</last></author>
+      <pages>144–155</pages>
+      <abstract>As computational drama studies are developing rapidly, the Dutch dramatic tradition is in need of centralisation still before it can benefit from state-of-the-art methodologies. This paper presents and evaluates EmDComF, a historical corpus of both manually curated and automatically digitised early modern Dutch comedies and farces authored between 1650 and 1725, and describes the refinement of a historically motivated annotation framework exploring sentiment and emotions in these two dramatic subgenres. Originating from Lodewijk Meyer’s philosophical writings on passions in the dramatic genre (±1670), published in Naauwkeurig onderwys in de tooneel-poëzy (Thorough instruction in the Poetics of Drama) by the literary society Nil Volentibus Arduum in 1765, a historical and genre-specific emotion framework is tested and operationalised for annotating emotions in the domain of early modern Dutch comedies and farces. Based on a frequency and cluster analysis of 782 annotated sentences by 2 expert annotators, the initial 38 emotion labels were restructured to a hierarchical label set of the 5 emotions Hatred, Anxiety, Sadness, Joy and Desire.</abstract>
+      <url hash="e17642cc">2024.lt4hala-1.17</url>
+      <bibkey>debaene-etal-2024-early</bibkey>
+    </paper>
+    <paper id="18">
+      <title>When Hieroglyphs Meet Technology: A Linguistic Journey through <fixed-case>A</fixed-case>ncient <fixed-case>E</fixed-case>gypt Using Natural Language Processing</title>
+      <author><first>Ricardo</first><last>Muñoz Sánchez</last></author>
+      <pages>156–169</pages>
+      <abstract>Knowing our past can help us better understand our future. The explosive development of NLP in these past few decades has allowed us to study ancient languages and cultures in ways that we couldn’t have done in the past. However, not all languages have received the same level of attention. Despite its popularity in pop culture, the languages spoken in Ancient Egypt have been somewhat overlooked in terms of NLP research. In this paper we give an overview of how NLP has been used to study different variations of the Ancient Egyptian languages. This not only includes Old, Middle, and Late Egyptian but also Demotic and Coptic. We begin our survey paper by giving a short introduction to these languages and their writing systems, before talking about the corpora and lexical resources that are available digitally. We then show the different NLP tasks that have been tackled for different variations of Ancient Egyptian, as well as the approaches that have been used. We hope that our work can stoke interest in the study of these languages within the NLP community.</abstract>
+      <url hash="43d17c7b">2024.lt4hala-1.18</url>
+      <bibkey>munoz-sanchez-2024-hieroglyphs</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Towards a Readability Formula for <fixed-case>L</fixed-case>atin</title>
+      <author><first>Thomas</first><last>Laurs</last></author>
+      <pages>170–175</pages>
+      <abstract>This research focuses on the development of a readability formula for Latin texts, a much-needed tool to assess the difficulty of Latin texts in educational settings. This study takes a comprehensive approach, exploring more than 100 linguistic variables, including lexical, morphological, syntactical, and discourse-related factors, to capture the multifaceted nature of text difficulty. The study incorporates a corpus of Latin texts that were assessed for difficulty, and their evaluations were used to establish the basis for the model. The research utilizes natural language processing tools to derive linguistic predictors, resulting in a multiple linear regression model that explains about 70% of the variance in text difficulty. While the model’s precision can be enhanced by adding further variables and a larger corpus, it already provides valuable insights into the readability of Latin texts and offers the opportunity to examine how different text genres and contents influence text accessibility. Additionally, the formula’s focus on objective text difficulty paves the way for future research on personal predictors, particularly in educational contexts.</abstract>
+      <url hash="98c8c2b1">2024.lt4hala-1.19</url>
+      <bibkey>laurs-2024-towards</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Automatic Normalisation of <fixed-case>M</fixed-case>iddle <fixed-case>F</fixed-case>rench and Its Impact on Productivity</title>
+      <author><first>Raphael</first><last>Rubino</last></author>
+      <author><first>Sandra</first><last>Coram-Mekkey</last></author>
+      <author><first>Johanna</first><last>Gerlach</last></author>
+      <author><first>Jonathan David</first><last>Mutal</last></author>
+      <author><first>Pierrette</first><last>Bouillon</last></author>
+      <pages>176–189</pages>
+      <abstract>This paper presents a study on automatic normalisation of 16th century documents written in Middle French. These documents present a large variety of wordforms which require spelling normalisation to facilitate downstream linguistic and historical studies. We frame the normalisation process as a machine translation task starting with a strong baseline leveraging a pre-trained encoder–decoder model. We propose to improve this baseline by combining synthetic data generation methods and producing artificial training data, thus tackling the lack of parallel corpora relevant to our task. The evaluation of our approach is twofold, in addition to automatic metrics relying on gold references, we evaluate our models through post-editing of their outputs. This evaluation method directly measures the productivity gain brought by our models to experts conducting the normalisation task manually. Results show a 20+ token per minute increase in productivity when using automatic normalisation compared to normalising text from scratch. The manually post-edited dataset resulting from our study is the first parallel corpus of normalised 16th century Middle French to be publicly released, along with the synthetic data and the automatic normalisation models used and trained in the presented work.</abstract>
+      <url hash="e05e147f">2024.lt4hala-1.20</url>
+      <bibkey>rubino-etal-2024-automatic</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Overview of the <fixed-case>E</fixed-case>va<fixed-case>L</fixed-case>atin 2024 Evaluation Campaign</title>
+      <author><first>Rachele</first><last>Sprugnoli</last></author>
+      <author><first>Federica</first><last>Iurescia</last></author>
+      <author><first>Marco</first><last>Passarotti</last></author>
+      <pages>190–197</pages>
+      <abstract>This paper describes the organization and the results of the third edition of EvaLatin, the campaign for the evaluation of Natural Language Processing tools for Latin. The two shared tasks proposed in EvaLatin 2024, i.,e., Dependency Parsing and Emotion Polarity Detection, are aimed to foster research in the field of language technologies for Classical languages. The shared datasets are described and the results obtained by the participants for each task are presented and discussed.</abstract>
+      <url hash="c328a38c">2024.lt4hala-1.21</url>
+      <bibkey>sprugnoli-etal-2024-overview</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Behr at <fixed-case>E</fixed-case>va<fixed-case>L</fixed-case>atin 2024: <fixed-case>L</fixed-case>atin Dependency Parsing Using Historical Sentence Embeddings</title>
+      <author><first>Rufus</first><last>Behr</last></author>
+      <pages>198–202</pages>
+      <abstract>This paper identifies the system used for my submission to EvaLatin’s shared dependency parsing task as part of the LT4HALA 2024 workshop. EvaLatin presented new Latin prose and poetry dependency test data from potentially different time periods, and imposed no restriction on training data or model selection for the task. This paper, therefore, sought to build a general Latin dependency parser that would perform accurately regardless of the Latin age to which the test data belongs. To train a general parser, all of the available Universal Dependencies treebanks were used, but in order to address the changes in the Latin language over time, this paper introduces historical sentence embeddings. A model was trained to encode sentences of the same Latin age into vectors of high cosine similarity, which are referred to as historical sentence embeddings. The system introduces these historical sentence embeddings into a biaffine dependency parser with the hopes of enabling training across the Latin treebanks in a more efficacious manner, but their inclusion shows no improvement over the base model.</abstract>
+      <url hash="8e6d769a">2024.lt4hala-1.22</url>
+      <bibkey>behr-2024-behr</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>KU</fixed-case> Leuven / Brepols-<fixed-case>CTLO</fixed-case> at <fixed-case>E</fixed-case>va<fixed-case>L</fixed-case>atin 2024: Span Extraction Approaches for <fixed-case>L</fixed-case>atin Dependency Parsing</title>
+      <author><first>Wouter</first><last>Mercelis</last></author>
+      <pages>203–206</pages>
+      <abstract>This report describes the KU Leuven / Brepols-CTLO submission to EvaLatin 2024. We present the results of two runs, both of which try to implement a span extraction approach. The first run implements span-span prediction, rooted in Machine Reading Comprehension, while making use of LaBERTa, a RoBERTa model pretrained on Latin texts. The first run produces meaningful results. The second, more experimental run operates on the token-level with a span-extraction approach based on the Question Answering task. This model finetuned a DeBERTa model, pretrained on Latin texts. The finetuning was set up in the form of a Multitask Model, with classification heads for each token’s part-of-speech tag and dependency relation label, while a question answering head handled the dependency head predictions. Due to the shared loss function, this paper tried to capture the link between part-of-speech tag, dependency relation and dependency heads, that follows the human intuition. The second run did not perform well.</abstract>
+      <url hash="63af149d">2024.lt4hala-1.23</url>
+      <bibkey>mercelis-2024-ku</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>ÚFAL</fixed-case> <fixed-case>L</fixed-case>atin<fixed-case>P</fixed-case>ipe at <fixed-case>E</fixed-case>va<fixed-case>L</fixed-case>atin 2024: Morphosyntactic Analysis of <fixed-case>L</fixed-case>atin</title>
+      <author><first>Milan</first><last>Straka</last></author>
+      <author><first>Jana</first><last>Straková</last></author>
+      <author><first>Federica</first><last>Gamba</last></author>
+      <pages>207–214</pages>
+      <abstract>We present LatinPipe, the winning submission to the EvaLatin 2024 Dependency Parsing shared task. Our system consists of a fine-tuned concatenation of base and large pre-trained LMs, with a dot-product attention head for parsing and softmax classification heads for morphology to jointly learn both dependency parsing and morphological analysis. It is trained by sampling from seven publicly available Latin corpora, utilizing additional harmonization of annotations to achieve a more unified annotation style. Before fine-tuning, we train the system for a few initial epochs with frozen weights. We also add additional local relative contextualization by stacking the BiLSTM layers on top of the Transformer(s). Finally, we ensemble output probability distributions from seven randomly instantiated networks for the final submission. The code is available at https://github.com/ufal/evalatin2024-latinpipe.</abstract>
+      <url hash="eb5a080c">2024.lt4hala-1.24</url>
+      <bibkey>straka-etal-2024-ufal</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Nostra Domina at <fixed-case>E</fixed-case>va<fixed-case>L</fixed-case>atin 2024: Improving <fixed-case>L</fixed-case>atin Polarity Detection through Data Augmentation</title>
+      <author><first>Stephen</first><last>Bothwell</last></author>
+      <author><first>Abigail</first><last>Swenor</last></author>
+      <author><first>David</first><last>Chiang</last></author>
+      <pages>215–222</pages>
+      <abstract>This paper describes submissions from the team Nostra Domina to the EvaLatin 2024 shared task of emotion polarity detection. Given the low-resource environment of Latin and the complexity of sentiment in rhetorical genres like poetry, we augmented the available data through automatic polarity annotation. We present two methods for doing so on the basis of the k-means algorithm, and we employ a variety of Latin large language models (LLMs) in a neural architecture to better capture the underlying contextual sentiment representations. Our best approach achieved the second highest macro-averaged Macro-F1 score on the shared task’s test set.</abstract>
+      <url hash="991636a6">2024.lt4hala-1.25</url>
+      <bibkey>bothwell-etal-2024-nostra</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>T</fixed-case>artu<fixed-case>NLP</fixed-case> at <fixed-case>E</fixed-case>va<fixed-case>L</fixed-case>atin 2024: Emotion Polarity Detection</title>
+      <author><first>Aleksei</first><last>Dorkin</last></author>
+      <author><first>Kairit</first><last>Sirts</last></author>
+      <pages>223–228</pages>
+      <abstract>The technical report for our submission at EvaLatin 2024 shared task. We apply knowledge transfer techniques and two distinct approaches to data annotation: based on heuristics and based on LLMs.</abstract>
+      <url hash="64f527ee">2024.lt4hala-1.26</url>
+      <bibkey>dorkin-sirts-2024-tartunlp-evalatin</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Overview of <fixed-case>E</fixed-case>va<fixed-case>H</fixed-case>an2024: The First International Evaluation on <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Sentence Segmentation and Punctuation</title>
+      <author><first>Bin</first><last>Li</last></author>
+      <author><first>Bolin</first><last>Chang</last></author>
+      <author><first>Zhixing</first><last>Xu</last></author>
+      <author><first>Minxuan</first><last>Feng</last></author>
+      <author><first>Chao</first><last>Xu</last></author>
+      <author><first>Weiguang</first><last>Qu</last></author>
+      <author><first>Si</first><last>Shen</last></author>
+      <author><first>Dongbo</first><last>Wang</last></author>
+      <pages>229–236</pages>
+      <abstract>Ancient Chinese texts have no sentence boundaries and punctuation. Adding modern Chinese punctuation to theses texts requires expertise, time and efforts. Automatic sentence segmentation and punctuation is considered as a basic task for Ancient Chinese processing, but there is no shared task to evaluate the performances of different systems. This paper presents the results of the first ancient Chinese sentence segmentation and punctuation bakeoff, which is held at the Third Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) 2024. The contest uses metrics for detailed evaluations of 4 genres of unpublished texts with 11 punctuation types. Six teams submitted 32 running results. In the closed modality, the participants are only allowed to use the training data, the highest obtained F1 scores are respectively 88.47% and 75.29% in sentence segmentation and sentence punctuation. The perfermances on the unseen data is 10 percent lower than the published common data, which means there is still space for further improvement. The large language models outperform the traditional models, but LLM changes the original characters around 1-2%, due to over-generation. Thus, post-processing is needed to keep the text consistancy.</abstract>
+      <url hash="aa093d3e">2024.lt4hala-1.27</url>
+      <bibkey>li-etal-2024-overview</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Two Sequence Labeling Approaches to Sentence Segmentation and Punctuation Prediction for Classic <fixed-case>C</fixed-case>hinese Texts</title>
+      <author><first>Xuebin</first><last>Wang</last></author>
+      <author><first>Zhenghua</first><last>Li</last></author>
+      <pages>237–241</pages>
+      <abstract>This paper describes our system for the EvaHan2024 shared task. We design and experiment with two sequence labeling approaches, i.e., one-stage and two-stage approaches. The one-stage approach directly predicts a label for each character, and the label may contain multiple punctuation marks. The two-stage approach divides punctuation marks into two classes, i.e., pause and non-pause, and separately handles them via two sequence labeling processes. The labels contain at most one punctuation marks. We use pre-trained SikuRoBERTa as a key component of the encoder and employ a conditional random field (CRF) layer on the top. According to the evaluation metrics adopted by the organizers, the two-stage approach is superior to the one-stage approach, and our system achieves the second place among all participant systems.</abstract>
+      <url hash="dc6e828e">2024.lt4hala-1.28</url>
+      <bibkey>wang-li-2024-two</bibkey>
+    </paper>
+    <paper id="29">
+      <title><fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Sentence Segmentation and Punctuation on Xunzi <fixed-case>LLM</fixed-case></title>
+      <author><first>Shitu</first><last>Huo</last></author>
+      <author><first>Wenhui</first><last>Chen</last></author>
+      <pages>242–245</pages>
+      <abstract>This paper describes the system submitted for the EvaHan 2024 Task on ancient Chinese sentence segmentation and punctuation. Our study utillizes the Xunzi large language model as the base model to evaluate the overall performance and the performance by record type. The applied methodologies and the prompts utilized in our study have shown to be helpful and effective in aiding the model’s performance evaluation.</abstract>
+      <url hash="892b9963">2024.lt4hala-1.29</url>
+      <bibkey>huo-chen-2024-ancient</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Sentence Segmentation and Sentence Punctuation Based on <fixed-case>X</fixed-case>unzi<fixed-case>ALLM</fixed-case></title>
+      <author><first>Zihong</first><last>Chen</last></author>
+      <pages>246–250</pages>
+      <abstract>In ancient Chinese books, punctuation marks are typically absent in engraved texts. Sentence segmentation and punctuation heavily rely on the meticulous efforts of experts and scholars. Therefore, the work of automatic punctuation and sentence segmentation plays a very important role in promoting ancient books, as well as the inheritance of Chinese culture. In this paper, we present a method for fine-tuning downstream tasks for large language model using the LoRA approach, leveraging the EvaHan2024 dataset. This method ensures robust output and high accuracy while inheriting the knowledge from the large pre-trained language model Xunzi.</abstract>
+      <url hash="9ab5efff">2024.lt4hala-1.30</url>
+      <bibkey>chen-2024-sentence</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Sentence Segmentation and Punctuation for Ancient Books Based on Supervised In-context Training</title>
+      <author><first>Shiquan</first><last>Wang</last></author>
+      <author><first>Weiwei</first><last>Fu</last></author>
+      <author><first>Mengxiang</first><last>Li</last></author>
+      <author><first>Zhongjiang</first><last>He</last></author>
+      <author><first>Yongxiang</first><last>Li</last></author>
+      <author><first>Ruiyu</first><last>Fang</last></author>
+      <author><first>Li</first><last>Guan</last></author>
+      <author><first>Shuangyong</first><last>Song</last></author>
+      <pages>251–255</pages>
+      <abstract>This paper describes the participation of team “TeleAI” in the third International Chinese Ancient Chinese Language Information Processing Evaluation (EvalHan24). The competition comprises a joint task of sentence segmentation and punctuation, categorized into open and closed tracks based on the models and data used. In the final evaluation, our system achieved significantly better results than the baseline. Specifically, in the closed-track sentence segmentation task, we obtained an F1 score of 0.8885, while in the sentence punctuation task, we achieved an F1 score of 0.7129.</abstract>
+      <url hash="fd8962cd">2024.lt4hala-1.31</url>
+      <bibkey>wang-etal-2024-sentence</bibkey>
+    </paper>
+    <paper id="32">
+      <title><fixed-case>SPEADO</fixed-case>: Segmentation and Punctuation for <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Texts via Example Augmentation and Decoding Optimization</title>
+      <author><first>Tian</first><last>Xia</last></author>
+      <author><first>Kai</first><last>Yu</last></author>
+      <author><first>Qianrong</first><last>Yu</last></author>
+      <author><first>Xinran</first><last>Peng</last></author>
+      <pages>256–260</pages>
+      <abstract>The SPEADO model for sentence segmentation and punctuation tasks in ancient Chinese texts is proposed, which incorporates text chunking and MinHash indexing techniques to realise example argumentation. Additionally, decoding optimization strategies are introduced to direct the attention of the LLM model towards punctuation errors and address the issue of uncontrollable output. Experimental results show that the F1 score of the proposed method exceeds the baseline model by 14.18%, indicating a significant improvement in performance.</abstract>
+      <url hash="94d5c017">2024.lt4hala-1.32</url>
+      <bibkey>xia-etal-2024-speado</bibkey>
+    </paper>
+    <paper id="33">
+      <title><fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Punctuation via In-Context Learning</title>
+      <author><first>Jie</first><last>Huang</last></author>
+      <pages>261–265</pages>
+      <abstract>EvaHan2024 focuses on sentence punctuation in ancient Chinese. Xunzi large language base model, which is specifically trained for ancient Chinese processing, is advised in the campaign. In general, we adopted the in-context learning (ICL) paradigm for this task and designed a post-processing scheme to ensure the standardability of final results. When constructing ICL prompts, we did feature extraction by LLM QA and selected demonstrations based on non-parametric metrics. We used Xunzi in two stages and neither did further training, so the model was generic and other fundamental abilities remained unaffected. Moreover, newly acquired training data can be directly utilized after identical feature extraction, showcasing the scalability of our system. As for the result, we achieved an F1-score of 67.7% on a complex test dataset consisting of multiple types of documents and 77.98% on Zuozhuan data.</abstract>
+      <url hash="f7fd9976">2024.lt4hala-1.33</url>
+      <bibkey>huang-2024-ancient</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.ltedi.xml b/data/xml/2024.ltedi.xml
index 49df4b42cd..ad2223820e 100644
--- a/data/xml/2024.ltedi.xml
+++ b/data/xml/2024.ltedi.xml
@@ -30,6 +30,7 @@
       <abstract>We introduce HATELEXICON, a lexicon of slurs and targets of hate speech for Brazil, Germany, India and Kenya, to aid model development and interpretability. First, we demonstrate how HATELEXICON can be used to interpret model predictions, showing that models developed to classify extreme speech rely heavily on target group names. Further, we propose a culturally-informed method to aid shot selection for training in low-resource settings. In few-shot learning, shot selection is of paramount importance to model performance and we need to ensure we make the most of available data. We work with HASOC German and Hindi data for training and the Multilingual HateCheck (MHC) benchmark for evaluation. We show that selecting shots based on our lexicon leads to models performing better than models trained on shots sampled randomly. Thus, when given only a few training examples, using HATELEXICON to select shots containing more sociocultural information leads to better few-shot performance. With these two use-cases we show how our HATELEXICON can be used for more effective hate speech detection.</abstract>
       <url hash="3bea1a46">2024.ltedi-1.1</url>
       <bibkey>maronikolakis-etal-2024-sociocultural</bibkey>
+      <video href="2024.ltedi-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>A Dataset for the Detection of Dehumanizing Language</title>
@@ -40,6 +41,7 @@
       <abstract>Dehumanization is a mental process that enables the exclusion and ill treatment of a group of people. In this paper, we present two data sets of dehumanizing text, a large, automatically collected corpus and a smaller, manually annotated data set. Both data sets include a combination of political discourse and dialogue from movie subtitles. Our methods give us a broad and varied amount of dehumanization data to work with, enabling further exploratory analysis as well as automatic classification of dehumanization patterns. Both data sets will be publicly released.</abstract>
       <url hash="12056ff5">2024.ltedi-1.2</url>
       <bibkey>engelmann-etal-2024-dataset</bibkey>
+      <video href="2024.ltedi-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Beyond the Surface: Spurious Cues in Automatic Media Bias Detection</title>
@@ -49,6 +51,7 @@
       <abstract>This study investigates the robustness and generalization of transformer-based models for automatic media bias detection. We explore the behavior of current bias classifiers by analyzing feature attributions and stress-testing with adversarial datasets. The findings reveal a disproportionate focus on rare but strongly connotated words, suggesting a rather superficial understanding of linguistic bias and challenges in contextual interpretation. This problem is further highlighted by inconsistent bias assessment when stress-tested with different entities and minorities. Enhancing automatic media bias detection models is critical to improving inclusivity in media, ensuring balanced and fair representation of diverse perspectives.</abstract>
       <url hash="67dde0a6">2024.ltedi-1.3</url>
       <bibkey>wessel-horych-2024-beyond</bibkey>
+      <video href="2024.ltedi-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>The Balancing Act: Unmasking and Alleviating <fixed-case>ASR</fixed-case> Biases in <fixed-case>P</fixed-case>ortuguese</title>
@@ -60,6 +63,7 @@
       <abstract>In the field of spoken language understanding, systems like Whisper and Multilingual Massive Speech (MMS) have shown state-of-the-art performances. This study is dedicated to a comprehensive exploration of the Whisper and MMS systems, with a focus on assessing biases in automatic speech recognition (ASR) inherent to casual conversation speech specific to the Portuguese language. Our investigation encompasses various categories, including gender, age, skin tone color, and geo-location. Alongside traditional ASR evaluation metrics such as Word Error Rate (WER), we have incorporated p-value statistical significance for gender bias analysis. Furthermore, we extensively examine the impact of data distribution and empirically show that oversampling techniques alleviate such stereotypical biases. This research represents a pioneering effort in quantifying biases in the Portuguese language context through the application of MMS and Whisper, contributing to a better understanding of ASR systems’ performance in multilingual settings.</abstract>
       <url hash="f740b2ac">2024.ltedi-1.4</url>
       <bibkey>kulkarni-etal-2024-balancing</bibkey>
+      <video href="2024.ltedi-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Towards Content Accessibility Through Lexical Simplification for <fixed-case>M</fixed-case>altese as a Low-Resource Language</title>
@@ -70,6 +74,7 @@
       <abstract>Natural Language Processing techniques have been developed to assist in simplifying online content while preserving meaning. However, for low-resource languages, like Maltese, there are still numerous challenges and limitations. Lexical Simplification (LS) is a core technique typically adopted to improve content accessibility, and has been widely studied for high-resource languages such as English and French. Motivated by the need to improve access to Maltese content and the limitations in this context, this work set out to develop and evaluate an LS system for Maltese text. An LS pipeline was developed consisting of (1) potential complex word identification, (2) substitute generation, (3) substitute selection, and (4) substitute ranking. An evaluation data set was developed to assess the performance of each step. Results are encouraging and will lead to numerous future work. Finally, a single-blind study was carried out with over 200 participants, where the system’s perceived quality in text simplification was evaluated. Results suggest that meaning is retained about 50% of the time, and when meaning is retained, about 70% of system-generated sentences are either perceived as simpler or of equal simplicity to the original. Challenges remain, and this study proposes a number of areas that may benefit from further research.</abstract>
       <url hash="e530a190">2024.ltedi-1.5</url>
       <bibkey>meli-etal-2024-towards</bibkey>
+      <video href="2024.ltedi-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Prompting Fairness: Learning Prompts for Debiasing Large Language Models</title>
@@ -91,6 +96,7 @@
       <abstract>This study pioneers the use of synthetically generated data for training generative models in document-level text simplification of German texts. We demonstrate the effectiveness of our approach with real-world online texts. Addressing the challenge of data scarcity in language simplification, we crawled professionally simplified German texts and synthesized a corpus using GPT-4. We finetune Large Language Models with up to 13 billion parameters on this data and evaluate their performance. This paper employs various methodologies for evaluation and demonstrates the limitations of currently used rule-based metrics. Both automatic and manual evaluations reveal that our models can significantly simplify real-world online texts, indicating the potential of synthetic data in improving text simplification.</abstract>
       <url hash="b22c866a">2024.ltedi-1.7</url>
       <bibkey>kloser-etal-2024-german</bibkey>
+      <video href="2024.ltedi-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title><fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> Based Data Augmentation for Improved Parameter-Efficient Debiasing of <fixed-case>LLM</fixed-case>s</title>
@@ -104,6 +110,7 @@
       <abstract>Large Language models (LLMs), while powerful, exhibit harmful social biases. Debiasing is often challenging due to computational costs, data constraints, and potential degradation of multi-task language capabilities. This work introduces a novel approach utilizing ChatGPT to generate synthetic training data, aiming to enhance the debiasing of LLMs. We propose two strategies: Targeted Prompting, which provides effective debiasing for known biases but necessitates prior specification of bias in question; and General Prompting, which, while slightly less effective, offers debiasing across various categories. We leverage resource-efficient LLM debiasing using adapter tuning and compare the effectiveness of our synthetic data to existing debiasing datasets. Our results reveal that: (1) ChatGPT can efficiently produce high-quality training data for debiasing other LLMs; (2) data produced via our approach surpasses existing datasets in debiasing performance while also preserving internal knowledge of a pre-trained LLM; and (3) synthetic data exhibits generalizability across categories, effectively mitigating various biases, including intersectional ones. These findings underscore the potential of synthetic data in advancing the fairness of LLMs with minimal retraining cost.</abstract>
       <url hash="99cabb53">2024.ltedi-1.8</url>
       <bibkey>han-etal-2024-chatgpt</bibkey>
+      <video href="2024.ltedi-1.8.mp4"/>
     </paper>
     <paper id="9">
       <title><fixed-case>DE</fixed-case>-Lite - a New Corpus of Easy <fixed-case>G</fixed-case>erman: Compilation, Exploration, Analysis</title>
@@ -114,6 +121,7 @@
       <abstract>In this paper, we report on a new corpus of simplified German. It is recently requested from public agencies in Germany to provide information in easy language on their outlets (e.g. websites) so as to facilitate participation in society for people with low-literacy levels related to learning difficulties or low language proficiency (e.g. L2 speakers). While various rule sets and guidelines for Easy German (a specific variant of simplified German) have emerged over time, it is unclear (a) to what extent authors and other content creators, including generative AI tools consistently apply them, and (b) how adequate texts in authentic Easy German really are for the intended audiences. As a first step in gaining insights into these issues and to further LT development for simplified German, we compiled DE-Lite, a corpus of easy-to-read texts including Easy German and comparable Standard German texts, by integrating existing collections and gathering new data from the web. We built n-gram models for an Easy German subcorpus of DE-Lite and comparable Standard German texts in order to identify typical features of Easy German. To this end, we use relative entropy (Kullback-Leibler Divergence), a standard technique for evaluating language models, which we apply here for corpus comparison. Our analysis reveals that some rules of Easy German are fairly dominant (e.g. punctuation) and that text genre has a strong effect on the distinctivity of the two language variants.</abstract>
       <url hash="5965be0e">2024.ltedi-1.9</url>
       <bibkey>jablotschkin-etal-2024-de</bibkey>
+      <video href="2024.ltedi-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>A Diachronic Analysis of Gender-Neutral Language on wiki<fixed-case>H</fixed-case>ow</title>
@@ -123,6 +131,7 @@
       <abstract>As a large how-to website, wikiHow’s mission is to empower every person on the planet to learn how to do anything. An important part of including everyone also linguistically is the use of gender-neutral language. In this short paper, we study in how far articles from wikiHow fulfill this criterion based on manual annotation and automatic classification. In particular, we employ a classifier to analyze how the use of gender-neutral language has developed over time. Our results show that although about 75% of all articles on wikiHow were written in a gender-neutral way from the outset, revisions have a higher tendency to add gender-specific language than to change it to inclusive wording.</abstract>
       <url hash="b4b356f2">2024.ltedi-1.10</url>
       <bibkey>suhr-roth-2024-diachronic</bibkey>
+      <video href="2024.ltedi-1.10.mp4"/>
     </paper>
     <paper id="11">
       <title>Overview of Third Shared Task on Homophobia and Transphobia Detection in Social Media Comments</title>
@@ -144,6 +153,7 @@
       <abstract>This paper provides a comprehensive summary of the “Homophobia and Transphobia Detection in Social Media Comments” shared task, which was held at the LT-EDI@EACL 2024. The objective of this task was to develop systems capable of identifying instances of homophobia and transphobia within social media comments. This challenge was extended across ten languages: English, Tamil, Malayalam, Telugu, Kannada, Gujarati, Hindi, Marathi, Spanish, and Tulu. Each comment in the dataset was annotated into three categories. The shared task attracted significant interest, with over 60 teams participating through the CodaLab platform. The submission of prediction from the participants was evaluated with the macro F1 score.</abstract>
       <url hash="dca0a0cd">2024.ltedi-1.11</url>
       <bibkey>chakravarthi-etal-2024-overview</bibkey>
+      <video href="2024.ltedi-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Overview of the Third Shared Task on Speech Recognition for Vulnerable Individuals in <fixed-case>T</fixed-case>amil</title>
@@ -156,6 +166,7 @@
       <abstract>The overview of the shared task on speech recognition for vulnerable individuals in Tamil (LT-EDI-2024) is described in this paper. The work comes with a Tamil dataset that was gath- ered from elderly individuals who identify as male, female, or transgender. The audio sam- ples were taken in public places such as marketplaces, vegetable shops, hospitals, etc. The training phase and the testing phase are when the dataset is made available. The task required of the participants was to handle audio signals using various models and techniques, and then turn in their results as transcriptions of the pro- vided test samples. The participant’s results were assessed using WER (Word Error Rate). The transformer-based approach was employed by the participants to achieve automatic voice recognition. This overview paper discusses the findings and various pre-trained transformer- based models that the participants employed.</abstract>
       <url hash="48d6c971">2024.ltedi-1.12</url>
       <bibkey>b-etal-2024-overview</bibkey>
+      <video href="2024.ltedi-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Overview of Shared Task on Multitask Meme Classification - Unraveling Misogynistic and Trolls in Online Memes</title>
@@ -174,6 +185,7 @@
       <abstract>This paper offers a detailed overview of the first shared task on “Multitask Meme Classification - Unraveling Misogynistic and Trolls in Online Memes,” organized as part of the LT-EDI@EACL 2024 conference. The task was set to classify misogynistic content and troll memes within online platforms, focusing specifically on memes in Tamil and Malayalam languages. A total of 52 teams registered for the competition, with four submitting systems for the Tamil meme classification task and three for the Malayalam task. The outcomes of this shared task are significant, providing insights into the current state of misogynistic content in digital memes and highlighting the effectiveness of various computational approaches in identifying such detrimental content. The top-performing model got a macro F1 score of 0.73 in Tamil and 0.87 in Malayalam.</abstract>
       <url hash="3b33ee0c">2024.ltedi-1.13</url>
       <bibkey>chakravarthi-etal-2024-overview-shared</bibkey>
+      <video href="2024.ltedi-1.13.mp4"/>
     </paper>
     <paper id="14">
       <title>Overview of Shared Task on Caste and Migration Hate Speech Detection</title>
@@ -188,6 +200,7 @@
       <abstract>We present an overview of the first shared task on “Caste and Migration Hate Speech Detection.” The shared task is organized as part of LTEDI@EACL 2024. The system must delineate between binary outcomes, ascertaining whether the text is categorized as a caste/migration hate speech or not. The dataset presented in this shared task is in Tamil, which is one of the under-resource languages. There are a total of 51 teams participated in this task. Among them, 15 teams submitted their research results for the task. To the best of our knowledge, this is the first time the shared task has been conducted on textual hate speech detection concerning caste and migration. In this study, we have conducted a systematic analysis and detailed presentation of all the contributions of the participants as well as the statistics of the dataset, which is the social media comments in Tamil language to detect hate speech. It also further goes into the details of a comprehensive analysis of the participants’ methodology and their findings.</abstract>
       <url hash="2b61cdba">2024.ltedi-1.14</url>
       <bibkey>rajiakodi-etal-2024-overview</bibkey>
+      <video href="2024.ltedi-1.14.mp4"/>
     </paper>
     <paper id="15">
       <title><fixed-case>P</fixed-case>inealai_<fixed-case>S</fixed-case>tress<fixed-case>I</fixed-case>dent_<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>@<fixed-case>EACL</fixed-case>2024: Minimal configurations for Stress Identification in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>elugu</title>
@@ -198,6 +211,7 @@
       <abstract>This paper introduces an approach to stress identification in Tamil and Telugu, leveraging traditional machine learning models—Fasttext for Tamil and Naive Bayes for Telugu—yielding commendable results. The study highlights the scarcity of annotated data and recognizes limitations in phonetic features relevant to these languages, impacting precise information extraction. Our models achieved a macro F1 score of 0.77 for Tamil and 0.72 for Telugu with Fasttext and Naive Bayes, respectively. While the Telugu model secured the second rank in shared tasks, ongoing research is crucial to unlocking the full potential of stress identification in these languages, necessitating the exploration of additional features and advanced techniques specified in the discussions and limitations section.</abstract>
       <url hash="882591b4">2024.ltedi-1.15</url>
       <bibkey>alex-eponon-etal-2024-pinealai</bibkey>
+      <video href="2024.ltedi-1.15.mp4"/>
     </paper>
     <paper id="16">
       <title>byte<fixed-case>LLM</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Homophobia/Transphobia Detection in Social Media Comments - Custom Subword Tokenization with <fixed-case>S</fixed-case>ubword2<fixed-case>V</fixed-case>ec and <fixed-case>B</fixed-case>i<fixed-case>LSTM</fixed-case></title>
@@ -207,6 +221,7 @@
       <abstract>This research focuses on Homophobia and Transphobia Detection in Dravidian languages, specifically Telugu, Kannada, Tamil, and Malayalam. Leveraging the Homophobia/ Transphobia Detection dataset, we propose an innovative approach employing a custom-designed tokenizer with a Bidirectional Long Short-Term Memory (BiLSTM) architecture. Our distinctive contribution lies in a tokenizer that reduces model sizes to below 7MB, improving efficiency and addressing real-time deployment challenges. The BiLSTM implementation demonstrates significant enhancements in hate speech detection accuracy, effectively capturing linguistic nuances. Low-size models efficiently alleviate inference challenges, ensuring swift real-time detection and practical deployment. This work pioneers a framework for hate speech detection, providing insights into model size, inference speed, and real-time deployment challenges in combatting online hate speech within Dravidian languages.</abstract>
       <url hash="a1ec73ea">2024.ltedi-1.16</url>
       <bibkey>manukonda-kodali-2024-bytellm</bibkey>
+      <video href="2024.ltedi-1.16.mp4"/>
     </paper>
     <paper id="17">
       <title><fixed-case>M</fixed-case>ason<fixed-case>T</fixed-case>igers@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: An Ensemble Approach Towards Detecting Homophobia and Transphobia in Social Media Comments</title>
@@ -218,6 +233,7 @@
       <abstract>In this paper, we describe our approaches and results for Task 2 of the LT-EDI 2024 Workshop, aimed at detecting homophobia and/or transphobia across ten languages. Our methodologies include monolingual transformers and ensemble methods, capitalizing on the strengths of each to enhance the performance of the models. The ensemble models worked well, placing our team, MasonTigers, in the top five for eight of the ten languages, as measured by the macro F1 score. Our work emphasizes the efficacy of ensemble methods in multilingual scenarios, addressing the complexities of language-specific tasks.</abstract>
       <url hash="1e772eac">2024.ltedi-1.17</url>
       <bibkey>goswami-etal-2024-masontigers</bibkey>
+      <video href="2024.ltedi-1.17.mp4"/>
     </paper>
     <paper id="18">
       <title><fixed-case>J</fixed-case>udith<fixed-case>J</fixed-case>eyafreeda_<fixed-case>S</fixed-case>tress<fixed-case>I</fixed-case>dent_<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>@<fixed-case>EACL</fixed-case>2024: <fixed-case>GPT</fixed-case> for stress identification</title>
@@ -226,6 +242,7 @@
       <abstract>Stress detection from social media texts has proved to play an important role in mental health assessments. People tend to express their stress on social media more easily. Analysing and classifying these texts allows for improvements in development of recommender systems and automated mental health assessments. In this paper, a GPT model is used for classification of social media texts into two classes - stressed and not-stressed. The texts used for classification are in two Dravidian languages - Tamil and Telugu. The results, although not very good shows a promising direction of research to use GPT models for classification.</abstract>
       <url hash="79b0c890">2024.ltedi-1.18</url>
       <bibkey>andrew-2024-judithjeyafreeda</bibkey>
+      <video href="2024.ltedi-1.18.mp4"/>
     </paper>
     <paper id="19">
       <title>cantnlp@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Automatic Detection of Anti-<fixed-case>LGBTQ</fixed-case>+ Hate Speech in Under-resourced Languages</title>
@@ -235,6 +252,7 @@
       <abstract>This paper describes our homophobia/transphobia in social media comments detection system developed as part of the shared task at LT-EDI-2024. We took a transformer-based approach to develop our multiclass classification model for ten language conditions (English, Spanish, Gujarati, Hindi, Kannada, Malayalam, Marathi, Tamil, Tulu, and Telugu). We introduced synthetic and organic instances of script-switched language data during domain adaptation to mirror the linguistic realities of social media language as seen in the labelled training data. Our system ranked second for Gujarati and Telugu with varying levels of performance for other language conditions. The results suggest incorporating elements of paralinguistic behaviour such as script-switching may improve the performance of language detection systems especially in the cases of under-resourced languages conditions.</abstract>
       <url hash="00d82a27">2024.ltedi-1.19</url>
       <bibkey>wong-durward-2024-cantnlp</bibkey>
+      <video href="2024.ltedi-1.19.mp4"/>
     </paper>
     <paper id="20">
       <title>Lidoma@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024:<fixed-case>T</fixed-case>amil Hate Speech Detection in Migration Discourse</title>
@@ -247,6 +265,7 @@
       <abstract>The exponential rise in social media users has revolutionized information accessibility and exchange. While these platforms serve various purposes, they also harbor negative elements, including hate speech and offensive behavior. Detecting hate speech in diverse languages has garnered significant attention in Natural Language Processing (NLP). This paper delves into hate speech detection in Tamil, particularly related to migration and refuge, contributing to the Caste/migration hate speech detection shared task. Employing a Convolutional Neural Network (CNN), our model achieved an F1 score of 0.76 in identifying hate speech and significant potential in the domain despite encountering complexities. We provide an overview of related research, methodology, and insights into the competition’s diverse performances, showcasing the landscape of hate speech detection nuances in the Tamil language.</abstract>
       <url hash="7fad7cef">2024.ltedi-1.20</url>
       <bibkey>tash-etal-2024-lidoma</bibkey>
+      <video href="2024.ltedi-1.20.mp4"/>
     </paper>
     <paper id="21">
       <title><fixed-case>CEN</fixed-case>_<fixed-case>A</fixed-case>mrita@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: A Transformer based Speech Recognition System for Vulnerable Individuals in <fixed-case>T</fixed-case>amil</title>
@@ -258,6 +277,7 @@
       <abstract>Speech recognition is known to be a specialized application of speech processing. Automatic speech recognition (ASR) systems are designed to perform the speech-to-text task. Although ASR systems have been the subject of extensive research, they still encounter certain challenges when speech variations arise. The speaker’s age, gender, vulnerability, and other factors are the main causes of the variations in speech. In this work, we propose a fine-tuned speech recognition model for recognising the spoken words of vulnerable individuals in Tamil. This research utilizes a dataset sourced from the LT-EDI@EACL2024 shared task. We trained and tested pre-trained ASR models, including XLS-R and Whisper. The findings highlight that the fine-tuned Whisper ASR model surpasses the XLSR, achieving a word error rate (WER) of 24.452, signifying its superior performance in recognizing speech from diverse individuals.</abstract>
       <url hash="4efdfb54">2024.ltedi-1.21</url>
       <bibkey>r-etal-2024-cen</bibkey>
+      <video href="2024.ltedi-1.21.mp4"/>
     </paper>
     <paper id="22">
       <title>kubapok@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: Evaluating Transformer Models for Hate Speech Detection in <fixed-case>T</fixed-case>amil</title>
@@ -267,6 +287,7 @@
       <abstract>We describe the second-place submission for the shared task organized at the Fourth Workshop on Language Technology for Equality, Diversity, and Inclusion (LT-EDI-2024). The task focuses on detecting caste/migration hate speech in Tamil. The included texts involve the Tamil language in both Tamil script and transliterated into Latin script, with some texts also in English. Considering different scripts, we examined the performance of 12 transformer language models on the dev set. Our analysis revealed that for the whole dataset, the model google/muril-large-cased performs the best. We used an ensemble of several models for the final challenge submission, achieving 0.81 for the test dataset.</abstract>
       <url hash="682e7eae">2024.ltedi-1.22</url>
       <bibkey>pokrywka-jassem-2024-kubapok</bibkey>
+      <video href="2024.ltedi-1.22.mp4"/>
     </paper>
     <paper id="23">
       <title><fixed-case>KEC</fixed-case>-<fixed-case>AI</fixed-case>-<fixed-case>NLP</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024:Homophobia and Transphobia Detection in Social Media Comments using Machine Learning</title>
@@ -280,6 +301,7 @@
       <abstract>Our work addresses the growing concern of abusive comments in online platforms, particularly focusing on the identification of Homophobia and Transphobia in social media comments. The goal is to categorize comments into three classes: Homophobia, Transphobia, and non-anti LGBT+ comments. Utilizing machine learning techniques and a deep learning model, our work involves training on a English dataset with a designated training set and testing on a validation set. This approach aims to contribute to the understanding and detection of Homophobia and Transphobia within the realm of social media interactions. Our team participated in the shared task organized by LTEDI@EACL 2024 and secured seventh rank in the task of Homophobia/Transphobia Detection in social media comments in Tamil with a macro- f1 score of 0.315. Also, our run was submitted for the English language and secured eighth rank with a macro-F1 score of 0.369. The run submitted for Malayalam language securing fourth rank with a macro- F1 score of 0.883 using the Random Forest model.</abstract>
       <url hash="cd64a031">2024.ltedi-1.23</url>
       <bibkey>shanmugavadivel-etal-2024-kec</bibkey>
+      <video href="2024.ltedi-1.23.mp4"/>
     </paper>
     <paper id="24">
       <title><fixed-case>KEC</fixed-case> <fixed-case>AI</fixed-case> <fixed-case>DSNLP</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024:Caste and Migration Hate Speech Detection using Machine Learning Techniques</title>
@@ -292,6 +314,7 @@
       <abstract>Commonly used language defines “hate speech” as objectionable statements that may jeopardize societal harmony by singling out a group or a person based on fundamental traits (including gender, caste, or religion). Using machine learning techniques, our research focuses on identifying hate speech in social media comments. Using a variety of machine learning methods, we created machine learning models to detect hate speech. An approximate Macro F1 of 0.60 was attained by the created models.</abstract>
       <url hash="c33efa2c">2024.ltedi-1.24</url>
       <bibkey>shanmugavadivel-etal-2024-kec-ai</bibkey>
+      <video href="2024.ltedi-1.24.mp4"/>
     </paper>
     <paper id="25">
       <title>Quartet@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: A Support Vector Machine Approach For Caste and Migration Hate Speech Detection</title>
@@ -304,6 +327,7 @@
       <abstract>Hate speech refers to the offensive remarks against a community or individual based on inherent characteristics. Hate speech against a community based on their caste and native are unfortunately prevalent in the society. Especially with social media platforms being a very popular tool for communication and sharing ideas, people post hate speech against caste or migrants on social medias. The Shared Task LT–EDI 2024: Caste and Migration Hate Speech Detection was created with the objective to create an automatic classification system that detects and classifies hate speech posted on social media targeting a community belonging to a particular caste and migrants. Datasets in Tamil language were provided along with the shared task. We experimented with several traditional models such as Naive Bayes, Support Vector Machine (SVM), Logistic Regression, Random Forest Classifier and Decision Tree Classifier out of which Support Vector Machine yielded the best results placing us 8th in the rank list released by the organizers.</abstract>
       <url hash="0ab065ec">2024.ltedi-1.25</url>
       <bibkey>h-etal-2024-quartet</bibkey>
+      <video href="2024.ltedi-1.25.mp4"/>
     </paper>
     <paper id="26">
       <title><fixed-case>SSN</fixed-case>-Nova@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: Leveraging Vectorisation Techniques in an Ensemble Approach for Stress Identification in Low-Resource Languages</title>
@@ -315,6 +339,7 @@
       <abstract>This paper presents our submission for Shared task on Stress Identification in Dravidian Languages: StressIdent LT-EDI@EACL2024. The objective of this task is to identify stress levels in individuals based on their social media content. The system is tasked with analysing posts written in a code-mixed language of Tamil and Telugu and categorising them into two labels: “stressed” or “not stressed.” Our approach aimed to leverage feature extraction and juxtapose the performance of widely used traditional, deep learning and transformer models. Our research highlighted that building a pipeline with traditional classifiers proved to significantly improve their performance (0.98 and 0.93 F1-scores in Telugu and Tamil respectively), surpassing the baseline as well as deep learning and transformer models.</abstract>
       <url hash="61c65fbd">2024.ltedi-1.26</url>
       <bibkey>reddy-etal-2024-ssn</bibkey>
+      <video href="2024.ltedi-1.26.mp4"/>
     </paper>
     <paper id="27">
       <title>Quartet@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: A <fixed-case>SVM</fixed-case>-<fixed-case>R</fixed-case>es<fixed-case>N</fixed-case>et50 Approach For Multitask Meme Classification - Unraveling Misogynistic and Trolls in Online Memes</title>
@@ -327,6 +352,7 @@
       <abstract>Meme is a very popular term prevailing among almost all social media platforms in recent days. A meme can be a combination of text and image whose sole purpose is meant to be funny and entertain people. Memes can sometimes promote misogynistic content expressing hatred, contempt, or prejudice against women. The Shared Task LT–EDI 2024: Multitask Meme Classification: Unraveling Misogynistic and Trolls in Online Memes Task 1 was created with the purpose to classify social media memes as “misogynistic” and “Non - Misogynistic”. The task encompassed Tamil and Malayalam datasets. We separately classified the textual data using Multinomial Naive Bayes and pictorial data using ResNet50 model. The results of from both data were combined to yield an overall result. We were ranked 2nd for both languages in this task.</abstract>
       <url hash="fe78f08d">2024.ltedi-1.27</url>
       <bibkey>h-etal-2024-quartet-lt</bibkey>
+      <video href="2024.ltedi-1.27.mp4"/>
     </paper>
     <paper id="28">
       <title>Quartet@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: Support Vector Machine Based Approach For Homophobia/Transphobia Detection In Social Media Comments</title>
@@ -339,6 +365,7 @@
       <abstract>Homophobia and transphobia are terms which are used to describe the fear or hatred towards people who are attracted to the same sex or people whose psychological gender differs from his biological sex. People use social media to exert this behaviour. The increased amount of abusive content negatively affects people in a lot of ways. It makes the environment toxic and unpleasant to LGBTQ+ people. The paper talks about the classification model for classifying the contents into 3 categories which are homophobic, transphobic and nonhomophobic/ transphobic. We used many traditional models like Support Vector Machine, Random Classifier, Logistic Regression and KNearest Neighbour to achieve this. The macro average F1 scores for Malayalam, Telugu, English, Marathi, Kannada, Tamil, Gujarati, Hindi are 0.88, 0.94, 0.96, 0.78, 0.93, 0.77, 0.94, 0.47 and the rank for these languages are 5, 6, 9, 6, 8, 6, 6, 4.</abstract>
       <url hash="0ebb0ac1">2024.ltedi-1.28</url>
       <bibkey>h-etal-2024-quartet-lt-edi</bibkey>
+      <video href="2024.ltedi-1.28.mp4"/>
     </paper>
     <paper id="29">
       <title><fixed-case>SSN</fixed-case>-Nova@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: <fixed-case>POS</fixed-case> Tagging, Boosting Techniques and Voting Classifiers for Caste And Migration Hate Speech Detection</title>
@@ -350,6 +377,7 @@
       <abstract>This paper presents our submission for the shared task on Caste and Migration Hate Speech Detection: LT-EDI@EACL 20241 . This text classification task aims to foster the creation of models capable of identifying hate speech related to caste and migration. The dataset comprises social media comments, and the goal is to categorize them into negative and positive sentiments. Our approach explores back-translation for data augmentation to address sparse datasets in low-resource Dravidian languages. While Part-of-Speech (POS) tagging is valuable in natural language processing, our work highlights its ineffectiveness in Dravidian languages, with model performance drastically reducing from 0.73 to 0.67 on application. In analyzing boosting and ensemble methods, the voting classifier with traditional models outperforms others and the boosting techniques, underscoring the efficacy of simper models on low-resource data despite augmentation.</abstract>
       <url hash="ae5f86be">2024.ltedi-1.29</url>
       <bibkey>reddy-etal-2024-ssn-nova</bibkey>
+      <video href="2024.ltedi-1.29.mp4"/>
     </paper>
     <paper id="30">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>NLP</fixed-case>_<fixed-case>M</fixed-case>anning@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024: Transformer-based Approach on Caste and Migration Hate Speech Detection</title>
@@ -362,6 +390,7 @@
       <abstract>The widespread use of online communication has caused a significant increase in the spread of hate speech on social media. However, there are also hate crimes based on caste and migration status. Despite several nations efforts to bring equality among their citizens, numerous crimes occur just based on caste. Migration-based hostility happens both in India and in developed countries. A shared task was arranged to address this issue in a low-resourced language such as Tamil. This paper aims to improve the detection of hate speech and hostility based on caste and migration status on social media. To achieve this, this work investigated several Machine Learning (ML), Deep Learning (DL), and transformer-based models, including M-BERT, XLM-R, and Tamil BERT. Experimental results revealed the highest macro f1-score of 0.80 using the M-BERT model, which enabled us to rank 3rd on the shared task.</abstract>
       <url hash="864640d9">2024.ltedi-1.30</url>
       <bibkey>alam-etal-2024-cuet</bibkey>
+      <video href="2024.ltedi-1.30.mp4"/>
     </paper>
     <paper id="31">
       <title><fixed-case>DRAVIDIAN</fixed-case> <fixed-case>LANGUAGE</fixed-case>@ <fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2024:Pretrained Transformer based Automatic Speech Recognition system for Elderly People</title>
@@ -375,6 +404,7 @@
       <bibkey>j-etal-2024-dravidian</bibkey>
       <revision id="1" href="2024.ltedi-1.31v1" hash="2367b543"/>
       <revision id="2" href="2024.ltedi-1.31v2" hash="4ad56f76" date="2024-03-29">This revision corrects the mistakes in the format of the names of the authors.</revision>
+      <video href="2024.ltedi-1.31.mp4"/>
     </paper>
     <paper id="32">
       <title>Transformers@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-<fixed-case>EACL</fixed-case>2024: Caste and Migration Hate Speech Detection in <fixed-case>T</fixed-case>amil Using Ensembling on Transformers</title>
@@ -384,6 +414,7 @@
       <abstract>In recent years, there has been a persistent focus on developing systems that can automatically identify the hate speech content circulating on diverse social media platforms. This paper describes the team “Transformers” submission to the Caste and Migration Hate Speech Detection in Tamil shared task by LT-EDI 2024 workshop at EACL 2024. We used an ensemble approach in the shared task, combining various transformer-based pre-trained models using majority voting. The best macro average F1-score achieved was 0.82. We secured the 1st rank in the Caste and Migration Hate Speech in Tamil shared task.</abstract>
       <url hash="c2dce549">2024.ltedi-1.32</url>
       <bibkey>singhal-bedi-2024-transformers-lt</bibkey>
+      <video href="2024.ltedi-1.32.mp4"/>
     </paper>
     <paper id="33">
       <title>Algorithm Alliance@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Caste and Migration Hate Speech Detection</title>
@@ -396,6 +427,7 @@
       <abstract>Caste and Migration speech refers to the use of language that distinguishes the offense, violence, and distress on their social, caste, and migration status. Here, caste hate speech targets the imbalance of an individual’s social status and focuses mainly on the degradation of their caste group. While the migration hate speech imposes the differences in nationality, culture, and individual status. These speeches are meant to affront the social status of these people. To detect this hate in the speech, our task on Caste and Migration Hate Speech Detection has been created which classifies human speech into genuine or stimulate categories. For this task, we used multiple classification models such as the train test split model to split the dataset into train and test data, Logistic regression, Support Vector Machine, MLP (multi-layer Perceptron) classifier, Random Forest classifier, KNN classifier, and Decision tree classification. Among these models, The SVM gave the highest macro average F1 score of 0.77 and the average accuracy for these models is around 0.75.</abstract>
       <url hash="f74eea76">2024.ltedi-1.33</url>
       <bibkey>sangeetham-etal-2024-algorithm</bibkey>
+      <video href="2024.ltedi-1.33.mp4"/>
     </paper>
     <paper id="34">
       <title><fixed-case>ME</fixed-case>n<fixed-case>T</fixed-case>r@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Multilingual Ensemble of Transformer Models for Homophobia/Transphobia Detection</title>
@@ -408,6 +440,7 @@
       <abstract>Detection of Homophobia and Transphobia in social media comments serves as an important step in the overall development of Equality, Diversity and Inclusion (EDI). In this research, we describe the system we formulated while participating in the shared task of Homophobia/ Transphobia detection as a part of the Fourth Workshop On Language Technology For Equality, Diversity, Inclusion (LT-EDI- 2024) at EACL 2024. We used an ensemble of three state-of-the-art multilingual transformer models, namely Multilingual BERT (mBERT), Multilingual Representations for Indic Languages (MuRIL) and XLM-RoBERTa to detect the presence of Homophobia or Transphobia in YouTube comments. The task comprised of datasets in ten languages - Hindi, English, Telugu, Tamil, Malayalam, Kannada, Gujarati, Marathi, Spanish and Tulu. Our system achieved rank 1 for the Spanish and Tulu tasks, 2 for Telugu, 3 for Marathi and Gujarati, 4 for Tamil, 5 for Hindi and Kannada, 6 for English and 8 for Malayalam. These results speak for the efficacy of our ensemble model as well as the data augmentation strategy we adopted for the detection of anti-LGBT+ language in social media data.</abstract>
       <url hash="3475ddc2">2024.ltedi-1.34</url>
       <bibkey>arora-etal-2024-mentr</bibkey>
+      <video href="2024.ltedi-1.34.mp4"/>
     </paper>
     <paper id="35">
       <title><fixed-case>CUET</fixed-case>_<fixed-case>DUO</fixed-case>@<fixed-case>S</fixed-case>tress<fixed-case>I</fixed-case>dent_<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>@<fixed-case>EACL</fixed-case>2024: Stress Identification Using <fixed-case>T</fixed-case>amil-<fixed-case>T</fixed-case>elugu <fixed-case>BERT</fixed-case></title>
@@ -422,6 +455,7 @@
       <abstract>The pervasive impact of stress on individuals necessitates proactive identification and intervention measures, especially in social media interaction. This research paper addresses the imperative need for proactive identification and intervention concerning the widespread influence of stress on individuals. This study focuses on the shared task, “Stress Identification in Dravidian Languages,” specifically emphasizing Tamil and Telugu code-mixed languages. The primary objective of the task is to classify social media messages into two categories: stressed and non stressed. We employed various methodologies, from traditional machine-learning techniques to state-of-the-art transformer-based models. Notably, the Tamil-BERT and Telugu-BERT models exhibited exceptional performance, achieving a noteworthy macro F1-score of <b>0.71</b> and <b>0.72</b>, respectively, and securing the <tex-math>15^{th}</tex-math> position in Tamil code-mixed language and the <tex-math>9^{th}</tex-math> position in the Telugu code-mixed language. These findings underscore the effectiveness of these models in recognizing stress signals within social media content composed in Tamil and Telugu.</abstract>
       <url hash="b2f82123">2024.ltedi-1.35</url>
       <bibkey>raihan-etal-2024-cuet</bibkey>
+      <video href="2024.ltedi-1.35.mp4"/>
     </paper>
     <paper id="36">
       <title>dkit@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Detecting Homophobia and Transphobia in <fixed-case>E</fixed-case>nglish Social Media Comments</title>
@@ -432,6 +466,7 @@
       <abstract>Machine learning and deep learning models have shown great potential in detecting hate speech from social media posts. This study focuses on the homophobia and transphobia detection task of LT-EDI-2024 in English. Several machine learning models, a Deep Neural Network (DNN), and the Bidirectional Encoder Representations from Transformers (BERT) model have been trained on the provided dataset using different feature vectorization techniques. We secured top rank with the best macro-F1 score of 0.4963, which was achieved by fine-tuning the BERT model on the English test set.</abstract>
       <url hash="e50a741c">2024.ltedi-1.36</url>
       <bibkey>yadav-etal-2024-dkit</bibkey>
+      <video href="2024.ltedi-1.36.mp4"/>
     </paper>
     <paper id="37">
       <title><fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>MIRACLE</fixed-case>_<fixed-case>MAKERS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Stress Identification in <fixed-case>D</fixed-case>ravidian Languages using Machine Learning Techniques</title>
@@ -444,6 +479,7 @@
       <abstract>Identifying an individual where he/she is stressed or not stressed is our shared task topic. we have used several machine learning models for identifying the stress. This paper presents our system submission for the task 1 and 2 for both Tamil and Telugu dataset, focusing on us- ing supervised approaches. For Tamil dataset, we got highest accuracy for the Support Vector Machine model with f1-score of 0.98 and for Telugu dataset, we got highest accuracy for Random Forest algorithm with f1-score of 0.99. By using this model, Stress Identification System will be helpful for an individual to improve their mental health in optimistic manner.</abstract>
       <url hash="ab8a17fd">2024.ltedi-1.37</url>
       <bibkey>shanmugavadivel-etal-2024-kec-ai-miracle</bibkey>
+      <video href="2024.ltedi-1.37.mp4"/>
     </paper>
     <paper id="38">
       <title><fixed-case>MUCS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Exploring Joint Representation for Memes Classification</title>
@@ -457,6 +493,7 @@
       <abstract>Misogynistic memes are a category of memes which contain disrespectful language targeting women on social media platforms. Hence, detecting such memes is necessary in order to maintain a healthy social media environment. To address the challenges of detecting misogynistic memes, “Multitask Meme classification - Unraveling Misogynistic and Trolls in Online Memes: LT-EDI@EACL 2024” shared task organized at European Chapter of the Association for Computational Linguistics (EACL) 2024, invites researchers to develop models to detect misogynistic memes in Tamil and Malayalam. The shared task has two subtasks, and in this paper, we - team MUCS, describe the learning models submitted to Task 1 - Identification of Misogynistic Memes in Tamil and Malayalam. As memes represent multi-modal data of image and text, three models: i) Bidirectional Encoder Representations from Transformers (BERT)+Residual Network (ResNet)-50, ii) Multilingual Representations for Indian Languages (MuRIL)+ResNet-50, and iii) multilingual BERT (mBERT)+ResNet50, are proposed based on joint representation of text and image, for detecting misogynistic memes in Tamil and Malayalam. Among the proposed models, mBERT+ResNet-50 and MuRIL+ ResNet-50 models obtained macro F1 scores of 0.73 and 0.87 for Tamil and Malayalam datasets respectively securing 1st rank for both the datasets in the shared task.</abstract>
       <url hash="6065a091">2024.ltedi-1.38</url>
       <bibkey>mahesh-etal-2024-mucs</bibkey>
+      <video href="2024.ltedi-1.38.mp4"/>
     </paper>
     <paper id="39">
       <title><fixed-case>MUCS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Learning Approaches to Empower Homophobic/Transphobic Comment Identification</title>
@@ -470,6 +507,7 @@
       <abstract>Homophobic/Transphobic (H/T) content includes hatred and discriminatory comments directed at Lesbian, Gay, Bisexual, Transgender, Queer (LGBTQ) individuals on social media platforms. As this unfavourable perception towards LGBTQ individuals may affect them physically and mentally, it is necessary to detect H/T content on social media. This demands automated tools to identify and address H/T content. In view of this, in this paper, we - team MUCS describe the learning models submitted to “Homophobia/Transphobia Detection in social media comments:LT-EDI@EACL 2024” shared task at European Chapter of the Association for Computational Linguistics (EACL) 2024. The learning models: i) Homo_Ensemble - an ensemble of Machine Learning (ML) algorithms trained with Term Frequency-Inverse Document Frequency (TFIDF) of syllable n-grams in the range (1, 3), ii) Homo_TL - a model based on Transfer Learning (TL) approach with Bidirectional Encoder Representations from Transformers (BERT) models, iii) Homo_probfuse - an ensemble of ML classifiers with soft voting trained using sentence embeddings (except for Hindi), and iv) Homo_FSL - Few-Shot Learning (FSL) models using Sentence Transformer (ST) (only for Tulu), are proposed to detect H/T content in the given languages. Among the models submitted to the shared task, the models that performed better for each language include: i) Homo_Ensemble model obtained macro F1 score of 0.95 securing 4th rank for Telugu language, ii) Homo_TL model obtained macro F1 scores of 0.49, 0.53, 0.45, 0.94, and 0.95 securing 2nd, 2nd, 1st, 1st, and 4th ranks for English, Marathi, Hindi, Kannada, and Gujarathi languages, respectively, iii) Homo_probfuse model obtained macro F1 scores of 0.86, 0.87, and 0.53 securing 2nd, 6th, and 2nd ranks for Tamil, Malayalam, and Spanish languages respectively, and iv) Homo_FSL model obtained a macro F1 score of 0.62 securing 2nd rank for Tulu dataset.</abstract>
       <url hash="ca485095">2024.ltedi-1.39</url>
       <bibkey>kulal-etal-2024-mucs</bibkey>
+      <video href="2024.ltedi-1.39.mp4"/>
     </paper>
     <paper id="40">
       <title><fixed-case>ASR</fixed-case> <fixed-case>TAMIL</fixed-case> <fixed-case>SSN</fixed-case>@ <fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2024: Automatic Speech Recognition system for Elderly People</title>
@@ -479,6 +517,7 @@
       <abstract>The results of the Shared Task on Speech Recognition for Vulnerable Individuals in Tamil (LT-EDI-2024) are discussed in this paper. The goal is to create an automated system for Tamil voice recognition. The older population that speaks Tamil is the source of the dataset used in this task. The proposed ASR system is designed with pre-trained model akashsivanandan/wav2vec2-large-xls-r300m-tamil-colab-final. The Tamil common speech dataset is utilized to fine-tune the pretrained model that powers our system. The suggested system receives the test data that was released from the task; transcriptions are then created for the test samples and delivered to the task. Word Error Rate (WER) is the evaluation statistic used to assess the provided result based on the task. Our Proposed system attained a WER of 29.297%.</abstract>
       <url hash="6607a9d9">2024.ltedi-1.40</url>
       <bibkey>s-b-2024-asr</bibkey>
+      <video href="2024.ltedi-1.40.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.mathnlp.xml b/data/xml/2024.mathnlp.xml
new file mode 100644
index 0000000000..74ff252d4d
--- /dev/null
+++ b/data/xml/2024.mathnlp.xml
@@ -0,0 +1,69 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.mathnlp">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 2nd Workshop on Mathematical Natural Language Processing @ LREC-COLING 2024</booktitle>
+      <editor><first>Marco</first><last>Valentino</last></editor>
+      <editor><first>Deborah</first><last>Ferreira</last></editor>
+      <editor><first>Mokanarangan</first><last>Thayaparan</last></editor>
+      <editor><first>Andre</first><last>Freitas</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="719a2d72">2024.mathnlp-1</url>
+      <venue>mathnlp</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="a18e47bb">2024.mathnlp-1.0</url>
+      <bibkey>mathnlp-2024-mathematical</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>An Approach to Co-reference Resolution and Formula Grounding for Mathematical Identifiers Using Large Language Models</title>
+      <author><first>Aamin</first><last>Dev</last></author>
+      <author><first>Takuto</first><last>Asakura</last></author>
+      <author><first>Rune</first><last>Sætre</last></author>
+      <pages>1–10</pages>
+      <abstract>This paper outlines an automated approach to annotate mathematical identifiers in scientific papers — a process historically laborious and costly. We employ state-of-the-art LLMs, including GPT-3.5 and GPT-4, and open-source alternatives to generate a dictionary for annotating mathematical identifiers, linking each identifier to its conceivable descriptions and then assigning these definitions to the respective identifier in- stances based on context. Evaluation metrics include the CoNLL score for co-reference cluster quality and semantic correctness of the annotations.</abstract>
+      <url hash="46dfe997">2024.mathnlp-1.1</url>
+      <bibkey>dev-etal-2024-approach</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Fluid Dynamics-Inspired Emotional Analysis in <fixed-case>S</fixed-case>hakespearean Tragedies: A Novel Computational Linguistics Methodology</title>
+      <author><first>Davide</first><last>Picca</last></author>
+      <pages>11–18</pages>
+      <abstract>This study introduces an innovative method for analyzing emotions in texts, drawing inspiration from the principles of fluid dynamics, particularly the Navier-Stokes equations. It applies this framework to analyze Shakespeare’s tragedies “Hamlet” and “Romeo and Juliet”, treating emotional expressions as entities akin to fluids. By mapping linguistic characteristics onto fluid dynamics components, this approach provides a dynamic perspective on how emotions are expressed and evolve in narrative texts. The results, when compared with conventional sentiment analysis methods, reveal a more detailed and subtle grasp of the emotional arcs within these works. This interdisciplinary strategy not only enriches emotion analysis in computational linguistics but also paves the way for potential integrations with machine learning in NLP.</abstract>
+      <url hash="340464dc">2024.mathnlp-1.2</url>
+      <bibkey>picca-2024-fluid</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Math Problem Solving: Enhancing Large Language Models with Semantically Rich Symbolic Variables</title>
+      <author><first>Ali Emre</first><last>Narin</last></author>
+      <pages>19–24</pages>
+      <abstract>The advent of Large Language Models (LLMs) based on the Transformer architecture has led to remarkable advancements in various domains, including reasoning tasks. However, accurately assessing the performance of Large Language Models, particularly in the reasoning domain, remains a challenge. In this paper, we propose the Semantically Rich Variable Substitution Method (SemRiVas) as an enhancement to existing symbolic methodologies for evaluating LLMs on Mathematical Word Problems (MWPs). Unlike previous approaches that utilize generic symbols for variable substitution, SemRiVas employs descriptive variable names, aiming to improve the problem-solving abilities of LLMs. Our method aims to eliminate the need for LLMs to possess programming proficiency and perform arithmetic operations, to be universally applicable. Our experimental results demonstrate the superior accuracy of SemRiVas compared to prior symbolic methods, particularly in resolving longer and more complex MWP questions. However, LLMs’ performance with SemRiVas and symbolic methods that utilize one-character variables still falls short compared to notable techniques like CoT and PaL.</abstract>
+      <url hash="e0823b59">2024.mathnlp-1.3</url>
+      <bibkey>narin-2024-math</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Data Driven Approach for Mathematical Problem Solving</title>
+      <author><first>Byungju</first><last>Kim</last></author>
+      <author><first>Wonseok</first><last>Lee</last></author>
+      <author><first>Jaehong</first><last>Kim</last></author>
+      <author><first>Jungbin</first><last>Im</last></author>
+      <pages>25–34</pages>
+      <abstract>In this paper, we investigate and introduce a novel Llama-2 based model, fine-tuned with an original dataset designed to mirror real-world mathematical challenges. The dataset was collected through a question-answering platform, incorporating solutions generated by both rule-based solver and question answering, to cover a broad spectrum of mathematical concepts and problem-solving techniques. Experimental results demonstrate significant performance improvements when the models are fine-tuned with our dataset. The results suggest that the integration of contextually rich and diverse problem sets into the training substantially enhances the problem-solving capability of language models across various mathematical domains. This study showcases the critical role of curated educational content in advancing AI research.</abstract>
+      <url hash="6b461bbd">2024.mathnlp-1.4</url>
+      <bibkey>kim-etal-2024-data</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Exploring Internal Numeracy in Language Models: A Case Study on <fixed-case>ALBERT</fixed-case></title>
+      <author><first>Ulme</first><last>Wennberg</last></author>
+      <author><first>Gustav Eje</first><last>Henter</last></author>
+      <pages>35–40</pages>
+      <abstract>It has been found that Transformer-based language models have the ability to perform basic quantitative reasoning. In this paper, we propose a method for studying how these models internally represent numerical data, and use our proposal to analyze the ALBERT family of language models. Specifically, we extract the learned embeddings these models use to represent tokens that correspond to numbers and ordinals, and subject these embeddings to Principal Component Analysis (PCA). PCA results reveal that ALBERT models of different sizes, trained and initialized separately, consistently learn to use the axes of greatest variation to represent the approximate ordering of various numerical concepts. Numerals and their textual counterparts are represented in separate clusters, but increase along the same direction in 2D space. Our findings illustrate that language models, trained purely to model text, can intuit basic mathematical concepts, opening avenues for NLP applications that intersect with quantitative reasoning.</abstract>
+      <url hash="5420fbb0">2024.mathnlp-1.5</url>
+      <bibkey>wennberg-henter-2024-exploring</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.mwe.xml b/data/xml/2024.mwe.xml
new file mode 100644
index 0000000000..df9f1b3e16
--- /dev/null
+++ b/data/xml/2024.mwe.xml
@@ -0,0 +1,322 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.mwe">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024</booktitle>
+      <editor><first>Archna</first><last>Bhatia</last></editor>
+      <editor><first>Gosse</first><last>Bouma</last></editor>
+      <editor><first>A. Seza</first><last>Doğruöz</last></editor>
+      <editor><first>Kilian</first><last>Evang</last></editor>
+      <editor><first>Marcos</first><last>Garcia</last></editor>
+      <editor><first>Voula</first><last>Giouli</last></editor>
+      <editor><first>Lifeng</first><last>Han</last></editor>
+      <editor><first>Joakim</first><last>Nivre</last></editor>
+      <editor><first>Alexandre</first><last>Rademaker</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="b5769b4d">2024.mwe-1</url>
+      <venue>mwe</venue>
+      <venue>udw</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="7ca5715c">2024.mwe-1.0</url>
+      <bibkey>mwe-2024-joint</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Every Time We Hire an <fixed-case>LLM</fixed-case>, the Reasoning Performance of the Linguists Goes Up</title>
+      <author><first>Harish</first><last>Tayyar Madabushi</last></author>
+      <pages>1</pages>
+      <abstract/>
+      <url hash="7ad6e083">2024.mwe-1.1</url>
+      <bibkey>tayyar-madabushi-2024-every</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Using <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies for testing hypotheses about communicative efficiency</title>
+      <author><first>Natalia</first><last>Levshina</last></author>
+      <pages>2–3</pages>
+      <abstract/>
+      <url hash="5febabbe">2024.mwe-1.2</url>
+      <bibkey>levshina-2024-using</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Automatic Manipulation of Training Corpora to Make Parsers Accept Real-world Text</title>
+      <author><first>Hiroshi</first><last>Kanayama</last></author>
+      <author><first>Ran</first><last>Iwamoto</last></author>
+      <author><first>Masayasu</first><last>Muraoka</last></author>
+      <author><first>Takuya</first><last>Ohko</last></author>
+      <author><first>Kohtaroh</first><last>Miyamoto</last></author>
+      <pages>4–13</pages>
+      <abstract>This paper discusses how to build a practical syntactic analyzer, and addresses the distributional differences between existing corpora and actual documents in applications. As a case study we focus on noun phrases that are not headed by a main verb and sentences without punctuation at the end, which are rare in a number of Universal Dependencies corpora but frequently appear in the real-world use cases of syntactic parsers. We converted the training corpora so that their distribution is closer to that in realistic inputs, and obtained the better scores both in general syntax benchmarking and a sentiment detection task, a typical application of dependency analysis.</abstract>
+      <url hash="a4e5206b">2024.mwe-1.3</url>
+      <bibkey>kanayama-etal-2024-automatic</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Assessing <fixed-case>BERT</fixed-case>’s sensitivity to idiomaticity</title>
+      <author><first>Li</first><last>Liu</last></author>
+      <author><first>Francois</first><last>Lareau</last></author>
+      <pages>14–23</pages>
+      <abstract>BERT-like language models have been demonstrated to capture the idiomatic meaning of multiword expressions. Linguists have also shown that idioms have varying degrees of idiomaticity. In this paper, we assess CamemBERT’s sensitivity to the degree of idiomaticity within idioms, as well as the dependency of this sensitivity on part of speech and idiom length. We used a demasking task on tokens from 3127 idioms and 22551 tokens corresponding to simple lexemes taken from the French Lexical Network (LN-fr), and observed that CamemBERT performs distinctly on tokens embedded within idioms compared to simple ones. When demasking tokens within idioms, the model is not proficient in discerning their level of idiomaticity. Moreover, regardless of idiomaticity, CamemBERT excels at handling function words. The length of idioms also impacts CamemBERT’s performance to a certain extent. The last two observations partly explain the difference between the model’s performance on idioms versus simple lexemes. We conclude that the model treats idioms differently from simple lexemes, but that it does not capture the difference in compositionality between subclasses of idioms.</abstract>
+      <url hash="72225b50">2024.mwe-1.4</url>
+      <bibkey>liu-lareau-2024-assessing</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Identification and Annotation of Body Part Multiword Expressions in Old <fixed-case>E</fixed-case>gyptian</title>
+      <author><first>Roberto</first><last>Díaz Hernández</last></author>
+      <pages>24–32</pages>
+      <abstract>This paper presents the preliminary results of an ongoing study on the diachronic and synchronic use of multiword expressions (MWEs) in Egyptian, begun when I joined the COST Action Universality, Diversity and Idiosyncrasy in Language Technology (UniDive, CA21167). It analyzes, as a case study, Old Egyptian body part MWEs based on lexicographic and textual resources, and its aim is both to open up a research line in Egyptology, where the study of MWEs has been neglected, and to contribute to Natural Language Processing studies by determining the rules governing the morpho-syntactic formation of Old Egyptian body part MWEs in order to facilitate the identification of other types of MWEs.</abstract>
+      <url hash="8935413a">2024.mwe-1.5</url>
+      <bibkey>diaz-hernandez-2024-identification</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Fitting Fixed Expressions into the <fixed-case>UD</fixed-case> Mould: <fixed-case>S</fixed-case>wedish as a Use Case</title>
+      <author><first>Lars</first><last>Ahrenberg</last></author>
+      <pages>33–42</pages>
+      <abstract>Fixed multiword expressions are common in many, if not all, natural languages. In the Universal Dependencies framework, UD, a subset of these expressions are modelled with the dependency relation ‘fixed’ targeting the most grammaticalized cases of functional multiword items. In this paper we perform a detailed analysis of 439 expressions modelled with ‘fixed’ in two Swedish UD treebanks in order to reduce their numbers and fit the definition better. We identify a large number of dimensions of variation for fixed multiword expressions that can be used for the purpose. We also point out several problematic aspects of the current UD approach to multiword expressions and discuss different alternative solutions for modelling fixed expresions. We suggest that insights from Constructional Grammar (CxG) can help with a more systematic treatment of fixed expressions in UD.</abstract>
+      <url hash="1bf98096">2024.mwe-1.6</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="2a0752c9">2024.mwe-1.6.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>ahrenberg-2024-fitting</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Synthetic-Error Augmented Parsing of <fixed-case>S</fixed-case>wedish as a Second Language: Experiments with Word Order</title>
+      <author><first>Arianna</first><last>Masciolini</last></author>
+      <author><first>Emilie</first><last>Francis</last></author>
+      <author><first>Maria Irena</first><last>Szawerna</last></author>
+      <pages>43–49</pages>
+      <abstract>Ungrammatical text poses significant challenges for off-the-shelf dependency parsers. In this paper, we explore the effectiveness of using synthetic data to improve performance on essays written by learners of Swedish as a second language. Due to their relevance and ease of annotation, we restrict our initial experiments to word order errors. To do that, we build a corrupted version of the standard Swedish Universal Dependencies (UD) treebank Talbanken, mimicking the error patterns and frequency distributions observed in the Swedish Learner Language (SweLL) corpus. We then use the MaChAmp (Massive Choice, Ample tasks) toolkit to train an array of BERT-based dependency parsers, fine-tuning on different combinations of original and corrupted data. We evaluate the resulting models not only on their respective test sets but also, most importantly, on a smaller collection of sentence-correction pairs derived from SweLL. Results show small but significant performance improvements on the target domain, with minimal decline on normative data.</abstract>
+      <url hash="294912ec">2024.mwe-1.7</url>
+      <bibkey>masciolini-etal-2024-synthetic</bibkey>
+    </paper>
+    <paper id="8">
+      <title>The <fixed-case>V</fixed-case>edic Compound Dataset</title>
+      <author><first>Sven</first><last>Sellmer</last></author>
+      <author><first>Oliver</first><last>Hellwig</last></author>
+      <pages>50–55</pages>
+      <abstract>This paper introduces the Vedic Compound Dataset (VCD), the first resource providing annotated compounds from Vedic Sanskrit, a South Asian Indo-European language used from ca. 1500 to 500 BCE. The VCD aims at facilitating the study of language change in early Indo-Iranian and offers comparative material for quantitative cross-linguistic research on compounds. The process of annotating Vedic compounds is complex as they contain five of the six basic types of compounds defined by Scalise &amp; Bisetto (2005), which are, however, not consistently marked in morphosyntax, making their automatic classification a significant challenge. The paper details the process of collecting and preprocessing the relevant data, with a particular focus on the question of how to distinguish exocentric from endocentric usage. It further discusses experiments with a simple ML classifier that uses compound internal syntactic relations, outlines the composition of the dataset, and sketches directions for future research.</abstract>
+      <url hash="2c97022b">2024.mwe-1.8</url>
+      <bibkey>sellmer-hellwig-2024-vedic</bibkey>
+    </paper>
+    <paper id="9">
+      <title>A <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies Treebank for <fixed-case>G</fixed-case>ujarati</title>
+      <author><first>Mayank</first><last>Jobanputra</last></author>
+      <author><first>Maitrey</first><last>Mehta</last></author>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
+      <pages>56–62</pages>
+      <abstract>The Universal Dependencies (UD) project has presented itself as a valuable platform to develop various resources for the languages of the world. We present and release a sample treebank for the Indo-Aryan language of Gujarati – a widely spoken language with little linguistic resources. This treebank is the first labeled dataset for dependency parsing in the language and the script (the Gujarati script). The treebank contains 187 part-of-speech and dependency annotated sentences from diverse genres. We discuss various idiosyncratic examples, annotation choices and present an elaborate corpus along with agreement statistics. We see this work as a valuable resource and a stepping stone for research in Gujarati Computational Linguistics.</abstract>
+      <url hash="c66591f0">2024.mwe-1.9</url>
+      <bibkey>jobanputra-etal-2024-universal</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Overcoming Early Saturation on Low-Resource Languages in Multilingual Dependency Parsing</title>
+      <author><first>Jiannan</first><last>Mao</last></author>
+      <author><first>Chenchen</first><last>Ding</last></author>
+      <author><first>Hour</first><last>Kaing</last></author>
+      <author><first>Hideki</first><last>Tanaka</last></author>
+      <author><first>Masao</first><last>Utiyama</last></author>
+      <author><first>Tadahiro</first><last>Matsumoto.</last></author>
+      <pages>63–69</pages>
+      <abstract>UDify is a multilingual and multi-task parser fine-tuned on mBERT that achieves remarkable performance in high-resource languages. However, the performance saturates early and decreases gradually in low-resource languages as training proceeds. This work applies a data augmentation method and conducts experiments on seven few-shot and four zero-shot languages. The unlabeled attachment scores were improved on the zero-shot languages dependency parsing tasks, with the average score rising from 67.1% to 68.7%. Meanwhile, dependency parsing tasks for high-resource languages and other tasks were hardly affected. Experimental results indicate the data augmentation method is effective for low-resource languages in a multilingual dependency parsing.</abstract>
+      <url hash="fa0a5c37">2024.mwe-1.10</url>
+      <bibkey>mao-etal-2024-overcoming</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Part-of-Speech Tagging for <fixed-case>N</fixed-case>orthern <fixed-case>K</fixed-case>urdish</title>
+      <author><first>Peshmerge</first><last>Morad</last></author>
+      <author><first>Sina</first><last>Ahmadi</last></author>
+      <author><first>Lorenzo</first><last>Gatti</last></author>
+      <pages>70–80</pages>
+      <abstract>In the growing domain of natural language processing, low-resourced languages like Northern Kurdish remain largely unexplored due to the lack of resources needed to be part of this growth. In particular, the tasks of part-of-speech tagging and tokenization for Northern Kurdish are still insufficiently addressed. In this study, we aim to bridge this gap by evaluating a range of statistical, neural, and fine-tuned-based models specifically tailored for Northern Kurdish. Leveraging limited but valuable datasets, including the Universal Dependency Kurmanji treebank and a novel manually annotated and tokenized gold-standard dataset consisting of 136 sentences (2,937 tokens). We evaluate several POS tagging models and report that the fine-tuned transformer-based model outperforms others, achieving an accuracy of 0.87 and a macro-averaged F1 score of 0.77. Data and models are publicly available under an open license at https://github.com/peshmerge/northern-kurdish-pos-tagging</abstract>
+      <url hash="f91a231c">2024.mwe-1.11</url>
+      <bibkey>morad-etal-2024-part</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Diachronic Analysis of Multi-word Expression Functional Categories in Scientific <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Diego</first><last>Alves</last></author>
+      <author><first>Stefania</first><last>Degaetano-Ortlieb</last></author>
+      <author><first>Elena</first><last>Schmidt</last></author>
+      <author><first>Elke</first><last>Teich</last></author>
+      <pages>81–87</pages>
+      <abstract>We present a diachronic analysis of multi-word expressions (MWEs) in English based on the Royal Society Corpus, a dataset containing 300+ years of the scientific publications of the Royal Society of London. Specifically, we investigate the functions of MWEs, such as stance markers (“is is interesting”) or discourse organizers (“in this section”), and their development over time. Our approach is multi-disciplinary: to detect MWEs we use Universal Dependencies, to classify them functionally we use an approach from register linguistics, and to assess their role in diachronic development we use an information-theoretic measure, relative entropy.</abstract>
+      <url hash="17aad2be">2024.mwe-1.12</url>
+      <bibkey>alves-etal-2024-diachronic</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Lexicons Gain the Upper Hand in <fixed-case>A</fixed-case>rabic <fixed-case>MWE</fixed-case> Identification</title>
+      <author><first>Najet</first><last>Hadj Mohamed</last></author>
+      <author><first>Agata</first><last>Savary</last></author>
+      <author><first>Cherifa</first><last>Ben Khelil</last></author>
+      <author><first>Jean-Yves</first><last>Antoine</last></author>
+      <author><first>Iskandar</first><last>Keskes</last></author>
+      <author><first>Lamia</first><last>Hadrich-Belguith</last></author>
+      <pages>88–97</pages>
+      <abstract>This paper highlights the importance of integrating MWE identification with the development of syntactic MWE lexicons. It suggests that lexicons with minimal morphosyntactic information can amplify current MWE-annotated datasets and refine identification strategies. To our knowledge, this work represents the first attempt to focus on both seen and unseen of VMWEs for Arabic. It also deals with the challenge of differentiating between literal and figurative interpretations of idiomatic expressions. The approach involves a dual-phase procedure: first projecting a VMWE lexicon onto a corpus to identify candidate occurrences, then disambiguating these occurrences to distinguish idiomatic from literal instances. Experiments outlined in the paper aim to assess the efficacy of this technique, utilizing a lexicon known as LEXAR and the “parseme-ar” corpus. The findings suggest that lexicon-driven strategies have the potential to refine MWE identification, particularly for unseen occurrences.</abstract>
+      <url hash="c1331bf3">2024.mwe-1.13</url>
+      <bibkey>hadj-mohamed-etal-2024-lexicons</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Revisiting <fixed-case>VMWE</fixed-case>s in <fixed-case>H</fixed-case>indi: Annotating Layers of Predication</title>
+      <author><first>Kanishka</first><last>Jain</last></author>
+      <author><first>Ashwini</first><last>Vaidya</last></author>
+      <pages>98–105</pages>
+      <abstract>Multiword expressions in languages like Hindi are both productive and challenging. Hindi not only uses a variety of verbal multiword expressions (VMWEs) but also employs different combinatorial strategies to create new types of multiword expressions. In this paper we are investigating two such strategies that are quite common in the language. Firstly, we describe that VMWEs in Hindi are not just lexical but also morphological. Causatives are formed morphologically in Hindi. Second, we examine Stacked VMWEs i.e. when at least two VMWEs occur together. We suggest that the existing PARSEME annotation framework can be extended to these two phenomena without changing the existing guidelines. We also propose rule-based heuristics using existing Universal Dependency annotations to automatically identify and annotate some of the VMWEs in the language. The goal of this paper is to refine the existing PARSEME corpus of Hindi for VMWEs while expanding its scope giving a more comprehensive picture of VMWEs in Hindi.</abstract>
+      <url hash="f9ec0aff">2024.mwe-1.14</url>
+      <bibkey>jain-vaidya-2024-revisiting</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Towards the semantic annotation of <fixed-case>SR</fixed-case>-<fixed-case>ELEXIS</fixed-case> corpus: Insights into Multiword Expressions and Named Entities</title>
+      <author><first>Cvetana</first><last>Krstev</last></author>
+      <author><first>Ranka</first><last>Stanković</last></author>
+      <author><first>Aleksandra M.</first><last>Marković</last></author>
+      <author><first>Teodora Sofija</first><last>Mihajlov</last></author>
+      <pages>106–114</pages>
+      <abstract>This paper presents the work in progress on ELEXIS-sr corpus, the Serbian addition to the ELEXIS multilingual annotated corpus ElexisWSD, comprising semantic annotations and word sense repositories. The ELEXIS corpus has parallel annotations in ten European languages, serving as a cross-lingual benchmark for evaluating low and medium-resourced European languages. The focus in this paper is on multiword expressions (MWEs) and named entities (NEs), their recognition in the ELEXIS-sr sentence set, and comparison with annotations in other languages. The first steps in building the Serbian sense inventory are discussed, and some results concerning MWEs and NEs are analysed. Once completed, the ELEXIS-sr corpus will be the first sense annotated corpus using the Serbian WordNet (SrpWN). Finally, ideas to represent MWE lexicon entries as Linguistic Linked-Open Data (LLOD) and connect them with occurrences in the corpus are presented.</abstract>
+      <url hash="31a02399">2024.mwe-1.15</url>
+      <bibkey>krstev-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="16">
+      <title>To Leave No Stone Unturned: Annotating Verbal Idioms in the <fixed-case>P</fixed-case>arallel <fixed-case>M</fixed-case>eaning <fixed-case>B</fixed-case>ank</title>
+      <author><first>Rafael</first><last>Ehren</last></author>
+      <author><first>Kilian</first><last>Evang</last></author>
+      <author><first>Laura</first><last>Kallmeyer</last></author>
+      <pages>115–124</pages>
+      <abstract>Idioms present many challenges to semantic annotation in a lexicalized framework, which leads to them being underrepresented or inadequately annotated in sembanks. In this work, we address this problem with respect to verbal idioms in the Parallel Meaning Bank (PMB), specifically in its German part, where only some idiomatic expressions have been annotated correctly. We first select candidate idiomatic expressions, then determine their idiomaticity status and whether they are decomposable or not, and then we annotate their semantics using WordNet senses and VerbNet semantic roles. Overall, inter-annotator agreement is very encouraging. A difficulty, however, is to choose the correct word sense. This is not surprising, given that English synsets are many and there is often no unique mapping from German idioms and words to them. Besides this, there are many subtle differences and interesting challenging cases. We discuss some of them in this paper.</abstract>
+      <url hash="31b7ab4f">2024.mwe-1.16</url>
+      <bibkey>ehren-etal-2024-leave</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Universal Feature-based Morphological Trees</title>
+      <author><first>Federica</first><last>Gamba</last></author>
+      <author><first>Abishek</first><last>Stephen</last></author>
+      <author><first>Zdeněk</first><last>Žabokrtský</last></author>
+      <pages>125–137</pages>
+      <abstract>The paper proposes a novel data representation inspired by Universal Dependencies (UD) syntactic trees, which are extended to capture the internal morphological structure of word forms. As a result, morphological segmentation is incorporated within the UD representation of syntactic dependencies. To derive the proposed data structure we leverage existing annotation of UD treebanks as well as available resources for segmentation, and we select 10 languages to work with in the presented case study. Additionally, statistical analysis reveals a robust correlation between morphs and sets of morphological features of words. We thus align the morphs to the observed feature inventories capturing the morphological meaning of morphs. Through the beneficial exploitation of cross-lingual correspondence of morphs, the proposed syntactic representation based on morphological segmentation proves to enhance the comparability of sentence structures across languages.</abstract>
+      <url hash="70a01a21">2024.mwe-1.17</url>
+      <bibkey>gamba-etal-2024-universal</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Combining Grammatical and Relational Approaches. A Hybrid Method for the Identification of Candidate Collocations from Corpora</title>
+      <author><first>Damiano</first><last>Perri</last></author>
+      <author><first>Irene</first><last>Fioravanti</last></author>
+      <author><first>Osvaldo</first><last>Gervasi</last></author>
+      <author><first>Stefania</first><last>Spina</last></author>
+      <pages>138–146</pages>
+      <abstract>We present an evaluation of three different methods for the automatic identification of candidate collocations in corpora, part of a research project focused on the development of a learner dictionary of Italian collocations. We compare the commonly used POS-based method and the syntactic dependency-based method with a hybrid method integrating both approaches. We conduct a statistical analysis on a sample corpus of written and spoken texts of different registers. Results show that the hybrid method can correctly detect more candidate collocations against a human annotated benchmark. The scores are particularly high in adjectival modifier rela- tions. A hybrid approach to candidate collocation identification seems to lead to an improvement in the quality of results.</abstract>
+      <url hash="0afd7911">2024.mwe-1.18</url>
+      <bibkey>perri-etal-2024-combining</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Multiword Expressions between the Corpus and the Lexicon: Universality, Idiosyncrasy, and the Lexicon-Corpus Interface</title>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Voula</first><last>Giouli</last></author>
+      <author><first>Kilian</first><last>Evang</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <author><first>Carole</first><last>Tiberius</last></author>
+      <author><first>Simon</first><last>Krek</last></author>
+      <author><first>Stella</first><last>Markantonatou</last></author>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Ranka</first><last>Stanković</last></author>
+      <author><first>Christian</first><last>Chiarcos</last></author>
+      <pages>147–153</pages>
+      <abstract>We present ongoing work towards defining a lexicon-corpus interface to serve as a benchmark in the representation of multiword expressions (of various parts of speech) in dedicated lexica and the linking of these entries to their corpus occurrences. The final aim is the harnessing of such resources for the automatic identification of multiword expressions in a text. The involvement of several natural languages aims at the universality of a solution not centered on a particular language, and also accommodating idiosyncrasies. Challenges in the lexicographic description of multiword expressions are discussed, the current status of lexica dedicated to this linguistic phenomenon is outlined, as well as the solution we envisage for creating an ecosystem of interlinked lexica and corpora containing and, respectively, annotated with multiword expressions.</abstract>
+      <url hash="1e372b69">2024.mwe-1.19</url>
+      <bibkey>barbu-mititelu-etal-2024-multiword</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Annotation of Multiword Expressions in the <fixed-case>SUK</fixed-case> 1.0 Training Corpus of <fixed-case>S</fixed-case>lovene: Lessons Learned and Future Steps</title>
+      <author><first>Jaka</first><last>Čibej</last></author>
+      <author><first>Polona</first><last>Gantar</last></author>
+      <author><first>Mija</first><last>Bon</last></author>
+      <pages>154–162</pages>
+      <abstract>Recent progress within the UniDive COST Action on the compilation of universal guidelines for the annotation of non-verbal multiword expressions (MWEs) has provided an opportunity to improve and expand the work previously done within the PARSEME COST Action on the annotation of verbal multiword expressions in the SUK 1.0 Training Corpus of Slovene. A segment of the training corpus had already been annotated with verbal MWEs during PARSEME. As a follow-up and part of the New Grammar of Modern Standard Slovene (NSSSS) project, the same segment was annotated with non verbal MWEs, resulting in approximately 6, 500 sentences annotated by at least three annotators (described in Gantar et al., 2019). Since then, the entire SUK 1.0 was also manually annotated with UD part-of-speech tags. In the paper, we present an analysis of the MWE annotations exported from the corpus along with their part-of-speech structures through the lens of Universal Dependencies. We discuss the usefulness of the data in terms of potential insight for the further compilation and fine-tuning of guidelines particularly for non-verbal MWEs, and conclude with our plans for future work.</abstract>
+      <url hash="aa0d1477">2024.mwe-1.20</url>
+      <bibkey>cibej-etal-2024-annotation</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Light Verb Constructions in <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies for <fixed-case>S</fixed-case>outh <fixed-case>A</fixed-case>sian Languages</title>
+      <author><first>Abishek</first><last>Stephen</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
+      <pages>163–177</pages>
+      <abstract>We conduct a morphosyntactic investigation into the light verb constructions (LVCs) or the verbo-nominal predicates in South Asian languages. This work spans the Indo-Aryan and Dravidian language families in treebanks based on Universal Dependencies (UD). For the selected languages we show how well the existing annotation guidelines fare for the LVCs. We also reiterate the importance of the core and oblique distinction in UD and how informative it is for making accurate morphosyntactic annotation judgments for such predicates.</abstract>
+      <url hash="3b28ad6e">2024.mwe-1.21</url>
+      <bibkey>stephen-zeman-2024-light</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Sign of the Times: Evaluating the use of Large Language Models for Idiomaticity Detection</title>
+      <author><first>Dylan</first><last>Phelps</last></author>
+      <author><first>Thomas M. R.</first><last>Pickard</last></author>
+      <author><first>Maggie</first><last>Mi</last></author>
+      <author><first>Edward</first><last>Gow-Smith</last></author>
+      <author><first>Aline</first><last>Villavicencio</last></author>
+      <pages>178–187</pages>
+      <abstract>Despite the recent ubiquity of large language models and their high zero-shot prompted performance across a wide range of tasks, it is still not known how well they perform on tasks which require processing of potentially idiomatic language. In particular, how well do such models perform in comparison to encoder-only models fine-tuned specifically for idiomaticity tasks? In this work, we attempt to answer this question by looking at the performance of a range of LLMs (both local and software-as-a-service models) on three idiomaticity datasets: SemEval 2022 Task 2a, FLUTE, and MAGPIE. Overall, we find that whilst these models do give competitive performance, they do not match the results of fine-tuned task-specific models, even at the largest scales (e.g. for GPT-4). Nevertheless, we do see consistent performance improvements across model scale. Additionally, we investigate prompting approaches to improve performance, and discuss the practicalities of using LLMs for these tasks.</abstract>
+      <url hash="5bc7839f">2024.mwe-1.22</url>
+      <bibkey>phelps-etal-2024-sign</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies for <fixed-case>S</fixed-case>araiki</title>
+      <author><first>Meesum</first><last>Alam</last></author>
+      <author><first>Francis</first><last>Tyers</last></author>
+      <author><first>Emily</first><last>Hanink</last></author>
+      <author><first>Sandra</first><last>Kübler</last></author>
+      <pages>188–197</pages>
+      <abstract>We present the first treebank of the Saraiki/Siraiki [ISO 639-3 skr] language, using the Universal Dependency annotation scheme (de Marneffe et al., 2021). The treebank currently comprises 587 annotated sentences and 7597 tokens. We explain the most relevant syntactic and morphological features of Saraiki, along with the decision we have made for a range of language specific constructions, namely compounds, verbal structures including light verb and serial verb constructions, and relative clauses.</abstract>
+      <url hash="1026f1dc">2024.mwe-1.23</url>
+      <bibkey>alam-etal-2024-universal</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Domain-Weighted Batch Sampling for Neural Dependency Parsing</title>
+      <author><first>Jacob</first><last>Striebel</last></author>
+      <author><first>Daniel</first><last>Dakota</last></author>
+      <author><first>Sandra</first><last>Kübler</last></author>
+      <pages>198–206</pages>
+      <abstract>In neural dependency parsing, as well as in the broader field of NLP, domain adaptation remains a challenging problem. When adapting a parser to a target domain, there is a fundamental tension between the need to make use of out-of-domain data and the need to ensure that syntactic characteristic of the target domain are learned. In this work we explore a way to balance these two competing concerns, namely using domain-weighted batch sampling, which allows us to use all available training data, while controlling the probability of sampling in- and out-of-domain data when constructing training batches. We conduct experiments using ten natural language domains and find that domain-weighted batch sampling yields substantial performance improvements in all ten domains compared to a baseline of conventional randomized batch sampling.</abstract>
+      <url hash="8cf4b706">2024.mwe-1.24</url>
+      <bibkey>striebel-etal-2024-domain</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Strategies for the Annotation of Pronominalised Locatives in <fixed-case>T</fixed-case>urkic <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependency Treebanks</title>
+      <author><first>Jonathan</first><last>Washington</last></author>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
+      <author><first>Furkan</first><last>Akkurt</last></author>
+      <author><first>Bermet</first><last>Chontaeva</last></author>
+      <author><first>Soudabeh</first><last>Eslami</last></author>
+      <author><first>Gulnura</first><last>Jumalieva</last></author>
+      <author><first>Aida</first><last>Kasieva</last></author>
+      <author><first>Aslı</first><last>Kuzgun</last></author>
+      <author><first>Büşra</first><last>Marşan</last></author>
+      <author><first>Chihiro</first><last>Taguchi</last></author>
+      <pages>207–219</pages>
+      <abstract>As part of our efforts to develop unified Universal Dependencies (UD) guidelines for Turkic languages, we evaluate multiple approaches to a difficult morphosyntactic phenomenon, pronominal locative expressions formed by a suffix -ki. These forms result in multiple syntactic words, with potentially conflicting morphological features, and participating in different dependency relations. We describe multiple approaches to the problem in current (and upcoming) Turkic UD treebanks, and show that none of them offers a solution that satisfies a number of constraints we consider (including constraints imposed by UD guidelines). This calls for a compromise with the ‘least damage’ that should be adopted by most, if not all, Turkic treebanks. Our discussion of the phenomenon and various annotation approaches may also help treebanking efforts for other languages or language families with similar constructions.</abstract>
+      <url hash="1c2219f8">2024.mwe-1.25</url>
+      <bibkey>washington-etal-2024-strategies</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>BERT</fixed-case>-based Idiom Identification using Language Translation and Word Cohesion</title>
+      <author><first>Arnav</first><last>Yayavaram</last></author>
+      <author><first>Siddharth</first><last>Yayavaram</last></author>
+      <author><first>Prajna Devi</first><last>Upadhyay</last></author>
+      <author><first>Apurba</first><last>Das</last></author>
+      <pages>220–230</pages>
+      <abstract>An idiom refers to a special type of multi-word expression whose meaning is figurative and cannot be deduced from the literal interpretation of its components. Idioms are prevalent in almost all languages and text genres, necessitating explicit handling by comprehensive NLP systems. Such phrases are referred to as Potentially Idiomatic Expressions (PIEs) and automatically identifying them in text is a challenging task. In this paper, we propose using a BERT-based model fine-tuned with custom objectives, to improve the accuracy of detecting PIEs in text. Our custom loss functions capture two important properties (word cohesion and language translation) to distinguish PIEs from non-PIEs. We conducted several experiments on 7 datasets and showed that incorporating custom objectives while training the model leads to substantial gains. Our models trained using this approach also have better sequence accuracy over DISC, a state-of-the-art PIE detection technique, along with good transfer capabilities.</abstract>
+      <url hash="5b0c98a8">2024.mwe-1.26</url>
+      <bibkey>yayavaram-etal-2024-bert</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Ad Hoc Compounds for Stance Detection</title>
+      <author><first>Qi</first><last>Yu</last></author>
+      <author><first>Fabian</first><last>Schlotterbeck</last></author>
+      <author><first>Hening</first><last>Wang</last></author>
+      <author><first>Naomi</first><last>Reichmann</last></author>
+      <author><first>Britta</first><last>Stolterfoht</last></author>
+      <author><first>Regine</first><last>Eckardt</last></author>
+      <author><first>Miriam</first><last>Butt</last></author>
+      <pages>231–242</pages>
+      <abstract>In this paper we focus on a subclass of multi-word expressions, namely compound formation in German. The automatic detection of compounds is a known problem and we argue that its resolution should be given more urgency in light of a new role we uncovered with respect to ad hoc compound formation: the systematic expression of attitudinal meaning and its potential importance for the down-stream NLP task of stance detection. We demonstrate that ad hoc compounds in German indeed systematically express attitudinal meaning by adducing corpus linguistic and psycholinguistic experimental data. However, an investigation of state-of-the-art dependency parsers and Universal Dependency treebanks shows that German compounds are parsed and annotated very unevenly, so that currently one cannot reliably identify or access ad hoc compounds with attitudinal meaning in texts. Moreover, we report initial experiments with large language models underlining the challenges in capturing attitudinal meanings conveyed by ad hoc compounds. We consequently suggest a systematized way of annotating (and thereby also parsing) ad hoc compounds that is based on positive experiences from within the multilingual ParGram grammar development effort.</abstract>
+      <url hash="9bd57308">2024.mwe-1.27</url>
+      <bibkey>yu-etal-2024-ad</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.neusymbridge.xml b/data/xml/2024.neusymbridge.xml
new file mode 100644
index 0000000000..2d8ad8b37e
--- /dev/null
+++ b/data/xml/2024.neusymbridge.xml
@@ -0,0 +1,78 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.neusymbridge">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Workshop: Bridging Neurons and Symbols for Natural Language Processing and Knowledge Graphs Reasoning (NeusymBridge) @ LREC-COLING-2024</booktitle>
+      <editor><first>Tiansi</first><last>Dong</last></editor>
+      <editor><first>Erhard</first><last>Hinrichs</last></editor>
+      <editor><first>Zhen</first><last>Han</last></editor>
+      <editor><first>Kang</first><last>Liu</last></editor>
+      <editor><first>Yangqiu</first><last>Song</last></editor>
+      <editor><first>Yixin</first><last>Cao</last></editor>
+      <editor><first>Christian F.</first><last>Hempelmann</last></editor>
+      <editor><first>Rafet</first><last>Sifa</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="9b1db1c2">2024.neusymbridge-1</url>
+      <venue>neusymbridge</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="8e807509">2024.neusymbridge-1.0</url>
+      <bibkey>neusymbridge-ws-2024-bridging</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Probing Large Language Models from a Human Behavioral Perspective</title>
+      <author><first>Xintong</first><last>Wang</last></author>
+      <author><first>Xiaoyu</first><last>Li</last></author>
+      <author><first>Xingshan</first><last>Li</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
+      <pages>1–7</pages>
+      <abstract>Large Language Models (LLMs) have emerged as dominant foundational models in modern NLP. However, the understanding of their prediction processes and internal mechanisms, such as feed-forward networks (FFN) and multi-head self-attention (MHSA), remains largely unexplored. In this work, we probe LLMs from a human behavioral perspective, correlating values from LLMs with eye-tracking measures, which are widely recognized as meaningful indicators of human reading patterns. Our findings reveal that LLMs exhibit a similar prediction pattern with humans but distinct from that of Shallow Language Models (SLMs). Moreover, with the escalation of LLM layers from the middle layers, the correlation coefficients also increase in FFN and MHSA, indicating that the logits within FFN increasingly encapsulate word semantics suitable for predicting tokens from the vocabulary.</abstract>
+      <url hash="def54e50">2024.neusymbridge-1.1</url>
+      <bibkey>wang-etal-2024-probing</bibkey>
+    </paper>
+    <paper id="2">
+      <title>The Semantic Relations in <fixed-case>LLM</fixed-case>s: An Information-theoretic Compression Approach</title>
+      <author><first>Yu-Hsiang</first><last>Tseng</last></author>
+      <author><first>Pin-Er</first><last>Chen</last></author>
+      <author><first>Da-Chen</first><last>Lian</last></author>
+      <author><first>Shu-Kai</first><last>Hsieh</last></author>
+      <pages>8–21</pages>
+      <abstract>Compressibility is closely related to the predictability of the texts from the information theory viewpoint. As large language models (LLMs) are trained to maximize the conditional probabilities of upcoming words, they may capture the subtlety and nuances of the semantic constraints underlying the texts, and texts aligning with the encoded semantic constraints are more compressible than those that do not. This paper systematically tests whether and how LLMs can act as compressors of semantic pairs. Using semantic relations from English and Chinese Wordnet, we empirically demonstrate that texts with correct semantic pairings are more compressible than incorrect ones, measured by the proposed compression advantages index. We also show that, with the Pythia model suite and a fine-tuned model on Chinese Wordnet, compression capacities are modulated by the model’s seen data. These findings are consistent with the view that LLMs encode the semantic knowledge as underlying constraints learned from texts and can act as compressors of semantic information or potentially other structured knowledge.</abstract>
+      <url hash="e0e5af37">2024.neusymbridge-1.2</url>
+      <bibkey>tseng-etal-2024-semantic</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Word Sense Disambiguation as a Game of Neurosymbolic Darts</title>
+      <author><first>Tiansi</first><last>Dong</last></author>
+      <author><first>Rafet</first><last>Sifa</last></author>
+      <pages>22–32</pages>
+      <abstract>Word Sense Disambiguation (WSD) is one of the hardest tasks in natural language understanding and knowledge engineering. The glass ceiling of the 80% F1 score is recently achieved through supervised learning, enriched by knowledge graphs. Here, we propose a novel neurosymbolic methodology that may push the F1 score above 90%. The core of our methodology is a neurosymbolic sense embedding, in terms of a configuration of nested n-dimensional balls. The central point of a ball well preserves pre-trained word embeddings learned from data, which partially fixes the locations of balls. Inclusion relations among balls precisely encode symbolic hypernym relations among senses, and enable simple logic deduction among sense embeddings. We trained a Transformer to learn the mapping from a contextualized word embedding to its sense ball embedding, just like playing the game of darts (a game of shooting darts into a dartboard). A series of experiments are carried out using pre-training n ball embeddings, which cover around 70% training data and 75% testing data in the benchmark WSD corpus. Euclidean distance and cosine similarity functions are used as objective functions, separately, and each reaches &gt;95.0% F1 score in the ALL-n-ball dataset. This substantially breaks the glass ceiling of deep learning methods. Future work is discussed to develop a full-fledged neurosymbolic WSD system that substantially outperforms deep learning approaches.</abstract>
+      <url hash="027d22d7">2024.neusymbridge-1.3</url>
+      <bibkey>dong-sifa-2024-word</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Open Event Causality Extraction by the Assistance of <fixed-case>LLM</fixed-case> in Task Annotation, Dataset, and Method</title>
+      <author><first>Kun</first><last>Luo</last></author>
+      <author><first>Tong</first><last>Zhou</last></author>
+      <author><first>Yubo</first><last>Chen</last></author>
+      <author><first>Jun</first><last>Zhao</last></author>
+      <author><first>Kang</first><last>Liu</last></author>
+      <pages>33–44</pages>
+      <abstract>Event Causality Extraction (ECE) aims to extract explicit causal relations between event pairs from the text. However, the event boundary deviation and the causal event pair mismatching are two crucial challenges that remain unaddressed. To address the above issues, we propose a paradigm to utilize LLM to optimize the task definition, evolve the datasets, and strengthen our proposed customized Contextual Highlighting Event Causality Extraction framework (CHECE). Specifically in CHECE, we propose an Event Highlighter and an Event Concretization Module, guiding the model to represent the event by a higher-level cluster and consider its causal counterpart in event boundary prediction to deal with event boundary deviation. And we propose a Contextual Event Causality Matching mechanism, meanwhile, applying LLM to diversify the content templates to force the model to learn causality from context to targeting on causal event pair mismatching. Experimental results on two ECE datasets demonstrate the effectiveness of our method.</abstract>
+      <url hash="bcdb7585">2024.neusymbridge-1.4</url>
+      <bibkey>luo-etal-2024-open</bibkey>
+    </paper>
+    <paper id="5">
+      <title>The Need for Grounding in <fixed-case>LLM</fixed-case>-based Dialogue Systems</title>
+      <author><first>Kristiina</first><last>Jokinen</last></author>
+      <pages>45–52</pages>
+      <abstract>Grounding is a pertinent part of the design of LLM-based dialogue systems. Although research on grounding has a long tradition, the paradigm shift caused by LLMs has brought the concept onto the foreground, in particular in the context of cognitive robotics. To avoid generation of irrelevant or false information, the system needs to ground its utterances into real-world events, and to avoid the statistical parrot effect, the system needs to construct shared understanding of the dialogue context and of the partner’s intents. Grounding and construction of the shared context enables cooperation between the participants, and thus supports trustworthy interaction. This paper discusses grounding using neural LLM technology. It aims to bridge neural and symbolic computing on the cognitive architecture level, so as to contribute to a better understanding of how conversational reasoning and collaboration can be linked to LLM implementations to support trustworthy and flexible interaction.</abstract>
+      <url hash="6ac82679">2024.neusymbridge-1.5</url>
+      <bibkey>jokinen-2024-need</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.nlp4hr.xml b/data/xml/2024.nlp4hr.xml
index b25518ee99..d317fe9467 100644
--- a/data/xml/2024.nlp4hr.xml
+++ b/data/xml/2024.nlp4hr.xml
@@ -29,6 +29,7 @@
       <abstract>Recent years have brought significant advances to Natural Language Processing (NLP), which enabled fast progress in the field of computational job market analysis. Core tasks in this application domain are skill extraction and classification from job postings. Because of its quick growth and its interdisciplinary nature, there is no exhaustive assessment of this field. This survey aims to fill this gap by providing a comprehensive overview of deep learning methodologies, datasets, and terminologies specific to NLP-driven skill extraction. Our comprehensive cataloging of publicly available datasets addresses the lack of consolidated information on dataset creation and characteristics. Finally, the focus on terminology addresses the current lack of consistent definitions for important concepts, such as hard and soft skills, and terms relating to skill extraction and classification.</abstract>
       <url hash="462face8">2024.nlp4hr-1.1</url>
       <bibkey>senger-etal-2024-deep</bibkey>
+      <video href="2024.nlp4hr-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Aspect-Based Sentiment Analysis for Open-Ended <fixed-case>HR</fixed-case> Survey Responses</title>
@@ -39,6 +40,7 @@
       <abstract>Understanding preferences, opinions, and sentiment of the workforce is paramount for effective employee lifecycle management. Open-ended survey responses serve as a valuable source of information. This paper proposes a machine learning approach for aspect-based sentiment analysis (ABSA) of Dutch open-ended responses in employee satisfaction surveys. Our approach aims to overcome the inherent noise and variability in these responses, enabling a comprehensive analysis of sentiments that can support employee lifecycle management. Through response clustering we identify six key aspects (salary, schedule, contact, communication, personal attention, agreements), which we validate by domain experts. We compile a dataset of 1,458 Dutch survey responses, revealing label imbalance in aspects and sentiments. We propose few-shot approaches for ABSA based on Dutch BERT models, and compare them against bag-of-words and zero-shot baselines.Our work significantly contributes to the field of ABSA by demonstrating the first successful application of Dutch pre-trained language models to aspect-based sentiment analysis in the domain of human resources (HR).</abstract>
       <url hash="dffe1ba7">2024.nlp4hr-1.2</url>
       <bibkey>rink-etal-2024-aspect</bibkey>
+      <video href="2024.nlp4hr-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Rethinking Skill Extraction in the Job Market Domain using Large Language Models</title>
@@ -50,6 +52,7 @@
       <abstract>Skill Extraction involves identifying skills and qualifications mentioned in documents such as job postings and resumes. The task is commonly tackled by training supervised models using a sequence labeling approach with BIO tags. However, the reliance on manually annotated data limits the generalizability of such approaches. Moreover, the common BIO setting limits the ability of the models to capture complex skill patterns and handle ambiguous mentions. In this paper, we explore the use of in-context learning to overcome these challenges, on a benchmark of 6 uniformized skill extraction datasets. Our approach leverages the few-shot learning capabilities of large language models (LLMs) to identify and extract skills from sentences. We show that LLMs, despite not being on par with traditional supervised models in terms of performance, can better handle syntactically complex skill mentions in skill extraction tasks.</abstract>
       <url hash="2f4cb08e">2024.nlp4hr-1.3</url>
       <bibkey>nguyen-etal-2024-rethinking</bibkey>
+      <video href="2024.nlp4hr-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title><fixed-case>J</fixed-case>ob<fixed-case>S</fixed-case>kape: A Framework for Generating Synthetic Job Postings to Enhance Skill Matching</title>
@@ -62,6 +65,7 @@
       <abstract>Recent approaches in skill matching, employing synthetic training data for classification or similarity model training, have shown promising results, reducing the need for time-consuming and expensive annotations. However, previous synthetic datasets have limitations, such as featuring only one skill per sentence and generally comprising short sentences. In this paper, we introduce JobSkape, a framework to generate synthetic data that tackles these limitations, specifically designed to enhance skill-to-taxonomy matching. Within this framework, we create SkillSkape, a comprehensive open-source synthetic dataset of job postings tailored for skill-matching tasks. We introduce several offline metrics that show that our dataset resembles real-world data. Additionally, we present a multi-step pipeline for skill extraction and matching tasks using large language models (LLMs), benchmarking against known supervised methodologies. We outline that the downstream evaluation results on real-world data can beat baselines, underscoring its efficacy and adaptability.</abstract>
       <url hash="7481b3df">2024.nlp4hr-1.4</url>
       <bibkey>magron-etal-2024-jobskape</bibkey>
+      <video href="2024.nlp4hr-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title><fixed-case>HR</fixed-case>-<fixed-case>M</fixed-case>ulti<fixed-case>WOZ</fixed-case>: A Task Oriented Dialogue (<fixed-case>TOD</fixed-case>) Dataset for <fixed-case>HR</fixed-case> <fixed-case>LLM</fixed-case> Agent</title>
@@ -86,6 +90,7 @@
       <abstract>Large language models have emerged as a useful technology for job matching, for both candidates and employers. Job matching is often based on a particular geographic location, such as a city or region. However, LMs have known biases, commonly derived from their training data. In this work, we aim to quantify the metropolitan size bias encoded within large language models, evaluating zero-shot salary, employer presence, and commute duration predictions in 384 of the United States’ metropolitan regions. Across all benchmarks, we observe correlations between metropolitan population and the accuracy of predictions, with the smallest 10 metropolitan regions showing upwards of 300% worse benchmark performance than the largest 10.</abstract>
       <url hash="4751dc5b">2024.nlp4hr-1.6</url>
       <bibkey>campanella-goot-2024-big</bibkey>
+      <video href="2024.nlp4hr-1.6.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.nlperspectives.xml b/data/xml/2024.nlperspectives.xml
new file mode 100644
index 0000000000..69f8d88042
--- /dev/null
+++ b/data/xml/2024.nlperspectives.xml
@@ -0,0 +1,190 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.nlperspectives">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 3rd Workshop on Perspectivist Approaches to NLP (NLPerspectives) @ LREC-COLING 2024</booktitle>
+      <editor><first>Gavin</first><last>Abercrombie</last></editor>
+      <editor><first>Valerio</first><last>Basile</last></editor>
+      <editor><first>Davide</first><last>Bernadi</last></editor>
+      <editor><first>Shiran</first><last>Dudy</last></editor>
+      <editor><first>Simona</first><last>Frenda</last></editor>
+      <editor><first>Lucy</first><last>Havens</last></editor>
+      <editor><first>Sara</first><last>Tonelli</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="deb5f2db">2024.nlperspectives-1</url>
+      <venue>nlperspectives</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="abe37c2b">2024.nlperspectives-1.0</url>
+      <bibkey>nlperspectives-2024-perspectivist</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Is a picture of a bird a bird? A mixed-methods approach to understanding diverse human perspectives and ambiguity in machine vision models</title>
+      <author><first>Alicia</first><last>Parrish</last></author>
+      <author><first>Susan</first><last>Hao</last></author>
+      <author><first>Sarah</first><last>Laszlo</last></author>
+      <author><first>Lora</first><last>Aroyo</last></author>
+      <pages>1–18</pages>
+      <abstract>Human experiences are complex and subjective. This subjectivity is reflected in the way people label images for machine vision models. While annotation tasks are often assumed to deliver objective results, this assumption does not allow for the subjectivity of human experience. This paper examines the implications of subjective human judgments in the behavioral task of labeling images used to train machine vision models. We identify three primary sources of ambiguity: (1) depictions of labels in the images can be simply ambiguous, (2) raters’ backgrounds and experiences can influence their judgments and (3) the way the labeling task is defined can also influence raters’ judgments. By taking steps to address these sources of ambiguity, we can create more robust and reliable machine vision models.</abstract>
+      <url hash="9f6fd9c0">2024.nlperspectives-1.1</url>
+      <bibkey>parrish-etal-2024-picture</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Wisdom of Instruction-Tuned Language Model Crowds. Exploring Model Label Variation</title>
+      <author><first>Flor Miriam</first><last>Plaza-del-Arco</last></author>
+      <author><first>Debora</first><last>Nozza</last></author>
+      <author><first>Dirk</first><last>Hovy</last></author>
+      <pages>19–30</pages>
+      <abstract>Large Language Models (LLMs) exhibit remarkable text classification capabilities, excelling in zero- and few-shot learning (ZSL and FSL) scenarios. However, since they are trained on different datasets, performance varies widely across tasks between those models. Recent studies emphasize the importance of considering human label variation in data annotation. However, how this human label variation also applies to LLMs remains unexplored. Given this likely model specialization, we ask: Do aggregate LLM labels improve over individual models (as for human annotators)? We evaluate four recent instruction-tuned LLMs as “annotators” on five subjective tasks across four languages. We use ZSL and FSL setups and label aggregation from human annotation. Aggregations are indeed substantially better than any individual model, benefiting from specialization in diverse tasks or languages. Surprisingly, FSL does not surpass ZSL, as it depends on the quality of the selected examples. However, there seems to be no good information-theoretical strategy to select those. We find that no LLM method rivals even simple supervised models. We also discuss the tradeoffs in accuracy, cost, and moral/ethical considerations between LLM and human annotation.</abstract>
+      <url hash="82b08a13">2024.nlperspectives-1.2</url>
+      <bibkey>plaza-del-arco-etal-2024-wisdom</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Revisiting Annotation of Online Gender-Based Violence</title>
+      <author><first>Gavin</first><last>Abercrombie</last></author>
+      <author><first>Nikolas</first><last>Vitsakis</last></author>
+      <author><first>Aiqi</first><last>Jiang</last></author>
+      <author><first>Ioannis</first><last>Konstas</last></author>
+      <pages>31–41</pages>
+      <abstract>Online Gender-Based Violence is an increasing problem, but existing datasets fail to capture the plurality of possible annotator perspectives or ensure representation of affected groups. In a pilot study, we revisit the annotation of a widely used dataset to investigate the relationship between annotator identities and underlying attitudes and the responses they give to a sexism labelling task. We collect demographic and attitudinal information about crowd-sourced annotators using two validated surveys from Social Psychology. While we do not find any correlation between underlying attitudes and annotation behaviour, ethnicity does appear to be related to annotator responses for this pool of crowd-workers. We also conduct initial classification experiments using Large Language Models, finding that a state-of-the-art model trained with human feedback benefits from our broad data collection to perform better on the new labels. This study represents the initial stages of a wider data collection project, in which we aim to develop a taxonomy of GBV in partnership with affected stakeholders.</abstract>
+      <url hash="cc8ea37c">2024.nlperspectives-1.3</url>
+      <bibkey>abercrombie-etal-2024-revisiting</bibkey>
+    </paper>
+    <paper id="4">
+      <title>A Perspectivist Corpus of Numbers in Social Judgements</title>
+      <author><first>Marlon</first><last>May</last></author>
+      <author><first>Lucie</first><last>Flek</last></author>
+      <author><first>Charles</first><last>Welch</last></author>
+      <pages>42–48</pages>
+      <abstract>With growing interest in the use of large language models, it is becoming increasingly important to understand whose views they express. These models tend to generate output that conforms to majority opinion and are not representative of diverse views. As a step toward building models that can take differing views into consideration, we build a novel corpus of social judgements. We crowdsourced annotations of a subset of the Commonsense Norm Bank that contained numbers in the situation descriptions and asked annotators to replace the number with a range defined by a start and end value that, in their view, correspond to the given verdict. Our corpus contains unaggregated annotations and annotator demographics. We describe our annotation process for social judgements and will release our dataset to support future work on numerical reasoning and perspectivist approaches to natural language processing.</abstract>
+      <url hash="76271c4c">2024.nlperspectives-1.4</url>
+      <bibkey>may-etal-2024-perspectivist</bibkey>
+    </paper>
+    <paper id="5">
+      <title>An Overview of Recent Approaches to Enable Diversity in Large Language Models through Aligning with Human Perspectives</title>
+      <author><first>Benedetta</first><last>Muscato</last></author>
+      <author><first>Chandana Sree</first><last>Mala</last></author>
+      <author><first>Marta</first><last>Marchiori Manerba</last></author>
+      <author><first>Gizem</first><last>Gezici</last></author>
+      <author><first>Fosca</first><last>Giannotti</last></author>
+      <pages>49–55</pages>
+      <abstract>The varied backgrounds and experiences of human annotators inject different opinions and potential biases into the data, inevitably leading to disagreements. Yet, traditional aggregation methods fail to capture individual judgments since they rely on the notion of a single ground truth. Our aim is to review prior contributions to pinpoint the shortcomings that might cause stereotypical content generation. As a preliminary study, our purpose is to investigate state-of-the-art approaches, primarily focusing on the following two research directions. First, we investigate how adding subjectivity aspects to LLMs might guarantee diversity. We then look into the alignment between humans and LLMs and discuss how to measure it. Considering existing gaps, our review explores possible methods to mitigate the perpetuation of biases targeting specific communities. However, we recognize the potential risk of disseminating sensitive information due to the utilization of socio-demographic data in the training process. These considerations underscore the inclusion of diverse perspectives while taking into account the critical importance of implementing robust safeguards to protect individuals’ privacy and prevent the inadvertent propagation of sensitive information.</abstract>
+      <url hash="27d2330b">2024.nlperspectives-1.5</url>
+      <bibkey>muscato-etal-2024-overview</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Disagreement in Argumentation Annotation</title>
+      <author><first>Anna</first><last>Lindahl</last></author>
+      <pages>56–66</pages>
+      <abstract>Disagreement, perspective or error? There is a growing discussion against the idea of a unified ground truth in annotated data, as well as the usefulness of such a ground truth and resulting gold standard. In data perspectivism, this issue is exemplified with tasks such as hate speech or sentiment classification in which annotators’ different perspectives are important to include. In this paper we turn to argumentation, a related field which has had less focus from this point of view. Argumentation is difficult to annotate for several reasons, from the more practical parts of deciding where the argumentation begins and ends to questions of how argumentation is defined and what it consists of. Learning more about disagreement is therefore important in order to improve argument annotation and to better utilize argument annotated data. Because of this, we examine disagreement in two corpora annotated with argumentation both manually and computationally. We find that disagreement is often not because of annotation errors or mistakes but due to the possibility of multiple possible interpretations. More specifically, these interpretations can be over boundaries, label or existence of argumentation. These results emphasize the need for more thorough analysis of disagreement in data, outside of the more common inter-annotator agreement measures.</abstract>
+      <url hash="b583e11b">2024.nlperspectives-1.6</url>
+      <bibkey>lindahl-2024-disagreement</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Moral Disagreement over Serious Matters: Discovering the Knowledge Hidden in the Perspectives</title>
+      <author><first>Anny D.</first><last>Alvarez Nogales</last></author>
+      <author><first>Oscar</first><last>Araque</last></author>
+      <pages>67–77</pages>
+      <abstract>Moral values significantly define decision-making processes, notably on contentious issues like global warming. The Moral Foundations Theory (MFT) delineates morality and aims to reconcile moral expressions across cultures, yet different interpretations arise, posing challenges for computational modeling. This paper addresses the need to incorporate diverse moral perspectives into the learning systems used to estimate morality in text. To do so, it explores how training language models with varied annotator perspectives affects the performance of the learners. Building on top if this, this work also proposes an ensemble method that exploits the diverse perspectives of annotators to construct a more robust moral estimation model. Additionally, we investigate the automated identification of texts that pose annotation challenges, enhancing the understanding of linguistic cues towards annotator disagreement. To evaluate the proposed models we use the Moral Foundations Twitter Corpus (MFTC), a resource that is currently the reference for modeling moral values in computational social sciences. We observe that incorporating the diverse perspectives of annotators into an ensemble model benefits the learning process, showing large improvements in the classification performance. Finally, the results also indicate that instances that convey strong moral meaning are more challenging to annotate.</abstract>
+      <url hash="a5c5a110">2024.nlperspectives-1.7</url>
+      <bibkey>alvarez-nogales-araque-2024-moral</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Perspectives on Hate: General vs. Domain-Specific Models</title>
+      <author><first>Giulia</first><last>Rizzi</last></author>
+      <author><first>Michele</first><last>Fontana</last></author>
+      <author><first>Elisabetta</first><last>Fersini</last></author>
+      <pages>78–83</pages>
+      <abstract>The rise of online hostility, combined with broad social media use, leads to the necessity of the comprehension of its human impact. However, the process of hate identification is challenging because, on the one hand, the line between healthy disagreement and poisonous speech is not well defined, and, on the other hand, multiple socio-cultural factors or prior beliefs shape people’s perceptions of potentially harmful text. To address disagreements in hate speech identification, Natural Language Processing (NLP) models must capture several perspectives. This paper introduces a strategy based on the Contrastive Learning paradigm for detecting disagreements in hate speech using pre-trained language models. Two approaches are proposed: the General Model, a comprehensive framework, and the Domain-Specific Model, which focuses on more specific hate-related tasks. The source code is available at ://anonymous.4open.science/r/Disagreement-530C.</abstract>
+      <url hash="df67be09">2024.nlperspectives-1.8</url>
+      <bibkey>rizzi-etal-2024-perspectives</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Soft metrics for evaluation with disagreements: an assessment</title>
+      <author><first>Giulia</first><last>Rizzi</last></author>
+      <author><first>Elisa</first><last>Leonardelli</last></author>
+      <author><first>Massimo</first><last>Poesio</last></author>
+      <author><first>Alexandra</first><last>Uma</last></author>
+      <author><first>Maja</first><last>Pavlovic</last></author>
+      <author><first>Silviu</first><last>Paun</last></author>
+      <author><first>Paolo</first><last>Rosso</last></author>
+      <author><first>Elisabetta</first><last>Fersini</last></author>
+      <pages>84–94</pages>
+      <abstract>The move towards preserving judgement disagreements in NLP requires the identification of adequate evaluation metrics. We identify a set of key properties that such metrics should have, and assess the extent to which natural candidates for soft evaluation such as Cross Entropy satisfy such properties. We employ a theoretical framework, supported by a visual approach, by practical examples, and by the analysis of a real case scenario. Our results indicate that Cross Entropy can result in fairly paradoxical results in some cases, whereas other measures Manhattan distance and Euclidean distance exhibit a more intuitive behavior, at least for the case of binary classification.</abstract>
+      <url hash="024dde3b">2024.nlperspectives-1.9</url>
+      <bibkey>rizzi-etal-2024-soft</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Designing <fixed-case>NLP</fixed-case> Systems That Adapt to Diverse Worldviews</title>
+      <author><first>Claudiu</first><last>Creanga</last></author>
+      <author><first>Liviu P.</first><last>Dinu</last></author>
+      <pages>95–99</pages>
+      <abstract>Natural Language Inference (NLI) is foundational for evaluating language understanding in AI. However, progress has plateaued, with models failing on ambiguous examples and exhibiting poor generalization. We argue that this stems from disregarding the subjective nature of meaning, which is intrinsically tied to an individual’s <i>weltanschauung</i> (which roughly translates to worldview). Existing NLP datasets often obscure this by aggregating labels or filtering out disagreement. We propose a perspectivist approach: building datasets that capture annotator demographics, values, and justifications for their labels. Such datasets would explicitly model diverse worldviews. Our initial experiments with a subset of the SBIC dataset demonstrate that even limited annotator metadata can improve model performance.</abstract>
+      <url hash="e34ec67e">2024.nlperspectives-1.10</url>
+      <bibkey>creanga-dinu-2024-designing</bibkey>
+    </paper>
+    <paper id="11">
+      <title>The Effectiveness of <fixed-case>LLM</fixed-case>s as Annotators: A Comparative Overview and Empirical Analysis of Direct Representation</title>
+      <author><first>Maja</first><last>Pavlovic</last></author>
+      <author><first>Massimo</first><last>Poesio</last></author>
+      <pages>100–110</pages>
+      <abstract>Recent studies focus on exploring the capability of Large Language Models (LLMs) for data annotation. Our work, firstly, offers a comparative overview of twelve such studies that investigate labelling with LLMs, particularly focusing on classification tasks. Secondly, we present an empirical analysis that examines the degree of alignment between the opinion distributions returned by GPT and those provided by human annotators across four subjective datasets. Our analysis supports a minority of studies that are considering diverse perspectives when evaluating data annotation tasks and highlights the need for further research in this direction.</abstract>
+      <url hash="5f2fcebe">2024.nlperspectives-1.11</url>
+      <bibkey>pavlovic-poesio-2024-effectiveness</bibkey>
+    </paper>
+    <paper id="12">
+      <title>What Does Perspectivism Mean? An Ethical and Methodological Countercriticism</title>
+      <author><first>Mathieu</first><last>Valette</last></author>
+      <pages>111–115</pages>
+      <abstract>In this paper, we address the epistemological and ethical break of perspectivism in NLP. First, we propose to consider data annotation from the point of view of the scientific management of annotation work - which is part of the automation process inherent in NLP, in order to ideologically situate the perspectivist paradigm. We then analyze some of the concepts of perspectivism (in particular, truth). Finally, based on this analysis, we formulate a set of proposals aimed at overcoming the observed limitations of corpus annotation in general and perspectivism in particular.</abstract>
+      <url hash="6ae58086">2024.nlperspectives-1.12</url>
+      <bibkey>valette-2024-perspectivism</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>O</fixed-case>rigam<fixed-case>IM</fixed-case>: A Dataset of Ambiguous Sentence Interpretations for Social Grounding and Implicit Language Understanding</title>
+      <author><first>Liesbeth</first><last>Allein</last></author>
+      <author><first>Marie-Francine</first><last>Moens</last></author>
+      <pages>116–122</pages>
+      <abstract>Sentences elicit different interpretations and reactions among readers, especially when there is ambiguity in their implicit layers. We present a first-of-its kind dataset of sentences from Reddit, where each sentence is annotated with multiple interpretations of its meanings, understandings of implicit moral judgments about mentioned people, and reader impressions of its author. Scrutiny of the dataset proves the evoked variability and polarity in reactions. It further shows that readers strongly disagree on both the presence of implied judgments and the social acceptability of the behaviors they evaluate. In all, the dataset offers a valuable resource for socially grounding language and modeling the intricacies of implicit language understanding from multiple reader perspectives.</abstract>
+      <url hash="c8650afa">2024.nlperspectives-1.13</url>
+      <bibkey>allein-moens-2024-origamim</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Linguistic Fingerprint in Transformer Models: How Language Variation Influences Parameter Selection in Irony Detection</title>
+      <author><first>Michele</first><last>Mastromattei</last></author>
+      <author><first>Fabio Massimo</first><last>Zanzotto</last></author>
+      <pages>123–130</pages>
+      <abstract>This paper explores the correlation between linguistic diversity, sentiment analysis and transformer model architectures. We aim to investigate how different English variations impact transformer-based models for irony detection. To conduct our study, we used the EPIC corpus to extract five diverse English variation-specific datasets and applied the KEN pruning algorithm on five different architectures. Our results reveal several similarities between optimal subnetworks, which provide insights into the linguistic variations that share strong resemblances and those that exhibit greater dissimilarities. We discovered that optimal subnetworks across models share at least 60% of their parameters, emphasizing the significance of parameter values in capturing and interpreting linguistic variations. This study highlights the inherent structural similarities between models trained on different variants of the same language and also the critical role of parameter values in capturing these nuances.</abstract>
+      <url hash="e72bb4ec">2024.nlperspectives-1.14</url>
+      <bibkey>mastromattei-zanzotto-2024-linguistic</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Intersectionality in <fixed-case>AI</fixed-case> Safety: Using Multilevel Models to Understand Diverse Perceptions of Safety in Conversational <fixed-case>AI</fixed-case></title>
+      <author><first>Christopher</first><last>Homan</last></author>
+      <author><first>Gregory</first><last>Serapio-Garcia</last></author>
+      <author><first>Lora</first><last>Aroyo</last></author>
+      <author><first>Mark</first><last>Diaz</last></author>
+      <author><first>Alicia</first><last>Parrish</last></author>
+      <author><first>Vinodkumar</first><last>Prabhakaran</last></author>
+      <author><first>Alex</first><last>Taylor</last></author>
+      <author><first>Ding</first><last>Wang</last></author>
+      <pages>131–141</pages>
+      <abstract>State-of-the-art conversational AI exhibits a level of sophistication that promises to have profound impacts on many aspects of daily life, including how people seek information, create content, and find emotional support. It has also shown a propensity for bias, offensive language, and false information. Consequently, understanding and moderating safety risks posed by interacting with AI chatbots is a critical technical and social challenge. Safety annotation is an intrinsically subjective task, where many factors—often intersecting—determine why people may express different opinions on whether a conversation is safe. We apply Bayesian multilevel models to surface factors that best predict rater behavior to a dataset of 101,286 annotations of conversations between humans and an AI chatbot, stratified by rater gender, age, race/ethnicity, and education level. We show that intersectional effects involving these factors play significant roles in validating safety in conversational AI data. For example, race/ethnicity and gender show strong intersectional effects, particularly among South Asian and East Asian women. We also find that conversational degree of harm impacts raters of all race/ethnicity groups, but that Indigenous and South Asian raters are particularly sensitive. Finally, we discover that the effect of education is uniquely intersectional for Indigenous raters. Our results underscore the utility of multilevel frameworks for uncovering underrepresented social perspectives.</abstract>
+      <url hash="04a1b56c">2024.nlperspectives-1.15</url>
+      <bibkey>homan-etal-2024-intersectionality</bibkey>
+    </paper>
+    <paper id="16">
+      <title>A Dataset for Multi-Scale Film Rating Inference from Reviews</title>
+      <author><first>Frankie</first><last>Robertson</last></author>
+      <author><first>Stefano</first><last>Leone</last></author>
+      <pages>142–150</pages>
+      <abstract>This resource paper introduces a dataset for multi-scale rating inference of film review scores based upon review summaries. The dataset and task are unique in pairing a text regression problem with ratings given on multiple scales, e.g. the A-F letter scale and the 4-point star scale. It retains entity identifiers such as film and reviewer names. The paper describes the construction of the dataset before exploring potential baseline architectures for the task, and evaluating their performance. Baselines based on classifier-per-scale, affine-per-scale, and ordinal regression models are presented and evaluated with the BERT-base backbone. Additional experiments are used to ground a discussion of the different architectures’ merits and drawbacks with regards to explainability and model interpretation.</abstract>
+      <url hash="dea51f7a">2024.nlperspectives-1.16</url>
+      <bibkey>robertson-leone-2024-dataset</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.osact.xml b/data/xml/2024.osact.xml
new file mode 100644
index 0000000000..8b1279748c
--- /dev/null
+++ b/data/xml/2024.osact.xml
@@ -0,0 +1,202 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.osact">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 6th Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and Dialect to MSA Machine Translation @ LREC-COLING 2024</booktitle>
+      <editor><first>Hend</first><last>Al-Khalifa</last></editor>
+      <editor><first>Kareem</first><last>Darwish</last></editor>
+      <editor><first>Hamdy</first><last>Mubarak</last></editor>
+      <editor><first>Mona</first><last>Ali</last></editor>
+      <editor><first>Tamer</first><last>Elsayed</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="baf86e80">2024.osact-1</url>
+      <venue>osact</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="bd81a8c5">2024.osact-1.0</url>
+      <bibkey>osact-2024-open</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>A</fixed-case>ra<fixed-case>T</fixed-case>ar: A Corpus to Support the Fine-grained Detection of Hate Speech Targets in the <fixed-case>A</fixed-case>rabic Language</title>
+      <author><first>Seham</first><last>Alghamdi</last></author>
+      <author><first>Youcef</first><last>Benkhedda</last></author>
+      <author><first>Basma</first><last>Alharbi</last></author>
+      <author><first>Riza</first><last>Batista-Navarro</last></author>
+      <pages>1–12</pages>
+      <abstract>We are currently witnessing a concerning surge in the spread of hate speech across various social media platforms, targeting individuals or groups based on their protected characteristics such as race, religion, nationality and gender. This paper focuses on the detection of hate type (Task 1) and hate target (Task 2) in the Arabic language. To comprehensively address this problem, we have combined and re-annotated hate speech tweets from existing publicly available corpora, resulting in the creation of AraTar, the first and largest Arabic corpus annotated with support for multi-label classification for both hate speech types and target detection with a high inter-annotator agreement. Additionally, we sought to determine the most effective machine learning-based approach for addressing this issue. To achieve this, we compare and evaluate different approaches, including: (1) traditional machine learning-based models, (2) deep learning-based models fed with contextual embeddings, and (3) fine-tuning language models (LMs). Our results demonstrate that fine-tuning LMs, specifically using AraBERTv0.2-twitter (base), achieved the highest performance, with a micro-averaged F1-score of 84.5% and 85.03%, and a macro-averaged F1-score of 77.46% and 73.15%, for Tasks 1 and 2, respectively.</abstract>
+      <url hash="6b31ca43">2024.osact-1.1</url>
+      <bibkey>alghamdi-etal-2024-aratar</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>CLEANANERC</fixed-case>orp: Identifying and Correcting Incorrect Labels in the <fixed-case>ANER</fixed-case>corp Dataset</title>
+      <author><first>Mashael</first><last>AlDuwais</last></author>
+      <author><first>Hend</first><last>Al-Khalifa</last></author>
+      <author><first>Abdulmalik</first><last>AlSalman</last></author>
+      <pages>13–19</pages>
+      <abstract>Label errors are a common issue in machine learning datasets, particularly for tasks such as Named Entity Recognition. Such label erros might hurt model training, affect evaluation results, and lead to an inaccurate assessment of model performance. In this study, we dived deep into one of the widely adopted Arabic NER benchmark datasets (ANERcorp) and found a significant number of annotation errors, missing labels, and inconsistencies. Therefore, in this study, we conducted empirical research to understand these erros, correct them and propose a cleaner version of the dataset named CLEANANERCorp. CLEANANERCorp will serve the research community as a more accurate and consistent benchmark.</abstract>
+      <url hash="34049782">2024.osact-1.2</url>
+      <bibkey>alduwais-etal-2024-cleananercorp</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Munazarat 1.0: A Corpus of <fixed-case>A</fixed-case>rabic Competitive Debates</title>
+      <author><first>Mohammad M.</first><last>Khader</last></author>
+      <author><first>AbdulGabbar</first><last>Al-Sharafi</last></author>
+      <author><first>Mohamad Hamza</first><last>Al-Sioufy</last></author>
+      <author><first>Wajdi</first><last>Zaghouani</last></author>
+      <author><first>Ali</first><last>Al-Zawqari</last></author>
+      <pages>20–30</pages>
+      <abstract>This paper introduces the Corpus of Arabic Competitive Debates (Munazarat). Despite the significance of competitive debating as an activity of fostering critical thinking and promoting dialogue, researchers within the fields of Arabic Natural Language Processing (NLP), linguistics, argumentation studies, and education have access to very limited datasets about competitive debating. At this study stage, we introduce Munazarat 1.0, which combines recordings of approximately 50 hours collected from 73 debates at QatarDebate-recognized tournaments, where all of those debates were available on YouTube. Munazarat is a novel specialized speech Arabic corpus, mostly in Modern Standard Arabic (MSA), consisting of diverse debating topics and showing rich metadata for each debate. The transcription of debates was done using Fenek, a speech-to-text Kanari AI tool, and three native Arabic speakers reviewed each transcription file to enhance the quality provided by the machine. The Munazarat 1.0 dataset can be used to train Arabic NLP tools, develop an argumentation mining machine, and analyze Arabic argumentation and rhetoric styles. Keywords: Arabic Speech Corpus, Modern Standard Arabic, Debates</abstract>
+      <url hash="3818ea14">2024.osact-1.3</url>
+      <bibkey>khader-etal-2024-munazarat</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Leveraging Corpus Metadata to Detect Template-based Translation: An Exploratory Case Study of the <fixed-case>E</fixed-case>gyptian <fixed-case>A</fixed-case>rabic <fixed-case>W</fixed-case>ikipedia Edition</title>
+      <author><first>Saied</first><last>Alshahrani</last></author>
+      <author><first>Hesham Haroon</first><last>Mohammed</last></author>
+      <author><first>Ali</first><last>Elfilali</last></author>
+      <author><first>Mariama</first><last>Njie</last></author>
+      <author><first>Jeanna</first><last>Matthews</last></author>
+      <pages>31–45</pages>
+      <abstract>Wikipedia articles (content pages) are commonly used corpora in Natural Language Processing (NLP) research, especially in low-resource languages other than English. Yet, a few research studies have studied the three Arabic Wikipedia editions, Arabic Wikipedia (AR), Egyptian Arabic Wikipedia (ARZ), and Moroccan Arabic Wikipedia (ARY), and documented issues in the Egyptian Arabic Wikipedia edition regarding the massive automatic creation of its articles using template-based translation from English to Arabic without human involvement, overwhelming the Egyptian Arabic Wikipedia with articles that do not only have low-quality content but also with articles that do not represent the Egyptian people, their culture, and their dialect. In this paper, we aim to mitigate the problem of template translation that occurred in the Egyptian Arabic Wikipedia by identifying these template-translated articles and their characteristics through exploratory analysis and building automatic detection systems. We first explore the content of the three Arabic Wikipedia editions in terms of density, quality, and human contributions and utilize the resulting insights to build multivariate machine learning classifiers leveraging articles’ metadata to detect the template-translated articles automatically. We then publicly deploy and host the best-performing classifier as an online application called ‘Egyptian Wikipedia Scanner’ and release the extracted, filtered, labeled, and preprocessed datasets to the research community to benefit from our datasets and the online, web-based detection system.</abstract>
+      <url hash="f1b8b080">2024.osact-1.4</url>
+      <bibkey>alshahrani-etal-2024-leveraging</bibkey>
+    </paper>
+    <paper id="5">
+      <title>A Novel Approach for Root Selection in the Dependency Parsing</title>
+      <author><first>Sharefah Ahmed</first><last>Al-Ghamdi</last></author>
+      <author><first>Hend</first><last>Al-Khalifa</last></author>
+      <author><first>Abdulmalik</first><last>AlSalman</last></author>
+      <pages>46–49</pages>
+      <abstract>Although syntactic analysis using the sequence labeling method is promising, it can be problematic when the labels sequence does not contain a root label. This can result in errors in the final parse tree when the postprocessing method assumes the first word as the root. In this paper, we present a novel postprocessing method for BERT-based dependency parsing as sequence labeling. Our method leverages the root’s part of speech tag to select a more suitable root for the dependency tree, instead of using the default first token. We conducted experiments on nine dependency treebanks from different languages and domains, and demonstrated that our technique consistently improves the labeled attachment score (LAS) on most of them.</abstract>
+      <url hash="2778405a">2024.osact-1.5</url>
+      <bibkey>al-ghamdi-etal-2024-novel</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>A</fixed-case>ra<fixed-case>M</fixed-case>ed: <fixed-case>A</fixed-case>rabic Medical Question Answering using Pretrained Transformer Language Models</title>
+      <author><first>Ashwag</first><last>Alasmari</last></author>
+      <author><first>Sarah</first><last>Alhumoud</last></author>
+      <author><first>Waad</first><last>Alshammari</last></author>
+      <pages>50–56</pages>
+      <abstract>Medical Question Answering systems have gained significant attention in recent years due to their potential to enhance medical decision-making and improve patient care. However, most of the research in this field has focused on English-language datasets, limiting the generalizability of MQA systems to non-English speaking regions. This study introduces AraMed, a large-scale Arabic Medical Question Answering dataset addressing the limited resources available for Arabic medical question answering. AraMed comprises of 270k question-answer pairs based on health consumer questions submitted to online medical forum. Experiments using various deep learning models showcase the dataset’s effectiveness, particularly with AraBERT models achieving highest results, specifically AraBERTv2 obtained an F1 score of 96.73% in the answer selection task. The comparative analysis of different deep learning models provides insights into their strengths and limitations. These findings highlight the potential of AraMed for advancing Arabic medical question answering research and development.</abstract>
+      <url hash="722e3030">2024.osact-1.6</url>
+      <bibkey>alasmari-etal-2024-aramed</bibkey>
+    </paper>
+    <paper id="7">
+      <title>The Multilingual Corpus of World’s Constitutions (<fixed-case>MCWC</fixed-case>)</title>
+      <author><first>Mo</first><last>El-Haj</last></author>
+      <author><first>Saad</first><last>Ezzini</last></author>
+      <pages>57–66</pages>
+      <abstract>The “Multilingual Corpus of World’s Constitutions” (MCWC) serves as a valuable resource for the NLP community, offering a comprehensive collection of constitutions from around the world. Its focus on data quality and breadth of coverage enables advanced research in constitutional analysis, machine translation, and cross-lingual legal studies. The MCWC prepares its data to ensure high quality and minimal noise, while also providing valuable mappings of constitutions to their respective countries and continents, facilitating comparative analysis. Notably, the corpus offers pairwise sentence alignments across languages, supporting machine translation experiments. We utilise a leading Machine Translation model, fine-tuned on the MCWC to achieve accurate and context-aware translations. Additionally, we introduce an independent Machine Translation model as a comparative baseline. Fine-tuning the model on the MCWC improves accuracy, highlighting the significance of such a legal corpus for NLP and Machine Translation. The MCWC’s rich multilingual content and rigorous data quality standards raise the bar for legal text analysis and inspire innovation in the NLP community, opening new avenues for studying constitutional texts and multilingual data analysis.</abstract>
+      <url hash="829deb6b">2024.osact-1.7</url>
+      <bibkey>el-haj-ezzini-2024-multilingual</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>T</fixed-case>afsir<fixed-case>E</fixed-case>xtractor: Text Preprocessing Pipeline preparing Classical <fixed-case>A</fixed-case>rabic Literature for Machine Learning Applications</title>
+      <author><first>Carl</first><last>Kruse</last></author>
+      <author><first>Sajawel</first><last>Ahmed</last></author>
+      <pages>67–73</pages>
+      <abstract>In this paper, we present a comprehensive tool of preprocessing Classical Arabic (CA) literature in the field of historical exegetical studies for machine learning (ML) evaluations. Most recent ML models require the training data to be in a specific format (e.g. XML, TEI, CoNLL) to use it afterwards for ML applications such as Named Entity Recognition (NER) or Topic Modeling (TM). We report on how our method works and can be applied by other researchers with similar endeavors. Thereby, the importance of this comprehensive tool of preprocessing is demonstrated, as this novel approach has no predecessors for CA yet. We achieve results that enable the training of current ML models leading to state-of-the art performance for NER and TM on CA literature. We make our tool along its source code and data freely available for the Natural Language Processing (NLP) research community.</abstract>
+      <url hash="7f4158a4">2024.osact-1.8</url>
+      <bibkey>kruse-ahmed-2024-tafsirextractor</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Advancing the <fixed-case>A</fixed-case>rabic <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et: Elevating Content Quality</title>
+      <author><first>Abed Alhakim</first><last>Freihat</last></author>
+      <author><first>Hadi Mahmoud</first><last>Khalilia</last></author>
+      <author><first>Gábor</first><last>Bella</last></author>
+      <author><first>Fausto</first><last>Giunchiglia</last></author>
+      <pages>74–83</pages>
+      <abstract>High-quality WordNets are crucial for achieving high-quality results in NLP applications that rely on such resources. However, the wordnets of most languages suffer from serious issues of correctness and completeness with respect to the words and word meanings they define, such as incorrect lemmas, missing glosses and example sentences, or an inadequate, Western-centric representation of the morphology and the semantics of the language. Previous efforts have largely focused on increasing lexical coverage while ignoring other qualitative aspects. In this paper, we focus on the Arabic language and introduce a major revision of the Arabic WordNet that addresses multiple dimensions of lexico-semantic resource quality. As a result, we updated more than 58% of the synsets of the existing Arabic WordNet by adding missing information and correcting errors. In order to address issues of language diversity and untranslatability, we also extended the wordnet structure by new elements: phrasets and lexical gaps.</abstract>
+      <url hash="e923c359">2024.osact-1.9</url>
+      <bibkey>freihat-etal-2024-advancing</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>A</fixed-case>rabic Speech Recognition of zero-resourced Languages: A case of <fixed-case>S</fixed-case>hehri (Jibbali) Language</title>
+      <author><first>Norah A.</first><last>Alrashoudi</last></author>
+      <author><first>Omar Said</first><last>Alshahri</last></author>
+      <author><first>Hend</first><last>Al-Khalifa</last></author>
+      <pages>84–92</pages>
+      <abstract>Many under-resourced languages lack computational resources for automatic speech recognition (ASR) due to data scarcity issues. This makes developing accurate ASR models challenging. Shehri or Jibbali, spoken in Oman, lacks extensive annotated speech data. This paper aims to improve an ASR model for this under-resourced language. We collected a Shehri (Jibbali) speech corpus and utilized transfer learning by fine-tuning pre-trained ASR models on this dataset. Specifically, models like Wav2Vec2.0, HuBERT and Whisper were fine-tuned using techniques like parameter-efficient fine-tuning. Evaluation using word error rate (WER) and character error rate (CER) showed that the Whisper model, fine-tuned on the Shehri (Jibbali) dataset, significantly outperformed other models, with the best results from Whisper-medium achieving 3.5% WER. This demonstrates the effectiveness of transfer learning for resource-constrained tasks, showing high zero-shot performance of pre-trained models.</abstract>
+      <url hash="a53bccea">2024.osact-1.10</url>
+      <bibkey>alrashoudi-etal-2024-arabic</bibkey>
+    </paper>
+    <paper id="11">
+      <title><fixed-case>OSACT</fixed-case>6 Dialect to <fixed-case>MSA</fixed-case> Translation Shared Task Overview</title>
+      <author><first>Ashraf Hatim</first><last>Elneima</last></author>
+      <author><first>AhmedElmogtaba Abdelmoniem Ali</first><last>Abdelaziz</last></author>
+      <author><first>Kareem</first><last>Darwish</last></author>
+      <pages>93–97</pages>
+      <abstract>This paper presents the Dialectal Arabic (DA) to Modern Standard Arabic (MSA) Machine Translation (MT) shared task in the sixth Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT6). The paper describes the creation of the validation and test data and the metrics used; and provides a brief overview of the submissions to the shared task. In all, 29 teams signed up and 6 teams made actual submissions. The teams used a variety of datasets and approaches to build their MT systems. The most successful submission involved using zero-shot and n-shot prompting of chatGPT.</abstract>
+      <url hash="9ef262fe">2024.osact-1.11</url>
+      <bibkey>elneima-etal-2024-osact6</bibkey>
+    </paper>
+    <paper id="12">
+      <title><fixed-case>OSACT</fixed-case> 2024 Task 2: <fixed-case>A</fixed-case>rabic Dialect to <fixed-case>MSA</fixed-case> Translation</title>
+      <author><first>Hanin</first><last>Atwany</last></author>
+      <author><first>Nour</first><last>Rabih</last></author>
+      <author><first>Ibrahim</first><last>Mohammed</last></author>
+      <author><first>Abdul</first><last>Waheed</last></author>
+      <author><first>Bhiksha</first><last>Raj</last></author>
+      <pages>98–103</pages>
+      <abstract>We present the results of Shared Task “Dialect to MSA Translation”, which tackles challenges posed by the diverse Arabic dialects in machine translation. Covering Gulf, Egyptian, Levantine, Iraqi and Maghrebi dialects, the task offers 1001 sentences in both MSA and dialects for fine-tuning, alongside 1888 blind test sentences. Leveraging GPT-3.5, a state-of-the-art language model, our method achieved the a BLEU score of 29.61. This endeavor holds significant implications for Neural Machine Translation (NMT) systems targeting low-resource langu ages with linguistic variation. Additionally, negative experiments involving fine-tuning AraT5 and No Language Left Behind (NLLB) using the MADAR Dataset resulted in BLEU scores of 10.41 and 11.96, respectively. Future directions include expanding the dataset to incorporate more Arabic dialects and exploring alternative NMT architectures to further enhance translation capabilities.</abstract>
+      <url hash="f6519049">2024.osact-1.12</url>
+      <bibkey>atwany-etal-2024-osact</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>ASOS</fixed-case> at <fixed-case>OSACT</fixed-case>6 Shared Task: Investigation of Data Augmentation in <fixed-case>A</fixed-case>rabic Dialect-<fixed-case>MSA</fixed-case> Translation</title>
+      <author><first>Omer</first><last>Nacar</last></author>
+      <author><first>Abdullah</first><last>Alharbi</last></author>
+      <author><first>Serry</first><last>Sibaee</last></author>
+      <author><first>Samar</first><last>Ahmed</last></author>
+      <author><first>Lahouari</first><last>Ghouti</last></author>
+      <author><first>Anis</first><last>Koubaa</last></author>
+      <pages>104–111</pages>
+      <abstract>The translation between Modern Standard Arabic (MSA) and the various Arabic dialects presents unique challenges due to the significant linguistic, cultural, and contextual variations across the regions where Arabic is spoken. This paper presents a system description of our participation in the OSACT 2024 Dialect to MSA Translation Shared Task. We explain our comprehensive approach which combines data augmentation techniques using generative pre-trained transformer models (GPT-3.5 and GPT-4) with fine-tuning of AraT5 V2, a model specifically designed for Arabic translation tasks. Our methodology has significantly expanded the training dataset, thus improving the model’s performance across five major Arabic dialects, namely Gulf, Egyptian, Levantine, Iraqi, and Maghrebi. We have rigorously evaluated our approach, using BLEU score, to ensure translation accuracy, fluency, and the preservation of meaning. Our results showcase the effectiveness of our refined models in addressing the challenges posed by diverse Arabic dialects and Modern Standard Arabic (MSA), achieving a BLEU score of 80% on the validation test set and 22.25% on the blind test set. However, it’s important to note that while utilizing a larger dataset, such as Madar + Dev, resulted in significantly higher evaluation BLEU scores, the performance on the blind test set was relatively lower. This observation underscores the importance of dataset size in model training, revealing potential limitations in generalization to unseen data due to variations in data distribution and domain mismatches.</abstract>
+      <url hash="d9f45d51">2024.osact-1.13</url>
+      <bibkey>nacar-etal-2024-asos</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>LLM</fixed-case>-based <fixed-case>MT</fixed-case> Data Creation: Dialectal to <fixed-case>MSA</fixed-case> Translation Shared Task</title>
+      <author><first>AhmedElmogtaba Abdelmoniem Ali</first><last>Abdelaziz</last></author>
+      <author><first>Ashraf Hatim</first><last>Elneima</last></author>
+      <author><first>Kareem</first><last>Darwish</last></author>
+      <pages>112–116</pages>
+      <abstract>This paper presents our approach to the Dialect to Modern Standard Arabic (MSA) Machine Translation shared task, conducted as part of the sixth Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT6). Our primary contribution is the development of a novel dataset derived from The Saudi Audio Dataset for Arabic (SADA) an Arabic audio corpus. By employing an automated method utilizing ChatGPT 3.5, we translated the dialectal Arabic texts to their MSA equivalents. This process not only yielded a unique and valuable dataset but also showcased an efficient method for leveraging language models in dataset generation. Utilizing this dataset, alongside additional resources, we trained a machine translation model based on the Transformer architecture. Through systematic experimentation with model configurations, we achieved notable improvements in translation quality. Our findings highlight the significance of LLM-assisted dataset creation methodologies and their impact on advancing machine translation systems, particularly for languages with considerable dialectal diversity like Arabic.</abstract>
+      <url hash="8770e68c">2024.osact-1.14</url>
+      <bibkey>abdelaziz-etal-2024-llm</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>S</fixed-case>irius_<fixed-case>T</fixed-case>ranslators at <fixed-case>OSACT</fixed-case>6 2024 Shared Task: Fin-tuning Ara-T5 Models for Translating <fixed-case>A</fixed-case>rabic Dialectal Text to <fixed-case>M</fixed-case>odern <fixed-case>S</fixed-case>tandard <fixed-case>A</fixed-case>rabic</title>
+      <author><first>Salwa Saad</first><last>Alahmari</last></author>
+      <pages>117–123</pages>
+      <abstract>This paper presents the findings from our participation in the 6th Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT6) in 2024. Our specific focus was on the second task (Task 2), which involved translating text at the sentence level from five distinct Dialectal Arabic (DA) (Gulf, Egyptian, Levantine, Iraqi, and Maghrebi) into Modern Standard Arabic (MSA). Our team, Sirius_Translators, fine-tuned four AraT5 models namely; AraT5 base, AraT5v2-base-1024, AraT5-MSA-Small, and AraT5-MSA-Base for the Arabic machine translation (MT) task. These models were fine-tuned using a variety of parallel corpora containing Dialectal Arabic and Modern Standard Arabic. Based on the evaluation results of OSACT6 2024 Shared Task2, our fine-tuned AraT5v2-base-1024 model achieved an overall BLEU score of 21.0 on the development (Dev) set and 9.57 on the test set, respectively.</abstract>
+      <url hash="429c4612">2024.osact-1.15</url>
+      <bibkey>alahmari-2024-sirius</bibkey>
+    </paper>
+    <paper id="16">
+      <title><fixed-case>A</fixed-case>ra<fixed-case>T</fixed-case>5-<fixed-case>MSA</fixed-case>izer: Translating Dialectal <fixed-case>A</fixed-case>rabic to <fixed-case>MSA</fixed-case></title>
+      <author><first>Murhaf</first><last>Fares</last></author>
+      <pages>124–129</pages>
+      <abstract>This paper outlines the process of training the AraT5-MSAizer model, a transformer-based neural machine translation model aimed at translating five regional Arabic dialects into Modern Standard Arabic (MSA). Developed for Task 2 of the 6th Workshop on Open-Source Arabic Corpora and Processing Tools, the model attained a BLEU score of 21.79% on the test set associated with this task.</abstract>
+      <url hash="04c5acd3">2024.osact-1.16</url>
+      <bibkey>fares-2024-arat5</bibkey>
+    </paper>
+    <paper id="17">
+      <title><fixed-case>ASOS</fixed-case> at <fixed-case>A</fixed-case>rabic <fixed-case>LLM</fixed-case>s Hallucinations 2024: Can <fixed-case>LLM</fixed-case>s detect their Hallucinations :)</title>
+      <author><first>Serry Taiseer</first><last>Sibaee</last></author>
+      <author><first>Abdullah</first><last>I. Alharbi</last></author>
+      <author><first>Samar</first><last>Ahmed</last></author>
+      <author><first>Omar</first><last>Nacar</last></author>
+      <author><first>Lahouri</first><last>Ghouti</last></author>
+      <author><first>Anis</first><last>Koubaa</last></author>
+      <pages>130–134</pages>
+      <abstract>This research delves into the issue of hallucination detection in Large Language Models (LLMs) using Arabic language datasets. As LLMs are increasingly being used in various applications, the phenomenon of hallucination, which refers to generating factually inaccurate content despite grammatical coherence, poses significant challenges. We participate in the OSACT 2024 Shared-task (Detection of Hallucination in Arabic Factual Claims Generated by ChatGPT and GPT4). We explore various approaches for detecting and mitigating hallucination, using models such as GPT-4, Mistral, and Gemini within a novel experimental framework. Our research findings reveal that the effectiveness of these models in classifying claims into Fact-Claim, Fact-Improvement, and Non-Fact categories varies greatly, underscoring the complexities of addressing hallucination in morphologically rich languages. The study emphasizes the need for advanced modelling and training strategies to enhance the reliability and factual accuracy of LLM-generated content, laying the groundwork for future explorations in mitigating hallucination risks. In our experiments we achieved a 0.54 F1 in GPT-4 LLM.</abstract>
+      <url hash="4d852084">2024.osact-1.17</url>
+      <bibkey>sibaee-etal-2024-asos</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.parlaclarin.xml b/data/xml/2024.parlaclarin.xml
new file mode 100644
index 0000000000..0faac55ae9
--- /dev/null
+++ b/data/xml/2024.parlaclarin.xml
@@ -0,0 +1,272 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.parlaclarin">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the IV Workshop on Creating, Analysing, and Increasing Accessibility of Parliamentary Corpora (ParlaCLARIN) @ LREC-COLING 2024</booktitle>
+      <editor><first>Darja</first><last>Fiser</last></editor>
+      <editor><first>Maria</first><last>Eskevich</last></editor>
+      <editor><first>David</first><last>Bordon</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="b52a180f">2024.parlaclarin-1</url>
+      <venue>parlaclarin</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="d7a08a3e">2024.parlaclarin-1.0</url>
+      <bibkey>parlaclarin-ws-2024-iv</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Parliamentary Discourse Research in Political Science: Literature Review</title>
+      <author><first>Jure</first><last>Skubic</last></author>
+      <author><first>Darja</first><last>Fišer</last></author>
+      <pages>1–11</pages>
+      <abstract>One of the major research interests for political science has always been the study of political discourse and parliamentary debates. This literature review offers an overview of the most prominent research methods used in political science when studying political discourse. We identify the commonalities and the differences of the political science and corpus-driven approaches and show how parliamentary corpora and corpus-based approaches could be successfully integrated in political science research.</abstract>
+      <url hash="923b02d6">2024.parlaclarin-1.1</url>
+      <bibkey>skubic-fiser-2024-parliamentary</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Compiling and Exploring a <fixed-case>P</fixed-case>ortuguese Parliamentary Corpus: <fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int-<fixed-case>PT</fixed-case></title>
+      <author><first>José</first><last>Aires</last></author>
+      <author><first>Aida</first><last>Cardoso</last></author>
+      <author><first>Rui</first><last>Pereira</last></author>
+      <author><first>Amalia</first><last>Mendes</last></author>
+      <pages>12–20</pages>
+      <abstract>As part of the project ParlaMint II, a new corpus of the sessions of the Portuguese Parliament from 2015 to 2022 has been compiled, encoded and annotated following the ParlaMint guidelines. We report on the contents of the corpus and on the specific nature of the political settings in Portugal during the time period covered. Two subcorpora were designed that would enable comparisons of the political speeches between pre and post covid-19 pandemic. We discuss the pipeline applied to download the original texts, ensure their preprocessing and encoding in XML, and the final step of annotation. This new resource covers a period of changes in the political system in Portugal and will be an important source of data for political and social studies. Finally, Finally, we have explored the political stance on immigration in the ParlaMint-PT corpus.</abstract>
+      <url hash="c271ab26">2024.parlaclarin-1.2</url>
+      <bibkey>aires-etal-2024-compiling</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Gender, Speech, and Representation in the <fixed-case>G</fixed-case>alician Parliament: An Analysis Based on the <fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int-<fixed-case>ES</fixed-case>-<fixed-case>GA</fixed-case> Dataset</title>
+      <author><first>Adina I.</first><last>Vladu</last></author>
+      <author><first>Elisa</first><last>Fernández Rei</last></author>
+      <author><first>Carmen</first><last>Magariños</last></author>
+      <author><first>Noelia</first><last>García Díaz</last></author>
+      <pages>21–29</pages>
+      <abstract>This paper employs the ParlaMint-ES-GA dataset to scrutinize the intersection of gender, speech, and representation within the Parliament of Galicia, an autonomous region located in North-western Spain. The research questions center around the dynamics of women’s participation in parliamentary proceedings. Contrary to numerical parity, we explore whether increased female presence in the parliament correlates with equitable access to the floor. Analyzing parliamentary proceedings from 2015 to 2022, our quantitative study investigates the relationship between the legislative body’s composition, the number of speeches by Members of Parliament (MPs), and references made by MPs in their speeches. The findings reveal nuances in gender representation and participation, challenging assumptions about proportional access to parliamentary discourse.</abstract>
+      <url hash="b94aac69">2024.parlaclarin-1.3</url>
+      <bibkey>vladu-etal-2024-gender</bibkey>
+    </paper>
+    <paper id="4">
+      <title><fixed-case>B</fixed-case>ulgarian <fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int 4.0 corpus as a testset for Part-of-speech tagging and Named Entity Recognition</title>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <author><first>Kiril</first><last>Simov</last></author>
+      <pages>30–35</pages>
+      <abstract>The paper discusses some fine-tuned models for the tasks of part-of-speech tagging and named entity recognition. The fine-tuning was performed on the basis of an existing BERT pre-trained model and two newly pre-trained BERT models for Bulgarian that are cross-tested on the domain of the Bulgarian part of the ParlaMint corpora as a new domain. In addition, a comparison has been made between the performance of the new fine-tuned BERT models and the available results from the Stanza-based model which the Bulgarian part of the ParlaMint corpora has been annotated with. The observations show the weaknesses in each model as well as the common challenges.</abstract>
+      <url hash="045b62d4">2024.parlaclarin-1.4</url>
+      <bibkey>osenova-simov-2024-bulgarian</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Resources and Methods for Analysing Political Rhetoric and Framing in Parliamentary Debates</title>
+      <author><first>Ines</first><last>Rehbein</last></author>
+      <pages>36–37</pages>
+      <abstract>Recent work in political science has made exten- sive use of NLP methods to produce evidential sup- port for a variety of analyses, for example, inferring an actor’s ideological positions from textual data or identifying the polarisation of the political discourse over the last decades. Most work has employed variations of lexical features extracted from text or has learned latent representations in a mostly un- supervised manner. While such approaches have the potential to enable political analyses at scale, they are often limited by their lack of interpretabil- ity. In the talk, I will instead look at semantic and pragmatic representations of political rhethoric and ideological framing and present several case stud- ies that showcase how linguistic annotation and the use of NLP methods can help to investigate dif- ferent framing strategies in parliamentary debates. The first part of the talk investigates populist framing strategies, specifically, the use of pronouns to create in- and out-groups and the identification of people-centric messages. The second part of the presentation focusses on framing strategies on the pragmatic level.</abstract>
+      <url hash="8fefd9fb">2024.parlaclarin-1.5</url>
+      <bibkey>rehbein-2024-resources</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>PTPARL</fixed-case>-<fixed-case>V</fixed-case>: <fixed-case>P</fixed-case>ortuguese Parliamentary Debates for Voting Behaviour Study</title>
+      <author><first>Afonso</first><last>Sousa</last></author>
+      <author><first>Henrique</first><last>Lopes Cardoso</last></author>
+      <pages>38–42</pages>
+      <abstract>We present a new dataset, , that provides valuable insight for advancing discourse analysis of parliamentary debates in Portuguese. This is achieved by processing the open-access information available at the official Portuguese Parliament website and scraping the information from the debate minutes’ PDFs contained therein. Our dataset includes interventions from 547 different deputies of all major Portuguese parties, from 736 legislative initiatives spanning five legislatures from 2005 to 2021. We present a statistical analysis of the dataset compared to other publicly available Portuguese parliamentary debate corpora. Finally, we provide baseline performance analysis for voting behaviour classification.</abstract>
+      <url hash="68b71542">2024.parlaclarin-1.6</url>
+      <bibkey>sousa-lopes-cardoso-2024-ptparl</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>P</fixed-case>olish Round Table Corpus</title>
+      <author><first>Maciej</first><last>Ogrodniczuk</last></author>
+      <author><first>Ryszard</first><last>Tuora</last></author>
+      <author><first>Beata</first><last>Wójtowicz</last></author>
+      <pages>43–47</pages>
+      <abstract>The paper describes the process of preparation of the Polish Round Table Corpus (Pol. Korpus Okrągłego Stołu), a new resource documenting negotiations taking place in 1989 between the representatives of the communist government of the People’s Republic of Poland and the Solidarity opposition. The process consisted of OCR of graphical transcripts of the talks stored in the form of parliament-like stenographic transcripts, carrying out their manual correction and making them available for search in a concordancer currently used for standard parliamentary transcripts.</abstract>
+      <url hash="167f5427">2024.parlaclarin-1.7</url>
+      <bibkey>ogrodniczuk-etal-2024-polish</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Investigating Multilinguality in the Plenary Sessions of the Parliament of <fixed-case>F</fixed-case>inland with Automatic Language Identification</title>
+      <author><first>Tommi</first><last>Jauhiainen</last></author>
+      <author><first>Jussi</first><last>Piitulainen</last></author>
+      <author><first>Erik</first><last>Axelson</last></author>
+      <author><first>Ute</first><last>Dieckmann</last></author>
+      <author><first>Mietta</first><last>Lennes</last></author>
+      <author><first>Jyrki</first><last>Niemi</last></author>
+      <author><first>Jack</first><last>Rueter</last></author>
+      <author><first>Krister</first><last>Lindén</last></author>
+      <pages>48–56</pages>
+      <abstract>In this paper, we use automatic language identification to investigate the usage of different languages in the plenary sessions of the Parliament of Finland. Finland has two national languages, Finnish and Swedish. The plenary sessions are published as transcriptions of speeches in Parliament, reflecting the language the speaker used. In addition to charting out language use, we demonstrate how language identification can be used to audit the quality of the dataset. On the one hand, we made slight improvements to our language identifier; on the other hand, we made a list of improvement suggestions for the next version of the dataset.</abstract>
+      <url hash="e4cbcc6b">2024.parlaclarin-1.8</url>
+      <bibkey>jauhiainen-etal-2024-investigating</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Exploring Word Formation Trends in Written, Spoken, Translated and Interpreted <fixed-case>E</fixed-case>uropean Parliament Data – A Case Study on Initialisms in <fixed-case>E</fixed-case>nglish and <fixed-case>G</fixed-case>erman</title>
+      <author><first>Katrin</first><last>Menzel</last></author>
+      <pages>57–65</pages>
+      <abstract>This paper demonstrates the research potential of a unique European Parliament dataset for register studies, contrastive linguistics, translation and interpreting studies. The dataset consists of parallel data for several European languages, including written source texts and their translations as well as spoken source texts and the transcripts of their simultaneously interpreted versions. The paper presents a cross-linguistic, corpus-based case study on a word formation phenomenon in these European Parliament data that are enriched with various linguistic annotations and metadata as well as with information-theoretic surprisal scores. It addresses the questions of how initialisms are used across languages and production modes in the English and German corpus sections of these European Parliament data, whether there is a correlation between the use of initialisms and the use of their corresponding multiword full forms in the analysed corpus sections and what insights on the informativity and possible processing difficulties of initialisms we can gain from an analysis of information-theoretic surprisal values. The results show that English written originals and German translations are the corpus sections with the highest frequencies of initialisms. The majority of cross-language transfer situations lead to fewer initialisms in the target texts than in the source texts. In the English data, there is a positive correlation between the frequency of initialisms and the frequency of the respective full forms. There is a similar correlation in the German data, apart from the interpreted data. Additionally, the results show that initialisms represent peaks of information with regard to their surprisal values within their segments. Particularly the German data show higher surprisal values of initialisms in mediated language than in non-mediated discourse types, which indicates that in German mediated discourse, initialisms tend to be used in less conventionalised textual contexts than in English.</abstract>
+      <url hash="23516dd9">2024.parlaclarin-1.9</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="1e7ba012">2024.parlaclarin-1.9.OptionalSupplementaryMaterial.docx</attachment>
+      <bibkey>menzel-2024-exploring</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Quantitative Analysis of Editing in Transcription Process in <fixed-case>J</fixed-case>apanese and <fixed-case>E</fixed-case>uropean Parliaments and its Diachronic Changes</title>
+      <author><first>Tatsuya</first><last>Kawahara</last></author>
+      <pages>66–69</pages>
+      <abstract>In making official transcripts for meeting records in Parliament, some edits are made from faithful transcripts of utterances for linguistic correction and formality. Classification of these edits is provided in this paper, and quantitative analysis is conducted for Japanese and European Parliamentary meetings by comparing the faithful transcripts of audio recordings against the official meeting records. Different trends are observed between the two Parliaments due to the nature of the language used and the meeting style. Moreover, its diachronic changes in the Japanese transcripts are presented, showing a significant decrease in the edits over the past decades. It was found that a majority of edits in the Japanese Parliament (Diet) simply remove fillers and redundant words, keeping the transcripts as verbatim as possible. This property is useful for the evaluation of the automatic speech transcription system, which was developed by us and has been used in the Japanese Parliament.</abstract>
+      <url hash="df9cea42">2024.parlaclarin-1.10</url>
+      <bibkey>kawahara-2024-quantitative</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Automated Emotion Annotation of <fixed-case>F</fixed-case>innish Parliamentary Speeches Using <fixed-case>GPT</fixed-case>-4</title>
+      <author><first>Otto</first><last>Tarkka</last></author>
+      <author><first>Jaakko</first><last>Koljonen</last></author>
+      <author><first>Markus</first><last>Korhonen</last></author>
+      <author><first>Juuso</first><last>Laine</last></author>
+      <author><first>Kristian</first><last>Martiskainen</last></author>
+      <author><first>Kimmo</first><last>Elo</last></author>
+      <author><first>Veronika</first><last>Laippala</last></author>
+      <pages>70–76</pages>
+      <abstract>In this paper, we test the efficacy of using GPT-4 to annotate a dataset that is the used to train a BERT classifier for emotion analysis. Manual data annotation is often a laborious and expensive task and emotion annotation, specifically, has proved difficult even for expert annotators. We show that using GPT-4 can produce equally good results as doing data annotation manually while saving a lot of time and money. We train a BERT classifier on our automatically annotated dataset and get results that outperform a BERT classifier that is trained on machine translated data. Our paper shows how Large Language Models can be used to work with and analyse parliamentary corpora.</abstract>
+      <url hash="c291c33e">2024.parlaclarin-1.11</url>
+      <bibkey>tarkka-etal-2024-automated</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Making Parliamentary Debates More Accessible: Aligning Video Recordings with Text Proceedings in Open Parliament <fixed-case>TV</fixed-case></title>
+      <author><first>Olivier</first><last>Aubert</last></author>
+      <author><first>Joscha</first><last>Jäger</last></author>
+      <pages>77–83</pages>
+      <abstract>We are going to describe the Open Parliament TV project and more specifically the work we have done on alignment of video recordings with text proceedings of the german Bundestag. This has allowed us to create a comprehensive and accessible platform for citizens and journalists to engage with parliamentary proceedings. Through our diligent work, we have ensured that the video recordings accurately correspond to the corresponding text, providing a seamless and synchronised experience for users. In this article, we describe the issues we were faced with and the method we used to solve it, along with the visualisations we developed to investigate and assess the content.</abstract>
+      <url hash="8b8a1121">2024.parlaclarin-1.12</url>
+      <bibkey>aubert-jager-2024-making</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>R</fixed-case>ussia and <fixed-case>U</fixed-case>kraine through the Eyes of <fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int 4.0: A Collocational <fixed-case>CADS</fixed-case> Profile of <fixed-case>S</fixed-case>panish and <fixed-case>B</fixed-case>ritish Parliamentary Discourses</title>
+      <author><first>Maria</first><last>Calzada Perez</last></author>
+      <pages>84–93</pages>
+      <abstract>This article resorts to mixed methods to examine British and Spanish parliamentary discourse. The quantitative corpus-assisted (lexical priming) theory and data are complemented by the qualitative discourse historical approach. Two CLARIN ParlaMint corpora – ParlamMint-GB and ParlaMint-ES – are queried in the analysis, which focuses on English (“Rusia” and “Ukraine”) and Spanish (“Rusia” and “Ucrania”) nodes and collocations. In sum, the analysis sketches a brief profile of each corpus. The British House of Commons is more homogenous, strongly associating “Russia” and “Ukraine” with their participation in the war. Furthermore, this chamber shows a greater interest in “Russia. The Spanish Congreso de los Diputados indicates greater quantitative differences (heterogeneity). Here, “Russia” clearly transcends its role as a military contender and is also portrayed as an economic competitor for the West. Unlike in Britain, the Spanish lower house shows more mentions of “Ucrania”, which is assigned just one role – as an invasion victim. In conclusion, the productivity of corpus-assisted mixed methods is confirmed along with the precious value of the ParlaMint constellation.</abstract>
+      <url hash="7a3d96d1">2024.parlaclarin-1.13</url>
+      <bibkey>calzada-perez-2024-russia</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Multilingual Power and Ideology identification in the Parliament: a reference dataset and simple baselines</title>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
+      <author><first>Matyáš</first><last>Kopp</last></author>
+      <author><first>Meden</first><last>Katja</last></author>
+      <author><first>Vaidas</first><last>Morkevicius</last></author>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
+      <author><first>Tomaž</first><last>Erjavec</last></author>
+      <pages>94–100</pages>
+      <abstract>We introduce a dataset on political orientation and power position identification. The dataset is derived from ParlaMint, a set of comparable corpora of transcribed parliamentary speeches from 29 national and regional parliaments. We introduce the dataset, provide the reasoning behind some of the choices during its creation, present statistics on the dataset, and, using a simple classifier, some baseline results on predicting political orientation on the left-to-right axis, and on power position identification, i.e., distinguishing between the speeches delivered by governing coalition party members from those of opposition party members.</abstract>
+      <url hash="f175c554">2024.parlaclarin-1.14</url>
+      <bibkey>coltekin-etal-2024-multilingual</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>IMPAQTS</fixed-case>: a multimodal corpus of parliamentary and other political speeches in <fixed-case>I</fixed-case>taly (1946-2023), annotated with implicit strategies</title>
+      <author><first>Federica</first><last>Cominetti</last></author>
+      <author><first>Lorenzo</first><last>Gregori</last></author>
+      <author><first>Edoardo</first><last>Lombardi Vallauri</last></author>
+      <author><first>Alessandro</first><last>Panunzi</last></author>
+      <pages>101–109</pages>
+      <abstract>The paper introduces the IMPAQTS corpus of Italian political discourse, a multimodal corpus of around 2.65 million tokens including 1,500 speeches uttered by 150 prominent politicians spanning from 1946 to 2023. Covering the entire history of the Italian Republic, the collection exhibits a non-homogeneous consistency that progressively increases in quantity towards the present. The corpus is balanced according to textual and socio-linguistic criteria and includes different types of speeches. The sociolinguistic features of the speakers are carefully considered to ensure representation of Republican Italian politicians. For each speaker, the corpus contains 4 parliamentary speeches, 2 rallies, 1 party assembly, and 3 statements (in person or broadcasted). Parliamentary speeches therefore constitute the largest section of the corpus (40% of the total), enabling direct comparison with other types of political speeches. The collection procedure, including details relevant to the transcription protocols, and the processing pipeline are described. The corpus has been pragmatically annotated to include information about the implicitly conveyed questionable contents, paired with their explicit paraphrasis, providing the largest Italian collection of ecologic examples of linguistic implicit strategies. The adopted ontology of linguistic implicitness and the fine-grained annotation scheme are presented in detail.</abstract>
+      <url hash="9b32362e">2024.parlaclarin-1.15</url>
+      <bibkey>cominetti-etal-2024-impaqts</bibkey>
+    </paper>
+    <paper id="16">
+      <title><fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int Ngram viewer: Multilingual Comparative Diachronic Search Across 26 Parliaments</title>
+      <author><first>Asher</first><last>de Jong</last></author>
+      <author><first>Taja</first><last>Kuzman</last></author>
+      <author><first>Maik</first><last>Larooij</last></author>
+      <author><first>Maarten</first><last>Marx</last></author>
+      <pages>110–115</pages>
+      <abstract>We demonstrate the multilingual search engine and Ngram viewer that was built on top of the Parlamint dataset using the recently available translations. The user interface and SERP are carefully designed for querying parliamentary proceedings and for the intended use by citizens, journalists and political scholars. Demo at https://debateabase.wooverheid.nl. Keywords: Multilingual Search, Parliamentary Proceedings, Ngram Viewer, Machine Translation</abstract>
+      <url hash="e537d405">2024.parlaclarin-1.16</url>
+      <bibkey>de-jong-etal-2024-parlamint</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Investigating Political Ideologies through the <fixed-case>G</fixed-case>reek <fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int corpus</title>
+      <author><first>Maria</first><last>Gavriilidou</last></author>
+      <author><first>Dimitris</first><last>Gkoumas</last></author>
+      <author><first>Stelios</first><last>Piperidis</last></author>
+      <author><first>Prokopis</first><last>Prokopidis</last></author>
+      <pages>116–120</pages>
+      <abstract>This paper has two objectives: to present (a) the creation of ParlaMint-GR, the Greek part of the ParlaMint corpora of debates in the parliaments of Europe, and (b) preliminary results on its comparison with a corpus of Greek party manifestos, aiming at the investigation of the ideologies of the Greek political parties and members of the Parliament. Additionally, a gender related comparison is explored. The creation of the ParlaMint-GR corpus is discussed, together with the solutions adopted for various challenges faced. The corpus of party manifestos, available through CLARIN:EL, serves for a comparative study with the corpus of speeches delivered by the members of the Greek Parliament, with the aim to identify the ideological positions of parties and politicians.</abstract>
+      <url hash="1e2d9188">2024.parlaclarin-1.17</url>
+      <bibkey>gavriilidou-etal-2024-investigating</bibkey>
+    </paper>
+    <paper id="18">
+      <title><fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int in <fixed-case>TEITOK</fixed-case></title>
+      <author><first>Maarten</first><last>Janssen</last></author>
+      <author><first>Matyáš</first><last>Kopp</last></author>
+      <pages>121–126</pages>
+      <abstract>This paper describes the ParlaMint 4.0 parliamentary corpora as made available in TEITOK at LINDAT. The TEITOK interface makes it possible to search through the corpus, to view each session in a readable manner, and to explore the names in the corpus. The interface does not present any new data, but provides an access point to the ParlaMint corpus that is less oriented to linguistic use only, and more accessible for the general public or researchers from other fields.</abstract>
+      <url hash="0fdf89b7">2024.parlaclarin-1.18</url>
+      <bibkey>janssen-kopp-2024-parlamint</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Historical Parliamentary Corpora Viewer</title>
+      <author><first>Alenka</first><last>Kavčič</last></author>
+      <author><first>Martin</first><last>Stojanoski</last></author>
+      <author><first>Matija</first><last>Marolt</last></author>
+      <pages>127–132</pages>
+      <abstract>Historical parliamentary debates offer a window into the past and provide valuable insights for academic research and historical analysis. This paper presents a novel web application tailored to the exploration of historical parliamentary corpora in the context of Slovenian national identity. The developed web viewer enables advanced search functions within collections of historical parliamentary records and has an intuitive and user-friendly interface. Users can enter search terms and apply filters to refine their search results. The search function allows keyword and phrase searching, including the ability to search by delegate and place names. It is also possible to search for translations of the text by selecting the desired languages. The search results are displayed with a preview of the proceedings and highlighted phrases that match the search query. To review a specific record, the full PDF document can be displayed in a separate view, allowing the user to scroll through the PDF document and search the content. In addition, the two corpora of Slovenian historical records integrated into the viewer—the Carniolan Provincial Assembly Corpus and the Parliamentary Corpus of the First Yugoslavia—are described and an insight into the corresponding preparation processes is provided.</abstract>
+      <url hash="bb0262dd">2024.parlaclarin-1.19</url>
+      <bibkey>kavcic-etal-2024-historical</bibkey>
+    </paper>
+    <paper id="20">
+      <title>The dbpedia <fixed-case>R</fixed-case> Package: An Integrated Workflow for Entity Linking (for <fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int Corpora)</title>
+      <author><first>Christoph</first><last>Leonhardt</last></author>
+      <author><first>Andreas</first><last>Blaette</last></author>
+      <pages>133–144</pages>
+      <abstract>Entity Linking is a powerful approach for linking textual data to established structured data such as survey data or adminstrative data. However, in the realm of social science, the approach is not widely adopted. We argue that this is, at least in part, due to specific setup requirements which constitute high barriers for usage and workflows which are not well integrated into analyitical scenarios commonly deployed in social science research. We introduce the dbpedia R package to make the approach more accessible. It has a focus on functionality that is easily adoptable to the needs of social scientists working with textual data, including the support of different input formats, limited setup costs and various output formats. Using a ParlaMint corpus, we show the applicability and flexibility of the approach for parliamentary debates.</abstract>
+      <url hash="b2754e9d">2024.parlaclarin-1.20</url>
+      <bibkey>leonhardt-blaette-2024-dbpedia</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Video Retrieval System Using Automatic Speech Recognition for the <fixed-case>J</fixed-case>apanese Diet</title>
+      <author><first>Mikitaka</first><last>Masuyama</last></author>
+      <author><first>Tatsuya</first><last>Kawahara</last></author>
+      <author><first>Kenjiro</first><last>Matsuda</last></author>
+      <pages>145–148</pages>
+      <abstract>The Japanese House of Representatives, one of the two houses of the Diet, has adopted an Automatic Speech Recognition (ASR) system, which directly transcribes parliamentary speech with an accuracy of 95 percent. The ASR system also provides a timestamp for every word, which enables retrieval of the video segments of the Parliamentary meetings. The video retrieval system we have developed allows one to pinpoint and play the parliamentary video clips corresponding to the meeting minutes by keyword search. In this paper, we provide its overview and suggest various ways we can utilize the system. The system is currently extended to cover meetings of local governments, which will allow us to investigate dialectal linguistic variations.</abstract>
+      <url hash="4cecf56e">2024.parlaclarin-1.21</url>
+      <bibkey>masuyama-etal-2024-video</bibkey>
+    </paper>
+    <paper id="22">
+      <title>One Year of Continuous and Automatic Data Gathering from Parliaments of <fixed-case>E</fixed-case>uropean <fixed-case>U</fixed-case>nion Member States</title>
+      <author><first>Ota</first><last>Mikušek</last></author>
+      <pages>149–153</pages>
+      <abstract>This paper provides insight into automatic parliamentary corpora development. One year ago, I created a simple set of tools designed to continuously and automatically download, process, and create corpora from speeches in the parliaments of European Union member states. Despite the existence of numerous corpora providing speeches from European Union parliaments, the tools are more focused on collecting and building such corpora with minimal human interaction. These tools have been operating continuously for over a year, gathering parliamentary data and extending corpora, which together have more than one billion words. However, the process of maintaining these tools has brought unforeseen challenges, including issues such as being blocked by some parliaments due to overloading the parliament with requests, the inability to access the most recent data of a parliament, and effectively managing interrupted connections. Additionally, potential problems that may arise in the future are provided, along with possible solutions. These include problems with data loss prevention and adaptation to changes in the sources from which speeches are downloaded.</abstract>
+      <url hash="93560af8">2024.parlaclarin-1.22</url>
+      <bibkey>mikusek-2024-one</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Government and Opposition in <fixed-case>D</fixed-case>anish Parliamentary Debates</title>
+      <author><first>Costanza</first><last>Navarretta</last></author>
+      <author><first>Dorte</first><last>Haltrup Hansen</last></author>
+      <pages>154–162</pages>
+      <abstract>In this paper, we address government and opposition speeches made by the Danish Parliament’s members from 2014 to 2022. We use the linguistic annotations and metadata in ParlaMint-DK, one of the ParlaMint corpora, to investigate some characteristics of the transcribed speeches made by government and opposition and test how well classifiers can identify the speeches delivered by these groups. Our analyses confirm that there are differences in the speeches made by government and opposition e.g., in the frequency of some modality expressions. In our study, we also include parties, which do not directly support or are against the government, the “other” group. The best performing classifier for identifying speeches made by parties in government, in opposition or in “other” is a transformer with a pre-trained Danish BERT model which gave an F1-score of 0.64. The same classifier obtained an F1-score of 0.77 on the binary identification of speeches made by government or opposition parties.</abstract>
+      <url hash="1817ca49">2024.parlaclarin-1.23</url>
+      <bibkey>navarretta-haltrup-hansen-2024-government</bibkey>
+    </paper>
+    <paper id="24">
+      <title>A new Resource and Baselines for Opinion Role Labelling in <fixed-case>G</fixed-case>erman Parliamentary Debates</title>
+      <author><first>Ines</first><last>Rehbein</last></author>
+      <author><first>Simone Paolo</first><last>Ponzetto</last></author>
+      <pages>163–170</pages>
+      <abstract>Detecting opinions, their holders and targets in parliamentary debates provides an interesting layer of analysis, for example, to identify frequent targets of opinions for specific topics, actors or parties. In the paper, we present GePaDe-ORL, a new dataset for German parliamentary debates where subjective expressions, their opinion holders and targets have been annotated. We describe the annotation process and report baselines for predicting those annotations in our new dataset.</abstract>
+      <url hash="d1af609d">2024.parlaclarin-1.24</url>
+      <bibkey>rehbein-ponzetto-2024-new</bibkey>
+    </paper>
+    <paper id="25">
+      <title><fixed-case>P</fixed-case>arla<fixed-case>M</fixed-case>int Widened: a <fixed-case>E</fixed-case>uropean Dataset of Freedom of Information Act Documents (Position Paper)</title>
+      <author><first>Gerda</first><last>Viira</last></author>
+      <author><first>Maarten</first><last>Marx</last></author>
+      <author><first>Maik</first><last>Larooij</last></author>
+      <pages>171–172</pages>
+      <abstract>This position paper makes an argument for creating a corpus similar to that of ParlaMint, not consisting of parliamentary proceedings, but of documents released under Freedom of Information Acts. Over 100 countries have such an act, and almost all European countries. Bringing these now dispersed document collections together in a uniform format into one portal will result in a valuable language resource. Besides that, our Dutch experience shows that such new larger exposure of these documents leads to efforts to improve their quality at the sources. Keywords: Freedom of Information Act, ParlaMint, Government Data</abstract>
+      <url hash="23682d1d">2024.parlaclarin-1.25</url>
+      <bibkey>viira-etal-2024-parlamint</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.politicalnlp.xml b/data/xml/2024.politicalnlp.xml
new file mode 100644
index 0000000000..c09e828d14
--- /dev/null
+++ b/data/xml/2024.politicalnlp.xml
@@ -0,0 +1,138 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.politicalnlp">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Second Workshop on Natural Language Processing for Political Sciences @ LREC-COLING 2024</booktitle>
+      <editor><first>Haithem</first><last>Afli</last></editor>
+      <editor><first>Houda</first><last>Bouamor</last></editor>
+      <editor><first>Cristina Blasi</first><last>Casagran</last></editor>
+      <editor><first>Sahar</first><last>Ghannay</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="6c7188a3">2024.politicalnlp-1</url>
+      <venue>politicalnlp</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="d05ceb2a">2024.politicalnlp-1.0</url>
+      <bibkey>politicalnlp-2024-natural</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Deciphering Political Entity Sentiment in News with Large Language Models: Zero-Shot and Few-Shot Strategies</title>
+      <author><first>Alapan</first><last>Kuila</last></author>
+      <author><first>Sudeshna</first><last>Sarkar</last></author>
+      <pages>1–11</pages>
+      <abstract>Sentiment analysis plays a pivotal role in understanding public opinion, particularly in the political domain where the portrayal of entities in news articles influences public perception. In this paper, we investigate the effectiveness of Large Language Models (LLMs) in predicting entity-specific sentiment from political news articles. Leveraging zero-shot and few-shot strategies, we explore the capability of LLMs to discern sentiment towards political entities in news content. Employing a chain-of-thought (COT) approach augmented with rationale in few-shot in-context learning, we assess whether this method enhances sentiment prediction accuracy. Our evaluation on sentiment-labeled datasets demonstrates that LLMs, outperform fine-tuned BERT models in capturing entity-specific sentiment. We find that learning in-context significantly improves model performance, while the self-consistency mechanism enhances consistency in sentiment prediction. Despite the promising results, we observe inconsistencies in the effectiveness of the COT prompting method. Overall, our findings underscore the potential of LLMs in entity-centric sentiment analysis within the political news domain and highlight the importance of suitable prompting strategies and model architectures.</abstract>
+      <url hash="1ac3a17b">2024.politicalnlp-1.1</url>
+      <bibkey>kuila-sarkar-2024-deciphering</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Event Detection in the Socio Political Domain</title>
+      <author><first>Emmanuel</first><last>Cartier</last></author>
+      <author><first>Hristo</first><last>Tanev</last></author>
+      <pages>12–21</pages>
+      <abstract>In this paper we present two approaches for detection of socio political events: the first is based on manually crafted keyword combinations and the second one is based on a BERT classifier. We compare the performance of the two systems on a dataset of socio-political events. Interestingly, the systems demonstrate complementary performance: both showing their best accuracy on non overlapping sets of event types. In the evaluation section we provide insights on the effect of taxonomy mapping on the event detection evaluation. We also review in the related work section the most important resources and approaches for event extraction in the recent years.</abstract>
+      <url hash="200b6a76">2024.politicalnlp-1.2</url>
+      <bibkey>cartier-tanev-2024-event</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Multi-Dimensional Insights: Annotated Dataset of Stance, Sentiment, and Emotion in <fixed-case>F</fixed-case>acebook Comments on <fixed-case>T</fixed-case>unisia’s <fixed-case>J</fixed-case>uly 25 Measures</title>
+      <author><first>Sanaa</first><last>Laabar</last></author>
+      <author><first>Wajdi</first><last>Zaghouani</last></author>
+      <pages>22–32</pages>
+      <abstract>On July 25, 2021, Tunisian President Kais Saied announced the suspension of parliament and dismissal of Prime Minister Hichem Mechichi, a move that sparked intense public debate. This study investigates Tunisian public opinion regarding these events by analyzing a corpus of 7,535 Facebook comments collected from the official Tunisian presidency page, specifically the post announcing the July 25 measures. A team of three annotators labeled a subset of 5,000 comments, categorizing each comment’s political stance (supportive, opposing, or neutral), sentiment (positive, negative, or neutral), emotions, presence of hate speech, aggressive tone, and racism. The inter-annotator agreement, measured by Cohen’s kappa, was 0.61, indicating substantial consensus. The analysis reveals that a majority of commenters supported President Saied’s actions, outnumbering those who opposed or took a neutral stance. Moreover, the overall sentiment expressed in the comments was predominantly positive. This study provides valuable insights into the complex landscape of public opinion in Tunisia during a crucial moment in the country’s ongoing political transformation, highlighting the role of social media as a platform for political discourse and engagement.</abstract>
+      <url hash="0fe3e260">2024.politicalnlp-1.3</url>
+      <bibkey>laabar-zaghouani-2024-multi</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Masking Explicit Pro-Con Expressions for Development of a Stance Classification Dataset on Assembly Minutes</title>
+      <author><first>Tomoyosi</first><last>Akiba</last></author>
+      <author><first>Yuki</first><last>Gato</last></author>
+      <author><first>Yasutomo</first><last>Kimura</last></author>
+      <author><first>Yuzu</first><last>Uchida</last></author>
+      <author><first>Keiichi</first><last>Takamaru</last></author>
+      <pages>33–38</pages>
+      <abstract>In this paper, a new dataset for Stance Classification based on assembly minutes is introduced. We develop it by using publicity available minutes taken from diverse Japanese local governments including prefectural, city, and town assemblies. In order to make the task to predict a stance from content of a politician’s utterance without explicit stance expressions, predefined words that directly convey the speaker’s stance in the utterance are replaced by a special token. Those masked words are also used to assign a golden label, either agreement or disagreement, to the utterance. Finally, we constructed total 15,018 instances automatically from 47 Japanese local governments. The dataset is used in the shared Stance Classification task evaluated in the NTCIR-17 QA-Lab-PoliInfo-4, and is now publicity available. Since the construction method of the dataset is automatic, we can still apply it to obtain more instances from the other Japanese local governments.</abstract>
+      <url hash="bbaa8091">2024.politicalnlp-1.4</url>
+      <bibkey>akiba-etal-2024-masking</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Analysing Pathos in User-Generated Argumentative Text</title>
+      <author><first>Natalia</first><last>Evgrafova</last></author>
+      <author><first>Veronique</first><last>Hoste</last></author>
+      <author><first>Els</first><last>Lefever</last></author>
+      <pages>39–44</pages>
+      <abstract>While persuasion has been extensively examined in the context of politicians’ speeches, there exists a notable gap in the understanding of the pathos role in user-generated argumentation. This paper presents an exploratory study into the pathos dimension of user-generated arguments and formulates ideas on how pathos could be incorporated in argument mining. Using existing sentiment and emotion detection tools, this research aims to obtain insights into the role of emotion in argumentative public discussion on controversial topics, explores the connection between sentiment and stance, and detects frequent emotion-related words for a given topic.</abstract>
+      <url hash="ebd62cb3">2024.politicalnlp-1.5</url>
+      <bibkey>evgrafova-etal-2024-analysing</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Knowledge Graph Representation for Political Information Sources</title>
+      <author><first>Tinatin</first><last>Osmonova</last></author>
+      <author><first>Alexey</first><last>Tikhonov</last></author>
+      <author><first>Ivan P.</first><last>Yamshchikov</last></author>
+      <pages>45–54</pages>
+      <abstract>With the rise of computational social science, many scholars utilize data analysis and natural language processing tools to analyze social media, news articles, and other accessible data sources for examining political and social discourse. Particularly, the study of the emergence of echo-chambers due to the dissemination of specific information has become a topic of interest in mixed methods research areas. In this paper, we analyze data collected from two news portals, Breitbart News (BN) and New York Times (NYT) to prove the hypothesis that the formation of echo-chambers can be partially explained on the level of an individual information consumption rather than a collective topology of individuals’ social networks. Our research findings are presented through knowledge graphs, utilizing a dataset spanning 11.5 years gathered from BN and NYT media portals. We demonstrate that the application of knowledge representation techniques to the aforementioned news streams highlights, contrary to common assumptions, shows relative “internal” neutrality of both sources and polarizing attitude towards a small fraction of entities. Additionally, we argue that such characteristics in information sources lead to fundamental disparities in audience worldviews, potentially acting as a catalyst for the formation of echo-chambers.</abstract>
+      <url hash="97562505">2024.politicalnlp-1.6</url>
+      <bibkey>osmonova-etal-2024-knowledge</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Analyzing Conflict Through Data: A Dataset on the Digital Framing of Sheikh Jarrah Evictions</title>
+      <author><first>Anatolii</first><last>Shestakov</last></author>
+      <author><first>Wajdi</first><last>Zaghouani</last></author>
+      <pages>55–67</pages>
+      <abstract>This study empirically investigates the role of social media in tracing the evolution of the May 2021 Israeli-Palestinian crisis, centered on the Sheikh Jarrah evictions. Analyzing a dataset of 370,747 English tweets from 120,173 users from May 9-21, 2021, the research employs a mixed-methods approach combining computational techniques and qualitative content analysis. Findings support the hypothesis that social media interactions reliably map crisis dynamics, as evidenced by hashtags like #SaveSheikhJarrah corresponding to critical shifts, though virality did not correlate with hashtag use. In contrast to prior sentiment-focused studies, the context-driven analysis reveals influencers and state actors shaping polarized narratives along geopolitical lines, with high-profile voices backing Palestinian solidarity while Israeli state accounts endorsed military operations. Evidence of a transcontinental cybercampaign emerged, albeit with limitations due to the English language scope and potential biases from data collection and keyword choices. The study contributes empirical insights into the mediatization of armed conflicts through social media’s competing narratives and information flows within the Israeli-Palestinian context. Recommendations for future multilingual, multi-platform analyses are provided to address limitations.</abstract>
+      <url hash="0ed2109b">2024.politicalnlp-1.7</url>
+      <bibkey>shestakov-zaghouani-2024-analyzing</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Semi-Automatic Topic Discovery and Classification for Epidemic Intelligence via Large Language Models</title>
+      <author><first>Federico</first><last>Borazio</last></author>
+      <author><first>Danilo</first><last>Croce</last></author>
+      <author><first>Giorgio</first><last>Gambosi</last></author>
+      <author><first>Roberto</first><last>Basili</last></author>
+      <author><first>Daniele</first><last>Margiotta</last></author>
+      <author><first>Antonio</first><last>Scaiella</last></author>
+      <author><first>Martina</first><last>Del Manso</last></author>
+      <author><first>Daniele</first><last>Petrone</last></author>
+      <author><first>Andrea</first><last>Cannone</last></author>
+      <author><first>Alberto M.</first><last>Urdiales</last></author>
+      <author><first>Chiara</first><last>Sacco</last></author>
+      <author><first>Patrizio</first><last>Pezzotti</last></author>
+      <author><first>Flavia</first><last>Riccardo</last></author>
+      <author><first>Daniele</first><last>Mipatrini</last></author>
+      <author><first>Federica</first><last>Ferraro</last></author>
+      <author><first>Sobha</first><last>Pilati</last></author>
+      <pages>68–84</pages>
+      <abstract>This paper introduces a novel framework to harness Large Language Models (LLMs) for Epidemic Intelligence, focusing on identifying and categorizing emergent socio-political phenomena within health crises, with a spotlight on the COVID-19 pandemic. Our approach diverges from traditional methods, such as Topic Models, by providing explicit support to analysts through the identification of distinct thematic areas and the generation of clear, actionable statements for each topic. This supports a Zero-shot Classification mechanism, enabling effective matching of news articles to fine-grain topics without the need for model fine-tuning. The framework is designed to be as transparent as possible, producing linguistically informed insights to make the analysis more accessible to analysts who may not be familiar with every subject matter of inherently emerging phenomena. This process not only enhances the precision and relevance of the extracted Epidemic Intelligence but also fosters a collaborative environment where system linguistic abilities and the analyst’s domain expertise are integrated.</abstract>
+      <url hash="6ed6b2fe">2024.politicalnlp-1.8</url>
+      <bibkey>borazio-etal-2024-semi</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Towards quantifying politicization in foreign aid project reports</title>
+      <author><first>Sidi</first><last>Wang</last></author>
+      <author><first>Gustav</first><last>Eggers</last></author>
+      <author><first>Alexia</first><last>de Roode Torres Georgiadis</last></author>
+      <author><first>Tuan Anh</first><last>Đo</last></author>
+      <author><first>Léa</first><last>Gontard</last></author>
+      <author><first>Ruth</first><last>Carlitz</last></author>
+      <author><first>Jelke</first><last>Bloem</last></author>
+      <pages>85–90</pages>
+      <abstract>We aim to develop a metric of politicization by investigating whether this concept can be operationalized computationally using document embeddings. We are interested in measuring the extent to which foreign aid is politicized. Textual reports of foreign aid projects are often made available by donor governments, but these are large and unstructured. By embedding them in vector space, we can compute similarities between sets of known politicized keywords and the foreign aid reports. We present a pilot study where we apply this metric to USAID reports.</abstract>
+      <url hash="ca793e2e">2024.politicalnlp-1.9</url>
+      <bibkey>wang-etal-2024-towards-quantifying</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Echo-chambers and Idea Labs: Communication Styles on <fixed-case>T</fixed-case>witter</title>
+      <author><first>Aleksandra</first><last>Sorokovikova</last></author>
+      <author><first>Michael</first><last>Becker</last></author>
+      <author><first>Ivan P.</first><last>Yamshchikov</last></author>
+      <pages>91–95</pages>
+      <abstract>This paper investigates the communication styles and structures of Twitter (X) communities within the vaccination context. While mainstream research primarily focuses on the echo-chamber phenomenon, wherein certain ideas are reinforced and participants are isolated from opposing opinions, this study reveals the presence of diverse communication styles across various communities. In addition to the communities exhibiting echo-chamber behavior, this research uncovers communities with distinct communication patterns. By shedding light on the nuanced nature of communication within social networks, this study emphasizes the significance of understanding the diversity of perspectives within online communities.</abstract>
+      <url hash="c05eeefb">2024.politicalnlp-1.10</url>
+      <bibkey>sorokovikova-etal-2024-echo</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.propor.xml b/data/xml/2024.propor.xml
index 0262ddcf2e..1e3759f240 100644
--- a/data/xml/2024.propor.xml
+++ b/data/xml/2024.propor.xml
@@ -2,7 +2,7 @@
 <collection id="2024.propor">
   <volume id="1" ingest-date="2024-03-04" type="proceedings">
     <meta>
-      <booktitle>Proceedings of the 16th International Conference on Computational Processing of Portuguese</booktitle>
+      <booktitle>Proceedings of the 16th International Conference on Computational Processing of Portuguese - Vol. 1 </booktitle>
       <editor><first>Pablo</first><last>Gamallo</last></editor>
       <editor><first>Daniela</first><last>Claro</last></editor>
       <editor><first>António</first><last>Teixeira</last></editor>
@@ -620,7 +620,7 @@
     </paper>
     <paper id="62">
       <title>Exploring Multimodal Models for Humor Recognition in <fixed-case>P</fixed-case>ortuguese</title>
-      <author><first>Marcio</first><last>Inácio</last></author>
+      <author><first>Marcio</first><last>Lima Inácio</last></author>
       <author><first>Hugo Gonçalo</first><last>Oliveira</last></author>
       <pages>568–574</pages>
       <url hash="dae1a7f6">2024.propor-1.62</url>
@@ -756,4 +756,333 @@
       <bibkey>freitas-2024-text</bibkey>
     </paper>
   </volume>
+  <volume id="2" ingest-date="2024-05-01" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 16th International Conference on Computational Processing of Portuguese - Vol. 2</booktitle>
+      <editor><first>Pablo</first><last>Gamallo</last></editor>
+      <editor><first>Daniela</first><last>Claro</last></editor>
+      <editor><first>António</first><last>Teixeira</last></editor>
+      <editor><first>Livy</first><last>Real</last></editor>
+      <editor><first>Marcos</first><last>Garcia</last></editor>
+      <editor><first>Hugo Gonçalo</first><last>Oliveira</last></editor>
+      <editor><first>Raquel</first><last>Amaro</last></editor>
+      <publisher>Association for Computational Lingustics</publisher>
+      <address>Santiago de Compostela, Galicia/Spain</address>
+      <month>March</month>
+      <year>2024</year>
+      <url hash="7da4377e">2024.propor-2</url>
+      <venue>propor</venue>
+    </meta>
+    <frontmatter>
+      <url hash="7da4377e">2024.propor-2.0</url>
+      <bibkey>propor-2024-international-processing</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>PROPOR</fixed-case>’24 Competition on Automatic Essay Scoring of <fixed-case>P</fixed-case>ortuguese Narrative Essays</title>
+      <author><first>Rafael Ferreira</first><last>Mello</last></author>
+      <author><first>Hilário</first><last>Oliveira</last></author>
+      <author><first>Moésio</first><last>Wenceslau</last></author>
+      <author><first>Hyan</first><last>Batista</last></author>
+      <author><first>Thiago</first><last>Cordeiro</last></author>
+      <author><first>Ig Ibert</first><last>Bittencourt</last></author>
+      <author><first>Seiji</first><last>Isotanif</last></author>
+      <pages>1–5</pages>
+      <url hash="a6862d46">2024.propor-2.1</url>
+      <bibkey>mello-etal-2024-propor24</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>AESV</fixed-case>oting: Automatic Essay Scoring with Bert and Voting Classifiers</title>
+      <author><first>Tiago Barbosa</first><last>de Lima</last></author>
+      <author><first>Elyda</first><last>Freitas</last></author>
+      <author><first>Valmir</first><last>Macario</last></author>
+      <pages>6–9</pages>
+      <url hash="a04aff2e">2024.propor-2.2</url>
+      <bibkey>de-lima-etal-2024-aesvoting</bibkey>
+    </paper>
+    <paper id="3">
+      <title><fixed-case>P</fixed-case>i<fixed-case>LN</fixed-case> at <fixed-case>PROPOR</fixed-case>: A <fixed-case>BERT</fixed-case>-Based Strategy for Grading Narrative Essays</title>
+      <author><first>Rogério F.</first><last>de Sousa</last></author>
+      <author><first>Jeziel C.</first><last>Marinho</last></author>
+      <author><first>Francisco A. R.</first><last>Neto</last></author>
+      <author><first>Rafael T.</first><last>Anchiêta</last></author>
+      <author><first>Raimundo S.</first><last>Moura</last></author>
+      <pages>10–13</pages>
+      <url hash="9b042604">2024.propor-2.3</url>
+      <bibkey>de-sousa-etal-2024-piln</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Exploring the Automated Scoring of Narrative Essays in <fixed-case>B</fixed-case>razilian <fixed-case>P</fixed-case>ortuguese using Transformer Models</title>
+      <author><first>Eugénio</first><last>Ribeiro</last></author>
+      <author><first>Nuno</first><last>Mamede</last></author>
+      <author><first>Jorge</first><last>Baptista</last></author>
+      <pages>14–17</pages>
+      <url hash="f743e27a">2024.propor-2.4</url>
+      <bibkey>ribeiro-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Building a Language-Learning Game for <fixed-case>B</fixed-case>razilian Indigenous Languages: A Case of Study</title>
+      <author><first>Gustavo</first><last>Polleti</last></author>
+      <pages>18–22</pages>
+      <url hash="469d7ac6">2024.propor-2.5</url>
+      <bibkey>polleti-2024-building</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Computational Model for <fixed-case>Y</fixed-case>oruba Aroko Communication System</title>
+      <author><first>Adéwuyì Adétáyò</first><last>Adégbìté</last></author>
+      <author><first>Odétúnjí Àjàdí</first><last>Odéjobí</last></author>
+      <pages>23–31</pages>
+      <url hash="22eb7f9a">2024.propor-2.6</url>
+      <bibkey>adegbite-odejobi-2024-computational</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Human Evaluation of the Usefulness of Fine-Tuned <fixed-case>E</fixed-case>nglish Translators for the <fixed-case>G</fixed-case>uarani Mbya and Nheengatu Indigenous Languages</title>
+      <author><first>Claudio</first><last>Pinhanez</last></author>
+      <author><first>Paulo</first><last>Cavalin</last></author>
+      <author><first>Julio</first><last>Nogima</last></author>
+      <pages>32–36</pages>
+      <url hash="d92f4346">2024.propor-2.7</url>
+      <bibkey>pinhanez-etal-2024-human</bibkey>
+    </paper>
+    <paper id="8">
+      <title>A <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies Treebank for Nheengatu</title>
+      <author><first>Leonel Figueiredo</first><last>de Alencar</last></author>
+      <pages>37–54</pages>
+      <url hash="59731167">2024.propor-2.8</url>
+      <bibkey>de-alencar-2024-universal</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Network-based Approach for Stopwords Detection</title>
+      <author><first>Felermino D. M. A.</first><last>Ali</last></author>
+      <author><first>Gabriel</first><last>de Jesus</last></author>
+      <author><first>Henrique Lopes</first><last>Cardoso</last></author>
+      <author><first>Sérgio</first><last>Nunes</last></author>
+      <author><first>Rui</first><last>Sousa-Silva</last></author>
+      <pages>55–63</pages>
+      <url hash="0ca7f242">2024.propor-2.9</url>
+      <bibkey>ali-etal-2024-network</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Grammar Induction for <fixed-case>B</fixed-case>razilian Indigenous Languages</title>
+      <author><first>Diego Pedro Gonçalves</first><last>da Silva</last></author>
+      <author><first>Thiago Alexandre Salgueiro</first><last>Pardo</last></author>
+      <pages>64–72</pages>
+      <url hash="d45f751b">2024.propor-2.10</url>
+      <bibkey>da-silva-pardo-2024-grammar</bibkey>
+    </paper>
+    <paper id="11">
+      <title><fixed-case>NLP</fixed-case> Tools for <fixed-case>A</fixed-case>frican Languages: Overview</title>
+      <author><first>Joaquim</first><last>Mussandi</last></author>
+      <author><first>Andreas</first><last>Wichert</last></author>
+      <pages>73–82</pages>
+      <url hash="f4031a69">2024.propor-2.11</url>
+      <bibkey>mussandi-wichert-2024-nlp</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Can rules still beat neural networks? The case of automatic normalisation for 18th-century <fixed-case>P</fixed-case>ortuguese texts</title>
+      <author><first>Leonardo</first><last>Zilio</last></author>
+      <author><first>Rafaela R.</first><last>Lazzari</last></author>
+      <author><first>Maria José B.</first><last>Finatto</last></author>
+      <pages>83–92</pages>
+      <url hash="a84a3ee2">2024.propor-2.12</url>
+      <bibkey>zilio-etal-2024-rules</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Revealing Public Opinion Sentiment Landscape: Eurovision Song Contest Sentiment Analysis</title>
+      <author><first>Klara</first><last>Kozolic</last></author>
+      <author><first>Gaurish</first><last>Thakkar</last></author>
+      <author><first>Nives Mikelic</first><last>Preradovic</last></author>
+      <pages>93–102</pages>
+      <url hash="63e2f5bf">2024.propor-2.13</url>
+      <bibkey>kozolic-etal-2024-revealing</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Could Style Help Plagiarism Detection? - A Sample-based Quantitative Study of Correlation between Style Specifics and Plagiarism</title>
+      <author><first>Adile</first><last>Uka</last></author>
+      <author><first>Maria</first><last>Berger</last></author>
+      <pages>103–108</pages>
+      <url hash="ada287de">2024.propor-2.14</url>
+      <bibkey>uka-berger-2024-style</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Authorship attribution in translated texts: a stylometric approach to translator style</title>
+      <author><first>Ana</first><last>Pagano</last></author>
+      <author><first>Carlos</first><last>Perini</last></author>
+      <author><first>Evandro</first><last>Cunha</last></author>
+      <author><first>Adriana</first><last>Pagano</last></author>
+      <pages>109–117</pages>
+      <url hash="8062ae4d">2024.propor-2.15</url>
+      <bibkey>pagano-etal-2024-authorship</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Support Verb Constructions in Medieval <fixed-case>P</fixed-case>ortuguese: Evidence from the <fixed-case>CTA</fixed-case> Corpus</title>
+      <author><first>Maria Inês</first><last>Bico</last></author>
+      <author><first>Esperança</first><last>Cardeira</last></author>
+      <author><first>Jorge</first><last>Baptista</last></author>
+      <author><first>Fernando</first><last>Baptista</last></author>
+      <pages>118–129</pages>
+      <url hash="2dba8d09">2024.propor-2.16</url>
+      <bibkey>bico-etal-2024-support</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Semantic Exploration of Textual Analogies for Advanced Plagiarism Detection</title>
+      <author><first>Elyah Frisco</first><last>Andriantsialo</last></author>
+      <author><first>Volatiana Marielle</first><last>Ratianantitra</last></author>
+      <author><first>Thomas</first><last>Mahatody</last></author>
+      <pages>130–133</pages>
+      <url hash="85b35442">2024.propor-2.17</url>
+      <bibkey>andriantsialo-etal-2024-semantic</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Creating datasets for emergent contact languages preservation</title>
+      <author><first>Dalmo</first><last>Buzato</last></author>
+      <author><first>Átila</first><last>Vital</last></author>
+      <pages>134–140</pages>
+      <url hash="91241b96">2024.propor-2.18</url>
+      <bibkey>buzato-vital-2024-creating</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Psychoanalytic Studies in the Digital Humanities: Employing Topic Modeling with an <fixed-case>LLM</fixed-case> to Decode Dreams During the <fixed-case>B</fixed-case>razilian Pandemic</title>
+      <author><first>João Pedro</first><last>Campos</last></author>
+      <author><first>Natalia</first><last>Resende</last></author>
+      <author><first>Ricardo</first><last>de Souza</last></author>
+      <author><first>Gilson</first><last>Iannini</last></author>
+      <pages>141–148</pages>
+      <url hash="fecc13ca">2024.propor-2.19</url>
+      <bibkey>campos-etal-2024-psychoanalytic</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Decoding Sentiments about Migration in <fixed-case>P</fixed-case>ortuguese Political Manifestos (2011, 2015, 2019)</title>
+      <author><first>Erik Bran</first><last>Marino</last></author>
+      <author><first>Renata</first><last>Vieira</last></author>
+      <author><first>Jesus Manuel Benitez</first><last>Baleato</last></author>
+      <author><first>Ana Sofia</first><last>Ribeiro</last></author>
+      <author><first>Katarina</first><last>Laken</last></author>
+      <pages>149–159</pages>
+      <url hash="06cd31dd">2024.propor-2.20</url>
+      <bibkey>marino-etal-2024-decoding</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Analysing entity distribution in an annotated 18th-century historical source</title>
+      <author><first>Daniel De Los</first><last>Reyes</last></author>
+      <author><first>Renata</first><last>Vieira</last></author>
+      <author><first>Fernanda</first><last>Olival</last></author>
+      <author><first>Helena Freire</first><last>Cameron</last></author>
+      <author><first>Fátima</first><last>Farrica</last></author>
+      <pages>160–164</pages>
+      <url hash="b75e67eb">2024.propor-2.21</url>
+      <bibkey>reyes-etal-2024-analysing</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Roda Viva boundaries: an overview of an audio-transcription corpus</title>
+      <author><first>Isaac Souza</first><last>de Miranda Jr.</last></author>
+      <author><first>Gabriela</first><last>Wick-Pedro</last></author>
+      <author><first>Cláudia Dias</first><last>de Barros</last></author>
+      <author><first>Oto</first><last>Vale</last></author>
+      <pages>165–169</pages>
+      <url hash="01ee1285">2024.propor-2.22</url>
+      <bibkey>de-miranda-jr-etal-2024-roda</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>G</fixed-case>i<fixed-case>D</fixed-case>i: A Virtual Assistant for Screening Protocols at Home</title>
+      <author><first>Andrés</first><last>Piñeiro-Martín</last></author>
+      <author><first>Carmen</first><last>García-Mateo</last></author>
+      <author><first>Laura</first><last>Docío-Fernández</last></author>
+      <author><first>María</first><last>del Carmen López-Pérez</last></author>
+      <author><first>Ignacio</first><last>Novo-Veleiro</last></author>
+      <pages>170–173</pages>
+      <url hash="c59bd0e0">2024.propor-2.23</url>
+      <bibkey>pineiro-martin-etal-2024-gidi</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>F</fixed-case>az<fixed-case>G</fixed-case>ame: A Game Based Platform that Uses Artificial Intelligence to Help Students to Improve <fixed-case>B</fixed-case>razilian <fixed-case>P</fixed-case>ortuguese Writing Skills</title>
+      <author><first>Jéssica Soares Dos</first><last>Santos</last></author>
+      <author><first>Gabriel</first><last>Coelho</last></author>
+      <author><first>Sidney</first><last>Melo</last></author>
+      <author><first>Oniram</first><last>Atila</last></author>
+      <author><first>Carla</first><last>Zeltzer</last></author>
+      <pages>174–177</pages>
+      <url hash="c34fdbb6">2024.propor-2.24</url>
+      <bibkey>santos-etal-2024-fazgame</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Indexing <fixed-case>P</fixed-case>ortuguese <fixed-case>NLP</fixed-case> Resources with <fixed-case>PT</fixed-case>-Pump-Up</title>
+      <author><first>Rúben</first><last>Almeida</last></author>
+      <author><first>Ricardo</first><last>Campos</last></author>
+      <author><first>Alípio</first><last>Jorge</last></author>
+      <author><first>Sérgio</first><last>Nunes</last></author>
+      <pages>178–181</pages>
+      <url hash="32086e9d">2024.propor-2.25</url>
+      <bibkey>almeida-etal-2024-indexing</bibkey>
+    </paper>
+    <paper id="26">
+      <title>plain <fixed-case>X</fixed-case> – <fixed-case>AI</fixed-case> Supported Multilingual Video Workflow Platform</title>
+      <author><first>Carlos</first><last>Amaral</last></author>
+      <author><first>Catarina</first><last>Lagrifa</last></author>
+      <author><first>Mirko</first><last>Lorenz</last></author>
+      <author><first>Peggy</first><last>van der Kreeft</last></author>
+      <author><first>Tiago</first><last>Veiga</last></author>
+      <pages>182–185</pages>
+      <url hash="f695d747">2024.propor-2.26</url>
+      <bibkey>amaral-etal-2024-plain</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Perfil Público: Automatic Generation and Visualization of Author Profiles for Digital News Media</title>
+      <author><first>Nuno</first><last>Guimarães</last></author>
+      <author><first>Ricardo</first><last>Campos</last></author>
+      <author><first>Alípio</first><last>Jorge</last></author>
+      <pages>186–189</pages>
+      <url hash="b838f769">2024.propor-2.27</url>
+      <bibkey>guimaraes-etal-2024-perfil</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Exploring Open Information Extraction for the <fixed-case>P</fixed-case>ortuguese language: An integrated monolithic approach in Cloud environment</title>
+      <author><first>Augusto</first><last>Barreto</last></author>
+      <author><first>Daniela</first><last>Claro</last></author>
+      <pages>190–193</pages>
+      <url hash="f5377d2e">2024.propor-2.28</url>
+      <bibkey>barreto-claro-2024-exploring</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Blip Copilot: a smart conversational assistant</title>
+      <author><first>Evandro</first><last>Fonseca</last></author>
+      <author><first>Tayane</first><last>Soares</last></author>
+      <author><first>Dyovana</first><last>Baptista</last></author>
+      <author><first>Rogers</first><last>Damas</last></author>
+      <author><first>Lucas</first><last>Avanco</last></author>
+      <pages>194–196</pages>
+      <url hash="426e07fd">2024.propor-2.29</url>
+      <bibkey>fonseca-etal-2024-blip</bibkey>
+    </paper>
+    <paper id="30">
+      <title><fixed-case>G</fixed-case>alician–<fixed-case>P</fixed-case>ortuguese Neural Machine Translation System</title>
+      <author><first>Sofía García</first><last>González</last></author>
+      <pages>197–199</pages>
+      <url hash="88325e52">2024.propor-2.30</url>
+      <bibkey>gonzalez-2024-galician</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Nós-<fixed-case>TTS</fixed-case>: a<fixed-case>W</fixed-case>eb User Interface for <fixed-case>G</fixed-case>alician Text-to-Speech</title>
+      <author><first>Carmen</first><last>Magariños</last></author>
+      <author><first>Alp</first><last>Öktem</last></author>
+      <author><first>Antonio Moscoso</first><last>Sánchez</last></author>
+      <author><first>Marta Vázquez</first><last>Abuín</last></author>
+      <author><first>Noelia García</first><last>Díaz</last></author>
+      <author><first>Adina Ioana</first><last>Vladu</last></author>
+      <author><first>Elisa Fernández</first><last>Rei</last></author>
+      <author><first>María Baqueiro</first><last>Vidal</last></author>
+      <pages>200–203</pages>
+      <url hash="9fd60996">2024.propor-2.31</url>
+      <bibkey>magarinos-etal-2024-nos</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Autopilot: a smart sales assistant</title>
+      <author><first>Amanda</first><last>Oliveira</last></author>
+      <author><first>João</first><last>Alvarenga</last></author>
+      <author><first>Evandro</first><last>Fonseca</last></author>
+      <author><first>William</first><last>Colen</last></author>
+      <pages>204–205</pages>
+      <url hash="f2e21abc">2024.propor-2.32</url>
+      <bibkey>oliveira-etal-2024-autopilot</bibkey>
+    </paper>
+  </volume>
 </collection>
diff --git a/data/xml/2024.rail.xml b/data/xml/2024.rail.xml
new file mode 100644
index 0000000000..bd051d718f
--- /dev/null
+++ b/data/xml/2024.rail.xml
@@ -0,0 +1,194 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.rail">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Fifth Workshop on Resources for African Indigenous Languages @ LREC-COLING 2024</booktitle>
+      <editor><first>Rooweither</first><last>Mabuya</last></editor>
+      <editor><first>Muzi</first><last>Matfunjwa</last></editor>
+      <editor><first>Mmasibidi</first><last>Setaka</last></editor>
+      <editor><first>Menno</first><last>van Zaanen</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="7a09e9d2">2024.rail-1</url>
+      <venue>rail</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="644edd59">2024.rail-1.0</url>
+      <bibkey>rail-2024-resources</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Doing Phonetics in the <fixed-case>R</fixed-case>ift <fixed-case>V</fixed-case>alley: Sound Systems of <fixed-case>M</fixed-case>aasai, <fixed-case>I</fixed-case>raqw and <fixed-case>H</fixed-case>adza</title>
+      <author><first>Alain</first><last>Ghio</last></author>
+      <author><first>Didier</first><last>Demolin</last></author>
+      <author><first>Michael</first><last>Karani</last></author>
+      <author><first>Yohann</first><last>Meynadier</last></author>
+      <pages>1–9</pages>
+      <abstract>This article discusses the contribution of experimental techniques to recording phonetic data in the field. Only a small part of the phonological systems of African languages is described with precision. This is why it is important to collect empirical data in the form of sound, video and physiological recordings. This allows research questions such as patterns of variation to be addressed. Analytical methods show how to interpret data from physical principles and integrate them into appropriate models. The question of linguistic contact between different language families is also addressed. To achieve these general objectives, we present the way we design corpora, and the different ways of recording data with crucial technical considerations during fieldwork. Finally, we focus on 3 languages spoken in the Great African Rift Zone, which includes several linguistic areas belonging to the four major linguistic families of the continent. (1) Hadza is a click language with a very complex consonant system. (2) Iraqw is a Cushitic language with ejective consonants. (3) Maasai is a Nilotic language with implosive consonants and a very elaborate set of interjections, ideophones and animal calls that include sounds not described in the International Phonetic Alphabet.</abstract>
+      <url hash="c6c855b4">2024.rail-1.1</url>
+      <bibkey>ghio-etal-2024-phonetics</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Kallaama: A Transcribed Speech Dataset about Agriculture in the Three Most Widely Spoken Languages in <fixed-case>S</fixed-case>enegal</title>
+      <author><first>Elodie</first><last>Gauthier</last></author>
+      <author><first>Aminata</first><last>Ndiaye</last></author>
+      <author><first>Abdoulaye</first><last>Guissé</last></author>
+      <pages>10–19</pages>
+      <abstract>This work is part of the Kallaama project, whose objective is to produce and disseminate national languages corpora for speech technologies developments, in the field of agriculture. Except for Wolof, which benefits from some language data for natural language processing, national languages of Senegal are largely ignored by language technology providers. However, such technologies are keys to the protection, promotion and teaching of these languages. Kallaama focuses on the 3 main spoken languages by Senegalese people: Wolof, Pulaar and Sereer. These languages are widely spoken by the population, with around 10 million of native Senegalese speakers, not to mention those outside the country. However, they remain under-resourced in terms of machine-readable data that can be used for automatic processing and language technologies, all the more so in the agricultural sector. We release a transcribed speech dataset containing 125 hours of recordings, about agriculture, in each of the above-mentioned languages. These resources are specifically designed for Automatic Speech Recognition purpose, including traditional approaches. To build such technologies, we provide textual corpora in Wolof and Pulaar, and a pronunciation lexicon containing 49,132 entries from the Wolof dataset.</abstract>
+      <url hash="c0aea3b1">2024.rail-1.2</url>
+      <bibkey>gauthier-etal-2024-kallaama</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Long-Form Recordings to Study Children’s Language Input and Output in Under-Resourced Contexts</title>
+      <author><first>Joseph R.</first><last>Coffey</last></author>
+      <author><first>Alejandrina</first><last>Cristia</last></author>
+      <pages>20–31</pages>
+      <abstract>A growing body of research suggests that young children’s early speech and language exposure is associated with later language development (including delays and diagnoses), school readiness, and academic performance. The last decade has seen increasing use of child-worn devices to collect long-form audio recordings by educators, economists, and developmental psychologists. The most commonly used system for analyzing this data is LENA, which was trained on North American English child-centered data and generates estimates of children’s speech-like vocalization counts, adult word counts, and child-adult turn counts. Recently, cheaper and open-source non-LENA alternatives with multilingual training have been proposed. Both kinds of systems have been employed in under-resourced, sometimes multilingual contexts, including Africa where access to printed or digital linguistic resources may be limited. In this paper, we describe each kind of system (LENA, non-LENA), provide information on audio data collected with them that is available for reuse, review evidence of the accuracy of extant automated analyses, and note potential strengths and shortcomings of their use in African communities.</abstract>
+      <url hash="6b4a8498">2024.rail-1.3</url>
+      <bibkey>coffey-cristia-2024-long</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Developing Bilingual <fixed-case>E</fixed-case>nglish-Setswana Datasets for Space Domain</title>
+      <author><first>Tebatso G.</first><last>Moape</last></author>
+      <author><first>Sunday Olusegun</first><last>Ojo</last></author>
+      <author><first>Oludayo O.</first><last>Olugbara</last></author>
+      <pages>32–36</pages>
+      <abstract>In the current digital age, languages lacking digital presence face an imminent risk of extinction. In addition, the absence of digital resources poses a significant obstacle to the development of Natural Language Processing (NLP) applications for such languages. Therefore, the development of digital language resources contributes to the preservation of these languages and enables application development. This paper contributes to the ongoing efforts of developing language resources for South African languages with a specific focus on Setswana and presents a new English-Setswana bilingual dataset that focuses on the space domain. The dataset was constructed using the expansion method. A subset of space domain English synsets from Princeton WordNet was professionally translated to Setswana. The initial submission of translations demonstrated an accuracy rate of 99% before validation. After validation, continuous revisions and discussions between translators and validators resulted in a unanimous agreement, ultimately achieving a 100% accuracy rate. The final version of the resource was converted into an XML format due to its machine-readable framework, providing a structured hierarchy for the organization of linguistic data.</abstract>
+      <url hash="23b81e31">2024.rail-1.4</url>
+      <bibkey>moape-etal-2024-developing</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Compiling a List of Frequently Used Setswana Words for Developing Readability Measures</title>
+      <author><first>Johannes</first><last>Sibeko</last></author>
+      <pages>37–44</pages>
+      <abstract>This paper addresses the pressing need for improved readability assessment in Setswana through the creation of a list of frequently used words in Setswana. The end goal is to integrate this list into the adaptation of traditional readability measures in Setswana, such as the Dale-Chall index, which relies on frequently used words. Our initial list is developed using corpus-based methods utilising frequency lists obtained from five sets of corpora. It is then refined using manual methods. The analysis section delves into the challenges encountered during the development of the final list, encompassing issues like the inclusion of non-Setswana words, proper names, unexpected terms, and spelling variations. The decision-making process is clarified, highlighting crucial choices such as the retention of contemporary terms and the acceptance of diverse spelling variations. These decisions reflect a nuanced balance between linguistic authenticity and readability. This paper contributes to the discourse on text readability in indigenous Southern African languages. Moreover, it establishes a foundation for tailored literacy initiatives and serves as a starting point for adapting traditional frequency-list-based readability measures to Setswana.</abstract>
+      <url hash="0702c4e7">2024.rail-1.5</url>
+      <bibkey>sibeko-2024-compiling</bibkey>
+    </paper>
+    <paper id="6">
+      <title>A Qualitative Inquiry into the <fixed-case>S</fixed-case>outh <fixed-case>A</fixed-case>frican Language Identifier’s Performance on <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube Comments.</title>
+      <author><first>Nkazimlo N.</first><last>Ngcungca</last></author>
+      <author><first>Johannes</first><last>Sibeko</last></author>
+      <author><first>Sharon</first><last>Rudman</last></author>
+      <pages>45–54</pages>
+      <abstract>The South African Language Identifier (SA-LID) has proven to be a valuable tool for data analysis in the multilingual context of South Africa, particularly in governmental texts. However, its suitability for broader projects has yet to be determined. This paper aims to assess the performance of the SA-LID in identifying isiXhosa in YouTube comments as part of the methodology for research on the expression of cultural identity through linguistic strategies. We curated a selection of 10 videos which focused on the isiXhosa culture in terms of theatre, poetry, language learning, culture, or music. The videos were predominantly in English as were most of the comments, but the latter were interspersed with elements of isiXhosa, identifying the commentators as speakers of isiXhosa. The SA-LID was used to identify all instances of the use of isiXhosa to facilitate the analysis of the relevant items. Following the application of the SA-LID to this data, a manual evaluation was conducted to gauge the effectiveness of this tool in selecting all isiXhosa items. Our findings reveal significant limitations in the use of the SA-LID, encompassing the oversight of unconventional spellings in indigenous languages and misclassification of closely related languages within the Nguni group. Although proficient in identifying the use of Nguni languages, differentiating within this language group proved challenging for the SA-LID. These results underscore the necessity for manual checks to complement the use of the SA-LID when other Nguni languages may be present in the comment texts.</abstract>
+      <url hash="a334174f">2024.rail-1.6</url>
+      <bibkey>ngcungca-etal-2024-qualitative</bibkey>
+    </paper>
+    <paper id="7">
+      <title>The First <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependency Treebank for <fixed-case>T</fixed-case>swana: <fixed-case>T</fixed-case>swana-Popapolelo</title>
+      <author><first>Tanja</first><last>Gaustad</last></author>
+      <author><first>Ansu</first><last>Berg</last></author>
+      <author><first>Rigardt</first><last>Pretorius</last></author>
+      <author><first>Roald</first><last>Eiselen</last></author>
+      <pages>55–65</pages>
+      <abstract>This paper presents the first publicly available UD treebank for Tswana, Tswana-Popapolelo. The data used consists of the 20 Cairo CICLing sentences translated to Tswana. After pre-processing these sentences with detailed POS (XPOS) and converting them to universal POS (UPOS), we proceeded to annotate the data with dependency relations, documenting decisions for the language specific constructions. Linguistic issues encountered are described in detail as this is the first application of the UD framework to produce a dependency treebank for the Bantu language family in general and for Tswana specifically.</abstract>
+      <url hash="efedc966">2024.rail-1.7</url>
+      <bibkey>gaustad-etal-2024-first</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Adapting Nine Traditional Text Readability Measures into Sesotho</title>
+      <author><first>Johannes</first><last>Sibeko</last></author>
+      <author><first>Menno</first><last>van Zaanen</last></author>
+      <pages>66–76</pages>
+      <abstract>This article discusses the adaptation of traditional English readability measures into Sesotho, a Southern African indigenous low-resource language. We employ the use of a translated readability corpus to extract textual features from the Sesotho texts and readability levels from the English translations. We look at the correlation between the different features to ensure that non-competing features are used in the readability metrics. Next, through linear regression analyses, we examine the impact of the text features from the Sesotho texts on the overall readability levels (which are gauged from the English translations). Starting from the structure of the traditional English readability measures, linear regression models identify coefficients and intercepts for the different variables considered in the readability formulas for Sesotho. In the end, we propose ten readability formulas for Sesotho (one more than the initial nine; we provide two formulas based on the structure of the Gunning Fog index). We also introduce intercepts for the Gunning Fog index, the Läsbarhets index and the Readability index (which do not have intercepts in the English variants) in the Sesotho formulas.</abstract>
+      <url hash="5aba8ccb">2024.rail-1.8</url>
+      <bibkey>sibeko-van-zaanen-2024-adapting</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Bootstrapping Syntactic Resources from isi<fixed-case>Z</fixed-case>ulu to Siswati</title>
+      <author><first>Laurette</first><last>Marais</last></author>
+      <author><first>Laurette</first><last>Pretorius</last></author>
+      <author><first>Lionel Clive</first><last>Posthumus</last></author>
+      <pages>77–85</pages>
+      <abstract>IsiZulu and Siswati are mutually intelligible languages that are considered under-resourced despite their status as official languages. Even so, the available digital and computational language resources for isiZulu significantly outstrip those for Siswati, such that it is worth investigating to what degree bootstrapping approaches can be leveraged to develop resources for Siswati. In this paper, we present the development of a computational grammar and parallel treebank, based on parallel linguistic descriptions of the two languages.</abstract>
+      <url hash="87fb4d5b">2024.rail-1.9</url>
+      <bibkey>marais-etal-2024-bootstrapping</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Early Child Language Resources and Corpora Developed in Nine <fixed-case>A</fixed-case>frican Languages by the <fixed-case>SAD</fixed-case>i<fixed-case>L</fixed-case>a<fixed-case>R</fixed-case> Child Language Development Node</title>
+      <author><first>Michelle J.</first><last>White</last></author>
+      <author><first>Frenette</first><last>Southwood</last></author>
+      <author><first>Sefela Londiwe</first><last>Yalala</last></author>
+      <pages>86–93</pages>
+      <abstract>Prior to the initiation of the project reported on in this paper, there were no instruments available with which to measure the language skills of young speakers of nine official African languages of South Africa. This limited the kind of research that could be conducted, and the rate at which knowledge creation on child language development could progress. Not only does this result in a dearth of knowledge needed to inform child language interventions but it also hinders the development of child language theories that would have good predictive power across languages. This paper reports on (i) the development of a questionnaire that caregivers complete about their infant’s communicative gestures and vocabulary or about their toddler’s vocabulary and grammar skills, in isiNdebele, isiXhosa, isiZulu, Sesotho, Sesotho sa Leboa, Setswana, Siswati, Tshivenda, and Xitsonga; and (ii) the 24 child language corpora thus far developed with these instruments. The potential research avenues opened by the 18 instruments and 24 corpora are discussed.</abstract>
+      <url hash="0a66804b">2024.rail-1.10</url>
+      <bibkey>white-etal-2024-early</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Morphological Synthesizer for <fixed-case>G</fixed-case>e’ez Language: Addressing Morphological Complexity and Resource Limitations</title>
+      <author><first>Gebrearegawi Gebremariam</first><last>Gidey</last></author>
+      <author><first>Hailay Kidu</first><last>Teklehaymanot</last></author>
+      <author><first>Gebregewergs Mezgebe</first><last>Atsbha</last></author>
+      <pages>94–106</pages>
+      <abstract>Ge’ez is an ancient Semitic language renowned for its unique alphabet. It serves as the script for numerous lan- guages, including Tigrinya and Amharic, and played a pivotal role in Ethiopia’s cultural and religious development during the Aksumite kingdom era. Ge’ez remains significant as a liturgical language in Ethiopia and Eritrea, with much of the national identity documentation recorded in Ge’ez. These written materials are invaluable primary sources for studying Ethiopian and Eritrean philosophy, creativity, knowledge, and civilization. Ge’ez is a complex morphological structure with rich inflectional and derivational morphology, and no usable NLP has been developed and published until now due to the scarcity of annotated linguistic data, corpora, labeled datasets, and lexicons. Therefore, we proposed a rule-based Ge’ez morphological synthesis to generate surface words from root words according to the morphological structures of the language. Consequently, we proposed an automatic morphological synthesizer for Ge’ez using TLM. We used 1,102 sample verbs, representing all verb morphological structures, to test and evaluate the system. Finally, we get a performance of 97.4%. This result outperforms the baseline model, suggesting that other scholars build a comprehensive system considering morphological variations of the language. Keywords: Ge’ez, NLP, morphology, morphological synthesizer, rule-based</abstract>
+      <url hash="7ddc218b">2024.rail-1.11</url>
+      <bibkey>gidey-etal-2024-morphological</bibkey>
+    </paper>
+    <paper id="12">
+      <title><fixed-case>E</fixed-case>thio<fixed-case>MT</fixed-case>: Parallel Corpus for Low-resource <fixed-case>E</fixed-case>thiopian Languages</title>
+      <author><first>Atnafu Lambebo</first><last>Tonja</last></author>
+      <author><first>Olga</first><last>Kolesnikova</last></author>
+      <author><first>Alexander</first><last>Gelbukh</last></author>
+      <author><first>Jugal</first><last>Kalita</last></author>
+      <pages>107–114</pages>
+      <abstract>Recent research in natural language processing (NLP) has achieved impressive performance in tasks such as machine translation (MT), news classification, and question-answering in high-resource languages. However, the performance of MT leaves much to be desired for low-resource languages. This is due to the smaller size of available parallel corpora in these languages, if such corpora are available at all. NLP in Ethiopian languages suffers from the same issues due to the unavailability of publicly accessible datasets for NLP tasks, including MT. To help the research community and foster research for Ethiopian languages, we introduce EthioMT – a new parallel corpus for 15 languages. We also create a new benchmark by collecting a dataset for better-researched languages in Ethiopia. We evaluate the newly collected corpus and the benchmark dataset for 23 Ethiopian languages using transformer and fine-tuning approaches.</abstract>
+      <url hash="2d4c9221">2024.rail-1.12</url>
+      <bibkey>tonja-etal-2024-ethiomt</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Resources for Annotating Hate Speech in Social Media Platforms Used in <fixed-case>E</fixed-case>thiopia: A Novel Lexicon and Labelling Scheme</title>
+      <author><first>Nuhu</first><last>Ibrahim</last></author>
+      <author><first>Felicity</first><last>Mulford</last></author>
+      <author><first>Matt</first><last>Lawrence</last></author>
+      <author><first>Riza</first><last>Batista-Navarro</last></author>
+      <pages>115–123</pages>
+      <abstract>Hate speech on social media has proliferated in Ethiopia. To support studies aimed at investigating the targets and types of hate speech circulating in the Ethiopian context, we developed a new fine-grained annotation scheme that captures three elements of hate speech: the target (i.e., any groups with protected characteristics), type (i.e., the method of abuse) and nature (i.e., the style of the language used). We also developed a new lexicon of hate speech-related keywords in the four most prominent languages found on Ethiopian social media: Amharic, Afaan Oromo, English and Tigrigna. These keywords enabled us to retrieve social media posts (also in the same four languages) from three platforms (i.e., X, Telegram and Facebook), that are likely to contain hate speech. Experts in the Ethiopian context then manually annotated a sample of those retrieved posts, obtaining fair to moderate inter-annotator agreement. The resulting annotations formed the basis of a case study of which groups tend to be targeted by particular types of hate speech or by particular styles of hate speech language.</abstract>
+      <url hash="dd17ea5b">2024.rail-1.13</url>
+      <bibkey>ibrahim-etal-2024-resources</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Low Resource Question Answering: An <fixed-case>A</fixed-case>mharic Benchmarking Dataset</title>
+      <author><first>Tilahun Abedissa</first><last>Taffa</last></author>
+      <author><first>Ricardo</first><last>Usbeck</last></author>
+      <author><first>Yaregal</first><last>Assabie</last></author>
+      <pages>124–132</pages>
+      <abstract>Question Answering (QA) systems return concise answers or answer lists based on natural language text, which uses a given context document. Many resources go into curating QA datasets to advance the development of robust QA models. There is a surge in QA datasets for languages such as English; this is different for low-resource languages like Amharic. Indeed, there is no published or publicly available Amharic QA dataset. Hence, to foster further research in low-resource QA, we present the first publicly available benchmarking Amharic Question Answering Dataset (Amh-QuAD). We crowdsource 2,628 question-answer pairs from over 378 Amharic Wikipedia articles. Using the training set, we fine-tune an XLM-R-based language model and introduce a new reader model. Leveraging our newly fine-tuned reader run a baseline model to spark open-domain Amharic QA research interest. The best- performing baseline QA achieves an F-score of 80.3 and 81.34 in retriever-reader and reading comprehension settings.</abstract>
+      <url hash="0d22cab2">2024.rail-1.14</url>
+      <bibkey>taffa-etal-2024-low</bibkey>
+    </paper>
+    <paper id="15">
+      <title>The Annotators Agree to Not Agree on the Fine-grained Annotation of Hate-speech against Women in <fixed-case>A</fixed-case>lgerian Dialect Comments</title>
+      <author><first>Imane</first><last>Guellil</last></author>
+      <author><first>Yousra</first><last>Houichi</last></author>
+      <author><first>Sara</first><last>Chennoufi</last></author>
+      <author><first>Mohamed</first><last>Boubred</last></author>
+      <author><first>Anfal Yousra</first><last>Boucetta</last></author>
+      <author><first>Faical</first><last>Azouaou</last></author>
+      <pages>133–139</pages>
+      <abstract>A significant number of research studies have been presented for detecting hate speech in social media during the last few years. However, the majority of these studies are in English. Only a few studies focus on Arabic and its dialects (especially the Algerian dialect) with a smaller number of them targeting sexism detection (or hate speech against women). Even the works that have been proposed on Arabic sexism detection consider two classes only (hateful and non-hateful), and three classes(adding the neutral class) in the best scenario. This paper aims to propose the first fine-grained corpus focusing on 13 classes. However, given the challenges related to hate speech and fine-grained annotation, the Kappa metric is relatively low among the annotators (i.e. 35% ). This work in progress proposes three main contributions: 1) Annotation of different categories related to hate speech such as insults, vulgar words or hate in general. 2) Annotation of 10,000 comments, in Arabic and Algerian dialects, automatically extracted from Youtube. 3) High-lighting the challenges related to manual annotation such as subjectivity, risk of bias, lack of annotation guidelines, etc</abstract>
+      <url hash="e9503aa7">2024.rail-1.15</url>
+      <bibkey>guellil-etal-2024-annotators</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Advancing Language Diversity and Inclusion: Towards a Neural Network-based Spell Checker and Correction for <fixed-case>W</fixed-case>olof</title>
+      <author><first>Thierno Ibrahima</first><last>Cissé</last></author>
+      <author><first>Fatiha</first><last>Sadat</last></author>
+      <pages>140–151</pages>
+      <abstract>This paper introduces a novel approach to spell checking and correction for low-resource and under-represented languages, with a specific focus on an African language, Wolof. By leveraging the capabilities of transformer models and neural networks, we propose an efficient and practical system capable of correcting typos and improving text quality. Our proposed technique involves training a transformer model on a parallel corpus consisting of misspelled sentences and their correctly spelled counterparts, generated using a semi-automatic method. As we fine tune the model to transform misspelled text into accurate sentences, we demonstrate the immense potential of this approach to overcome the challenges faced by resource-scarce and under-represented languages in the realm of spell checking and correction. Our experimental results and evaluations exhibit promising outcomes, offering valuable insights that contribute to the ongoing endeavors aimed at enriching linguistic diversity and inclusion and thus improving digital communication accessibility for languages grappling with scarcity of resources and under-representation in the digital landscape.</abstract>
+      <url hash="0287fd8e">2024.rail-1.16</url>
+      <bibkey>cisse-sadat-2024-advancing</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Lateral Inversions, Word Form/Order, Unnamed Grammatical Entities and Ambiguities in the Constituency Parsing and Annotation of the <fixed-case>I</fixed-case>gala Syntax through the <fixed-case>E</fixed-case>nglish Language</title>
+      <author><first>Mahmud Mohammed</first><last>Momoh</last></author>
+      <pages>152–162</pages>
+      <abstract>The aim of this paper is expose the structural form of the Igala language and the inherent complexity related to the translation of the language to a second language – i.e. the English language, through an inquisition into its the word order, lateral inversions, and unnamed grammatical entities inherent in the language. While this study finds out that there is a preponderance of a linguistic typology with subject-verb-object word order and the total absence of preposition in the speech composition of the Igala language. The implication of these trio of topic sentences (syntactic inversion, word ordering, unnamed entities) have remain within the dark corner of intellectual consideration and worst still the incorporation of this considerations in syntax parsing and annotation in computing. Rising from ongoing abstruseness and incongruity in machine translation of Igala, a comprehension model for automotive identification, application and/or conversion of these structural forms to the English language shall be the focus of this paper.</abstract>
+      <url hash="05eff149">2024.rail-1.17</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="af313924">2024.rail-1.17.OptionalSupplementaryMaterial.pdf</attachment>
+      <bibkey>momoh-2024-lateral</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.rapid.xml b/data/xml/2024.rapid.xml
new file mode 100644
index 0000000000..1b779fe5d3
--- /dev/null
+++ b/data/xml/2024.rapid.xml
@@ -0,0 +1,144 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.rapid">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Fifth Workshop on Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments @LREC-COLING 2024</booktitle>
+      <editor><first>Dimitrios</first><last>Kokkinakis</last></editor>
+      <editor><first>Kathleen C.</first><last>Fraser</last></editor>
+      <editor><first>Charalambos K.</first><last>Themistocleous</last></editor>
+      <editor><first>Kristina Lundholm</first><last>Fors</last></editor>
+      <editor><first>Athanasios</first><last>Tsanas</last></editor>
+      <editor><first>Fredrik</first><last>Ohman</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="8f364cef">2024.rapid-1</url>
+      <venue>rapid</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="c7479684">2024.rapid-1.0</url>
+      <bibkey>rapid-2024-resources</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Semantic-based <fixed-case>NLP</fixed-case> techniques discriminate schizophrenia and <fixed-case>W</fixed-case>ernicke’s aphasia based on spontaneous speech</title>
+      <author><first>Frank</first><last>Tsiwah</last></author>
+      <author><first>Anas</first><last>Mayya</last></author>
+      <author><first>Andreas</first><last>van Cranenburgh</last></author>
+      <pages>1–8</pages>
+      <abstract>People with schizophrenia spectrum disorder (SSD)—a psychiatric disorder, and people with Wernicke’s aphasia — an acquired neurological disorder, are both known to display semantic deficits in their spontaneous speech outputs. Very few studies directly compared the two groups on their spontaneous speech (Gerson et al., 1977; Faber et al., 1983), and no consistent results were found. Our study uses word (based on the word2vec model with moving windows across words) and sentence (transformer based-model) embeddings as features for a machine learning classification model to differentiate between the spontaneous speech of both groups. Additionally, this study uses these measures to differentiate between people with Wernicke’s aphasia and healthy controls. The model is able to classify patients with Wernicke’s aphasia and patients with SSD with a cross-validated accuracy of 81%. Additionally, it is also able to classify patients with Wernicke’s aphasia versus healthy controls and SSD versus healthy controls with cross-validated accuracy of 93.72% and 84.36%, respectively. For the SSD individuals, sentence and/or discourse level features are deemed more informative by the model, whereas for the Wernicke group, only intra-sentential features are more informative. Overall, we show that NLP-based semantic measures are sensitive to identifying Wernicke’s aphasic and schizophrenic speech.</abstract>
+      <url hash="7bebd994">2024.rapid-1.1</url>
+      <bibkey>tsiwah-etal-2024-semantic</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Speech Rate and Salient Syllables Position in Spontaneous Speech of Children with Autism Spectrum Disorder</title>
+      <author><first>Valentina</first><last>Saccone</last></author>
+      <pages>9–15</pages>
+      <abstract>The study employs a semi-automatic approach to analyze speech rate in spoken Italian, aiming to identify acoustic parameters associated with perceptual atypicality in the speech of children diagnosed with Autism Spectrum Disorder (ASD). The research focuses on a dataset comprising recordings of semi-spontaneous interactions, in comparison with interviews of Typically Developing (TD) children. A detailed examination of speech rate variability is conducted, progressing from assessing overall speech rate in conversation to the analysis of individual utterances. Furthermore, salient syllables within utterances are identified using an automatic procedure through the Salient Detector Praat script and analyzed for stress position. The study highlights specific speech style, including rapid-telegraphic and reading-performed speech. Additionally, it reveals a higher speech rate with the increasing length of utterance when &lt;10 syllables; conversely, a speech rate diminishing in 20-25 syllables utterances, suggesting potential difficulty in producing longer utterances associated with increased cognitive load.</abstract>
+      <url hash="eb97f3b9">2024.rapid-1.2</url>
+      <bibkey>saccone-2024-speech</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Cross-Lingual Examination of Language Features and Cognitive Scores From Free Speech</title>
+      <author><first>Hali</first><last>Lindsay</last></author>
+      <author><first>Giorgia</first><last>Albertin</last></author>
+      <author><first>Louisa</first><last>Schwed</last></author>
+      <author><first>Nicklas</first><last>Linz</last></author>
+      <author><first>Johannes</first><last>Tröger</last></author>
+      <pages>16–25</pages>
+      <abstract>Speech analysis is gaining significance for monitoring neurodegenerative disorders, but with a view of application in clinical practice, solid evidence of the association of language features with cognitive scores is still needed. A cross-linguistic investigation has been pursued to examine whether language features show significance correlation with two cognitive scores, i.e. Mini-Mental State Examination and ki:e SB-C scores, on Alzheimer’s Disease patients. We explore 23 language features, representative of syntactic complexity and semantic richness, extracted on a dataset of free speech recordings of 138 participants distributed in four languages (Spanish, Catalan, German, Dutch). Data was analyzed using the speech library SIGMA; Pearson’s correlation was computed with Bonferroni correction, and a mixed effects linear regression analysis is done on the significant correlated results. MMSE and the SB-C are found to be correlated with no significant differences across languages. Three features were found to be significantly correlated with the SB-C scores. Among these, two features of lexical richness show consistent patterns across languages, while determiner rate showed language-specific patterns.</abstract>
+      <url hash="a4480b03">2024.rapid-1.3</url>
+      <bibkey>lindsay-etal-2024-cross</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Speech and Language Biomarkers of Neurodegenerative Conditions: Developing Cross-Linguistically Valid Tools for Automatic Analysis</title>
+      <author><first>Iris E.</first><last>Nowenstein</last></author>
+      <author><first>Marija</first><last>Stanojevic</last></author>
+      <author><first>Gunnar</first><last>Örnólfsson</last></author>
+      <author><first>María Kristín</first><last>Jónsdóttir</last></author>
+      <author><first>Bill</first><last>Simpson</last></author>
+      <author><first>Jennifer</first><last>Sorinas Nerin</last></author>
+      <author><first>Bryndís</first><last>Bergþórsdóttir</last></author>
+      <author><first>Kristín</first><last>Hannesdóttir</last></author>
+      <author><first>Jekaterina</first><last>Novikova</last></author>
+      <author><first>Jelena</first><last>Curcic</last></author>
+      <pages>26–33</pages>
+      <abstract>In the last decade, a rapidly growing body of studies has shown promising results for the automatic detection and extraction of speech and language features as biomarkers of neurodegenerative conditions such as Alzheimer’s disease. This has sparked great optimism and the development of various digital health tools, but also warnings regarding the predominance of English in the field and calls for linguistically diverse research as well as global, equitable access to novel clinical instruments. To automatically extract clinically relevant features from transcripts in low-resource languages, two approaches are possible: 1) utilizing a limited range of language-specific tools or 2) translating text to English and then extracting the features. We evaluate these approaches for part-of-speech (POS) rates in transcripts of recorded picture descriptions from a cross-sectional study of Icelandic speakers at different stages of Alzheimer’s disease and healthy controls. While the translation method merits further exploration, only a subset of the POS categories show a promising correspondence to the direct extraction from the Icelandic transcripts in our results, indicating that the translation method has to be linguistically validated at the individual POS category level.</abstract>
+      <url hash="ce50a9bc">2024.rapid-1.4</url>
+      <bibkey>nowenstein-etal-2024-speech</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Automatic Detection of Rhythmic Features in Pathological Speech of <fixed-case>MCI</fixed-case> and Dementia Patients</title>
+      <author><first>Marica</first><last>Belmonte</last></author>
+      <author><first>Gloria</first><last>Gagliardi</last></author>
+      <author><first>Dimitrios</first><last>Kokkinakis</last></author>
+      <author><first>Fabio</first><last>Tamburini</last></author>
+      <pages>34–44</pages>
+      <abstract>Linguistic alterations represent one of the prodromal signs of cognitive decline associated with Dementia. In recent years, a growing body of work has been devoted to the development of algorithms for the automatic linguistic analysis of both oral and written texts, for diagnostic purposes. The extraction of Digital Linguistic Biomarkers from patients’ verbal productions can indeed provide a rapid, ecological, and cost-effective system for large-scale screening of the pathology. This article contributes to the ongoing research in the field by exploring a traditionally less studied aspect of language in Dementia, namely the rhythmic characteristics of speech. In particular, the paper focuses on the automatic detection of rhythmic features in Italian-connected speech. A landmark-based system was developed and evaluated to segment the speech flow into vocalic and consonantal intervals and to calculate several rhythmic metrics. Additionally, the reliability of these metrics in identifying Mild Cognitive Impairment and Dementia patients was tested.</abstract>
+      <url hash="7f666a02">2024.rapid-1.5</url>
+      <bibkey>belmonte-etal-2024-automatic</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Open Brain <fixed-case>AI</fixed-case>. Automatic Language Assessment</title>
+      <author><first>Charalambos</first><last>Themistocleous</last></author>
+      <pages>45–53</pages>
+      <abstract>Language assessment plays a crucial role in diagnosing and treating individuals with speech, language, and communication disorders caused by neurogenic conditions, whether developmental or acquired. To support clinical assessment and research, we developed Open Brain AI (https://openbrainai.com). This computational platform employs AI techniques, namely machine learning, natural language processing, large language models, and automatic speech-to-text transcription, to automatically analyze multilingual spoken and written productions. This paper discusses the development of Open Brain AI, the AI language processing modules, and the linguistic measurements of discourse macro-structure and micro-structure. The fast and automatic analysis of language alleviates the burden on clinicians, enabling them to streamline their workflow and allocate more time and resources to direct patient care. Open Brain AI is freely accessible, empowering clinicians to conduct critical data analyses and give more attention and resources to other critical aspects of therapy and treatment.</abstract>
+      <url hash="3ec14d04">2024.rapid-1.6</url>
+      <bibkey>themistocleous-2024-open</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Exploring the Relationship Between Intrinsic Stigma in Masked Language Models and Training Data Using the Stereotype Content Model</title>
+      <author><first>Mario</first><last>Mina</last></author>
+      <author><first>Júlia</first><last>Falcão</last></author>
+      <author><first>Aitor</first><last>Gonzalez-Agirre</last></author>
+      <pages>54–67</pages>
+      <abstract>Much work has gone into developing language models of increasing size, but only recently have we begun to examine them for pernicious behaviour that could lead to harming marginalised groups. Following Lin et al. (2022) in rooting our work in psychological research, we prompt two masked language models (MLMs) of different specialisations in English and Spanish with statements from a questionnaire developed to measure stigma to determine if they treat physical and mental illnesses equally. In both models we find a statistically significant difference in the treatment of physical and mental illnesses across most if not all latent constructs as measured by the questionnaire, and thus they are more likely to associate mental illnesses with stigma. We then examine their training data or data retrieved from the same domain using a computational implementation of the Stereotype Content Model (SCM) (Fiske et al., 2002; Fraser et al., 2021) to interpret the questionnaire results based on the SCM values as reflected in the data. We observe that model behaviour can largely be explained by the distribution of the mentions of illnesses according to their SCM values.</abstract>
+      <url hash="5393c6f0">2024.rapid-1.7</url>
+      <bibkey>mina-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Establishing Control Corpora for Depression Detection in <fixed-case>M</fixed-case>odern <fixed-case>G</fixed-case>reek: Methodological Insights</title>
+      <author><first>Vivian</first><last>Stamou</last></author>
+      <author><first>George</first><last>Mikros</last></author>
+      <author><first>George</first><last>Markopoulos</last></author>
+      <author><first>Spyridoula</first><last>Varlokosta</last></author>
+      <pages>68–76</pages>
+      <abstract>This paper presents a methodological approach for establishing control corpora in the context of depression detection in the Modern Greek language. We discuss various methods used to create control corpora, focusing on the challenge of selecting representative samples from the general population when the target reference is the depressed population. Our approach includes traditional random selection among Twitter users, as well as an innovative method for creating topic-oriented control corpora. Through this study, we provide insights into the development of control corpora, offering valuable considerations for researchers working on similar projects in linguistic analysis and mental health studies. In addition, we identify several dominant topics in the depressed population such as religion, sentiments, health and digestion, which seem to align with findings consistently reported in the literature</abstract>
+      <url hash="479789e5">2024.rapid-1.8</url>
+      <bibkey>stamou-etal-2024-establishing</bibkey>
+    </paper>
+    <paper id="9">
+      <title>A Preliminary Evaluation of Semantic Coherence and Cohesion in Aphasic and Non-Aphasic Discourse Across Test and Retest</title>
+      <author><first>Snigdha</first><last>Khanna</last></author>
+      <author><first>Brielle C.</first><last>Stark</last></author>
+      <pages>77–86</pages>
+      <abstract>This paper evaluates global and local semantic coherence in aphasic and non-aphasic discourse tasks using the Tool for the Automatic Analysis of Cohesion (TAACO). The motivation for this paper stems from a lack of automatic methods to evaluate discourse-level phenomena, such as semantic cohesion, in transcripts derived from persons with aphasia. It leverages existing test-retest data to evaluate two main objectives: (1) Test-Retest Reliability, to identify if variables significantly differ across test and retest time points for either group (aphasia, control), and (2) Inter-Group Discourse Cohesion, where aphasic discourse is expected to be less cohesive than control discourse, resulting in lower cohesion scores for the aphasia group. Exploratory analysis examines correlations between variables for both groups, identifying any relationships between word-level and sentence-level semantic variables. Results verify that semantic cohesion and coherence are generally preserved in both groups, except for word-level and a few sentence-level semantic measures,w which are higher for the control group. Overall, variables tend to be reliable across time points for both groups. Notably, the aphasia group demonstrates more variability in cohesion than the control group, which is to be expected after brain injury. A close relationship between word-level indices and other indices is observed, suggesting a disconnection between word-level factors and sentence-level metrics.</abstract>
+      <url hash="0b3d34ac">2024.rapid-1.9</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="a48d2572">2024.rapid-1.9.OptionalSupplementaryMaterial.py</attachment>
+      <bibkey>khanna-stark-2024-preliminary</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Harnessing Linguistic Analysis for <fixed-case>ADHD</fixed-case> Diagnosis Support: A Stylometric Approach to Self-Defining Memories</title>
+      <author><first>Florian Raphaël</first><last>Cafiero</last></author>
+      <author><first>Juan</first><last>Barrios Rudloff</last></author>
+      <author><first>Simon</first><last>Gabay</last></author>
+      <pages>87–94</pages>
+      <abstract>This study explores the potential of stylometric analysis in identifying Self-Defining Memories (SDMs) authored by individuals with Attention-Deficit/Hyperactivity Disorder (ADHD) versus a control group. A sample of 198 SDMs were written by 66 adolescents and were then analysed using Support Vector Classifiers (SVC). The analysis included a variety of linguistic features such as character 3-grams, function words, sentence length, or lexical richness among others. It also included metadata about the participants (gender, age) and their SDMs (self-reported sentiment after recalling their memories). The results reveal a promising ability of linguistic analysis to accurately classify SDMs, with perfect prediction (F1=1.0) in the contextually simpler setup of text-by-text prediction, and satisfactory levels of precision (F1 = 0.77) when predicting individual by individual. Such results highlight the significant role that linguistic characteristics play in reflecting the distinctive cognitive patterns associated with ADHD. While not a substitute for professional diagnosis, textual analysis offers a supportive avenue for early detection and a deeper understanding of ADHD.</abstract>
+      <url hash="26fde4f5">2024.rapid-1.10</url>
+      <bibkey>cafiero-etal-2024-harnessing</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Crosslinguistic Acoustic Feature-based Dementia Classification Using Advanced Learning Architectures</title>
+      <author><first>Anna Seo Gyeong</first><last>Choi</last></author>
+      <author><first>Jin-seo</first><last>Kim</last></author>
+      <author><first>Seo-hee</first><last>Kim</last></author>
+      <author><first>Min Seok</first><last>Back</last></author>
+      <author><first>Sunghye</first><last>Cho</last></author>
+      <pages>95–100</pages>
+      <abstract>In this study, we rigorously evaluated eight machine learning and deep learning classifiers for identifying Alzheimer’s Disease (AD) patients using crosslinguistic acoustic features automatically extracted from one-minute oral picture descriptions produced by speakers of American English, Korean, and Mandarin Chinese. We employed eGeMAPSv2 and ComParE feature sets on segmented and non-segmented audio data. The Multilayer Perceptron model showed the highest performance, achieving an accuracy of 83.54% and an AUC of 0.8 on the ComParE features extracted from non-segmented picture description data. Our findings suggest that classifiers trained with acoustic features extracted from one-minute picture description data in multiple languages are highly promising as a quick, language-universal, large-scale, remote screening tool for AD. However, the dataset included predominantly English-speaking participants, indicating the need for more balanced multilingual datasets in future research.</abstract>
+      <url hash="2d4daeef">2024.rapid-1.11</url>
+      <bibkey>choi-etal-2024-crosslinguistic</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.readi.xml b/data/xml/2024.readi.xml
new file mode 100644
index 0000000000..34bc5f7136
--- /dev/null
+++ b/data/xml/2024.readi.xml
@@ -0,0 +1,128 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.readi">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 3rd Workshop on Tools and Resources for People with REAding DIfficulties (READI) @ LREC-COLING 2024</booktitle>
+      <editor><first>Rodrigo</first><last>Wilkens</last></editor>
+      <editor><first>Rémi</first><last>Cardon</last></editor>
+      <editor><first>Amalia</first><last>Todirascu</last></editor>
+      <editor><first>Núria</first><last>Gala</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="c1632f72">2024.readi-1</url>
+      <venue>readi</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="ea35fa4d">2024.readi-1.0</url>
+      <bibkey>readi-2024-tools</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Evaluating Document Simplification: On the Importance of Separately Assessing Simplicity and Meaning Preservation</title>
+      <author><first>Liam</first><last>Cripwell</last></author>
+      <author><first>Joël</first><last>Legrand</last></author>
+      <author><first>Claire</first><last>Gardent</last></author>
+      <pages>1–14</pages>
+      <abstract>Text simplification intends to make a text easier to read while preserving its core meaning. Intuitively and as shown in previous works, these two dimensions (simplification and meaning preservation) are often-times inversely correlated. An overly conservative text will fail to simplify sufficiently, whereas extreme simplification will degrade meaning preservation. Yet, popular evaluation metrics either aggregate meaning preservation and simplification into a single score (SARI, LENS), or target meaning preservation alone (BERTScore, QuestEval). Moreover, these metrics usually require a set of references and most previous work has only focused on sentence-level simplification. In this paper, we focus on the evaluation of document-level text simplification and compare existing models using distinct metrics for meaning preservation and simplification. We leverage existing metrics from similar tasks and introduce a reference-less metric variant for simplicity, showing that models are mostly biased towards either simplification or meaning preservation, seldom performing well on both dimensions. Making use of the fact that the metrics we use are all reference-less, we also investigate the performance of existing models when applied to unseen data (where reference simplifications are unavailable).</abstract>
+      <url hash="37516886">2024.readi-1.1</url>
+      <bibkey>cripwell-etal-2024-evaluating</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Malmon: A Crowd-Sourcing Platform for Simple Language</title>
+      <author><first>Helgi Björn</first><last>Hjartarson</last></author>
+      <author><first>Steinunn Rut</first><last>Friðriksdóttir</last></author>
+      <pages>15–21</pages>
+      <abstract>This paper presents a crowd-sourcing platform designed to address the need for parallel corpora in the field of Automatic Text Simplification (ATS). ATS aims to automatically reduce the linguistic complexity of text to aid individuals with reading difficulties, such as those with cognitive disorders, dyslexia, children, and non-native speakers. ATS does not only facilitate improved reading comprehension among these groups but can also enhance the preprocessing stage for various NLP tasks through summarization, contextual simplification, and paraphrasing. Our work introduces a language independent, openly accessible platform that crowdsources training data for ATS models, potentially benefiting low-resource languages where parallel data is scarce. The platform can efficiently aid in the collection of parallel corpora by providing a user-friendly data-collection environment. Furthermore, using human crowd-workers for the data collection process offers a potential resource for linguistic research on text simplification practices. The paper discusses the platform’s architecture, built with modern web technologies, and its user-friendly interface designed to encourage widespread participation. Through gamification and a robust admin panel, the platform incentivizes high-quality data collection and engagement from crowdworkers.</abstract>
+      <url hash="96030e76">2024.readi-1.2</url>
+      <bibkey>hjartarson-fridriksdottir-2024-malmon</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Automatic Generation and Evaluation of Reading Comprehension Test Items with Large Language Models</title>
+      <author><first>Andreas</first><last>Säuberli</last></author>
+      <author><first>Simon</first><last>Clematide</last></author>
+      <pages>22–37</pages>
+      <abstract>Reading comprehension tests are used in a variety of applications, reaching from education to assessing the comprehensibility of simplified texts. However, creating such tests manually and ensuring their quality is difficult and time-consuming. In this paper, we explore how large language models (LLMs) can be used to generate and evaluate multiple-choice reading comprehension items. To this end, we compiled a dataset of German reading comprehension items and developed a new protocol for human and automatic evaluation, including a metric we call text informativity, which is based on guessability and answerability. We then used this protocol and the dataset to evaluate the quality of items generated by Llama 2 and GPT-4. Our results suggest that both models are capable of generating items of acceptable quality in a zero-shot setting, but GPT-4 clearly outperforms Llama 2. We also show that LLMs can be used for automatic evaluation by eliciting item reponses from them. In this scenario, evaluation results with GPT-4 were the most similar to human annotators. Overall, zero-shot generation with LLMs is a promising approach for generating and evaluating reading comprehension test items, in particular for languages without large amounts of available data.</abstract>
+      <url hash="7a3ded65">2024.readi-1.3</url>
+      <bibkey>sauberli-clematide-2024-automatic</bibkey>
+    </paper>
+    <paper id="4">
+      <title>An Extensible Massively Multilingual Lexical Simplification Pipeline Dataset using the <fixed-case>M</fixed-case>ulti<fixed-case>LS</fixed-case> Framework</title>
+      <author><first>Matthew</first><last>Shardlow</last></author>
+      <author><first>Fernando</first><last>Alva-Manchego</last></author>
+      <author><first>Riza</first><last>Batista-Navarro</last></author>
+      <author><first>Stefan</first><last>Bott</last></author>
+      <author><first>Saul</first><last>Calderon Ramirez</last></author>
+      <author><first>Rémi</first><last>Cardon</last></author>
+      <author><first>Thomas</first><last>François</last></author>
+      <author><first>Akio</first><last>Hayakawa</last></author>
+      <author><first>Andrea</first><last>Horbach</last></author>
+      <author><first>Anna</first><last>Huelsing</last></author>
+      <author><first>Yusuke</first><last>Ide</last></author>
+      <author><first>Joseph Marvin</first><last>Imperial</last></author>
+      <author><first>Adam</first><last>Nohejl</last></author>
+      <author><first>Kai</first><last>North</last></author>
+      <author><first>Laura</first><last>Occhipinti</last></author>
+      <author><first>Nelson</first><last>Peréz Rojas</last></author>
+      <author><first>Nishat</first><last>Raihan</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Martin</first><last>Solis Salazar</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <author><first>Horacio</first><last>Saggion</last></author>
+      <pages>38–46</pages>
+      <abstract>We present preliminary findings on the MultiLS dataset, developed in support of the 2024 Multilingual Lexical Simplification Pipeline (MLSP) Shared Task. This dataset currently comprises of 300 instances of lexical complexity prediction and lexical simplification across 10 languages. In this paper, we (1) describe the annotation protocol in support of the contribution of future datasets and (2) present summary statistics on the existing data that we have gathered. Multilingual lexical simplification can be used to support low-ability readers to engage with otherwise difficult texts in their native, often low-resourced, languages.</abstract>
+      <url hash="695bc622">2024.readi-1.4</url>
+      <bibkey>shardlow-etal-2024-extensible</bibkey>
+    </paper>
+    <paper id="5">
+      <title><fixed-case>SIERA</fixed-case>: An Evaluation Metric for Text Simplification using the Ranking Model and Data Augmentation by Edit Operations</title>
+      <author><first>Hikaru</first><last>Yamanaka</last></author>
+      <author><first>Takenobu</first><last>Tokunaga</last></author>
+      <pages>47–58</pages>
+      <abstract>Automatic evaluation metrics are indispensable for text simplification (TS) research. The past TS research adopts three evaluation aspects: fluency, meaning preservation and simplicity. However, there is little consensus on a metric to measure simplicity, a unique aspect of TS compared with other text generation tasks. In addition, many of the existing metrics require reference simplified texts for evaluation. Thus, the cost of collecting reference texts is also an issue. This study proposes a new automatic evaluation metric, SIERA, for sentence simplification. SIERA employs a ranking model for the order relation of simplicity, which is trained by pairs of the original and simplified sentences. It does not require reference sentences for either training or evaluation. The sentence pairs for training are further augmented by the proposed method that utlizes edit operations to generate intermediate sentences with the simplicity between the original and simplified sentences. Using three evaluation datasets for text simplification, we compare SIERA with other metrics by calculating the correlations between metric values and human ratings. The results showed SIERA’s superiority over other metrics with a reservation that the quality of evaluation sentences is consistent with that of the training data.</abstract>
+      <url hash="1f08f7ad">2024.readi-1.5</url>
+      <bibkey>yamanaka-tokunaga-2024-siera</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Transfer Learning for <fixed-case>R</fixed-case>ussian Legal Text Simplification</title>
+      <author><first>Mark</first><last>Athugodage</last></author>
+      <author><first>Olga</first><last>Mitrofanove</last></author>
+      <author><first>Vadim</first><last>Gudkov</last></author>
+      <pages>59–69</pages>
+      <abstract>We present novel results in legal text simplification for Russian. We introduce the first dataset for such a task in Russian - a parallel corpus based on the data extracted from “Rossiyskaya Gazeta Legal Papers”. In this study we discuss three approaches for text simplification which involve T5 and GPT model architectures. We evaluate the proposed models on a set of metrics: ROUGE, SARI and BERTScore. We also analysed the models’ results on such readability indices as Flesch-Kinkaid Grade Level and Gunning Fog Index. And, finally, we performed human evaluation of simplified texts generated by T5 and GPT models; expertise was carried out by native speakers of Russian and Russian lawyers. In this research we compared T5 and GPT architectures for text simplification task and found out that GPT handles better when it is fine-tuned on dataset of coped texts. Our research makes a big step in improving Russian legal text readability and accessibility for common people.</abstract>
+      <url hash="8967bd46">2024.readi-1.6</url>
+      <bibkey>athugodage-etal-2024-transfer</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Accessible Communication: a systematic review and comparative analysis of official <fixed-case>E</fixed-case>nglish Easy-to-Understand (<fixed-case>E</fixed-case>2<fixed-case>U</fixed-case>) language guidelines</title>
+      <author><first>Andreea Maria</first><last>Deleanu</last></author>
+      <author><first>Constantin</first><last>Orasan</last></author>
+      <author><first>Sabine</first><last>Braun</last></author>
+      <pages>70–92</pages>
+      <abstract>Easy-to-Understand (E2U) language varieties have been recognized by the United Nation’s Convention on the Rights of Persons with Disabilities (2006) as a means to guarantee the fundamental right to Accessible Communication. Increased awareness has driven changes in European (European Commission, 2015, 2021; European Parliament, 2016) and International legislation (ODI, 2010), prompting public-sector and other institutions to offer domain-specific content into E2U language to prevent communicative exclusion of those facing cognitive barriers (COGA, 2017; Maaß, 2020; Perego, 2020). However, guidance on what it is that makes language actually ‘easier to understand’ is still fragmented and vague. For this reason, we carried out a systematic review of official guidelines for English Plain Language and Easy Language to identify the most effective lexical, syntactic and adaptation strategies that can reduce complexity in verbal discourse according to official bodies. This article will present the methods and preliminary results of the guidelines analysis.</abstract>
+      <url hash="ee9cb8ce">2024.readi-1.7</url>
+      <bibkey>deleanu-etal-2024-accessible</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>L</fixed-case>anguage<fixed-case>T</fixed-case>ool as a <fixed-case>CAT</fixed-case> tool for Easy-to-Read in <fixed-case>S</fixed-case>panish</title>
+      <author><first>Margot</first><last>Madina</last></author>
+      <author><first>Itziar</first><last>Gonzalez-Dios</last></author>
+      <author><first>Melanie</first><last>Siegel</last></author>
+      <pages>93–101</pages>
+      <abstract>Easy-to-Read (E2R) is an approach to content creation that emphasizes simplicity and clarity in language to make texts more accessible to readers with cognitive challenges or learning disabilities. The Spanish version of E2R is called Lectura Fácil (LF). E2R and its variants, such as LF, focus on straightforward language and structure to enhance readability. The manual production of such texts is both time and resource expensive. In this work, we have developed LFWriteAssist, an authoring support tool that aligns with the guidelines of LF. It is underpinned by the functionalities of LanguageTool, a free and open source grammar, style and spelling checker. Our tool assists in ensuring compliance with LF standard, provides definitions for complex, polysemic, or infrequently used terms, and acronym extensions. The tool is primarily targeted at LF creators, as it serves as an authoring aid, identifying any rule infringements and assisting with language simplifications. However, it can be used by anyone who seek to enhance text readability and inclusivity. The tool’s code is made available as open source, thereby contributing to the wider effort of creating inclusive and comprehensible content.</abstract>
+      <url hash="04cd925b">2024.readi-1.8</url>
+      <bibkey>madina-etal-2024-languagetool</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Paying attention to the words: explaining readability prediction for <fixed-case>F</fixed-case>rench as a foreign language</title>
+      <author><first>Rodrigo</first><last>Wilkens</last></author>
+      <author><first>Patrick</first><last>Watrin</last></author>
+      <author><first>Thomas</first><last>François</last></author>
+      <pages>102–115</pages>
+      <abstract>Automatic text Readability Assessment (ARA) has been seen as a way of helping people with reading difficulties. Recent advancements in Natural Language Processing have shifted ARA from linguistic-based models to more precise black-box models. However, this shift has weakened the alignment between ARA models and the reading literature, potentially leading to inaccurate predictions based on unintended factors. In this paper, we investigate the explainability of ARA models, inspecting the relationship between attention mechanism scores, ARA features, and CEFR level predictions made by the model. We propose a method for identifying features associated with the predictions made by a model through the use of the attention mechanism. Exploring three feature families (i.e., psycho-linguistic, work frequency and graded lexicon), we associated features with the model’s attention heads. Finally, while not fully explanatory of the model’s performance, the correlations of these associations surpass those between features and text readability levels.</abstract>
+      <url hash="5bb2c735">2024.readi-1.9</url>
+      <bibkey>wilkens-etal-2024-paying</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.rfp.xml b/data/xml/2024.rfp.xml
new file mode 100644
index 0000000000..1f3f9709be
--- /dev/null
+++ b/data/xml/2024.rfp.xml
@@ -0,0 +1,77 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.rfp">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the First Workshop on Reference, Framing, and Perspective @ LREC-COLING 2024</booktitle>
+      <editor><first>Pia</first><last>Sommerauer</last></editor>
+      <editor><first>Tommaso</first><last>Caselli</last></editor>
+      <editor><first>Malvina</first><last>Nissim</last></editor>
+      <editor><first>Levi</first><last>Remijnse</last></editor>
+      <editor><first>Piek</first><last>Vossen</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="be40aef3">2024.rfp-1</url>
+      <venue>rfp</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="a25651a0">2024.rfp-1.0</url>
+      <bibkey>rfp-2024-reference</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Tracking Perspectives on Event Participants: a Structural Analysis of the Framing of Real-World Events in Co-Referential Corpora</title>
+      <author><first>Levi</first><last>Remijnse</last></author>
+      <author><first>Pia</first><last>Sommerauer</last></author>
+      <author><first>Antske</first><last>Fokkens</last></author>
+      <author><first>Piek T.J.M.</first><last>Vossen</last></author>
+      <pages>1–12</pages>
+      <abstract>In this paper, we present the outcome of a structural linguistic analysis performed on a referentially grounded FrameNet dataset. In this dataset, multiple Dutch events are referenced by multiple co-referential Dutch news texts. Mentions in those documents are annotated with respect to their referential grounding (i.e., links to structured Wikidata), and their conceptual representation (i.e., frames). Provided with each document’s temporal reporting distance, we selected documents for two events - the Utrecht shooting and MH17 - and performed an analysis in which we tracked the events’ participants over time in both their focalization (number of mentions) and their framing (distribution of frame element labels). This way, we use the carefully collected and annotated data to schematize shifts in focalization and perspectivization of the participants as a result of the constantly developing narrative surrounding the events. This novel type of linguistic research involves reference to the real-world referents and takes into account storytelling in news streams.</abstract>
+      <url hash="067106cd">2024.rfp-1.1</url>
+      <bibkey>remijnse-etal-2024-tracking</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>T</fixed-case>ime<fixed-case>F</fixed-case>rame: Querying and Visualizing Event Semantic Frames in Time</title>
+      <author><first>Davide</first><last>Lamorte</last></author>
+      <author><first>Marco</first><last>Rovera</last></author>
+      <author><first>Alfio</first><last>Ferrara</last></author>
+      <author><first>Sara</first><last>Tonelli</last></author>
+      <pages>13–17</pages>
+      <abstract>In this work we introduce TimeFrame, an online platform to easily query and visualize events and participants extracted from document collections in Italian following a frame-based approach. The system allows users to select one or more events (frames) or event categories and to display their occurrences on a timeline. Different query types, from coarse to fine-grained, are available through the interface, enabling a time-bound analysis of large historical corpora. We present three use cases based on the full archive of news published in 1948 by the newspaper “Corriere della Sera”. We show that different crucial events can be explored, providing interesting insights into the narratives around such events, the main participants and their points of view.</abstract>
+      <url hash="321ec100">2024.rfp-1.2</url>
+      <bibkey>lamorte-etal-2024-timeframe</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Comparing News Framing of Migration Crises using Zero-Shot Classification</title>
+      <author><first>Nikola</first><last>Ivačič</last></author>
+      <author><first>Matthew</first><last>Purver</last></author>
+      <author><first>Fabienne</first><last>Lind</last></author>
+      <author><first>Senja</first><last>Pollak</last></author>
+      <author><first>Hajo</first><last>Boomgaarden</last></author>
+      <author><first>Veronika</first><last>Bajt</last></author>
+      <pages>18–27</pages>
+      <abstract>We present an experiment on classifying news frames in a language unseen by the learner, using zero-shot cross-lingual transfer learning. We used two pre-trained multilingual Transformer Encoder neural network models and tested with four specific news frames, investigating two approaches to the resulting multi-label task: Binary Relevance (treating each frame independently) and Label Power-set (predicting each possible combination of frames). We train our classifiers on an available annotated multilingual migration news dataset and test on an unseen Slovene language migration news corpus, first evaluating performance and then using the classifiers to analyse how media framed the news during the periods of Syria and Ukraine conflict-related migrations.</abstract>
+      <url hash="6f014e96">2024.rfp-1.3</url>
+      <bibkey>ivacic-etal-2024-comparing</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Manosphrames: exploring an <fixed-case>I</fixed-case>talian incel community through the lens of <fixed-case>NLP</fixed-case> and Frame Semantics</title>
+      <author><first>Sara</first><last>Gemelli</last></author>
+      <author><first>Gosse</first><last>Minnema</last></author>
+      <pages>28–39</pages>
+      <abstract>We introduce a large corpus of comments extracted from an Italian online incel (‘involuntary incelibate’) forum, a community of men who build a collective identity and anti-feminist ideology centered around their inability to find a sexual or romantic partner and who frequently use explicitly misogynistic language. Our corpus consists of 2.4K comments that have been manually collected, analyzed and annotated with topic labels, and a further 32K threads (300K comments) that have been automatically scraped and automatically annotated with FrameNet annotations. We show how large-scale frame semantic analysis can shed a light on what is discussed in the community, and introduce incel topic classification as a new NLP task and benchmark.</abstract>
+      <url hash="a77adcc1">2024.rfp-1.4</url>
+      <bibkey>gemelli-minnema-2024-manosphrames</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Broadening the coverage of computational representations of metaphor through Dynamic Metaphor Theory</title>
+      <author><first>Xiaojuan</first><last>Tan</last></author>
+      <author><first>Jelke</first><last>Bloem</last></author>
+      <pages>40–50</pages>
+      <abstract>Current approaches to computational metaphor processing typically incorporate static representations of metaphor. We aim to show that this limits the coverage of such systems. We take insights from dynamic metaphor theory and discuss how existing computational models of metaphor might benefit from representing the dynamics of metaphor when applied to the analysis of conflicting discourse. We propose that a frame-based approach to metaphor representation based on the model of YinYang Dynamics of Metaphoricity (YYDM) would pave the way to more comprehensive modeling of metaphor. In particular, the metaphoricity cues of the YYDM model could be used to address the task of dynamic metaphor identification. Frame-based modeling of dynamic metaphor would facilitate the computational analysis of perspectives in conflicting discourse, with potential applications in analyzing political discourse.</abstract>
+      <url hash="13124405">2024.rfp-1.5</url>
+      <bibkey>tan-bloem-2024-broadening</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.safety4convai.xml b/data/xml/2024.safety4convai.xml
new file mode 100644
index 0000000000..71769df7a8
--- /dev/null
+++ b/data/xml/2024.safety4convai.xml
@@ -0,0 +1,78 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.safety4convai">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of Safety4ConvAI: The Third Workshop on Safety for Conversational AI @ LREC-COLING 2024</booktitle>
+      <editor><first>Tanvi</first><last>Dinkar</last></editor>
+      <editor><first>Giuseppe</first><last>Attanasio</last></editor>
+      <editor><first>Amanda Cercas</first><last>Curry</last></editor>
+      <editor><first>Ioannis</first><last>Konstas</last></editor>
+      <editor><first>Dirk</first><last>Hovy</last></editor>
+      <editor><first>Verena</first><last>Rieser</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="d0c409c6">2024.safety4convai-1</url>
+      <venue>safety4convai</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="370cd433">2024.safety4convai-1.0</url>
+      <bibkey>safety4convai-2024-safety4convai</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Grounding <fixed-case>LLM</fixed-case>s to In-prompt Instructions: Reducing Hallucinations Caused by Static Pre-training Knowledge</title>
+      <author><first>Angus</first><last>Addlesee</last></author>
+      <pages>1–7</pages>
+      <abstract>When deploying LLMs in certain commercial or research settings, domain specific knowledge must be explicitly provided within the prompt. This in-prompt knowledge can conflict with an LLM’s static world knowledge learned at pre-training, causing model hallucination (see examples in Table 1). In safety-critical settings, like healthcare and finance, these hallucinations can harm vulnerable users. We have curated a QA corpus containing information that LLMs could not have seen at pre-training. Using our corpus, we have probed various LLMs, manipulating both the prompt and the knowledge representation. We have found that our ‘Jodie’ prompt consistently improves the model’s textual grounding to the given knowledge, and in-turn the overall answer accuracy. This is true in both the healthcare and finance domains - improving accuracy by up to 28% (mean: 12%). We have also identified that hierarchical and direct node-property graph structures could lead to more interpretable and controllable systems that provide a natural language interface with real-time in-domain knowledge. Our corpus will enable further work on this critical challenge.</abstract>
+      <url hash="28e11cfc">2024.safety4convai-1.1</url>
+      <bibkey>addlesee-2024-grounding</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Diversity-Aware Annotation for Conversational <fixed-case>AI</fixed-case> Safety</title>
+      <author><first>Alicia</first><last>Parrish</last></author>
+      <author><first>Vinodkumar</first><last>Prabhakaran</last></author>
+      <author><first>Lora</first><last>Aroyo</last></author>
+      <author><first>Mark</first><last>Díaz</last></author>
+      <author><first>Christopher M.</first><last>Homan</last></author>
+      <author><first>Greg</first><last>Serapio-García</last></author>
+      <author><first>Alex S.</first><last>Taylor</last></author>
+      <author><first>Ding</first><last>Wang</last></author>
+      <pages>8–15</pages>
+      <abstract>How people interpret content is deeply influenced by their socio-cultural backgrounds and lived experiences. This is especially crucial when evaluating AI systems for safety, where accounting for such diversity in interpretations and potential impacts on human users will make them both more successful and inclusive. While recent work has demonstrated the importance of diversity in human ratings that underlie AI pipelines, effective and efficient ways to incorporate diverse perspectives in human data annotation pipelines is still largely elusive. In this paper, we discuss the primary challenges faced in incorporating diversity into model evaluations, and propose a practical diversity-aware annotation approach. Using an existing dataset with highly parallel safety annotations, we take as a test case a policy that prioritizes recall of safety issues, and demonstrate that our diversity-aware approach can efficiently obtain a higher recall of safety issues flagged by minoritized rater groups without hurting overall precision.</abstract>
+      <url hash="714a8034">2024.safety4convai-1.2</url>
+      <bibkey>parrish-etal-2024-diversity</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Using Information Retrieval Techniques to Automatically Repurpose Existing Dialogue Datasets for Safe Chatbot Development</title>
+      <author><first>Tunde Oluwaseyi</first><last>Ajayi</last></author>
+      <author><first>Gaurav</first><last>Negi</last></author>
+      <author><first>Mihael</first><last>Arcan</last></author>
+      <author><first>Paul</first><last>Buitelaar</last></author>
+      <pages>16–27</pages>
+      <abstract>There has been notable progress in the development of open-domain dialogue systems (chatbots) especially with the rapid advancement of the capabilities of Large Language Models. Chatbots excel at holding conversations in a manner that keeps a user interested and engaged. However, their responses can be unsafe, as they can respond in an offensive manner or offer harmful professional advice. As a way to mitigate this issue, recent work crowdsource datasets with exemplary responses or annotate dialogue safety datasets, which are relatively scarce compared to casual dialogues. Despite the quality of data obtained from crowdsourcing, it can be expensive and time consuming. This work proposes an effective pipeline, using information retrieval, to automatically repurpose existing dialogue datasets for safe chatbot development, as a way to address the aforementioned challenges. We select an existing dialogue dataset, revise its unsafe responses, as a way to obtain a dataset with safer responses to unsafe user inputs. We then fine-tune dialogue models on the original and revised datasets and generate responses to evaluate the safeness of the models.</abstract>
+      <url hash="713bd26f">2024.safety4convai-1.3</url>
+      <bibkey>ajayi-etal-2024-using</bibkey>
+    </paper>
+    <paper id="4">
+      <title><fixed-case>F</fixed-case>air<fixed-case>P</fixed-case>air: A Robust Evaluation of Biases in Language Models through Paired Perturbations</title>
+      <author><first>Jane</first><last>Dwivedi-Yu</last></author>
+      <pages>28–39</pages>
+      <abstract>The accurate evaluation of differential treatment in language models to specific groups is critical to ensuring a positive and safe user experience. An ideal evaluation should have the properties of being robust, extendable to new groups or attributes, and being able to capture biases that appear in typical usage (rather than just extreme, rare cases). Relatedly, bias evaluation should surface not only egregious biases but also ones that are subtle and commonplace, such as a likelihood for talking about appearances with regard to women. We present FairPair, an evaluation framework for assessing differential treatment that occurs during ordinary usage. FairPair operates through counterfactual pairs, but crucially, the paired continuations are grounded in the same demographic group, which ensures equivalent comparison. Additionally, unlike prior work, our method factors in the inherent variability that comes from the generation process itself by measuring the sampling variability. We present an evaluation of several commonly used generative models and a qualitative analysis that indicates a preference for discussing family and hobbies with regard to women.</abstract>
+      <url hash="efdbe264">2024.safety4convai-1.4</url>
+      <bibkey>dwivedi-yu-2024-fairpair</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Learning To See But Forgetting To Follow: Visual Instruction Tuning Makes <fixed-case>LLM</fixed-case>s More Prone To Jailbreak Attacks</title>
+      <author><first>Georgios</first><last>Pantazopoulos</last></author>
+      <author><first>Amit</first><last>Parekh</last></author>
+      <author><first>Malvina</first><last>Nikandrou</last></author>
+      <author><first>Alessandro</first><last>Suglia</last></author>
+      <pages>40–51</pages>
+      <abstract>Augmenting Large Language Models (LLMs) with image-understanding capabilities has resulted in a boom of high-performing Vision-Language models (VLMs). While studying the alignment of LLMs to human values has received widespread attention, the safety of VLMs has not received the same attention. In this paper, we explore the impact of jailbreaking on three state-of-the-art VLMs, each using a distinct modeling approach. By comparing each VLM to their respective LLM backbone, we find that each VLM is more susceptible to jailbreaking. We consider this as an undesirable outcome from visual instruction-tuning, which imposes a forgetting effect on an LLM’s safety guardrails. Therefore, we provide recommendations for future work based on evaluation strategies that aim to highlight the weaknesses of a VLM, as well as take safety measures into account during visual instruction tuning.</abstract>
+      <url hash="e60055c5">2024.safety4convai-1.5</url>
+      <bibkey>pantazopoulos-etal-2024-learning</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.scalellm.xml b/data/xml/2024.scalellm.xml
index 5a5879cd71..8ff7892319 100644
--- a/data/xml/2024.scalellm.xml
+++ b/data/xml/2024.scalellm.xml
@@ -51,6 +51,7 @@
       <abstract>Most adults can complete a sequence of steps to achieve a certain goal, such as making a sandwich or repairing a bicycle tire. In completing these goal-oriented tasks, or simply tasks in this paper, one must use sequential reasoning to understand the relationship between the sequence of steps and the goal. LLMs have shown impressive capabilities across various natural language understanding tasks. However, prior work has mainlyfocused on logical reasoning tasks (e.g. arithmetic, commonsense QA); how well LLMs can perform on more complex reasoning tasks like sequential reasoning is not clear. In this paper, we address this gap and conduct a comprehensive evaluation of how well LLMs are able to conduct this reasoning for tasks and how they scale w.r.t multiple dimensions(e.g. adaptive prompting strategies, number of in-context examples, varying complexity of the sequential task). Our findings reveal that while Chain of Thought (CoT) prompting can significantly enhance LLMs’ sequential reasoning in certain scenarios, it can also be detrimental in others, whereas Tree of Thoughts (ToT) reasoning is less effective for this type of task. Additionally, we discover that an increase in model size or in-context examples does not consistently lead to improved performance.</abstract>
       <url hash="96fa1937">2024.scalellm-1.3</url>
       <bibkey>bellos-etal-2024-large</bibkey>
+      <video href="2024.scalellm-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title><fixed-case>I</fixed-case>nstruct<fixed-case>E</fixed-case>val: Towards Holistic Evaluation of Instruction-Tuned Large Language Models</title>
@@ -62,6 +63,7 @@
       <abstract>Instruction-tuned large language models have revolutionized natural language processing and have shown great potential in applications such as conversational agents. These models, such as GPT-4, can not only master language but also solve complex tasks in areas like mathematics, coding, medicine, and law. However, there is still a lack of comprehensive understanding regarding their full potential, primarily due to the black-box nature of many models and lack of holistic evaluation. To address these challenges, we present InstructEval, a more comprehensive evaluation suite designed specifically for instruction-tuned large language models. Unlike previous works, our evaluation involves a rigorous assessment of models based on problem-solving, writing ability, and alignment to human values. We take a holistic approach to analyze various factors affecting model performance, including the pretraining foundation, instruction-tuning data, and training methods. Our findings reveal that the quality of instruction data is a crucial factor in scaling model performance. While open-source models demonstrate impressive writing abilities, there is substantial room for improvement in problem-solving and alignment.</abstract>
       <url hash="89a31fa0">2024.scalellm-1.4</url>
       <bibkey>chia-etal-2024-instructeval</bibkey>
+      <video href="2024.scalellm-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>Detecting Mode Collapse in Language Models via Narration</title>
@@ -70,6 +72,7 @@
       <abstract>No two authors write alike. Personal flourishes invoked in written narratives, from lexicon to rhetorical devices, imply a particular author—what literary theorists label the implied or virtual author; distinct from the real author or narrator of a text. Early large language models trained on unfiltered training sets drawn from a variety of discordant sources yielded incoherent personalities, problematic for conversational tasks but proving useful for sampling literature from multiple perspectives. Successes in alignment research in recent years have allowed researchers to impose subjectively consistent personae on language models via instruction tuning and reinforcement learning from human feedback (RLHF), but whether aligned models retain the ability to model an arbitrary virtual author has received little scrutiny. By studying 4,374 stories sampled from three OpenAI language models, we show successive versions of GPT-3 suffer from increasing degrees of “mode collapse” whereby overfitting the model during alignment constrains it from generalizing over authorship: models suffering from mode collapse become unable to assume a multiplicity of perspectives. Our method and results are significant for researchers seeking to employ language models in sociological simulations.</abstract>
       <url hash="c435731e">2024.scalellm-1.5</url>
       <bibkey>hamilton-2024-detecting</bibkey>
+      <video href="2024.scalellm-1.5.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.scichat.xml b/data/xml/2024.scichat.xml
index c9999e54f1..f7a19b81d1 100644
--- a/data/xml/2024.scichat.xml
+++ b/data/xml/2024.scichat.xml
@@ -43,6 +43,7 @@
       <abstract>State-of-the-art conversational AI systems raise concerns due to their potential risks of generating unsafe, toxic, unethical, or dangerous content. Previous works have developed datasets to teach conversational agents the appropriate social paradigms to respond effectively to specifically designed hazardous content. However, models trained on these adversarial datasets still struggle to recognize subtle unsafe situations that appear naturally in conversations or introduce an inappropriate response in a casual context. To understand the extent of this problem, we study prosociality in both adversarial and casual dialog contexts and audit the response quality of general-purpose language models in terms of propensity to produce unsafe content. We propose a dual-step fine-tuning process to address these issues using a socially aware n-pair contrastive loss. Subsequently, we train a base model that integrates prosocial behavior by leveraging datasets like Moral Integrity Corpus (MIC) and ProsocialDialog. Experimental results on several dialog datasets demonstrate the effectiveness of our approach in generating socially appropriate responses.</abstract>
       <url hash="22b70561">2024.scichat-1.2</url>
       <bibkey>das-srihari-2024-improving</bibkey>
+      <video href="2024.scichat-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Reliable <fixed-case>LLM</fixed-case>-based User Simulator for Task-Oriented Dialogue Systems</title>
@@ -58,6 +59,7 @@
       <abstract>In the realm of dialogue systems, user simulation techniques have emerged as a game-changer, redefining the evaluation and enhancement of task-oriented dialogue (TOD) systems. These methods are crucial for replicating real user interactions, enabling applications like synthetic data augmentation, error detection, and robust evaluation. However, existing approaches often rely on rigid rule-based methods or on annotated data. This paper introduces DAUS, a Domain-Aware User Simulator. Leveraging large language models, we fine-tune DAUS on real examples of task-oriented dialogues. Results on two relevant benchmarks showcase significant improvements in terms of user goal fulfillment. Notably, we have observed that fine-tuning enhances the simulator’s coherence with user goals, effectively mitigating hallucinations—a major source of inconsistencies in simulator responses.</abstract>
       <url hash="71602890">2024.scichat-1.3</url>
       <bibkey>sekulic-etal-2024-reliable</bibkey>
+      <video href="2024.scichat-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Evaluating Modular Dialogue System for Form Filling Using Large Language Models</title>
@@ -68,6 +70,7 @@
       <abstract>This paper introduces a novel approach to form-filling and dialogue system evaluation by leveraging Large Language Models (LLMs). The proposed method establishes a setup wherein multiple modules collaborate on addressing the form-filling task. The dialogue system is constructed on top of LLMs, focusing on defining specific roles for individual modules. We show that using multiple independent sub-modules working cooperatively on this task can improve performance and handle the typical constraints of using LLMs, such as context limitations. The study involves testing the modular setup on four selected forms of varying topics and lengths, employing commercial and open-access LLMs. The experimental results demonstrate that the modular setup consistently outperforms the baseline, showcasing the effectiveness of this approach. Furthermore, our findings reveal that open-access models perform comparably to commercial models for the specified task.</abstract>
       <url hash="88332ae8">2024.scichat-1.4</url>
       <bibkey>hakimov-etal-2024-evaluating</bibkey>
+      <video href="2024.scichat-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title><fixed-case>KAUCUS</fixed-case> - Knowledgeable User Simulators for Training Large Language Models</title>
@@ -76,6 +79,7 @@
       <abstract>An effective multi-turn instruction-following assistant can be developed by creating a simulator that can generate useful interaction data. Apart from relying on its intrinsic weights, an ideal user simulator should also be able to bootstrap external knowledge rapidly in its raw form to simulate the multifarious diversity of text available over the internet. Previous user simulators generally lacked diversity, were mostly closed domain, and necessitated rigid schema making them inefficient to rapidly scale to incorporate external knowledge. In this regard, we introduce Kaucus, a Knowledge-Augmented User Simulator framework, to outline a process of creating diverse user simulators, that can seamlessly exploit external knowledge as well as benefit downstream assistant model training. Through two GPT-J based simulators viz., a Retrieval Augmented Simulator and a Summary Controlled Simulator we generate diverse simulator-assistant interactions. Through reward and preference model-based evaluations, we find that these interactions serve as useful training data and create more helpful downstream assistants. We also find that incorporating knowledge through retrieval augmentation or summary control helps create better assistants.</abstract>
       <url hash="7f680677">2024.scichat-1.5</url>
       <bibkey>dhole-2024-kaucus</bibkey>
+      <video href="2024.scichat-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title><fixed-case>S</fixed-case>arc<fixed-case>E</fixed-case>mp - Fine-tuning <fixed-case>D</fixed-case>ialo<fixed-case>GPT</fixed-case> for Sarcasm and Empathy</title>
diff --git a/data/xml/2024.signlang.xml b/data/xml/2024.signlang.xml
new file mode 100644
index 0000000000..44dbcd5910
--- /dev/null
+++ b/data/xml/2024.signlang.xml
@@ -0,0 +1,496 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.signlang">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the LREC-COLING 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources</booktitle>
+      <editor><first>Eleni</first><last>Efthimiou</last></editor>
+      <editor><first>Stavroula-Evita</first><last>Fotinea</last></editor>
+      <editor><first>Thomas</first><last>Hanke</last></editor>
+      <editor><first>Julie A.</first><last>Hochgesang</last></editor>
+      <editor><first>Johanna</first><last>Mesch</last></editor>
+      <editor><first>Marc</first><last>Schulder</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="413fc419">2024.signlang-1</url>
+      <venue>signlang</venue>
+    </meta>
+    <frontmatter>
+      <url hash="6cc737a5">2024.signlang-1.0</url>
+      <bibkey>signlang-2024-lrec</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Advancing Annotation for Continuous Data in <fixed-case>S</fixed-case>wiss <fixed-case>G</fixed-case>erman Sign Language</title>
+      <author><first>Alessia</first><last>Battisti</last></author>
+      <author><first>Katja</first><last>Tissi</last></author>
+      <author><first>Sandra</first><last>Sidler-Miserez</last></author>
+      <author><first>Sarah</first><last>Ebling</last></author>
+      <pages>1–12</pages>
+      <url hash="29149948">2024.signlang-1.1</url>
+      <bibkey>battisti-etal-2024-advancing</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Person Identification from Pose Estimates in Sign Language</title>
+      <author><first>Alessia</first><last>Battisti</last></author>
+      <author><first>Emma</first><last>van den Bold</last></author>
+      <author><first>Anne</first><last>Göhring</last></author>
+      <author><first>Franz</first><last>Holzknecht</last></author>
+      <author><first>Sarah</first><last>Ebling</last></author>
+      <pages>13–25</pages>
+      <url hash="39020cf9">2024.signlang-1.2</url>
+      <bibkey>battisti-etal-2024-person</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Data Integration, Annotation, and Transcription Methods for Sign Language Dialogue with Latency in Videoconferencing</title>
+      <author><first>Mayumi</first><last>Bono</last></author>
+      <author><first>Tomohiro</first><last>Okada</last></author>
+      <author><first>Victor</first><last>Skobov</last></author>
+      <author><first>Robert</first><last>Adam</last></author>
+      <pages>26–35</pages>
+      <url hash="363a4610">2024.signlang-1.3</url>
+      <bibkey>bono-etal-2024-data</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Evaluating the Alignment of Utterances in the <fixed-case>S</fixed-case>wedish <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage Corpus</title>
+      <author><first>Carl</first><last>Börstell</last></author>
+      <pages>36–45</pages>
+      <url hash="a8390d35">2024.signlang-1.4</url>
+      <bibkey>borstell-2024-evaluating</bibkey>
+    </paper>
+    <paper id="5">
+      <title>How to Approach Lexical Variation in Sign Language Corpora</title>
+      <author><first>Carl</first><last>Börstell</last></author>
+      <pages>46–53</pages>
+      <url hash="ddc60677">2024.signlang-1.5</url>
+      <bibkey>borstell-2024-approach</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Systemic Biases in Sign Language <fixed-case>AI</fixed-case> Research: A Deaf-Led Call to Reevaluate Research Agendas</title>
+      <author><first>Aashaka</first><last>Desai</last></author>
+      <author><first>Maartje</first><last>De Meulder</last></author>
+      <author><first>Julie A.</first><last>Hochgesang</last></author>
+      <author><first>Annemarie</first><last>Kocab</last></author>
+      <author><first>Alex X.</first><last>Lu</last></author>
+      <pages>54–65</pages>
+      <url hash="dfc6b4ea">2024.signlang-1.6</url>
+      <bibkey>desai-etal-2024-systemic</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Evaluating Inter-Annotator Agreement for Non-Manual Markers in Sign Languages</title>
+      <author><first>Lyke D.</first><last>Esselink</last></author>
+      <author><first>Marloes</first><last>Oomen</last></author>
+      <author><first>Floris</first><last>Roelofsen</last></author>
+      <pages>66–76</pages>
+      <url hash="c931b50f">2024.signlang-1.7</url>
+      <bibkey>esselink-etal-2024-evaluating</bibkey>
+    </paper>
+    <paper id="8">
+      <title>A software editor for the <fixed-case>AZVD</fixed-case> graphical Sign Language representation system</title>
+      <author><first>Michael</first><last>Filhol</last></author>
+      <author><first>Thomas</first><last>von Ascheberg</last></author>
+      <pages>77–85</pages>
+      <url hash="1c5abea1">2024.signlang-1.8</url>
+      <bibkey>filhol-von-ascheberg-2024-software</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Content Questions in Sign Language – From theory to language description via corpus, experiments, and fieldwork</title>
+      <author><first>Robert</first><last>Gavrilescu</last></author>
+      <author><first>Carlo</first><last>Geraci</last></author>
+      <author><first>Johanna</first><last>Mesch</last></author>
+      <pages>86–94</pages>
+      <url hash="0ebcd851">2024.signlang-1.9</url>
+      <bibkey>gavrilescu-etal-2024-content</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Matignon-<fixed-case>LSF</fixed-case>: a Large Corpus of Interpreted <fixed-case>F</fixed-case>rench <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage</title>
+      <author><first>Julie</first><last>Halbout</last></author>
+      <author><first>Diandra</first><last>Fabre</last></author>
+      <author><first>Yanis</first><last>Ouakrim</last></author>
+      <author><first>Julie</first><last>Lascar</last></author>
+      <author><first>Annelies</first><last>Braffort</last></author>
+      <author><first>Michèle</first><last>Gouiffès</last></author>
+      <author><first>Denis</first><last>Beautemps</last></author>
+      <pages>95–101</pages>
+      <url hash="8b81451e">2024.signlang-1.10</url>
+      <bibkey>halbout-etal-2024-matignon</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Phonological Transcription of the <fixed-case>C</fixed-case>anadian Dictionary of <fixed-case>ASL</fixed-case> as a Language Resource</title>
+      <author><first>Kathleen Currie</first><last>Hall</last></author>
+      <author><first>Anushka</first><last>Asthana</last></author>
+      <author><first>Maggie</first><last>Reid</last></author>
+      <author><first>Yiran</first><last>Gao</last></author>
+      <author><first>Grace</first><last>Hobby</last></author>
+      <author><first>Oksana</first><last>Tkachman</last></author>
+      <author><first>Kaili</first><last>Vesik</last></author>
+      <pages>102–110</pages>
+      <url hash="0604da0a">2024.signlang-1.11</url>
+      <bibkey>hall-etal-2024-phonological</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Retrospective of <fixed-case>K</fixed-case>azakh-<fixed-case>R</fixed-case>ussian <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage Corpus Formation</title>
+      <author><first>Alfarabi</first><last>Imashev</last></author>
+      <author><first>Aigerim</first><last>Kydyrbekova</last></author>
+      <author><first>Medet</first><last>Mukushev</last></author>
+      <author><first>Anara</first><last>Sandygulova</last></author>
+      <author><first>Shynggys</first><last>Islam</last></author>
+      <author><first>Khassan</first><last>Israilov</last></author>
+      <author><first>Aibek</first><last>Makazhanov</last></author>
+      <author><first>Zhandos</first><last>Yessenbayev</last></author>
+      <pages>111–122</pages>
+      <url hash="fe31b43e">2024.signlang-1.12</url>
+      <bibkey>imashev-etal-2024-retrospective</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Enhancing Syllabic Component Classification in <fixed-case>J</fixed-case>apanese <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage by Pre-training on Non-<fixed-case>J</fixed-case>apanese <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage Data</title>
+      <author><first>Jundai</first><last>Inoue</last></author>
+      <author><first>Makoto</first><last>Miwa</last></author>
+      <author><first>Yutaka</first><last>Sasaki</last></author>
+      <author><first>Daisuke</first><last>Hara</last></author>
+      <pages>123–130</pages>
+      <url hash="7c3a34fb">2024.signlang-1.13</url>
+      <bibkey>inoue-etal-2024-enhancing</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Building Your Query Step by Step: A Query Wizard for the <fixed-case>MY</fixed-case> <fixed-case>DGS</fixed-case> – <fixed-case>ANNIS</fixed-case> Portal of the <fixed-case>DGS</fixed-case> <fixed-case>C</fixed-case>orpus</title>
+      <author><first>Amy</first><last>Isard</last></author>
+      <pages>131–139</pages>
+      <url hash="74245023">2024.signlang-1.14</url>
+      <bibkey>isard-2024-building</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Investigating Motion History Images and Convolutional Neural Networks for Isolated <fixed-case>I</fixed-case>rish <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage Fingerspelling Recognition</title>
+      <author><first>Hafiz Muhammad Sarmad</first><last>Khan</last></author>
+      <author><first>Irene</first><last>Murtagh</last></author>
+      <author><first>Simon D.</first><last>McLoughlin</last></author>
+      <pages>140–146</pages>
+      <url hash="34cd3a0a">2024.signlang-1.15</url>
+      <bibkey>khan-etal-2024-investigating</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Shedding Light on the Underexplored: Tackling the Minor Sign Language Research Topics</title>
+      <author><first>Jung-Ho</first><last>Kim</last></author>
+      <author><first>Changyong</first><last>Ko</last></author>
+      <author><first>Mathew</first><last>Huerta-Enochian</last></author>
+      <author><first>Seung Yong</first><last>Ko</last></author>
+      <pages>147–158</pages>
+      <url hash="6235ccb4">2024.signlang-1.16</url>
+      <bibkey>kim-etal-2024-shedding</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Headshakes in <fixed-case>NGT</fixed-case>: Relation between Phonetic Properties &amp; Linguistic Functions</title>
+      <author><first>Vadim</first><last>Kimmelman</last></author>
+      <author><first>Marloes</first><last>Oomen</last></author>
+      <author><first>Roland</first><last>Pfau</last></author>
+      <pages>159–167</pages>
+      <url hash="64e8055c">2024.signlang-1.17</url>
+      <bibkey>kimmelman-etal-2024-headshakes</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Nonmanual Marking of Questions in <fixed-case>B</fixed-case>alinese Homesign Interactions: a Computer-Vision Assisted Analysis</title>
+      <author><first>Vadim</first><last>Kimmelman</last></author>
+      <author><first>Ari</first><last>Price</last></author>
+      <author><first>Josefina</first><last>Safar</last></author>
+      <author><first>Connie</first><last>de Vos</last></author>
+      <author><first>Jan</first><last>Bulla</last></author>
+      <pages>168–177</pages>
+      <url hash="7ea32df3">2024.signlang-1.18</url>
+      <bibkey>kimmelman-etal-2024-nonmanual</bibkey>
+    </paper>
+    <paper id="19">
+      <title>An Extension of the <fixed-case>NGT</fixed-case> Dataset in <fixed-case>G</fixed-case>lobal <fixed-case>S</fixed-case>ignbank</title>
+      <author><first>Ulrika</first><last>Klomp</last></author>
+      <author><first>Lisa</first><last>Gierman</last></author>
+      <author><first>Pieter</first><last>Manders</last></author>
+      <author><first>Ellen</first><last>Nauta</last></author>
+      <author><first>Gomèr</first><last>Otterspeer</last></author>
+      <author><first>Ray</first><last>Pelupessy</last></author>
+      <author><first>Galya</first><last>Stern</last></author>
+      <author><first>Dalene</first><last>Venter</last></author>
+      <author><first>Casper</first><last>Wubbolts</last></author>
+      <author><first>Marloes</first><last>Oomen</last></author>
+      <author><first>Floris</first><last>Roelofsen</last></author>
+      <pages>178–183</pages>
+      <url hash="d877c01c">2024.signlang-1.19</url>
+      <bibkey>klomp-etal-2024-extension</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Corpus à la carte – Improving Access to the <fixed-case>P</fixed-case>ublic <fixed-case>DGS</fixed-case> <fixed-case>C</fixed-case>orpus</title>
+      <author><first>Reiner</first><last>Konrad</last></author>
+      <author><first>Thomas</first><last>Hanke</last></author>
+      <author><first>Amy</first><last>Isard</last></author>
+      <author><first>Marc</first><last>Schulder</last></author>
+      <author><first>Lutz</first><last>König</last></author>
+      <author><first>Julian</first><last>Bleicken</last></author>
+      <author><first>Oliver</first><last>Böse</last></author>
+      <pages>184–193</pages>
+      <url hash="78348681">2024.signlang-1.20</url>
+      <bibkey>konrad-etal-2024-corpus</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Introducing the <fixed-case>DW</fixed-case>-<fixed-case>DGS</fixed-case> – The Digital Dictionary of <fixed-case>DGS</fixed-case></title>
+      <author><first>Gabriele</first><last>Langer</last></author>
+      <author><first>Anke</first><last>Müller</last></author>
+      <author><first>Sabrina</first><last>Wähl</last></author>
+      <author><first>Felicitas</first><last>Otte</last></author>
+      <author><first>Lea</first><last>Sepke</last></author>
+      <author><first>Thomas</first><last>Hanke</last></author>
+      <pages>194–203</pages>
+      <url hash="60df5ea9">2024.signlang-1.21</url>
+      <bibkey>langer-etal-2024-introducing</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Annotation of <fixed-case>LSF</fixed-case> subtitled videos without a pre-existing dictionary</title>
+      <author><first>Julie</first><last>Lascar</last></author>
+      <author><first>Michèle</first><last>Gouiffès</last></author>
+      <author><first>Annelies</first><last>Braffort</last></author>
+      <author><first>Claire</first><last>Danet</last></author>
+      <pages>204–212</pages>
+      <url hash="9e32f4fc">2024.signlang-1.22</url>
+      <bibkey>lascar-etal-2024-annotation</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Capturing Motion: Using Radar to Build Better Sign Language Corpora</title>
+      <author><first>Evie</first><last>Malaia</last></author>
+      <author><first>Joshua</first><last>Borneman</last></author>
+      <author><first>Sevgi</first><last>Gurbuz</last></author>
+      <pages>213–218</pages>
+      <url hash="7e6dc4d2">2024.signlang-1.23</url>
+      <bibkey>malaia-etal-2024-capturing</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Exploring Latent Sign Language Representations with Isolated Signs, Sentences and In-the-Wild Data</title>
+      <author><first>Fredrik</first><last>Malmberg</last></author>
+      <author><first>Anna</first><last>Klezovich</last></author>
+      <author><first>Johanna</first><last>Mesch</last></author>
+      <author><first>Jonas</first><last>Beskow</last></author>
+      <pages>219–224</pages>
+      <url hash="482383f3">2024.signlang-1.24</url>
+      <bibkey>malmberg-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Quantitative Analysis of Hand Locations in both Sign Language and Non-linguistic Gesture Videos</title>
+      <author><first>Niels</first><last>Martínez-Guevara</last></author>
+      <author><first>Arturo</first><last>Curiel</last></author>
+      <pages>225–234</pages>
+      <url hash="c7b2610a">2024.signlang-1.25</url>
+      <bibkey>martinez-guevara-curiel-2024-quantitative</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Formal Representation of Interrogation in <fixed-case>F</fixed-case>rench <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage</title>
+      <author><first>Emmanuella</first><last>Martinod</last></author>
+      <author><first>Michael</first><last>Filhol</last></author>
+      <pages>235–243</pages>
+      <url hash="031a82be">2024.signlang-1.26</url>
+      <bibkey>martinod-filhol-2024-formal</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Multilingual Synthesis of Depictions through Structured Descriptions of Sign: An Initial Case Study</title>
+      <author><first>John</first><last>McDonald</last></author>
+      <author><first>Eleni</first><last>Efthimiou</last></author>
+      <author><first>Stavroula-Evita</first><last>Fotinea</last></author>
+      <author><first>Rosalee</first><last>Wolfe</last></author>
+      <pages>244–253</pages>
+      <url hash="7bf7d3da">2024.signlang-1.27</url>
+      <bibkey>mcdonald-etal-2024-multilingual</bibkey>
+    </paper>
+    <paper id="28">
+      <title><fixed-case>S</fixed-case>wedish <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage Resources from a User’s Perspective</title>
+      <author><first>Johanna</first><last>Mesch</last></author>
+      <author><first>Thomas</first><last>Björkstrand</last></author>
+      <author><first>Eira</first><last>Balkstam</last></author>
+      <author><first>Patrick</first><last>Hansson</last></author>
+      <author><first>Nikolaus</first><last>Riemer Kankkonen</last></author>
+      <pages>254–261</pages>
+      <url hash="7efe0fbb">2024.signlang-1.28</url>
+      <bibkey>mesch-etal-2024-swedish</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Sign Language Translation with Gloss Pair Encoding</title>
+      <author><first>Taro</first><last>Miyazaki</last></author>
+      <author><first>Sihan</first><last>Tan</last></author>
+      <author><first>Tsubasa</first><last>Uchida</last></author>
+      <author><first>Hiroyuki</first><last>Kaneko</last></author>
+      <pages>262–268</pages>
+      <url hash="c65e180a">2024.signlang-1.29</url>
+      <bibkey>miyazaki-etal-2024-sign</bibkey>
+    </paper>
+    <paper id="30">
+      <title><fixed-case>S</fixed-case>ign<fixed-case>C</fixed-case>ollect: A ‘Touchless’ Pipeline for Constructing Large-scale Sign Language Repositories</title>
+      <author><first>Gomèr</first><last>Otterspeer</last></author>
+      <author><first>Ulrika</first><last>Klomp</last></author>
+      <author><first>Floris</first><last>Roelofsen</last></author>
+      <pages>269–275</pages>
+      <url hash="f210830f">2024.signlang-1.30</url>
+      <bibkey>otterspeer-etal-2024-signcollect</bibkey>
+    </paper>
+    <paper id="31">
+      <title>The <fixed-case>EASIER</fixed-case> Mobile Application and Avatar End-User Evaluation Methodology</title>
+      <author><first>Frankie</first><last>Picron</last></author>
+      <author><first>Davy</first><last>Van Landuyt</last></author>
+      <author><first>Rehana</first><last>Omardeen</last></author>
+      <author><first>Eleni</first><last>Efthimiou</last></author>
+      <author><first>Rosalee</first><last>Wolfe</last></author>
+      <author><first>Stavroula-Evita</first><last>Fotinea</last></author>
+      <author><first>Theodore</first><last>Goulas</last></author>
+      <author><first>Christian</first><last>Tismer</last></author>
+      <author><first>Maria</first><last>Kopf</last></author>
+      <author><first>Thomas</first><last>Hanke</last></author>
+      <pages>276–281</pages>
+      <url hash="a0054dff">2024.signlang-1.31</url>
+      <bibkey>picron-etal-2024-easier</bibkey>
+    </paper>
+    <paper id="32">
+      <title><fixed-case>V</fixed-case>isuo<fixed-case>L</fixed-case>ab: Building a sign language multilingual, multimodal and multifunctional platform</title>
+      <author><first>Christian</first><last>Rathmann</last></author>
+      <author><first>Ronice Muller</first><last>de Quadros</last></author>
+      <author><first>Thomas</first><last>Geißler</last></author>
+      <author><first>Christian</first><last>Peters</last></author>
+      <author><first>Francisco</first><last>Fernandes</last></author>
+      <author><first>Milene Peixer</first><last>Loio</last></author>
+      <author><first>Diego</first><last>França</last></author>
+      <pages>282–289</pages>
+      <url hash="8ab53577">2024.signlang-1.32</url>
+      <bibkey>rathmann-etal-2024-visuolab</bibkey>
+    </paper>
+    <paper id="33">
+      <title>3<fixed-case>D</fixed-case>-<fixed-case>LEX</fixed-case> v1.0 – 3<fixed-case>D</fixed-case> Lexicons for <fixed-case>A</fixed-case>merican <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage and <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage of the <fixed-case>N</fixed-case>etherlands</title>
+      <author><first>Oline</first><last>Ranum</last></author>
+      <author><first>Gomèr</first><last>Otterspeer</last></author>
+      <author><first>Jari I.</first><last>Andersen</last></author>
+      <author><first>Robert G.</first><last>Belleman</last></author>
+      <author><first>Floris</first><last>Roelofsen</last></author>
+      <pages>290–301</pages>
+      <url hash="18962049">2024.signlang-1.33</url>
+      <bibkey>ranum-etal-2024-3d</bibkey>
+    </paper>
+    <paper id="34">
+      <title><fixed-case>S</fixed-case>ignbank 2.0 of Sign Languages: Easy to Administer, Easy to Use, Easy to Share</title>
+      <author><first>Ronice Muller</first><last>de Quadros</last></author>
+      <author><first>Christian</first><last>Rathmann</last></author>
+      <author><first>Peter Zalán</first><last>Romanek</last></author>
+      <author><first>Francisco</first><last>Fernandes</last></author>
+      <author><first>Sther</first><last>Condé</last></author>
+      <pages>302–314</pages>
+      <url hash="55ef2efc">2024.signlang-1.34</url>
+      <bibkey>de-quadros-etal-2024-signbank</bibkey>
+    </paper>
+    <paper id="35">
+      <title><fixed-case>STK</fixed-case> <fixed-case>LSF</fixed-case>: A Motion Capture Dataset in <fixed-case>LSF</fixed-case> for <fixed-case>S</fixed-case>ign<fixed-case>T</fixed-case>o<fixed-case>K</fixed-case>ids</title>
+      <author><first>Clément</first><last>Reverdy</last></author>
+      <author><first>Sylvie</first><last>Gibet</last></author>
+      <author><first>Thibaut</first><last>Le Naour</last></author>
+      <pages>315–322</pages>
+      <url hash="bce369a0">2024.signlang-1.35</url>
+      <bibkey>reverdy-etal-2024-stk</bibkey>
+    </paper>
+    <paper id="36">
+      <title>Preprocessing Mediapipe Keypoints with Keypoint Reconstruction and Anchors for Isolated Sign Language Recognition</title>
+      <author><first>Kyunggeun</first><last>Roh</last></author>
+      <author><first>Huije</first><last>Lee</last></author>
+      <author><first>Eui Jun</first><last>Hwang</last></author>
+      <author><first>Sukmin</first><last>Cho</last></author>
+      <author><first>Jong C.</first><last>Park</last></author>
+      <pages>323–334</pages>
+      <url hash="7f75f86d">2024.signlang-1.36</url>
+      <bibkey>roh-etal-2024-preprocessing</bibkey>
+    </paper>
+    <paper id="37">
+      <title>Decoding Sign Languages: The <fixed-case>SL</fixed-case>-<fixed-case>FE</fixed-case> Framework for Phonological Analysis and Automated Annotation</title>
+      <author><first>Karahan</first><last>Şahin</last></author>
+      <author><first>Kadir</first><last>Gökgöz</last></author>
+      <pages>335–342</pages>
+      <url hash="cda15a99">2024.signlang-1.37</url>
+      <bibkey>sahin-gokgoz-2024-decoding</bibkey>
+    </paper>
+    <paper id="38">
+      <title>Signs and Synonymity: Continuing Development of the Multilingual Sign Language <fixed-case>W</fixed-case>ordnet</title>
+      <author><first>Marc</first><last>Schulder</last></author>
+      <author><first>Sam</first><last>Bigeard</last></author>
+      <author><first>Maria</first><last>Kopf</last></author>
+      <author><first>Thomas</first><last>Hanke</last></author>
+      <author><first>Anna</first><last>Kuder</last></author>
+      <author><first>Joanna</first><last>Wójcicka</last></author>
+      <author><first>Johanna</first><last>Mesch</last></author>
+      <author><first>Thomas</first><last>Björkstrand</last></author>
+      <author><first>Anna</first><last>Vacalopoulou</last></author>
+      <author><first>Kyriaki</first><last>Vasilaki</last></author>
+      <author><first>Theodore</first><last>Goulas</last></author>
+      <author><first>Stavroula-Evita</first><last>Fotinea</last></author>
+      <author><first>Eleni</first><last>Efthimiou</last></author>
+      <pages>343–353</pages>
+      <url hash="06e16e4c">2024.signlang-1.38</url>
+      <bibkey>schulder-etal-2024-signs</bibkey>
+    </paper>
+    <paper id="39">
+      <title>Facial Expressions for Sign Language Synthesis using <fixed-case>FACSH</fixed-case>uman and <fixed-case>AZ</fixed-case>ee</title>
+      <author><first>Paritosh</first><last>Sharma</last></author>
+      <author><first>Camille</first><last>Challant</last></author>
+      <author><first>Michael</first><last>Filhol</last></author>
+      <pages>354–360</pages>
+      <url hash="d36f1e54">2024.signlang-1.39</url>
+      <bibkey>sharma-etal-2024-facial</bibkey>
+    </paper>
+    <paper id="40">
+      <title>Eye Blink Detection in Sign Language Data Using <fixed-case>CNN</fixed-case>s and Rule-Based Methods</title>
+      <author><first>Margaux</first><last>Susman</last></author>
+      <author><first>Vadim</first><last>Kimmelman</last></author>
+      <pages>361–369</pages>
+      <url hash="e9831e45">2024.signlang-1.40</url>
+      <bibkey>susman-kimmelman-2024-eye</bibkey>
+    </paper>
+    <paper id="41">
+      <title><fixed-case>SEDA</fixed-case>: Simple and Effective Data Augmentation for Sign Language Understanding</title>
+      <author><first>Sihan</first><last>Tan</last></author>
+      <author><first>Taro</first><last>Miyazaki</last></author>
+      <author><first>Katsutoshi</first><last>Itoyama</last></author>
+      <author><first>Kazuhiro</first><last>Nakadai</last></author>
+      <pages>370–375</pages>
+      <url hash="ed50cd3f">2024.signlang-1.41</url>
+      <bibkey>tan-etal-2024-seda</bibkey>
+    </paper>
+    <paper id="42">
+      <title><fixed-case>H</fixed-case>am<fixed-case>N</fixed-case>o<fixed-case>S</fixed-case>ys-based Motion Editing Method for Sign Language</title>
+      <author><first>Tsubasa</first><last>Uchida</last></author>
+      <author><first>Taro</first><last>Miyazaki</last></author>
+      <author><first>Hiroyuki</first><last>Kaneko</last></author>
+      <pages>376–385</pages>
+      <url hash="8640c4b6">2024.signlang-1.42</url>
+      <bibkey>uchida-etal-2024-hamnosys</bibkey>
+    </paper>
+    <paper id="43">
+      <title><fixed-case>S</fixed-case>igna<fixed-case>M</fixed-case>ed: a Cooperative Bilingual <fixed-case>LSE</fixed-case>-<fixed-case>S</fixed-case>panish Dictionary in the Healthcare Domain</title>
+      <author><first>Manuel</first><last>Vázquez-Enríquez</last></author>
+      <author><first>José Luis</first><last>Alba-Castro</last></author>
+      <author><first>Ania</first><last>Pérez-Pérez</last></author>
+      <author><first>Carmen</first><last>Cabeza-Pereiro</last></author>
+      <author><first>Laura</first><last>Docío-Fernández</last></author>
+      <pages>386–394</pages>
+      <url hash="e9b2e042">2024.signlang-1.43</url>
+      <bibkey>vazquez-enriquez-etal-2024-signamed</bibkey>
+    </paper>
+    <paper id="44">
+      <title>Diffusion Models for Sign Language Video Anonymization</title>
+      <author><first>Zhaoyang</first><last>Xia</last></author>
+      <author><first>Yang</first><last>Zhou</last></author>
+      <author><first>Ligong</first><last>Han</last></author>
+      <author><first>Carol</first><last>Neidle</last></author>
+      <author><first>Dimitris N.</first><last>Metaxas</last></author>
+      <pages>395–407</pages>
+      <url hash="943078a1">2024.signlang-1.44</url>
+      <bibkey>xia-etal-2024-diffusion</bibkey>
+    </paper>
+    <paper id="45">
+      <title>A Multimodal Spatio-Temporal <fixed-case>GCN</fixed-case> Model with Enhancements for Isolated Sign Recognition</title>
+      <author><first>Yang</first><last>Zhou</last></author>
+      <author><first>Zhaoyang</first><last>Xia</last></author>
+      <author><first>Yuxiao</first><last>Chen</last></author>
+      <author><first>Carol</first><last>Neidle</last></author>
+      <author><first>Dimitris N.</first><last>Metaxas</last></author>
+      <pages>408–419</pages>
+      <url hash="25fb9c3d">2024.signlang-1.45</url>
+      <bibkey>zhou-etal-2024-multimodal</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.sigtyp.xml b/data/xml/2024.sigtyp.xml
index bf94e98352..f9de00f525 100644
--- a/data/xml/2024.sigtyp.xml
+++ b/data/xml/2024.sigtyp.xml
@@ -97,6 +97,7 @@
       <abstract>LAJaR (Language Atlas of Japanese and Ryukyuan) is a linguistic typology database focusing on micro-variation of the Japonic (Japanese and Ryukyuan) languages. This paper aims to report the design and progress of this ongoing database project. Finally, we also show a case study utilizing its database on zero copulas among the Japonic languages.</abstract>
       <url hash="ab639a05">2024.sigtyp-1.7</url>
       <bibkey>kato-etal-2024-language</bibkey>
+      <video href="2024.sigtyp-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title><fixed-case>GTNC</fixed-case>: A Many-To-One Dataset of <fixed-case>G</fixed-case>oogle Translations from <fixed-case>N</fixed-case>ews<fixed-case>C</fixed-case>rawl</title>
@@ -117,6 +118,7 @@
       <abstract>Emotion classification is a challenging task in NLP due to the inherent idiosyncratic and subjective nature of linguistic expression,especially with code-mixed data. Pre-trained language models (PLMs) have achieved high performance for many tasks and languages, but it remains to be seen whether these models learn and are robust to the differences in emotional expression across languages. Sociolinguistic studies have shown that Hinglish speakers switch to Hindi when expressing negative emotions and to English when expressing positive emotions. To understand if language models can learn these associations, we study the effect of language on emotion prediction across 3 PLMs on a Hinglish emotion classification dataset. Using LIME and token level language ID, we find that models do learn these associations between language choice and emotional expression. Moreover, having code-mixed data present in the pre-training can augment that learning when task-specific data is scarce. We also conclude from the misclassifications that the models may overgeneralise this heuristic to other infrequent examples where this sociolinguistic phenomenon does not apply.</abstract>
       <url hash="1d79f49f">2024.sigtyp-1.9</url>
       <bibkey>tatariya-etal-2024-sociolinguistically</bibkey>
+      <video href="2024.sigtyp-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>A Call for Consistency in Reporting Typological Diversity</title>
@@ -140,6 +142,7 @@
       <abstract>In traditional studies on language evolution, scholars often emphasize the importance of sound laws and sound correspondences for phylogenetic inference of language family trees. However, to date, computational approaches have typically not taken this potential into account. Most computational studies still rely on lexical cognates as major data source for phylogenetic reconstruction in linguistics, although there do exist a few studies in which authors praise the benefits of comparing words at the level of sound sequences. Building on (a) ten diverse datasets from different language families, and (b) state-of-the-art methods for automated cognate and sound correspondence detection, we test, for the first time, the performance of sound-based versus cognate-based approaches to phylogenetic reconstruction. Our results show that phylogenies reconstructed from lexical cognates are topologically closer, by approximately one third with respect to the generalized quartet distance on average, to the gold standard phylogenies than phylogenies reconstructed from sound correspondences.</abstract>
       <url hash="014d3a3f">2024.sigtyp-1.11</url>
       <bibkey>hauser-etal-2024-sounds</bibkey>
+      <video href="2024.sigtyp-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Compounds in <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies: A Survey in Five <fixed-case>E</fixed-case>uropean Languages</title>
@@ -149,6 +152,7 @@
       <abstract>In Universal Dependencies, compounds, which we understand as words containing two or more roots, are represented according to tokenization, which reflects the orthographic conventions of the language. A closed compound (e.g. <tex-math>\textit{waterfall}</tex-math>) corresponds to a single word in Universal Dependencies while a hyphenated compound (<tex-math>\textit{father-in-law}</tex-math>) and an open compound (<tex-math>\textit{apple pie}</tex-math>) to multiple words. The aim of this paper is to open a discussion on how to move towards a more consistent annotation of compounds.The solution we argue for is to represent the internal structure of all compound types analogously to syntactic phrases, which would not only increase the comparability of compounding within and across languages, but also allow comparisons of compounds and syntactic phrases.</abstract>
       <url hash="cdd2cea9">2024.sigtyp-1.12</url>
       <bibkey>svoboda-sevcikova-2024-compounds</bibkey>
+      <video href="2024.sigtyp-1.12.mp4"/>
     </paper>
     <paper id="13">
       <title>Predicting positive transfer for improved low-resource speech recognition using acoustic pseudo-tokens</title>
@@ -212,6 +216,7 @@
       <abstract>In this paper, we describe Allen AI’s submission to the constrained track of the SIGTYP 2024 Shared Task. Using only the data provided by the organizers, we pretrained a transformer-based multilingual model, then finetuned it on the Universal Dependencies (UD) annotations of a given language for a downstream task. Our systems achieved decent performance on the test set, beating the baseline in most language-task pairs, yet struggles with subtoken tags in multiword expressions as seen in Coptic and Ancient Hebrew. On the validation set, we obtained ≥70% F1- score on most language-task pairs. In addition, we also explored the cross-lingual capability of our trained models. This paper highlights our pretraining and finetuning process, and our findings from our internal evaluations.</abstract>
       <url hash="bfbced08">2024.sigtyp-1.18</url>
       <bibkey>miranda-2024-allen</bibkey>
+      <video href="2024.sigtyp-1.18.mp4"/>
     </paper>
     <paper id="19">
       <title>Findings of the <fixed-case>SIGTYP</fixed-case> 2024 Shared Task on Word Embedding Evaluation for Ancient and Historical Languages</title>
@@ -225,6 +230,7 @@
       <abstract>This paper discusses the organisation and findings of the SIGTYP 2024 Shared Task on Word Embedding Evaluation for Ancient and Historical Languages. The shared task was split into the constrained and unconstrained tracks and involved solving either 3 or 5 problems for either 13 or 16 ancient and historical languages belonging to 4 language families, and making use of 6 different scripts. There were 14 registrations in total, of which 3 teams submitted to each track. Out of these 6 submissions, 2 systems were successful in the constrained setting and another 2 in the uncon- strained setting, and 4 system description papers were submitted by different teams. The best average result for morphological feature prediction was about 96%, while the best average results for POS-tagging and lemmatisation were 96% and 94% respectively. At the word level, the winning team could not achieve a higher average accuracy across all 16 languages than 5.95%, which demonstrates the difficulty of this problem. At the character level, the best average result over 16 languages 55.62%</abstract>
       <url hash="c66b4f4d">2024.sigtyp-1.19</url>
       <bibkey>dereza-etal-2024-findings</bibkey>
+      <video href="2024.sigtyp-1.19.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.sigul.xml b/data/xml/2024.sigul.xml
new file mode 100644
index 0000000000..2f4d28e973
--- /dev/null
+++ b/data/xml/2024.sigul.xml
@@ -0,0 +1,572 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.sigul">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</booktitle>
+      <editor><first>Maite</first><last>Melero</last></editor>
+      <editor><first>Sakriani</first><last>Sakti</last></editor>
+      <editor><first>Claudia</first><last>Soria</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="ec23435b">2024.sigul-1</url>
+      <venue>sigul</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="9a503a91">2024.sigul-1.0</url>
+      <bibkey>sigul-2024-special</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>A Bit of a Problem: Measurement Disparities in Dataset Sizes across Languages</title>
+      <author><first>Catherine</first><last>Arnett</last></author>
+      <author><first>Tyler A.</first><last>Chang</last></author>
+      <author><first>Benjamin</first><last>Bergen</last></author>
+      <pages>1–9</pages>
+      <abstract>How should text dataset sizes be compared across languages? Even for content-matched (parallel) corpora, UTF-8 encoded text can require a dramatically different number of bytes for different languages. In our work, we define the byte premium between two languages as the ratio of bytes used to encode content-matched text in those languages. We compute byte premiums for 1155 languages, and we use linear regressions to estimate byte premiums for other languages. We release a tool to obtain byte premiums for any two languages, enabling comparisons of dataset sizes across languages for more equitable multilingual model development and data practices.</abstract>
+      <url hash="9b308c45">2024.sigul-1.1</url>
+      <bibkey>arnett-etal-2024-bit</bibkey>
+    </paper>
+    <paper id="2">
+      <title>A Novel Corpus for Automated Sexism Identification on Social Media</title>
+      <author><first>Lutfiye Seda</first><last>Mut Altin</last></author>
+      <author><first>Horacio</first><last>Saggion</last></author>
+      <pages>10–15</pages>
+      <abstract>In this paper, we present a novel dataset for the study of automated sexism identification and categorization on social media in Turkish. For this purpose, we have collected, following a well established methodology, a set of Tweets and YouTube comments. Relying on expert organizations in the area of gender equality, each text has been annotated based on a two-level labelling schema derived from previous research. Our resulting dataset consists of around 7,000 annotated instances useful for the study of expressions of sexism and misogyny on the Web. To the best of our knowledge, this is the first two-level manually annotated comprehensive Turkish dataset for sexism identification. In order to fuel research in this relevant area, we also present the result of our benchmarking experiments in the area of sexism identification in Turkish.</abstract>
+      <url hash="4ff1c10b">2024.sigul-1.2</url>
+      <bibkey>mut-altin-saggion-2024-novel</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Advancing Generative <fixed-case>AI</fixed-case> for <fixed-case>P</fixed-case>ortuguese with Open Decoder Gervásio <fixed-case>PT</fixed-case>*</title>
+      <author><first>Rodrigo</first><last>Santos</last></author>
+      <author><first>João Ricardo</first><last>Silva</last></author>
+      <author><first>Luís</first><last>Gomes</last></author>
+      <author><first>João</first><last>Rodrigues</last></author>
+      <author><first>António</first><last>Branco</last></author>
+      <pages>16–26</pages>
+      <abstract>To advance the neural decoding of Portuguese, in this paper we present a fully open Transformer-based, instruction-tuned decoder model that sets a new state of the art in this respect. To develop this decoder, which we named Gervásio PT*, a strong LLaMA 2 7B model was used as a starting point, and its further improvement through additional training was done over language resources that include new instruction data sets of Portuguese prepared for this purpose, which are also contributed in this paper. All versions of Gervásio are open source and distributed for free under an open license, including for either research or commercial usage, and can be run on consumer-grade hardware, thus seeking to contribute to the advancement of research and innovation in language technology for Portuguese.</abstract>
+      <url hash="0c7d90f9">2024.sigul-1.3</url>
+      <bibkey>santos-etal-2024-advancing</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Assessing Pre-Built Speaker Recognition Models for Endangered Language Data</title>
+      <author><first>Gina-Anne</first><last>Levow</last></author>
+      <pages>27–32</pages>
+      <abstract>Significant research has focused on speaker recognition, determining which speaker is speaking in a segment of audio. However, few experiments have investigated speaker recognition for very low-resource or endangered languages. Furthermore, speaker recognition has the potential to support language documentation and revitalization efforts, making recordings more accessible to researchers and communities. Since endangered language datasets are too small to build competitive speaker representations from scratch, we investigate the application of large-scale pre-built speaker recognition models to bridge this gap. This paper compares four speaker recognition models on six diverse endangered language data sets. Comparisons contrast three recent neural network-based x-vector models and an earlier baseline i-vector model. Experiments demonstrate significantly stronger performance for some of the studied models. Further analysis highlights differences in effectiveness tied to the lengths of test audio segments and amount of data used for speaker modeling.</abstract>
+      <url hash="6d73a56f">2024.sigul-1.4</url>
+      <bibkey>levow-2024-assessing</bibkey>
+    </paper>
+    <paper id="5">
+      <title><fixed-case>BERT</fixed-case>bek: A Pretrained Language Model for <fixed-case>U</fixed-case>zbek</title>
+      <author><first>Elmurod</first><last>Kuriyozov</last></author>
+      <author><first>David</first><last>Vilares</last></author>
+      <author><first>Carlos</first><last>Gómez-Rodríguez</last></author>
+      <pages>33–44</pages>
+      <abstract>Recent advances in neural networks based language representation made it possible for pretrained language models to outperform previous models in many downstream natural language processing (NLP) tasks. These pretrained language models have also shown that if large enough, they exhibit good few-shot abilities, which is especially beneficial for low-resource scenarios. In this respect, although there are some large-scale multilingual pretrained language models available, language-specific pretrained models have demonstrated to be more accurate for monolingual evaluation setups. In this work, we present BERTbek - pretrained language models based on the BERT (Bidirectional Encoder Representations from Transformers) architecture for the low-resource Uzbek language. We also provide a comprehensive evaluation of the models on a number of NLP tasks: sentiment analysis, multi-label topic classification, and named entity recognition, comparing the models with various machine learning methods as well as multilingual BERT (mBERT). Experimental results indicate that our models outperform mBERT and other task-specific baseline models in all three tasks. Additionally, we also show the impact of training data size and quality on the downstream performance of BERT models, by training three different models with different text sources and corpus sizes.</abstract>
+      <url hash="423adf8b">2024.sigul-1.5</url>
+      <bibkey>kuriyozov-etal-2024-bertbek</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Beyond Error Categories: A Contextual Approach of Evaluating Emerging Spell and Grammar Checkers</title>
+      <author><first>Þórunn</first><last>Arnardóttir</last></author>
+      <author><first>Svanhvít Lilja</first><last>Ingólfsdóttir</last></author>
+      <author><first>Haukur Barri</first><last>Símonarson</last></author>
+      <author><first>Hafsteinn</first><last>Einarsson</last></author>
+      <author><first>Anton Karl</first><last>Ingason</last></author>
+      <author><first>Vilhjálmur</first><last>Þorsteinsson</last></author>
+      <pages>45–52</pages>
+      <abstract>Automatic spell and grammar checking can be done using various system architectures, and large language models have recently been used to solve the task with promising results. Here we describe a new method of creating test data to measure the performance of spell and grammar checkers, including large language models. Three types of test data represent different approaches to evaluation, from basic error detection to error correction with natural language explanations of the corrections made and error severity scores, which is the main novelty of this approach. These additions are especially useful when evaluating large language models. We present a spell and grammar checking test set for Icelandic in which the described approach is applied. The data consists of whole texts instead of discrete sentences, which facilitates evaluating context awareness of models. The resulting test set can be used to compare different spell and grammar checkers and is published under permissive licenses.</abstract>
+      <url hash="bc92c494">2024.sigul-1.6</url>
+      <bibkey>arnardottir-etal-2024-beyond</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Bidirectional <fixed-case>E</fixed-case>nglish-<fixed-case>N</fixed-case>epali Machine Translation(<fixed-case>MT</fixed-case>) System for Legal Domain</title>
+      <author><first>Shabdapurush</first><last>Poudel</last></author>
+      <author><first>Bal Krishna</first><last>Bal</last></author>
+      <author><first>Praveen</first><last>Acharya</last></author>
+      <pages>53–58</pages>
+      <abstract>Nepali, a low-resource language belonging to the Indo-Aryan language family and spoken in Nepal, India, Sikkim, and Burma has comparatively very little digital content and resources, more particularly in the legal domain. However, the need to translate legal documents is ever-increasing in the context of growing volumes of legal cases and a large population seeking to go abroad for higher education or employment. This underscores the need for developing an English-Nepali Machine Translation for the legal domain. We attempt to address this problem by utilizing a Neural Machine Translation (NMT) System with an encoder-decoder architecture, specifically designed for legal Nepali-English translation. Leveraging a custom-built legal corpus of 125,000 parallel sentences, our system achieves encouraging BLEU scores of 7.98 in (Nepali → English) and 6.63 (English → Nepali) direction</abstract>
+      <url hash="d7dbbafb">2024.sigul-1.7</url>
+      <bibkey>poudel-etal-2024-bidirectional</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>BK</fixed-case>3<fixed-case>AT</fixed-case>: Bangsamoro K-3 Children’s Speech Corpus for Developing Assessment Tools in the Bangsamoro Languages</title>
+      <author><first>Kiel D.</first><last>Gonzales</last></author>
+      <author><first>Jazzmin R.</first><last>Maranan</last></author>
+      <author><first>Francis Paolo D.</first><last>Santelices</last></author>
+      <author><first>Edsel Jedd M.</first><last>Renovalles</last></author>
+      <author><first>Nissan D.</first><last>Macale</last></author>
+      <author><first>Nicole Anne A.</first><last>Palafox</last></author>
+      <author><first>Jose Marie A.</first><last>Mendoza</last></author>
+      <pages>59–65</pages>
+      <abstract>Bangsamoro languages are among the under-resourced languages in the Mindanao region in the Philippines. Moreover, there is no currently publicly available data for children’s speech on most of these languages. BK3AT children’s speech corpus is a corpus designed for creating speech technologies that could help facilitators and teachers in K-3 education. The corpus consists of 122 hours of children speech data across 10 languages: Bahasa Sug, Chavacano, English, Filipino, Iranun, Maguindanaon, Meranaw, Sinama, Teduray, and Yakan. Preliminary experiments using Wav2Vec-XLSR architecture have been done in fine-tuning the Tagalog and L2 English corpus subsets to develop automatic speech recognition backend for literacy assessment. Results from the experiments show low word error rates (WERs) for small-vocabulary and targeted domains.</abstract>
+      <url hash="dc3d9424">2024.sigul-1.8</url>
+      <bibkey>gonzales-etal-2024-bk3at</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>C</fixed-case>orpus<fixed-case>A</fixed-case>rièja: Building an Annotated Corpus with Variation in <fixed-case>O</fixed-case>ccitan</title>
+      <author><first>Clamenca</first><last>Poujade</last></author>
+      <author><first>Myriam</first><last>Bras</last></author>
+      <author><first>Assaf</first><last>Urieli</last></author>
+      <pages>66–71</pages>
+      <abstract>The Occitan language is a less resourced language and is classified as ‘in danger’ by the UNESCO. Thereby, it is important to build resources and tools that can help to safeguard and develop the digitisation of the language. CorpusArièja is a collection of 72 texts (just over 41,000 tokens) in the Occitan language of the French department of Ariège. The majority of the texts needed to be digitised and pass within an Optical Character Recognition. This corpus contains dialectal and spelling variation, but is limited to prose, without diachronic variation or genre variation. It is an annotated corpus with two levels of lemmatisation, POS tags and verbal inflection. One of the main aims of the corpus is to enable the conception of tools that can automatically annotate all Occitan texts, regardless of the dialect or spelling used. The Ariège territory is interesting because it includes the two variations that we focus on, dialectal and spelling. It has plenty of authors that write in their native language, their variety of Occitan.</abstract>
+      <url hash="e8d16220">2024.sigul-1.9</url>
+      <bibkey>poujade-etal-2024-corpusarieja</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Developing Infrastructure for Low-Resource Language Corpus Building</title>
+      <author><first>Hedwig G.</first><last>Sekeres</last></author>
+      <author><first>Wilbert</first><last>Heeringa</last></author>
+      <author><first>Wietse</first><last>de Vries</last></author>
+      <author><first>Oscar Yde</first><last>Zwagers</last></author>
+      <author><first>Martijn</first><last>Wieling</last></author>
+      <author><first>Goffe Th.</first><last>Jensma</last></author>
+      <pages>72–78</pages>
+      <abstract>For many of the world’s small languages, few resources are available. In this project, a written online accessible corpus was created for the minority language variant Gronings, which serves both researchers interested in language change and variation and a general audience of (new) speakers interested in finding real-life examples of language use. The corpus was created using a combination of volunteer work and automation, which together formed an efficient pipeline for converting printed text to Key Words in Context (KWICs), annotated with lemmas and part-of-speech tags. In the creation of the corpus, we have taken into account several of the challenges that can occur when creating resources for minority languages, such as a lack of standardisation and limited (financial) resources. As the solutions we offer are applicable to other small languages as well, each step of the corpus creation process is discussed and resources will be made available benefiting future projects on other low-resource languages.</abstract>
+      <url hash="54a5e6b9">2024.sigul-1.10</url>
+      <bibkey>sekeres-etal-2024-developing</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Evaluating <fixed-case>I</fixed-case>celandic Sentiment Analysis Models Trained on Translated Data</title>
+      <author><first>Ólafur A.</first><last>Jóhannsson</last></author>
+      <author><first>Birkir H.</first><last>Arndal</last></author>
+      <author><first>Eysteinn Ö.</first><last>Jónsson</last></author>
+      <author><first>Stefan</first><last>Olafsson</last></author>
+      <author><first>Hrafn</first><last>Loftsson</last></author>
+      <pages>79–89</pages>
+      <abstract>We experiment with sentiment classification models for Icelandic that leverage machine-translated data for training. Since no large sentiment dataset exists for Icelandic, we translate 50,000 English IMDb reviews, classified either as positive or negative, into Icelandic using two services: Google Translate and GreynirTranslate. After machine translation, we assess whether the sentiment of the source language text is retained in the target language. Moreover, we evaluate the accuracy of the sentiment classifiers on non-translated Icelandic text.The performance of three types of baseline classifiers is compared, i.e., Support Vector Machines, Logistic Regression and Naive Bayes, when trained on translated data generated by either translation service. Furthermore, we fine-tune and evaluate three pre-trained transformer-based models, RoBERTa, IceBERT and ELECTRA, on both the original English texts and the translated texts. Our results indicate that the transformer models perform better than the baseline classifiers on all datasets. Moreover, our evaluation shows that the transformer models trained on data translated from English reviews can be used to effectively classify sentiment on non-translated Icelandic movie reviews.</abstract>
+      <url hash="7f6b4472">2024.sigul-1.11</url>
+      <bibkey>johannsson-etal-2024-evaluating</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Exploring Text Classification for Enhancing Digital Game-Based Language Learning for <fixed-case>I</fixed-case>rish</title>
+      <author><first>Leona</first><last>Mc Cahill</last></author>
+      <author><first>Thomas</first><last>Baltazar</last></author>
+      <author><first>Sally</first><last>Bruen</last></author>
+      <author><first>Liang</first><last>Xu</last></author>
+      <author><first>Monica</first><last>Ward</last></author>
+      <author><first>Elaine</first><last>Uí Dhonnchadha</last></author>
+      <author><first>Jennifer</first><last>Foster</last></author>
+      <pages>90–96</pages>
+      <abstract>Digital game-based language learning (DGBLL) can help with the language learning process. DGBLL applications can make learning more enjoyable and engaging, but they are difficult to develop. A DBGLL app that relies on target language texts obviously needs to be able to use texts of the appropriate level for the individual learners. This implies that text classification tools should be available to DGBLL developers, who may not be familiar with the target language, in order to incorporate suitable texts into their games. While text difficulty classifiers exist for many of the most commonly spoken languages, this is not the case for under-resourced languages, such as Irish. In this paper, we explore approaches to the development of text classifiers for Irish. In the first approach to text analysis and grading, we apply linguistic analysis to assess text complexity. Features from this approach are then used in machine learning-based text classification, which explores the application of a number of machine learning algorithms to the problem. Although the development of these text classifiers is at an early stage, they show promise, particularly in a low-resourced scenario.</abstract>
+      <url hash="e15ac382">2024.sigul-1.12</url>
+      <bibkey>mc-cahill-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Forget <fixed-case>NLI</fixed-case>, Use a Dictionary: Zero-Shot Topic Classification for Low-Resource Languages with Application to <fixed-case>L</fixed-case>uxembourgish</title>
+      <author><first>Fred</first><last>Philippy</last></author>
+      <author><first>Shohreh</first><last>Haddadan</last></author>
+      <author><first>Siwen</first><last>Guo</last></author>
+      <pages>97–104</pages>
+      <abstract>In NLP, zero-shot classification (ZSC) is the task of assigning labels to textual data without any labeled examples for the target classes. A common method for ZSC is to fine-tune a language model on a Natural Language Inference (NLI) dataset and then use it to infer the entailment between the input document and the target labels. However, this approach faces certain challenges, particularly for languages with limited resources. In this paper, we propose an alternative solution that leverages dictionaries as a source of data for ZSC. We focus on Luxembourgish, a low-resource language spoken in Luxembourg, and construct two new topic relevance classification datasets based on a dictionary that provides various synonyms, word translations and example sentences. We evaluate the usability of our dataset and compare it with the NLI-based approach on two topic classification tasks in a zero-shot manner. Our results show that by using the dictionary-based dataset, the trained models outperform the ones following the NLI-based approach for ZSC. While we focus on a single low-resource language in this study, we believe that the efficacy of our approach can also transfer to other languages where such a dictionary is available.</abstract>
+      <url hash="ef76b2f2">2024.sigul-1.13</url>
+      <bibkey>philippy-etal-2024-forget</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Fostering the Ecosystem of Open Neural Encoders for <fixed-case>P</fixed-case>ortuguese with Albertina <fixed-case>PT</fixed-case>* Family</title>
+      <author><first>Rodrigo</first><last>Santos</last></author>
+      <author><first>João</first><last>Rodrigues</last></author>
+      <author><first>Luís</first><last>Gomes</last></author>
+      <author><first>João Ricardo</first><last>Silva</last></author>
+      <author><first>António</first><last>Branco</last></author>
+      <author><first>Henrique</first><last>Lopes Cardoso</last></author>
+      <author><first>Tomás Freitas</first><last>Osório</last></author>
+      <author><first>Bernardo</first><last>Leite</last></author>
+      <pages>105–114</pages>
+      <abstract>To foster the neural encoding of Portuguese, this paper contributes foundation encoder models that represent an expansion of the still very scarce ecosystem of large language models specifically developed for this language that are fully open, in the sense that they are open source and openly distributed for free under an open license for any purpose, thus including research and commercial usages. Like most languages other than English, Portuguese is low-resourced in terms of these foundational language resources, there being the inaugural 900 million parameter Albertina and 335 million Bertimbau. Taking this couple of models as an inaugural set, we present the extension of the ecosystem of state-of-the-art open encoders for Portuguese with a larger, top performance-driven model with 1.5 billion parameters, and a smaller, efficiency-driven model with 100 million parameters. While achieving this primary goal, further results that are relevant for this ecosystem were obtained as well, namely new datasets for Portuguese based on the SuperGLUE benchmark, which we also distribute openly.</abstract>
+      <url hash="4c83a3e6">2024.sigul-1.14</url>
+      <bibkey>santos-etal-2024-fostering</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Improving Language Coverage on <fixed-case>H</fixed-case>e<fixed-case>LI</fixed-case>-<fixed-case>OTS</fixed-case></title>
+      <author><first>Tommi</first><last>Jauhiainen</last></author>
+      <author><first>Krister</first><last>Lindén</last></author>
+      <pages>115–125</pages>
+      <abstract>In this paper, we add under-resourced languages into the language repertoire of an existing off-the-shelf language identifier, HeLI-OTS. Adding more languages to a language identifier often comes with the drawback of lessened accuracy for the languages already part of the repertoire. We aim to minimize this effect. As sources for training and development data in the new languages, we use the OpenLID and FLORES-200 datasets. They are openly available high-quality datasets that are especially well-suited for language identifier development. By carefully inspecting the effect of each added language and the quality of their training and development data, we managed to add support for 20 new under-resourced languages to HeLI-OTS without affecting the performance of any existing languages to a noticeable extent.</abstract>
+      <url hash="73881a20">2024.sigul-1.15</url>
+      <bibkey>jauhiainen-linden-2024-improving</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Improving Legal Judgement Prediction in <fixed-case>R</fixed-case>omanian with Long Text Encoders</title>
+      <author><first>Mihai</first><last>Masala</last></author>
+      <author><first>Traian</first><last>Rebedea</last></author>
+      <author><first>Horia</first><last>Velicu</last></author>
+      <pages>126–132</pages>
+      <abstract>In recent years,the entire field of Natural Language Processing (NLP) has enjoyed amazing novel results achieving almost human-like performance on a variety of tasks. Legal NLP domain has also been part of this process, as it has seen an impressive growth. However, general-purpose models are not readily applicable for legal domain. Due to the nature of the domain (e.g. specialized vocabulary, long documents) specific models and methods are often needed for Legal NLP. In this work we investigate both specialized and general models for predicting the final ruling of a legal case, task known as Legal Judgment Prediction (LJP). We particularly focus on methods to extend to sequence length of Transformer-based models to better understand the long documents present in legal corpora. Extensive experiments on 4 LJP datasets in Romanian, originating from 2 sources with significantly different sizes and document lengths, show that specialized models and handling long texts are critical for a good performance.</abstract>
+      <url hash="a93b72b5">2024.sigul-1.16</url>
+      <bibkey>masala-etal-2024-improving</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Improving Noisy Student Training for Low-resource Languages in End-to-End <fixed-case>ASR</fixed-case> Using <fixed-case>C</fixed-case>ycle<fixed-case>GAN</fixed-case> and Inter-domain Losses</title>
+      <author><first>Chia-Yu</first><last>Li</last></author>
+      <author><first>Ngoc Thang</first><last>Vu</last></author>
+      <pages>133–142</pages>
+      <abstract>Training a semi-supervised end-to-end speech recognition system using noisy student training has significantly improved performance. However, this approach requires a substantial amount of paired speech-text and unlabeled speech, which is costly for low-resource languages. Therefore, this paper considers a more extreme case of semi-supervised end-to-end automatic speech recognition where there are limited paired speech-text, unlabeled speech (less than five hours), and abundant external text. Firstly, we observe improved performance by training the model using our previous work on semi-supervised learning “CycleGAN and inter-domain losses” solely with external text. Secondly, we enhance “CycleGAN and inter-domain losses” by incorporating automatic hyperparameter tuning, calling “enhanced CycleGAN inter-domain losses.” Thirdly, we integrate it into the noisy student training approach pipeline for low-resource scenarios. Our experimental results, conducted on six non-English languages from Voxforge and Common Voice, show a 20% word error rate reduction compared to the baseline teacher model and a 10% word error rate reduction compared to the baseline best student model, highlighting the significant improvements achieved through our proposed method.</abstract>
+      <url hash="9852525a">2024.sigul-1.17</url>
+      <bibkey>li-vu-2024-improving</bibkey>
+    </paper>
+    <paper id="18">
+      <title><fixed-case>I</fixed-case>ndonesian-<fixed-case>E</fixed-case>nglish Code-Switching Speech Recognition Using the Machine Speech Chain Based Semi-Supervised Learning</title>
+      <author><first>Rais Vaza Man</first><last>Tazakka</last></author>
+      <author><first>Dessi</first><last>Lestari</last></author>
+      <author><first>Ayu</first><last>Purwarianti</last></author>
+      <author><first>Dipta</first><last>Tanaya</last></author>
+      <author><first>Kurniawati</first><last>Azizah</last></author>
+      <author><first>Sakriani</first><last>Sakti</last></author>
+      <pages>143–148</pages>
+      <abstract>Indonesia is home to a diverse linguistic landscape, where individuals seamlessly transition between Indonesian, English, and local dialects in their everyday conversations—a phenomenon known as code-switching. Understanding and accommodating this linguistic fluidity is essential, particularly in the development of accurate speech recognition systems. However, tackling code-switching in Indonesian poses a challenge due to the scarcity of paired code-switching data. Thus, this study endeavors to address Indonesian-English code-switching in speech recognition, leveraging unlabeled data and employing a semi-supervised technique known as the machine speech chain. Our findings demonstrate that the machine speech chain method effectively enhances Automatic Speech Recognition (ASR) performance in recognizing code-switching between Indonesian and English, utilizing previously untapped resources of unlabeled data.</abstract>
+      <url hash="af232b9f">2024.sigul-1.18</url>
+      <bibkey>tazakka-etal-2024-indonesian</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Inter-language Transfer Learning for Visual Speech Recognition toward Under-resourced Environments</title>
+      <author><first>Fumiya</first><last>Kondo</last></author>
+      <author><first>Satoshi</first><last>Tamura</last></author>
+      <pages>149–154</pages>
+      <abstract>In this study, we introduce a method of inter-language transfer learning for under-resourced visual speech recognition. Deploying speech-related technology to all languages is a quite important activity. However, applying state-of-the-art deep-learning techniques requires huge-size labeled corpora, which makes it hard for under-resourced languages. Our approach leverages a small amount of labeled video data of the target language, and employs inter-language transfer learning using a pre-trained English lip-reading model. By applying the proposed scheme, we build a Japanese lip-reading model, using the ROHAN corpus, the size of which is about one 450th of the size of English datasets. The front-end encoder part of the pre-trained model is fine-tuned to improve the acquisition of pronunciation and lip movement patterns unique to Japanese. On the other hand, the back-end encoder and the decoder are built using the Japanese dataset. Although English and Japanese have different language structures, evaluation experiments show that it is possible to build the Japanese lip-reading model efficiently. Comparison with competitive schemes demonstrates the effectiveness of our method.</abstract>
+      <url hash="b5a9dae8">2024.sigul-1.19</url>
+      <bibkey>kondo-tamura-2024-inter</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Investigating Neural Machine Translation for Low-Resource Languages: Using <fixed-case>B</fixed-case>avarian as a Case Study</title>
+      <author><first>Wan-hua</first><last>Her</last></author>
+      <author><first>Udo</first><last>Kruschwitz</last></author>
+      <pages>155–167</pages>
+      <abstract>Machine Translation has made impressive progress in recent years offering close to human-level performance on many languages, but studies have primarily focused on high-resource languages with broad online presence and resources. With the help of growing Large Language Models, more and more low-resource languages achieve better results through the presence of other languages. However, studies have shown that not all low-resource languages can benefit from multilingual systems, especially those with insufficient training and evaluation data. In this paper, we revisit state-of-the-art Neural Machine Translation techniques to develop automatic translation systems between German and Bavarian. We investigate conditions of low-resource languages such as data scarcity and parameter sensitivity and focus on refined solutions that combat low-resource difficulties and creative solutions such as harnessing language similarity. Our experiment entails applying Back-translation and Transfer Learning to automatically generate more training data and achieve higher translation performance. We demonstrate noisiness in the data and present our approach to carry out text preprocessing extensively. Evaluation was conducted using combined metrics: BLEU, chrF and TER. Statistical significance results with Bonferroni correction show surprisingly high baseline systems, and that Back-translation leads to significant improvement. Furthermore, we present a qualitative analysis of translation errors and system limitations.</abstract>
+      <url hash="ab3e85ae">2024.sigul-1.20</url>
+      <bibkey>her-kruschwitz-2024-investigating</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>I</fixed-case>talian-<fixed-case>L</fixed-case>igurian Machine Translation in Its Cultural Context</title>
+      <author><first>Christopher R.</first><last>Haberland</last></author>
+      <author><first>Jean</first><last>Maillard</last></author>
+      <author><first>Stefano</first><last>Lusito</last></author>
+      <pages>168–176</pages>
+      <abstract>Large multilingual machine translation efforts are driving improved access and performance for under-resourced languages, but often fail to translate culturally specific and local concepts. Additionally, translation from practically relevant input languages may flag behind those that are comparatively over-represented in the training dataset. In this work, we release a new corpus, ZenaMT, containing 7,561 parallel Ligurian-Italian sentences, nearly a fifth of which are also translated in English. This corpus spans five domains: local and international news, Ligurian literature, Genoese Ligurian linguistics concepts, traditional card game rules, and Ligurian geographic expressions. We find that a translation model augmented with ZenaMT improves a baseline by 20%, and by over 25% (BLEU) compared to NLLB-3.3B, which is over 50 times the size. Our results demonstrate the utility of creating data sets for MT that are specifically tailored for the cultural context of Ligurian speakers. We freely release ZenaMT and expect to periodically update the corpus to improve MT performance and domain coverage.</abstract>
+      <url hash="7859c188">2024.sigul-1.21</url>
+      <bibkey>haberland-etal-2024-italian</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Labadain-30k+: A Monolingual Tetun Document-Level Audited Dataset</title>
+      <author><first>Gabriel</first><last>de Jesus</last></author>
+      <author><first>Sérgio</first><last>Nunes</last></author>
+      <pages>177–188</pages>
+      <abstract>This paper introduces Labadain-30k+, a monolingual dataset comprising 33.6k documents in Tetun, a low-resource language spoken in Timor-Leste. The dataset was acquired through web crawling and augmented with Wikipedia documents released by Wikimedia. Both sets of documents underwent thorough manual audits at the document level by native Tetun speakers, resulting in the construction of a Tetun text dataset well-suited for a variety of natural language processing and information retrieval tasks. This dataset was employed to conduct a comprehensive content analysis aimed at providing a nuanced understanding of document composition and the evolution of Tetun documents on the web. The analysis revealed that news articles constitute the predominant documents within the dataset, accounting for 89.87% of the total, followed by Wikipedia documents at 4.34%, and legal and governmental documents at 3.65%, among others. Notably, there was a substantial increase in the number of documents in 2020, indicating 11.75 percentage points rise in document quantity, compared to an average of 4.76 percentage points per year from 2001 to 2023. Moreover, the year 2017, marked by the increased popularity of online news in Tetun, served as a threshold for analyzing the evolution of document writing on the web pre- and post-2017, specifically regarding vocabulary usage. Surprisingly, this analysis showed a significant increase of 6.12 percentage points in the Tetun written adhering to the Tetun official standard. Additionally, the persistence of Portuguese loanwords in that trajectory remained evident, reflecting an increase of 5.09 percentage points.</abstract>
+      <url hash="5933e8e5">2024.sigul-1.22</url>
+      <bibkey>de-jesus-nunes-2024-labadain</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Language Models on a Diet: Cost-Efficient Development of Encoders for Closely-Related Languages via Additional Pretraining</title>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
+      <author><first>Vít</first><last>Suchomel</last></author>
+      <author><first>Peter</first><last>Rupnik</last></author>
+      <author><first>Taja</first><last>Kuzman</last></author>
+      <author><first>Rik</first><last>van Noord</last></author>
+      <pages>189–203</pages>
+      <abstract>The world of language models is going through turbulent times, better and ever larger models are coming out at an unprecedented speed. However, we argue that, especially for the scientific community, encoder models of up to 1 billion parameters are still very much needed, their primary usage being in enriching large collections of data with metadata necessary for downstream research. We investigate the best way to ensure the existence of such encoder models on the set of very closely related languages - Croatian, Serbian, Bosnian and Montenegrin, by setting up a diverse benchmark for these languages, and comparing the trained-from-scratch models with the new models constructed via additional pretraining of existing multilingual models. We show that comparable performance to dedicated from-scratch models can be obtained by additionally pretraining available multilingual models even with a limited amount of computation. We also show that neighboring languages, in our case Slovenian, can be included in the additional pretraining with little to no loss in the performance of the final model.</abstract>
+      <url hash="7b4d106e">2024.sigul-1.23</url>
+      <bibkey>ljubesic-etal-2024-language</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Man or Machine: Evaluating Spelling Error Detection in <fixed-case>D</fixed-case>anish Newspaper Corpora</title>
+      <author><first>Eckhard</first><last>Bick</last></author>
+      <author><first>Jonas Nygaard</first><last>Blom</last></author>
+      <author><first>Marianne</first><last>Rathje</last></author>
+      <author><first>Jørgen</first><last>Schack</last></author>
+      <pages>204–211</pages>
+      <abstract>This paper evaluates frequency and detection performance for both spelling and grammatical errors in a corpus of published Danish newspaper texts, comparing the results of three human proofreaders with those of an automatic system, DanProof. Adopting the error categorization scheme of the latter, we look at the accuracy of individual error types and their relative distribution over time, as well as the adequacy of suggested corrections. Finally, we discuss so-called artefact errors introduced by corpus processing, and the potential of DanProof as a corpus cleaning tool for identifying and correcting format conversion, OCR or other compilation errors. In the evaluation, with balanced F1-scores of 77.6 and 67.6 for 1999 texts and 2019 texts, respectively, DanProof achieved a higher recall and accuracy than the individual human annotators, and contributed the largest share of errors not detected by others (16.4% for 1999 and 23.6% for 2019). However, the human annotators had a significantly higher precision. Not counting artifacts, the overall error frequency in the corpus was low ( 0.5%), and less than half in the newer texts compared to the older ones, a change that mostly concerned orthographical errors, with a correspondingly higher relative share of grammatical errors.</abstract>
+      <url hash="10701e7d">2024.sigul-1.24</url>
+      <bibkey>bick-etal-2024-man</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Managing Fine-grained Metadata for Text Bases in Extremely Low Resource Languages: The Cases of Two Regional Languages of <fixed-case>F</fixed-case>rance</title>
+      <author><first>Marianne</first><last>Vergez-Couret</last></author>
+      <author><first>Delphine</first><last>Bernhard</last></author>
+      <author><first>Michael</first><last>Nauge</last></author>
+      <author><first>Myriam</first><last>Bras</last></author>
+      <author><first>Pablo</first><last>Ruiz Fabo</last></author>
+      <author><first>Carole</first><last>Werner</last></author>
+      <pages>212–221</pages>
+      <abstract>Metadata are key components of language resources and facilitate their exploitation and re-use. Their creation is a labour intensive process and requires a modeling step, which identifies resource-specific information as well as standards and controlled vocabularies that can be reused. In this article, we focus on metadata for documenting text bases for regional languages of France characterised by several levels of variation (space, time, usage, social status), based on a survey of existing metadata schema. Moreover, we implement our metadata model as a database structure for the Heurist data management system, which combines both the ease of use of spreadsheets and the ability to model complex relationships between entities of relational databases. The Heurist template is made freely available and was used to describe metadata for text bases in Alsatian and Poitevin-Santongeais. We also propose tools to automatically generate XML metadata headers files from the database.</abstract>
+      <url hash="05af90d1">2024.sigul-1.25</url>
+      <bibkey>vergez-couret-etal-2024-managing</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Mixat: A Data Set of Bilingual Emirati-<fixed-case>E</fixed-case>nglish Speech</title>
+      <author><first>Maryam Khalifa</first><last>Al Ali</last></author>
+      <author><first>Hanan</first><last>Aldarmaki</last></author>
+      <pages>222–226</pages>
+      <abstract>This paper introduces Mixat: a dataset of Emirati speech code-mixed with English. Mixat was developed to address the shortcomings of current speech recognition resources when applied to Emirati speech, and in particular, to bilignual Emirati speakers who often mix and switch between their local dialect and English. The data set consists of 15 hours of speech derived from two public podcasts featuring native Emirati speakers, one of which is in the form of conversations between the host and a guest. Therefore, the collection contains examples of Emirati-English code-switching in both formal and natural conversational contexts. In this paper, we describe the process of data collection and annotation, and describe some of the features and statistics of the resulting data set. In addition, we evaluate the performance of pre-trained Arabic and multi-lingual ASR systems on our dataset, demonstrating the shortcomings of existing models on this low-resource dialectal Arabic, and the additional challenge of recognizing code-switching in ASR. The dataset will be made publicly available for research use.</abstract>
+      <url hash="bf23554e">2024.sigul-1.26</url>
+      <bibkey>al-ali-aldarmaki-2024-mixat</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Multi-dialectal <fixed-case>ASR</fixed-case> of <fixed-case>A</fixed-case>rmenian from Naturalistic and Read Speech</title>
+      <author><first>Malajyan</first><last>Arthur</last></author>
+      <author><first>Victoria</first><last>Khurshudyan</last></author>
+      <author><first>Karen</first><last>Avetisyan</last></author>
+      <author><first>Hossep</first><last>Dolatian</last></author>
+      <author><first>Damien</first><last>Nouvel</last></author>
+      <pages>227–236</pages>
+      <abstract>The paper explores the development of Automatic Speech Recognition (ASR) models for Armenian, by using data from two standard dialects (Eastern Armenian and Western Armenian). The goal is to develop a joint bi-variational model. We achieve state-of-the-art results. Results from our ASR experiments demonstrate the impact of dataset selection and data volume on model performance. The study reveals limited transferability between dialects, although integrating datasets from both dialects enhances overall performance. The paper underscores the importance of dataset diversity and volume in ASR model training for under-resourced languages like Armenian.</abstract>
+      <url hash="8999af79">2024.sigul-1.27</url>
+      <bibkey>arthur-etal-2024-multi</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Multilingual Self-supervised Visually Grounded Speech Models</title>
+      <author><first>Huynh Phuong Thanh</first><last>Nguyen</last></author>
+      <author><first>Sakriani</first><last>Sakti</last></author>
+      <pages>237–243</pages>
+      <abstract>Developing a multilingual speech-to-speech translation system poses challenges due to the scarcity of paired speech data in various languages, particularly when dealing with unknown and untranscribed languages. However, the shared semantic representation across multiple languages presents an opportunity to build a translation system based on images. Recently, researchers have explored methods for aligning bilingual speech as a novel approach to discovering speech pairs using semantic images from unknown and untranscribed speech. These aligned speech pairs can then be utilized to train speech-to-speech translation systems. Our research builds upon these approaches by expanding into multiple languages and focusing on achieving multimodal multilingual pairs alignment, with a key component being multilingual visually grounded speech models. The objectives of our research are twofold: (1) to create visually grounded speech datasets for English, Japanese, Indonesian, and Vietnamese, and (2) to develop self-supervised visually grounded speech models for these languages. Our experiments have demonstrated the feasibility of this approach, showcasing the ability to retrieve associations between speeches and images. The results indicate that our multilingual visually grounded speech models yield promising outcomes in representing speeches using semantic images across multiple languages.</abstract>
+      <url hash="5ac8fb9b">2024.sigul-1.28</url>
+      <bibkey>nguyen-sakti-2024-multilingual</bibkey>
+    </paper>
+    <paper id="29">
+      <title><fixed-case>N</fixed-case>epal Script Text Recognition Using <fixed-case>CRNN</fixed-case> <fixed-case>CTC</fixed-case> Architecture</title>
+      <author><first>Swornim</first><last>Nakarmi</last></author>
+      <author><first>Sarin</first><last>Sthapit</last></author>
+      <author><first>Arya</first><last>Shakya</last></author>
+      <author><first>Rajani</first><last>Chulyadyo</last></author>
+      <author><first>Bal Krishna</first><last>Bal</last></author>
+      <pages>244–251</pages>
+      <abstract>Nepal Script (also known as Prachalit Script) is the widely used script of Nepal Bhasa, the native language of the Kathmandu Valley in Nepal. Derived from the Brahmi Script, the Nepal Script was developed in the 9th century and was extensively used till the 20th century, before being replaced by the Devanagari script. Numerous ancient manuscripts, inscriptions, and documents written in the Nepal Script are still available containing immense knowledge on architecture, arts, astrology, ayurveda, literature, music, tantrism, etc. To preserve and revive Nepal Bhasa, digitizing such documents plays a crucial role. This paper presents our work on text recognition for the Nepal Script. The implementation includes the Nepal Script text recognizer based on CRNN CTC architecture aided by line and word segmentations. Leveraging a carefully curated dataset that encompasses handwritten and printed texts in the Nepal Script, our work has achieved CER of 6.65% and WER of 13.11%. The dataset used for this work is available as Nepal Script Text Dataset on Kaggle. The paper further explores the associated challenges due to the complex nature of the script such as conjuncts, modifiers and variations; and the current state of the script.</abstract>
+      <url hash="1fefecd3">2024.sigul-1.29</url>
+      <bibkey>nakarmi-etal-2024-nepal</bibkey>
+    </paper>
+    <paper id="30">
+      <title><fixed-case>NLP</fixed-case> for Arbëresh: How an Endangered Language Learns to Write in the 21st Century</title>
+      <author><first>Giulio</first><last>Cusenza</last></author>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
+      <pages>252–256</pages>
+      <abstract>Societies are becoming more and more connected, and minority languages often find themselves helpless against the advent of the digital age, with their speakers having to regularly turn to other languages for written communication. This work introduces the case of Arbëresh, a southern Italian language related to Albanian. It presents the very first machine-readable Arbëresh data, collected through a web campaign, and describes a set of tools developed to enable the Arbëresh people to learn how to write their language, including a spellchecker, a conjugator, a numeral generator, and an interactive platform to learn Arbëresh spelling. A comprehensive web application was set up to make these tools available to the public, as well as to collect further data through them. This method can be replicated to help revive other minority languages in a situation similar to Arbëresh’s. The main challenges of the process were the extremely low-resource setting and the variability of Arbëresh dialects.</abstract>
+      <url hash="e1ec79f3">2024.sigul-1.30</url>
+      <bibkey>cusenza-coltekin-2024-nlp</bibkey>
+    </paper>
+    <paper id="31">
+      <title><fixed-case>P</fixed-case>ersian<fixed-case>E</fixed-case>mo: Enhancing <fixed-case>F</fixed-case>arsi-<fixed-case>D</fixed-case>ari Emotion Analysis with a Hybrid Transformer and Recurrent Neural Network Model</title>
+      <author><first>Mohammad Ali</first><last>Hussiny</last></author>
+      <author><first>Mohammad Arif</first><last>Payenda</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
+      <pages>257–263</pages>
+      <abstract>Emotion analysis is a critical research domain within the field of natural language processing (NLP). While substantial progress has been made in this area for the Persian language, there is still a need for more precise models and larger datasets specifically focusing on the Farsi and Dari dialects. In this research, we introduce “LearnArmanEmo” as a new dataset and a superior ensemble approach for Persian text emotion classification. Our proposed model, which combines XLM-RoBERTa-large and BiGRU, undergoes evaluation on LetHerLearn for the Dari dialect, ARMANEMO for the Farsi dialect, and LearnArmanEmo for both Dari and Farsi dialects. The empirical results substantiate the efficacy of our approach with the combined model demonstrating superior performance. Specifically, our model achieves an F1 score of 72.9% on LetHerLearn, an F1 score of 77.1% on ARMANEMO, and an F1 score of 78.8% on the LearnArmanEmo dataset, establishing it as a better ensemble model for these datasets. These findings underscore the potential of this hybrid model as a useful tool for enhancing the performance of emotion analysis in Persian language processing.</abstract>
+      <url hash="babd90a8">2024.sigul-1.31</url>
+      <bibkey>hussiny-etal-2024-persianemo</bibkey>
+    </paper>
+    <paper id="32">
+      <title><fixed-case>P</fixed-case>hilippine Languages Database: A Multilingual Speech Corpora for Developing Systems for Low-Resource Languages</title>
+      <author><first>Rowena Cristina L.</first><last>Guevara</last></author>
+      <author><first>Rhandley D.</first><last>Cajote</last></author>
+      <author><first>Michael Gringo Angelo R.</first><last>Bayona</last></author>
+      <author><first>Crisron Rudolf G.</first><last>Lucas</last></author>
+      <pages>264–271</pages>
+      <abstract>Previous efforts to collect Filipino speech were done in the development of Filipino-Speech Corpus, TAGCO, and Filipino-Bisaya speech corpus. These corpora, however, are either domain-specific, non-parallel, non-multilingual or relatively insufficient for the development of state-of-the-art Automatic Speech Recognizers (ASR) and Text-To-Speech Systems (TTS) which usually requires hundreds of hours of speech data. This paper presents a multilingual corpora for the Philippine languages namely: Filipino, English, Cebuano, Kapampangan, Hiligaynon, Ilokano, Bikolano, Waray, and Tausug. PLD includes over 454 hours of recordings from speakers of the ten languages, covering multiple domains in news, medical, education, tourism and spontaneous speech. The applicability of the corpus has also been demonstrated in adult and children ASR, phoneme transcriber, voice conversion, and TTS applications.</abstract>
+      <url hash="44ff74f0">2024.sigul-1.32</url>
+      <bibkey>guevara-etal-2024-philippine</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Prompting towards Alleviating Code-Switched Data Scarcity in Under-Resourced Languages with <fixed-case>GPT</fixed-case> as a Pivot</title>
+      <author><first>Michelle</first><last>Terblanche</last></author>
+      <author><first>Kayode</first><last>Olaleye</last></author>
+      <author><first>Vukosi</first><last>Marivate</last></author>
+      <pages>272–282</pages>
+      <abstract>Many multilingual communities, including numerous in Africa, frequently engage in code-switching during conversations. This behaviour stresses the need for natural language processing technologies adept at processing code-switched text. However, data scarcity, particularly in African languages, poses a significant challenge, as many are low-resourced and under-represented. In this study, we prompted GPT 3.5 to generate Afrikaans–English and Yoruba–English code-switched sentences, enhancing diversity using topic-keyword pairs, linguistic guidelines, and few-shot examples. Our findings indicate that the quality of generated sentences for languages using non-Latin scripts, like Yoruba, is considerably lower when compared with the high Afrikaans–English success rate. There is therefore a notable opportunity to refine prompting guidelines to yield sentences suitable for the fine-tuning of language models. We propose a framework for augmenting the diversity of synthetically generated code-switched data using GPT and propose leveraging this technology to mitigate data scarcity in low-resourced languages, underscoring the essential role of native speakers in this process.</abstract>
+      <url hash="d0de42d2">2024.sigul-1.33</url>
+      <bibkey>terblanche-etal-2024-prompting</bibkey>
+    </paper>
+    <paper id="34">
+      <title>Quantifying the Ethical Dilemma of Using Culturally Toxic Training Data in <fixed-case>AI</fixed-case> Tools for Indigenous Languages</title>
+      <author><first>Pedro Henrique</first><last>Domingues</last></author>
+      <author><first>Claudio Santos</first><last>Pinhanez</last></author>
+      <author><first>Paulo</first><last>Cavalin</last></author>
+      <author><first>Julio</first><last>Nogima</last></author>
+      <pages>283–293</pages>
+      <abstract>This paper tries to quantify the ethical dilemma of using culturally toxic training data to improve the performance of AI tools for ultra low-resource languages such as Indigenous languages. Our case study explores the use of Bible data which is both a commonly available source of training pairs for translators of Indigenous languages and a text which has a trail of physical and cultural violence for many Indigenous communities. In the context of fine-tuning a WMT19 German-to-English model into a Guarani Mbya-to-English translator, we first show, with two commonly-used Machine Translation metrics, that using only Bible data is not enough to create successful translators for everyday sentences gathered from a dictionary. Indeed, even fine-tuning with only 3,000 pairs of data from the dictionary produces significant increases in accuracy compared to Bible-only models. We then show that simultaneously fine-tuning with dictionary and Bible data achieves a substantial increase over the accuracy of a dictionary-only trained translator, and similarly happens when using two-step methods of fine-tuning. However, we also observed some, measurable, contaminated text from the Bible into the outputs of the best translator, creating concerns about its release to an Indigenous community. We end by discussing mechanisms to mitigate the negative impacts of this contamination.</abstract>
+      <url hash="96f2a92a">2024.sigul-1.34</url>
+      <bibkey>domingues-etal-2024-quantifying</bibkey>
+    </paper>
+    <paper id="35">
+      <title>Residual Dropout: A Simple Approach to Improve Transformer’s Data Efficiency</title>
+      <author><first>Carlos</first><last>Escolano</last></author>
+      <author><first>Francesca</first><last>De Luca Fornaciari</last></author>
+      <author><first>Maite</first><last>Melero</last></author>
+      <pages>294–299</pages>
+      <abstract>Transformer models often demand a vast amount of training data to achieve the desired level of performance. However, this data requirement poses a major challenge for low-resource languages seeking access to high-quality systems, particularly in tasks like Machine Translation. To address this issue, we propose adding Dropout to Transformer’s Residual Connections. Our experimental results demonstrate that this modification effectively mitigates overfitting during training, resulting in substantial performance gains of over 4 BLEU points on a dataset consisting of merely 10 thousand examples.</abstract>
+      <url hash="ccc871a5">2024.sigul-1.35</url>
+      <bibkey>escolano-etal-2024-residual</bibkey>
+    </paper>
+    <paper id="36">
+      <title>Resource Acquisition for Understudied Languages: Extracting Wordlists from Dictionaries for Computer-assisted Language Comparison</title>
+      <author><first>Frederic</first><last>Blum</last></author>
+      <author><first>Johannes</first><last>Englisch</last></author>
+      <author><first>Alba</first><last>Hermida Rodriguez</last></author>
+      <author><first>Rik</first><last>van Gijn</last></author>
+      <author><first>Johann-Mattis</first><last>List</last></author>
+      <pages>300–306</pages>
+      <abstract>Comparative wordlists play a crucial role for historical language comparison. They are regularly used for the identification of related words and languages, or for the reconstruction of language phylogenies and proto-languages. While automated solutions exist for the majority of methods used for this purpose, no standardized computational or computer-assisted approaches for the compilation of comparative wordlists have been proposed so far. Up to today, scholars compile wordlists by sifting manually through dictionaries or similar language resources and typing them into spreadsheets. In this study we present a semi-automatic approach to extract wordlists from machine-readable dictionaries. The transparent workflow allows to build user-defined wordlists for individual languages in a standardized format. By automating the search for translation equivalents in dictionaries, our approach greatly facilitates the aggregation of individual resources into multilingual comparative wordlists that can be used for a variety of purposes.</abstract>
+      <url hash="4329cf61">2024.sigul-1.36</url>
+      <bibkey>blum-etal-2024-resource</bibkey>
+    </paper>
+    <paper id="37">
+      <title>Robust Guidance for Unsupervised Data Selection: Capturing Perplexing Named Entities for Domain-Specific Machine Translation</title>
+      <author><first>Seunghyun</first><last>Ji</last></author>
+      <author><first>Hagai Raja</first><last>Sinulingga</last></author>
+      <author><first>Darongsae</first><last>Kwon</last></author>
+      <pages>307–317</pages>
+      <abstract>Low-resourced data presents a significant challenge for neural machine translation. In most cases, the low-resourced environment is caused by high costs due to the need for domain experts or the lack of language experts. Therefore, identifying the most training-efficient data within an unsupervised setting emerges as a practical strategy. Recent research suggests that such effective data can be identified by selecting ‘appropriately complex data’ based on its volume, providing strong intuition for unsupervised data selection. However, we have discovered that establishing criteria for unsupervised data selection remains a challenge, as the ‘appropriate level of difficulty’ may vary depending on the data domain. We introduce a novel unsupervised data selection method named ‘Capturing Perplexing Named Entities,’ which leverages the maximum inference entropy in translated named entities as a metric for selection. When tested with the ‘Korean-English Parallel Corpus of Specialized Domains,’ our method served as robust guidance for identifying training-efficient data across different domains, in contrast to existing methods.</abstract>
+      <url hash="7b41737a">2024.sigul-1.37</url>
+      <bibkey>ji-etal-2024-robust</bibkey>
+    </paper>
+    <paper id="38">
+      <title>Seeding Alignment between Language Technology and Indigenous Methodologies: A Decolonizing Framework for Endangered Language Revitalization</title>
+      <author><first>Craig John</first><last>Carpenter</last></author>
+      <author><first>John</first><last>Lyon</last></author>
+      <author><first>Miles</first><last>Thorogood</last></author>
+      <author><first>Jeannette C.</first><last>Armstrong</last></author>
+      <pages>318–324</pages>
+      <abstract>The integration of a speech technology into a digital edition to support the acquisition of a critically endangered Indigenous language is a complex task. More than simply consisting of technical challenges of working with an under-resourced language, researchers face the potential of re-enacting causes of language endangerment without rigorous adherence to qualitative methodologies. Based on reflections throughout the development process of a speech technology, this paper proposes a cross-disciplinary decolonizing framework for researchers working in the field of computational linguistics for Indigenous Language Revitalization (ILR). The authors propose a series of qualitative methodologies to ensure alignment with the language community which the technology is intended to benefit. The proposed relational framework is designed to sustain the integrity of the Four Rs: a series of principles first presented by Verna J. Kirkness and Ray Barnhardt in their 1991 article, “First Nations and Higher Education: The Four R’s - Respect, Relevance, Reciprocity, Responsibility”.</abstract>
+      <url hash="77f8dcd7">2024.sigul-1.38</url>
+      <bibkey>carpenter-etal-2024-seeding</bibkey>
+    </paper>
+    <paper id="39">
+      <title>Solving Failure Modes in the Creation of Trustworthy Language Technologies</title>
+      <author><first>Gianna</first><last>Leoni</last></author>
+      <author><first>Lee</first><last>Steven</last></author>
+      <author><first>Tūreiti</first><last>Keith</last></author>
+      <author><first>Keoni</first><last>Mahelona</last></author>
+      <author><first>Peter-Lucas</first><last>Jones</last></author>
+      <author><first>Suzanne</first><last>Duncan</last></author>
+      <pages>325–330</pages>
+      <abstract>To produce high-quality Natural Language Processing (NLP) technologies for low-resource languages, authentic leadership and participation from the low-resource language community is crucial. This reduces chances of bias, surveillance and the inclusion of inaccurate data that can negatively impact output in language technologies. It also ensures that decision-making throughout the pipeline of work centres on the language community rather than only prioritising metrics. The NLP building process involves a range of steps and decisions to ensure the production of successful models and outputs. Rarely does a model perform as expected or desired the first time it is deployed for testing, resulting in the need for re-assessment and re-deployment. This paper discusses the process involved in solving failure modes for a Māori language automatic speech recognition (ASR) model. It explains how the data is curated and how language and data specialists offer unparalleled insight into the debugging process because of their knowledge of the data. This expertise has a significant influence on decision-making to ensure the entire pipeline is embedded in ethical practice and the work is culturally appropriate for the Māori language community thus creating trustworthy language technology.</abstract>
+      <url hash="dc69ef85">2024.sigul-1.39</url>
+      <bibkey>leoni-etal-2024-solving</bibkey>
+    </paper>
+    <paper id="40">
+      <title>Tandem Long-Short Duration-based Modeling for Automatic Speech Recognition</title>
+      <author><first>Dalai</first><last>Mengke</last></author>
+      <author><first>Yan</first><last>Meng</last></author>
+      <author><first>Peter</first><last>Mihajlik</last></author>
+      <pages>331–336</pages>
+      <abstract>This study outlines our duration-dependent modeling experiments on limited-resource Hungarian speech recognition tasks. As it is well known, very short utterances pose significant challenges in automatic speech recognition due to the lack of context and other phenomena. In particular, we found that that the exclusion of shorter speech samples from fine-tuning for longer duration test data significantly improves the recognition rate measured on public Hungarian datasets, BEA-Base and CommonVoice (CV). Therefore we apply a tandem modeling approach, separate models are used for short and long duration test data. Our strategy improved the ability to recognize short utterances while maintaining recognition of long utterances efficiently, which led to a significant increase in overall recognition accuracy.</abstract>
+      <url hash="2310d0b9">2024.sigul-1.40</url>
+      <bibkey>mengke-etal-2024-tandem</bibkey>
+    </paper>
+    <paper id="41">
+      <title><fixed-case>TELP</fixed-case> – Text Extraction with Linguistic Patterns</title>
+      <author><first>João</first><last>Cordeiro</last></author>
+      <author><first>Purificação Moura</first><last>Silvano</last></author>
+      <author><first>António</first><last>Leal</last></author>
+      <author><first>Sebastião</first><last>Pais</last></author>
+      <pages>337–344</pages>
+      <abstract>Linguistic studies in under-resourced languages pose additional challenges at various levels, including the automatic collection of examples, cases, and corpora construction. Several sophisticated applications, such as GATE (Cunningham, 2002), can be configured/adjusted/programmed by experts to automatically collect examples from the Web in any language. However, these applications are too complex and intricate to be operated, requiring, in some cases, skills in computer science. In this work, we present TELP, a tool that allows for the simplified expression of linguistic patterns to extract case studies automatically from World Wide Web sites. It is a straightforward application with an intuitive GUI and a quick learning curve, facilitating its broad use by researchers from different domains. In this paper, we describe the operational and technical aspects of TELP and some relatively recent and relevant use cases in the field of linguistic studies.</abstract>
+      <url hash="914dd83e">2024.sigul-1.41</url>
+      <bibkey>cordeiro-etal-2024-telp</bibkey>
+    </paper>
+    <paper id="42">
+      <title>The First Parallel Corpus and Neural Machine Translation Model of <fixed-case>W</fixed-case>estern <fixed-case>A</fixed-case>rmenian and <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Ari Nubar</first><last>Boyacıoğlu</last></author>
+      <author><first>Jan</first><last>Niehues</last></author>
+      <pages>345–356</pages>
+      <abstract>Western Armenian is a low-resource language spoken by the Armenian Diaspora residing in various places of the world. Although having content on the internet as well as a relatively rich literary heritage for a minority language, there is no data for the machine translation task and only a very limited amount of labeled data for other NLP tasks. In this work, we build the first machine translation system between Western Armenian and English. We explore different techniques for data collection and evaluate their impact in this very low-resource scenario. Then, we build the machine translation system while focusing on the possibilities of performing knowledge transfer from Eastern Armenian. The system is finetuned with the data collected for the first Western Armenian-English parallel corpus, which contains a total of approximately 147k sentence pairs, whose shareable part of 52k examples was made open-source. The best system through the experiments performs with a BLEU score of 29.8 while translating into English and 17 into Western Armenian.</abstract>
+      <url hash="4fe93435">2024.sigul-1.42</url>
+      <bibkey>boyacioglu-niehues-2024-first</bibkey>
+    </paper>
+    <paper id="43">
+      <title>Tracing Linguistic Heritage: Constructing a <fixed-case>S</fixed-case>omali-<fixed-case>I</fixed-case>talian Terminological Resource through Explorers’ Notebooks and Contemporary Corpus Analysis</title>
+      <author><first>Silvia</first><last>Piccini</last></author>
+      <author><first>Giuliana Elizabeth</first><last>Vilela Ruiz</last></author>
+      <author><first>Andrea</first><last>Bellandi</last></author>
+      <author><first>Enrico</first><last>Carniani</last></author>
+      <pages>357–362</pages>
+      <abstract>The aim of this contribution is to introduce the initial phases of constructing a Somali-Italian terminological resource that dates back to Italy’s colonial expansion into Africa. Specifically, the terminological data was extracted from the notebooks authored by the Italian explorer Ugo Ferrandi (1852 - 1928) and published by the Società Geografica in 1903 under the title “Lugh. Emporio Commerciale sul Giuba”. In order to develop Ferrandi’s terminological resource, we have employed Semantic Web technologies (RDF, OWL, and SPARQL) and embraced the Linked Open Data paradigm. This ensures the FAIRness of the data and enables the publication and sharing of our terminological resource within an open interconnected Web of Data, thus contributing to addressing the absence of Somali in the Linguistic Linked Data cloud. Whenever feasible, Ferrandi’s lexicon entries have been linked and enriched with information derived from a Somali lexicon included in a contemporary Somali Corpus. This approach allows the synchronic corpus-related Somali lexicon to acquire historical depth, thereby illuminating the linguistic dynamics that have transpired over time and would otherwise have remained obscure.</abstract>
+      <url hash="35c24c0d">2024.sigul-1.43</url>
+      <bibkey>piccini-etal-2024-tracing</bibkey>
+    </paper>
+    <paper id="44">
+      <title>Uncovering Social Changes of the <fixed-case>B</fixed-case>asque Speaking <fixed-case>T</fixed-case>witter Community During <fixed-case>COVID</fixed-case>-19 Pandemic</title>
+      <author><first>Joseba</first><last>Fernandez de Landa</last></author>
+      <author><first>Iker</first><last>García-Ferrero</last></author>
+      <author><first>Ander</first><last>Salaberria</last></author>
+      <author><first>Jon Ander</first><last>Campos</last></author>
+      <pages>363–371</pages>
+      <abstract>The aim of this work is to study the impact of the COVID-19 pandemic on the Basque speaking Twitter community by applying Natural Language Processing unsupervised techniques. In order to carry out this study, we collected and publicly released the biggest dataset of Basque tweets containing up to 8M tweets from September 2019 to February 2021. To analyze the impact of the pandemic, the variability of the content over time was studied through quantitative and qualitative analysis of words and emojis. For the quantitative analysis, the shift at the frequency of the terms was calculated using linear regression over frequencies. On the other hand, for the qualitative analysis, word embeddings were used to study the changes in the meaning of the most significant words and emojis at different periods of the pandemic. Through this multifaceted approach, we discovered noteworthy alterations in the political inclinations exhibited by Basque users throughout the course of the pandemic.</abstract>
+      <url hash="cc81d3b9">2024.sigul-1.44</url>
+      <bibkey>fernandez-de-landa-etal-2024-uncovering</bibkey>
+    </paper>
+    <paper id="45">
+      <title><fixed-case>U</fixed-case>ni<fixed-case>D</fixed-case>ive: A <fixed-case>COST</fixed-case> Action on Universality, Diversity and Idiosyncrasy in Language Technology</title>
+      <author><first>Agata</first><last>Savary</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Anabela</first><last>Barreiro</last></author>
+      <author><first>Olesea</first><last>Caftanatov</last></author>
+      <author><first>Marie-Catherine</first><last>de Marneffe</last></author>
+      <author><first>Kaja</first><last>Dobrovoljc</last></author>
+      <author><first>Gülşen</first><last>Eryiğit</last></author>
+      <author><first>Voula</first><last>Giouli</last></author>
+      <author><first>Bruno</first><last>Guillaume</last></author>
+      <author><first>Stella</first><last>Markantonatou</last></author>
+      <author><first>Nurit</first><last>Melnik</last></author>
+      <author><first>Joakim</first><last>Nivre</last></author>
+      <author><first>Atul Kr.</first><last>Ojha</last></author>
+      <author><first>Carlos</first><last>Ramisch</last></author>
+      <author><first>Abigail</first><last>Walsh</last></author>
+      <author><first>Beata</first><last>Wójtowicz</last></author>
+      <author><first>Alina</first><last>Wróblewska</last></author>
+      <pages>372–382</pages>
+      <abstract>This paper presents the objectives, organization and activities of the UniDive COST Action, a scientific network dedicated to universality, diversity and idiosyncrasy in language technology. We describe the objectives and organization of this initiative, the people involved, the working groups and the ongoing tasks and activities. This paper is also an pen call for participation towards new members and countries.</abstract>
+      <url hash="1f9c2d06">2024.sigul-1.45</url>
+      <bibkey>savary-etal-2024-unidive</bibkey>
+    </paper>
+    <paper id="46">
+      <title>Unsupervised Outlier Detection for Language-Independent Text Quality Filtering</title>
+      <author><first>Jón</first><last>Daðason</last></author>
+      <author><first>Hrafn</first><last>Loftsson</last></author>
+      <pages>383–393</pages>
+      <abstract>Web-crawled corpora offer an abundant source of training data for language models. However, they are generally noisy and are typically filtered using heuristic rules or classifiers. These methods require careful tuning or labeling by fluent speakers. In this paper, we assess the effectiveness of commonly applied rules on TQ-IS, a manually labeled text quality dataset for Icelandic. Additionally, we advocate for the utilization of unsupervised clustering and outlier detection algorithms for filtering. These algorithms are language-independent, computationally efficient and do not require language expertise. Using grid search, we find the optimal configuration for every combination of rules, optimizing for F1 score on TQ-IS. For a rule-based approach, we discover that optimal results can be achieved with only a small subset of the full ruleset. Using five rules, we obtain an F1 score of 98.2%. We then evaluate three unsupervised algorithms, i.e., Gaussian Mixture Models (GMMs), Isolation Forests and One-Class SVMs. Our findings reveal that unsupervised algorithms perform well on the TQ-IS dataset, with GMMs obtaining the best results, comparable to those obtained with the rule-based approach. Finally, we show that unsupervised methods appear to be equally suitable for languages other than Icelandic, including Estonian and Basque.</abstract>
+      <url hash="97694b18">2024.sigul-1.46</url>
+      <bibkey>dadason-loftsson-2024-unsupervised</bibkey>
+    </paper>
+    <paper id="47">
+      <title><fixed-case>U</fixed-case>z<fixed-case>ABSA</fixed-case>: Aspect-Based Sentiment Analysis for the <fixed-case>U</fixed-case>zbek Language</title>
+      <author><first>Sanatbek Gayratovich</first><last>Matlatipov</last></author>
+      <author><first>Jaloliddin</first><last>Rajabov</last></author>
+      <author><first>Elmurod</first><last>Kuriyozov</last></author>
+      <author><first>Mersaid</first><last>Aripov</last></author>
+      <pages>394–403</pages>
+      <abstract>The objective of enhancing the availability of natural language processing technologies for low-resource languages has significant importance in facilitating technological accessibility within the populations of speakers of these languages. Our current grasping shows that there are no established linguistic resources available open source to develop aspect-based sentiment analysis (ABSA) tools tailored to the Uzbek language. This work aims to address the aforementioned gap by presenting the first high-quality annotated ABSA dataset - UzABSA. The data used in this study was obtained from a compilation of online reviews of Uzbek restaurants. Consequently, the constructed dataset has a length of 3500 reviews at the document level and 6100+ sentences at the sentence level. The popular approach to language resources of this kind explores four distinctive characteristics, namely Aspect Terms, Aspect Term Polarities, Aspect Category Terms, as well as Aspect Category Polarities. To the best of our knowledge, it is the first and the largest ABSA dataset for the Uzbek language. To evaluate the annotation process of our dataset, we used established statistical techniques such as Cohen’s kappa coefficient and Krippendorff’s <tex-math>\alpha</tex-math> to assess agreement between annotators. Subsequently, a classification model, namely K-Nearest Neighbour (KNN), was used to evaluate the performance of the created dataset. Both sets of evaluation techniques demonstrate comparable levels of accuracy. The first findings across the various tasks showed promising outcomes, with accuracy rates ranging from 72% to 88%. This study not only highlights the significance of our acquired dataset but also plays a valuable tool for scholars interested in furthering sentiment analysis in the Uzbek language.</abstract>
+      <url hash="799cd9e6">2024.sigul-1.47</url>
+      <bibkey>matlatipov-etal-2024-uzabsa</bibkey>
+    </paper>
+    <paper id="48">
+      <title><fixed-case>V</fixed-case>i<fixed-case>H</fixed-case>ealth<fixed-case>NLI</fixed-case>: A Dataset for <fixed-case>V</fixed-case>ietnamese Natural Language Inference in Healthcare</title>
+      <author><first>Huyen</first><last>Nguyen</last></author>
+      <author><first>Quyen The</first><last>Ngo</last></author>
+      <author><first>Thanh-Ha</first><last>Do</last></author>
+      <author><first>Tuan-Anh</first><last>Hoang</last></author>
+      <pages>404–409</pages>
+      <abstract>This paper introduces ViHealthNLI, a large dataset for the natural language inference problem for Vietnamese. Unlike the similar Vietnamese datasets, ours is specific to the healthcare domain. We conducted an exploratory analysis to characterize the dataset and evaluated the state-of-the-art methods on the dataset. Our findings indicate that the dataset poses significant challenges while also holding promise for further advanced research and the creation of practical applications.</abstract>
+      <url hash="ec120a97">2024.sigul-1.48</url>
+      <bibkey>nguyen-etal-2024-vihealthnli</bibkey>
+    </paper>
+    <paper id="49">
+      <title>Why the Unexpected? Dissecting the Political and Economic Bias in <fixed-case>P</fixed-case>ersian Small and Large Language Models</title>
+      <author><first>Ehsan</first><last>Barkhordar</last></author>
+      <author><first>Surendrabikram</first><last>Thapa</last></author>
+      <author><first>Ashwarya</first><last>Maratha</last></author>
+      <author><first>Usman</first><last>Naseem</last></author>
+      <pages>410–420</pages>
+      <abstract>Recently, language models (LMs) like BERT and large language models (LLMs) like GPT-4 have demonstrated potential in various linguistic tasks such as text generation, translation, and sentiment analysis. However, these abilities come with a cost of a risk of perpetuating biases from their training data. Political and economic inclinations play a significant role in shaping these biases. Thus, this research aims to understand political and economic biases in Persian LMs and LLMs, addressing a significant gap in AI ethics and fairness research. Focusing on the Persian language, our research employs a two-step methodology. First, we utilize the political compass test adapted to Persian. Second, we analyze biases present in these models. Our findings indicate the presence of nuanced biases, underscoring the importance of ethical considerations in AI deployments within Persian-speaking contexts.</abstract>
+      <url hash="aff90479">2024.sigul-1.49</url>
+      <bibkey>barkhordar-etal-2024-unexpected</bibkey>
+    </paper>
+    <paper id="50">
+      <title>Work in Progress: Text-to-speech on Edge Devices for Te Reo <fixed-case>M</fixed-case>āori and ‘Ōlelo Hawaiʻi</title>
+      <author><first>Tūreiti</first><last>Keith</last></author>
+      <pages>421–426</pages>
+      <abstract>Existing popular text-to-speech technologies focus on large models requiring a large corpus of recorded speech to train. The resulting models are typically run on high-resource servers where users synthesise speech from a client device requiring constant connectivity. For speakers of low-resource languages living in remote areas, this approach does not work. Corpora are typically small and synthesis needs to run on an unconnected, battery or solar-powered edge device. In this paper, we demonstrate how knowledge transfer and adversarial training can be used to create efficient models capable of running on edge devices using a corpus of only several hours. We apply these concepts to create a voice synthesiser for te reo Māori (the indigenous language of Aotearoa New Zealand) for a non-speaking user and ‘ōlelo Hawaiʻi (the indigenous language of Hawaiʻi) for a legally blind user, thus creating the first high-quality text-to-speech tools for these endangered, central-eastern Polynesian languages capable of running on a low powered edge device.</abstract>
+      <url hash="a391ab39">2024.sigul-1.50</url>
+      <bibkey>keith-2024-work</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.tacl.xml b/data/xml/2024.tacl.xml
new file mode 100644
index 0000000000..be3b2e48d4
--- /dev/null
+++ b/data/xml/2024.tacl.xml
@@ -0,0 +1,497 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.tacl">
+  <volume id="1" type="journal">
+    <meta>
+      <booktitle>Transactions of the Association for Computational Linguistics, Volume 12</booktitle>
+      <publisher>MIT Press</publisher>
+      <address>Cambridge, MA</address>
+      <year>2024</year>
+      <venue>tacl</venue>
+      <journal-volume>11</journal-volume>
+    </meta>
+    <paper id="1">
+      <title><fixed-case>A</fixed-case>mbi<fixed-case>FC</fixed-case>: Fact-Checking Ambiguous Claims with Evidence</title>
+      <author><first>Max</first><last>Glockner</last></author>
+      <author><first>Ieva</first><last>Staliūnaitė</last></author>
+      <author><first>James</first><last>Thorne</last></author>
+      <author><first>Gisela</first><last>Vallejo</last></author>
+      <author><first>Andreas</first><last>Vlachos</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
+      <doi>10.1162/tacl_a_00629</doi>
+      <abstract>Automated fact-checking systems verify claims against evidence to predict their veracity. In real-world scenarios, the retrieved evidence may not unambiguously support or refute the claim and yield conflicting but valid interpretations. Existing fact-checking datasets assume that the models developed with them predict a single veracity label for each claim, thus discouraging the handling of such ambiguity. To address this issue we present AmbiFC,1 a fact-checking dataset with 10k claims derived from real-world information needs. It contains fine-grained evidence annotations of 50k passages from 5k Wikipedia pages. We analyze the disagreements arising from ambiguity when comparing claims against evidence in AmbiFC, observing a strong correlation of annotator disagreement with linguistic phenomena such as underspecification and probabilistic reasoning. We develop models for predicting veracity handling this ambiguity via soft labels, and find that a pipeline that learns the label distribution for sentence-level evidence selection and veracity prediction yields the best performance. We compare models trained on different subsets of AmbiFC and show that models trained on the ambiguous instances perform better when faced with the identified linguistic phenomena.</abstract>
+      <pages>1–18</pages>
+      <url hash="190d51f6">2024.tacl-1.1</url>
+      <bibkey>glockner-etal-2024-ambifc</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Language Varieties of <fixed-case>I</fixed-case>taly: Technology Challenges and Opportunities</title>
+      <author><first>Alan</first><last>Ramponi</last></author>
+      <doi>10.1162/tacl_a_00631</doi>
+      <abstract>Italy is characterized by a one-of-a-kind linguistic diversity landscape in Europe, which implicitly encodes local knowledge, cultural traditions, artistic expressions, and history of its speakers. However, most local languages and dialects in Italy are at risk of disappearing within a few generations. The NLP community has recently begun to engage with endangered languages, including those of Italy. Yet, most efforts assume that these varieties are under-resourced language monoliths with an established written form and homogeneous functions and needs, and thus highly interchangeable with each other and with high-resource, standardized languages. In this paper, we introduce the linguistic context of Italy and challenge the default machine-centric assumptions of NLP for Italy’s language varieties. We advocate for a shift in the paradigm from machine-centric to speaker-centric NLP, and provide recommendations and opportunities for work that prioritizes languages and their speakers over technological advances. To facilitate the process, we finally propose building a local community towards responsible, participatory efforts aimed at supporting vitality of languages and dialects of Italy.</abstract>
+      <pages>19–38</pages>
+      <url hash="dd061bfb">2024.tacl-1.2</url>
+      <bibkey>ramponi-2024-language</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Benchmarking Large Language Models for News Summarization</title>
+      <author><first>Tianyi</first><last>Zhang</last></author>
+      <author><first>Faisal</first><last>Ladhak</last></author>
+      <author><first>Esin</first><last>Durmus</last></author>
+      <author><first>Percy</first><last>Liang</last></author>
+      <author><first>Kathleen</first><last>McKeown</last></author>
+      <author><first>Tatsunori B.</first><last>Hashimoto</last></author>
+      <doi>10.1162/tacl_a_00632</doi>
+      <abstract>Large language models (LLMs) have shown promise for automatic summarization but the reasons behind their successes are poorly understood. By conducting a human evaluation on ten LLMs across different pretraining methods, prompts, and model scales, we make two important observations. First, we find instruction tuning, not model size, is the key to the LLM’s zero-shot summarization capability. Second, existing studies have been limited by low-quality references, leading to underestimates of human performance and lower few-shot and finetuning performance. To better evaluate LLMs, we perform human evaluation over high-quality summaries we collect from freelance writers. Despite major stylistic differences such as the amount of paraphrasing, we find that LLM summaries are judged to be on par with human written summaries.</abstract>
+      <pages>39–57</pages>
+      <url hash="2e4604d4">2024.tacl-1.3</url>
+      <bibkey>zhang-etal-2024-benchmarking</bibkey>
+    </paper>
+    <paper id="4">
+      <title>m<fixed-case>GPT</fixed-case>: Few-Shot Learners Go Multilingual</title>
+      <author><first>Oleh</first><last>Shliazhko</last></author>
+      <author><first>Alena</first><last>Fenogenova</last></author>
+      <author><first>Maria</first><last>Tikhonova</last></author>
+      <author><first>Anastasia</first><last>Kozlova</last></author>
+      <author><first>Vladislav</first><last>Mikhailov</last></author>
+      <author><first>Tatiana</first><last>Shavrina</last></author>
+      <doi>10.1162/tacl_a_00633</doi>
+      <abstract>This paper introduces mGPT, a multilingual variant of GPT-3, pretrained on 61 languages from 25 linguistically diverse language families using Wikipedia and the C4 Corpus. We detail the design and pretraining procedure. The models undergo an intrinsic and extrinsic evaluation: language modeling in all languages, downstream evaluation on cross-lingual NLU datasets and benchmarks in 33 languages, and world knowledge probing in 23 languages. The in-context learning abilities are on par with the contemporaneous language models while covering a larger number of languages, including underrepresented and low-resource languages of the Commonwealth of Independent States and the indigenous peoples in Russia. The source code and the language models are publicly available under the MIT license.</abstract>
+      <pages>58–79</pages>
+      <url hash="ad281290">2024.tacl-1.4</url>
+      <bibkey>shliazhko-etal-2024-mgpt</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Cultural Adaptation of Recipes</title>
+      <author><first>Yong</first><last>Cao</last></author>
+      <author><first>Yova</first><last>Kementchedjhieva</last></author>
+      <author><first>Ruixiang</first><last>Cui</last></author>
+      <author><first>Antonia</first><last>Karamolegkou</last></author>
+      <author><first>Li</first><last>Zhou</last></author>
+      <author><first>Megan</first><last>Dare</last></author>
+      <author><first>Lucia</first><last>Donatelli</last></author>
+      <author><first>Daniel</first><last>Hershcovich</last></author>
+      <doi>10.1162/tacl_a_00634</doi>
+      <abstract>Building upon the considerable advances in Large Language Models (LLMs), we are now equipped to address more sophisticated tasks demanding a nuanced understanding of cross-cultural contexts. A key example is recipe adaptation, which goes beyond simple translation to include a grasp of ingredients, culinary techniques, and dietary preferences specific to a given culture. We introduce a new task involving the translation and cultural adaptation of recipes between Chinese- and English-speaking cuisines. To support this investigation, we present CulturalRecipes, a unique dataset composed of automatically paired recipes written in Mandarin Chinese and English. This dataset is further enriched with a human-written and curated test set. In this intricate task of cross-cultural recipe adaptation, we evaluate the performance of various methods, including GPT-4 and other LLMs, traditional machine translation, and information retrieval techniques. Our comprehensive analysis includes both automatic and human evaluation metrics. While GPT-4 exhibits impressive abilities in adapting Chinese recipes into English, it still lags behind human expertise when translating English recipes into Chinese. This underscores the multifaceted nature of cultural adaptations. We anticipate that these insights will significantly contribute to future research on culturally aware language models and their practical application in culturally diverse contexts.</abstract>
+      <pages>80–99</pages>
+      <url hash="a28fd6d2">2024.tacl-1.5</url>
+      <bibkey>cao-etal-2024-cultural</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Metric-Free Learning Network with Dual Relations Propagation for Few-Shot Aspect Category Sentiment Analysis</title>
+      <author><first>Shiman</first><last>Zhao</last></author>
+      <author><first>Yutao</first><last>Xie</last></author>
+      <author><first>Wei</first><last>Chen</last></author>
+      <author><first>Tengjiao</first><last>Wang</last></author>
+      <author><first>Jiahui</first><last>Yao</last></author>
+      <author><first>Jiabin</first><last>Zheng</last></author>
+      <doi>10.1162/tacl_a_00635</doi>
+      <abstract>Few-shot Aspect Category Sentiment Analysis (ACSA) is a crucial task for aspect-based sentiment analysis, which aims to detect sentiment polarity for a given aspect category in a sentence with limited data. However, few-shot learning methods focus on distance metrics between the query and support sets to classify queries, heavily relying on aspect distributions in the embedding space. Thus, they suffer from overlapping distributions of aspect embeddings caused by irrelevant sentiment noise among sentences with multiple sentiment aspects, leading to misclassifications. To solve the above issues, we propose a metric-free method for few-shot ACSA, which models the associated relations among the aspects of support and query sentences by Dual Relations Propagation (DRP), addressing the passive effect of overlapping distributions. Specifically, DRP uses the dual relations (similarity and diversity) among the aspects of support and query sentences to explore intra-cluster commonality and inter-cluster uniqueness for alleviating sentiment noise and enhancing aspect features. Additionally, the dual relations are transformed from support-query to class-query to promote query inference by learning class knowledge. Experiments show that we achieve convincing performance on few-shot ACSA, especially an average improvement of 2.93% accuracy and 2.10% F1 score in the 3-way 1-shot setting.</abstract>
+      <pages>100–119</pages>
+      <url hash="886b8299">2024.tacl-1.6</url>
+      <bibkey>zhao-etal-2024-metric</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Addressing the Binning Problem in Calibration Assessment through Scalar Annotations</title>
+      <author><first>Zhengping</first><last>Jiang</last></author>
+      <author><first>Anqi</first><last>Liu</last></author>
+      <author><first>Benjamnin Van</first><last>Durme</last></author>
+      <doi>10.1162/tacl_a_00636</doi>
+      <abstract>Computational linguistics models commonly target the prediction of discrete—categorical—labels. When assessing how well-calibrated these model predictions are, popular evaluation schemes require practitioners to manually determine a binning scheme: grouping labels into bins to approximate true label posterior. The problem is that these metrics are sensitive to binning decisions. We consider two solutions to the binning problem that apply at the stage of data annotation: collecting either distributed (redundant) labels or direct scalar value assignment. In this paper, we show that although both approaches address the binning problem by evaluating instance-level calibration, direct scalar assignment is significantly more cost-effective. We provide theoretical analysis and empirical evidence to support our proposal for dataset creators to adopt scalar annotation protocols to enable a higher-quality assessment of model calibration.</abstract>
+      <pages>120–136</pages>
+      <url hash="4f20ba0a">2024.tacl-1.7</url>
+      <bibkey>jiang-etal-2024-addressing</bibkey>
+    </paper>
+    <paper id="8">
+      <title>An Energy-based Model for Word-level <fixed-case>A</fixed-case>uto<fixed-case>C</fixed-case>ompletion in Computer-aided Translation</title>
+      <author><first>Cheng</first><last>Yang</last></author>
+      <author><first>Guoping</first><last>Huang</last></author>
+      <author><first>Mo</first><last>Yu</last></author>
+      <author><first>Zhirui</first><last>Zhang</last></author>
+      <author><first>Siheng</first><last>Li</last></author>
+      <author><first>Mingming</first><last>Yang</last></author>
+      <author><first>Shuming</first><last>Shi</last></author>
+      <author><first>Yujiu</first><last>Yang</last></author>
+      <author><first>Lemao</first><last>Liu</last></author>
+      <doi>10.1162/tacl_a_00637</doi>
+      <abstract>Word-level AutoCompletion (WLAC) is a rewarding yet challenging task in Computer-aided Translation. Existing work addresses this task through a classification model based on a neural network that maps the hidden vector of the input context into its corresponding label (i.e., the candidate target word is treated as a label). Since the context hidden vector itself does not take the label into account and it is projected to the label through a linear classifier, the model cannot sufficiently leverage valuable information from the source sentence as verified in our experiments, which eventually hinders its overall performance. To alleviate this issue, this work proposes an energy-based model for WLAC, which enables the context hidden vector to capture crucial information from the source sentence. Unfortunately, training and inference suffer from efficiency and effectiveness challenges, therefore we employ three simple yet effective strategies to put our model into practice. Experiments on four standard benchmarks demonstrate that our reranking-based approach achieves substantial improvements (about 6.07%) over the previous state-of-the-art model. Further analyses show that each strategy of our approach contributes to the final performance.1</abstract>
+      <pages>137–156</pages>
+      <url hash="527ab2ef">2024.tacl-1.8</url>
+      <bibkey>yang-etal-2024-energy</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Lost in the Middle: How Language Models Use Long Contexts</title>
+      <author><first>Nelson F.</first><last>Liu</last></author>
+      <author><first>Kevin</first><last>Lin</last></author>
+      <author><first>John</first><last>Hewitt</last></author>
+      <author><first>Ashwin</first><last>Paranjape</last></author>
+      <author><first>Michele</first><last>Bevilacqua</last></author>
+      <author><first>Fabio</first><last>Petroni</last></author>
+      <author><first>Percy</first><last>Liang</last></author>
+      <doi>10.1162/tacl_a_00638</doi>
+      <abstract>While recent language models have the ability to take long contexts as input, relatively little is known about how well they use longer context. We analyze the performance of language models on two tasks that require identifying relevant information in their input contexts: multi-document question answering and key-value retrieval. We find that performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts. In particular, we observe that performance is often highest when relevant information occurs at the beginning or end of the input context, and significantly degrades when models must access relevant information in the middle of long contexts, even for explicitly long-context models. Our analysis provides a better understanding of how language models use their input context and provides new evaluation protocols for future long-context language models.</abstract>
+      <pages>157–173</pages>
+      <url hash="ad243033">2024.tacl-1.9</url>
+      <bibkey>liu-etal-2024-lost</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Red Teaming Language Model Detectors with Language Models</title>
+      <author><first>Zhouxing</first><last>Shi</last></author>
+      <author><first>Yihan</first><last>Wang</last></author>
+      <author><first>Fan</first><last>Yin</last></author>
+      <author><first>Xiangning</first><last>Chen</last></author>
+      <author><first>Kai-Wei</first><last>Chang</last></author>
+      <author><first>Cho-Jui</first><last>Hsieh</last></author>
+      <doi>10.1162/tacl_a_00639</doi>
+      <abstract>The prevalence and strong capability of large language models (LLMs) present significant safety and ethical risks if exploited by malicious users. To prevent the potentially deceptive usage of LLMs, recent work has proposed algorithms to detect LLM-generated text and protect LLMs. In this paper, we investigate the robustness and reliability of these LLM detectors under adversarial attacks. We study two types of attack strategies: 1) replacing certain words in an LLM’s output with their synonyms given the context; 2) automatically searching for an instructional prompt to alter the writing style of the generation. In both strategies, we leverage an auxiliary LLM to generate the word replacements or the instructional prompt. Different from previous works, we consider a challenging setting where the auxiliary LLM can also be protected by a detector. Experiments reveal that our attacks effectively compromise the performance of all detectors in the study with plausible generations, underscoring the urgent need to improve the robustness of LLM-generated text detection systems. Code is available at https://github.com/shizhouxing/LLM-Detector-Robustness.</abstract>
+      <pages>174–189</pages>
+      <url hash="2a5886ed">2024.tacl-1.10</url>
+      <bibkey>shi-etal-2024-red</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Text Attribute Control via Closed-Loop Disentanglement</title>
+      <author><first>Lei</first><last>Sha</last></author>
+      <author><first>Thomas</first><last>Lukasiewicz</last></author>
+      <doi>10.1162/tacl_a_00640</doi>
+      <abstract>Changing an attribute of a text without changing the content usually requires first disentangling the text into irrelevant attributes and content representations. After that, in the inference phase, the representation of one attribute is tuned to a different value, expecting that the corresponding attribute of the text can also be changed accordingly. The usual way of disentanglement is to add some constraints on the latent space of an encoder-decoder architecture, including adversarial-based constraints and mutual-information-based constraints. However, previous semi-supervised processes of attribute change are usually not enough to guarantee the success of attribute change and content preservation. In this paper, we propose a novel approach to achieve a robust control of attributes while enhancing content preservation. In this approach, we use a semi-supervised contrastive learning method to encourage the disentanglement of attributes in latent spaces. Differently from previous works, we re-disentangle the reconstructed sentence and compare the re-disentangled latent space with the original latent space, which makes a closed-loop disentanglement process. This also helps content preservation. In addition, the contrastive learning method is also able to replace the role of minimizing mutual information and adversarial training in the disentanglement process, which alleviates the computation cost. We conducted experiments on three text datasets, including the Yelp Service review dataset, the Amazon Product review dataset, and the GoEmotions dataset. The experimental results show the effectiveness of our model.</abstract>
+      <pages>190–209</pages>
+      <url hash="63eaa6d0">2024.tacl-1.11</url>
+      <bibkey>sha-lukasiewicz-2024-text</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Unifying Structured Data as Graph for Data-to-Text Pre-Training</title>
+      <author><first>Shujie</first><last>Li</last></author>
+      <author><first>Liang</first><last>Li</last></author>
+      <author><first>Ruiying</first><last>Geng</last></author>
+      <author><first>Min</first><last>Yang</last></author>
+      <author><first>Binhua</first><last>Li</last></author>
+      <author><first>Guanghu</first><last>Yuan</last></author>
+      <author><first>Wanwei</first><last>He</last></author>
+      <author><first>Shao</first><last>Yuan</last></author>
+      <author><first>Can</first><last>Ma</last></author>
+      <author><first>Fei</first><last>Huang</last></author>
+      <author><first>Yongbin</first><last>Li</last></author>
+      <doi>10.1162/tacl_a_00641</doi>
+      <abstract>Data-to-text (D2T) generation aims to transform structured data into natural language text. Data-to-text pre-training has proved to be powerful in enhancing D2T generation and yields impressive performance. However, previous pre-training methods either oversimplified structured data into a sequence without considering input structures or designed training objectives tailored for a specific data structure (e.g., table or knowledge graph). In this paper, we unify different types of structured data (i.e., table, key-value data, knowledge graph) into the graph format and cast different D2T generation tasks as graph-to-text generation. To effectively exploit the structural information of the input graph, we propose a structure-enhanced pre-training method for D2T generation by designing a structure-enhanced Transformer. Concretely, we devise a position matrix for the Transformer, encoding relative positional information of connected nodes in the input graph. In addition, we propose a new attention matrix to incorporate graph structures into the original Transformer by taking the available explicit connectivity structure into account. Extensive experiments on six benchmark datasets show the effectiveness of our model. Our source codes are available at https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/unid2t.</abstract>
+      <pages>210–228</pages>
+      <url hash="93e5bf66">2024.tacl-1.12</url>
+      <bibkey>li-etal-2024-unifying</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Exploring Human-Like Translation Strategy with Large Language Models</title>
+      <author><first>Zhiwei</first><last>He</last></author>
+      <author><first>Tian</first><last>Liang</last></author>
+      <author><first>Wenxiang</first><last>Jiao</last></author>
+      <author><first>Zhuosheng</first><last>Zhang</last></author>
+      <author><first>Yujiu</first><last>Yang</last></author>
+      <author><first>Rui</first><last>Wang</last></author>
+      <author><first>Zhaopeng</first><last>Tu</last></author>
+      <author><first>Shuming</first><last>Shi</last></author>
+      <author><first>Xing</first><last>Wang</last></author>
+      <doi>10.1162/tacl_a_00642</doi>
+      <abstract>Large language models (LLMs) have demonstrated impressive capabilities in general scenarios, exhibiting a level of aptitude that approaches, in some aspects even surpasses, human-level intelligence. Among their numerous skills, the translation abilities of LLMs have received considerable attention. Compared to typical machine translation that focuses solely on source-to-target mapping, LLM-based translation can potentially mimic the human translation process, which might take preparatory steps to ensure high-quality translation. This work explores this possibility by proposing the MAPS framework, which stands for Multi-Aspect Prompting and Selection. Specifically, we enable LLMs first to analyze the given source sentence and induce three aspects of translation-related knowledge (keywords, topics, and relevant demonstrations) to guide the final translation process. Moreover, we employ a selection mechanism based on quality estimation to filter out noisy and unhelpful knowledge. Both automatic (3 LLMs × 11 directions × 2 automatic metrics) and human evaluation (preference study and MQM) demonstrate the effectiveness of MAPS. Further analysis shows that by mimicking the human translation process, MAPS reduces various translation errors such as hallucination, ambiguity, mistranslation, awkward style, untranslated text, and omission. Source code is available at https://github.com/zwhe99/MAPS-mt.</abstract>
+      <pages>229–246</pages>
+      <url hash="3801d508">2024.tacl-1.13</url>
+      <bibkey>he-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Retrieve What You Need: A Mutual Learning Framework for Open-domain Question Answering</title>
+      <author><first>Dingmin</first><last>Wang</last></author>
+      <author><first>Qiuyuan</first><last>Huang</last></author>
+      <author><first>Matthew</first><last>Jackson</last></author>
+      <author><first>Jianfeng</first><last>Gao</last></author>
+      <doi>10.1162/tacl_a_00646</doi>
+      <abstract>An open-domain question answering (QA) system usually follows a retrieve-then-read paradigm, in which a retriever is used to retrieve relevant passages from a large corpus, and then a reader generates answers based on the retrieved passages and the original question. In this paper, we propose a simple and novel mutual learning framework to improve the performance of retrieve-then-read-style models via an intermediate module named the knowledge selector, which we train with reinforcement learning. The key benefits of our proposed intermediate module are: 1) no requirement for additional annotated question-passage pairs; 2) improvements in both retrieval and QA performance, as well as computational efficiency, compared to prior competitive retrieve-then-read models; 3) with no finetuning, improvement in the zero-shot performance of large-scale pre-trained language models, e.g., ChatGPT, by encapsulating the input with relevant knowledge without violating the input length constraint.</abstract>
+      <pages>247–263</pages>
+      <url hash="3d6d9126">2024.tacl-1.14</url>
+      <bibkey>wang-etal-2024-retrieve</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Explicitly Representing Syntax Improves Sentence-to-Layout Prediction of Unexpected Situations</title>
+      <author><first>Wolf</first><last>Nuyts</last></author>
+      <author><first>Ruben</first><last>Cartuyvels</last></author>
+      <author><first>Marie-Francine</first><last>Moens</last></author>
+      <doi>10.1162/tacl_a_00643</doi>
+      <abstract>Recognizing visual entities in a natural language sentence and arranging them in a 2D spatial layout require a compositional understanding of language and space. This task of layout prediction is valuable in text-to-image synthesis as it allows localized and controlled in-painting of the image. In this comparative study it is shown that we can predict layouts from language representations that implicitly or explicitly encode sentence syntax, if the sentences mention similar entity-relationships to the ones seen during training. To test compositional understanding, we collect a test set of grammatically correct sentences and layouts describing compositions of entities and relations that unlikely have been seen during training. Performance on this test set substantially drops, showing that current models rely on correlations in the training data and have difficulties in understanding the structure of the input sentences. We propose a novel structural loss function that better enforces the syntactic structure of the input sentence and show large performance gains in the task of 2D spatial layout prediction conditioned on text. The loss has the potential to be used in other generation tasks where a tree-like structure underlies the conditioning modality. Code, trained models, and the USCOCO evaluation set are available via Github.1</abstract>
+      <pages>264–282</pages>
+      <url hash="131757be">2024.tacl-1.15</url>
+      <bibkey>nuyts-etal-2024-explicitly</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Evaluating the Ripple Effects of Knowledge Editing in Language Models</title>
+      <author><first>Roi</first><last>Cohen</last></author>
+      <author><first>Eden</first><last>Biran</last></author>
+      <author><first>Ori</first><last>Yoran</last></author>
+      <author><first>Amir</first><last>Globerson</last></author>
+      <author><first>Mor</first><last>Geva</last></author>
+      <doi>10.1162/tacl_a_00644</doi>
+      <abstract>Modern language models capture a large body of factual knowledge. However, some facts can be incorrectly induced or become obsolete over time, resulting in factually incorrect generations. This has led to the development of various editing methods that allow updating facts encoded by the model. Evaluation of these methods has primarily focused on testing whether an individual fact has been successfully injected, and if similar predictions for other subjects have not changed. Here we argue that such evaluation is limited, since injecting one fact (e.g., “Jack Depp is the son of Johnny Depp”) introduces a “ripple effect” in the form of additional facts that the model needs to update (e.g., “Jack Depp is the sibling of Lily-Rose Depp”). To address this, we propose novel evaluation criteria that consider the implications of an edit on related facts. Using these criteria, we then construct RippleEdits, a diagnostic benchmark of 5K factual edits, capturing various types of ripple effects. We evaluate prominent editing methods on RippleEdits, showing that they fail to introduce consistent changes in the model’s knowledge. In addition, we find that a simple in-context editing baseline obtains the best scores on our benchmark, suggesting a promising research direction for model editing.1</abstract>
+      <pages>283–298</pages>
+      <url hash="6e844169">2024.tacl-1.16</url>
+      <bibkey>cohen-etal-2024-evaluating</bibkey>
+    </paper>
+    <paper id="17">
+      <title>The Impact of Word Splitting on the Semantic Content of Contextualized Word Representations</title>
+      <author><first>Aina Garí</first><last>Soler</last></author>
+      <author><first>Matthieu</first><last>Labeau</last></author>
+      <author><first>Chloé</first><last>Clavel</last></author>
+      <doi>10.1162/tacl_a_00647</doi>
+      <abstract>When deriving contextualized word representations from language models, a decision needs to be made on how to obtain one for out-of-vocabulary (OOV) words that are segmented into subwords. What is the best way to represent these words with a single vector, and are these representations of worse quality than those of in-vocabulary words? We carry out an intrinsic evaluation of embeddings from different models on semantic similarity tasks involving OOV words. Our analysis reveals, among other interesting findings, that the quality of representations of words that are split is often, but not always, worse than that of the embeddings of known words. Their similarity values, however, must be interpreted with caution.</abstract>
+      <pages>299–320</pages>
+      <url hash="3b083dbd">2024.tacl-1.17</url>
+      <bibkey>soler-etal-2024-impact</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Large Language Models Enable Few-Shot Clustering</title>
+      <author><first>Vijay</first><last>Viswanathan</last></author>
+      <author><first>Kiril</first><last>Gashteovski</last></author>
+      <author><first>Kiril</first><last>Gashteovski</last></author>
+      <author><first>Carolin</first><last>Lawrence</last></author>
+      <author><first>Tongshuang</first><last>Wu</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
+      <doi>10.1162/tacl_a_00648</doi>
+      <abstract>Unlike traditional unsupervised clustering, semi-supervised clustering allows users to provide meaningful structure to the data, which helps the clustering algorithm to match the user’s intent. Existing approaches to semi-supervised clustering require a significant amount of feedback from an expert to improve the clusters. In this paper, we ask whether a large language model (LLM) can amplify an expert’s guidance to enable query-efficient, few-shot semi-supervised text clustering. We show that LLMs are surprisingly effective at improving clustering. We explore three stages where LLMs can be incorporated into clustering: before clustering (improving input features), during clustering (by providing constraints to the clusterer), and after clustering (using LLMs post-correction). We find that incorporating LLMs in the first two stages routinely provides significant improvements in cluster quality, and that LLMs enable a user to make trade-offs between cost and accuracy to produce desired clusters. We release our code and LLM prompts for the public to use.1</abstract>
+      <pages>321–333</pages>
+      <url hash="1e490298">2024.tacl-1.18</url>
+      <bibkey>viswanathan-etal-2024-large</bibkey>
+    </paper>
+    <paper id="19">
+      <title><fixed-case>J</fixed-case>usti<fixed-case>LM</fixed-case>: Few-shot Justification Generation for Explainable Fact-Checking of Real-world Claims</title>
+      <author><first>Fengzhu</first><last>Zeng</last></author>
+      <author><first>Wei</first><last>Gao</last></author>
+      <doi>10.1162/tacl_a_00649</doi>
+      <abstract>Justification is an explanation that supports the veracity assigned to a claim in fact-checking. However, the task of justification generation has been previously oversimplified as summarization of a fact-check article authored by fact-checkers. Therefore, we propose a realistic approach to generate justification based on retrieved evidence. We present a new benchmark dataset called ExClaim (for Explainable fact-checking of real-world Claims), and introduce JustiLM, a novel few-shot Justification generation based on retrieval-augmented Language Model by using fact-check articles as an auxiliary resource during training only. Experiments show that JustiLM achieves promising performance in justification generation compared to strong baselines, and can also enhance veracity classification with a straightforward extension.1 Code and dataset are released at https://github.com/znhy1024/JustiLM.</abstract>
+      <pages>334–354</pages>
+      <url hash="b637974e">2024.tacl-1.19</url>
+      <bibkey>zeng-gao-2024-justilm</bibkey>
+    </paper>
+    <paper id="20">
+      <title>To Diverge or Not to Diverge: A Morphosyntactic Perspective on Machine Translation vs Human Translation</title>
+      <author><first>Jiaming</first><last>Luo</last></author>
+      <author><first>Colin</first><last>Cherry</last></author>
+      <author><first>George</first><last>Foster</last></author>
+      <doi>10.1162/tacl_a_00645</doi>
+      <abstract>We conduct a large-scale fine-grained comparative analysis of machine translations (MTs) against human translations (HTs) through the lens of morphosyntactic divergence. Across three language pairs and two types of divergence defined as the structural difference between the source and the target, MT is consistently more conservative than HT, with less morphosyntactic diversity, more convergent patterns, and more one-to-one alignments. Through analysis on different decoding algorithms, we attribute this discrepancy to the use of beam search that biases MT towards more convergent patterns. This bias is most amplified when the convergent pattern appears around 50% of the time in training data. Lastly, we show that for a majority of morphosyntactic divergences, their presence in HT is correlated with decreased MT performance, presenting a greater challenge for MT systems.</abstract>
+      <pages>355–371</pages>
+      <url hash="84f467ac">2024.tacl-1.20</url>
+      <bibkey>luo-etal-2024-diverge</bibkey>
+    </paper>
+    <paper id="21">
+      <title>What Do Self-Supervised Speech Models Know About Words?</title>
+      <author><first>Ankita</first><last>Pasad</last></author>
+      <author><first>Chung-Ming</first><last>Chien</last></author>
+      <author><first>Shane</first><last>Settle</last></author>
+      <author><first>Karen</first><last>Livescu</last></author>
+      <doi>10.1162/tacl_a_00656</doi>
+      <abstract>Many self-supervised speech models (S3Ms) have been introduced over the last few years, improving performance and data efficiency on various speech tasks. However, these empirical successes alone do not give a complete picture of what is learned during pre-training. Recent work has begun analyzing how S3Ms encode certain properties, such as phonetic and speaker information, but we still lack a proper understanding of knowledge encoded at the word level and beyond. In this work, we use lightweight analysis methods to study segment-level linguistic properties—word identity, boundaries, pronunciation, syntactic features, and semantic features—encoded in S3Ms. We present a comparative study of layer-wise representations from ten S3Ms and find that (i) the frame-level representations within each word segment are not all equally informative, and (ii) the pre-training objective and model size heavily influence the accessibility and distribution of linguistic information across layers. We also find that on several tasks—word discrimination, word segmentation, and semantic sentence similarity—S3Ms trained with visual grounding outperform their speech-only counterparts. Finally, our task-based analyses demonstrate improved performance on word segmentation and acoustic word discrimination while using simpler methods than prior work.1</abstract>
+      <pages>372–391</pages>
+      <url hash="3dcd2b85">2024.tacl-1.21</url>
+      <bibkey>pasad-etal-2024-self</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Are Character-level Translations Worth the Wait? Comparing <fixed-case>B</fixed-case>y<fixed-case>T</fixed-case>5 and m<fixed-case>T</fixed-case>5 for Machine Translation</title>
+      <author><first>Lukas</first><last>Edman</last></author>
+      <author><first>Gabriele</first><last>Sarti</last></author>
+      <author><first>Antonio</first><last>Toral</last></author>
+      <author><first>Gertjan van</first><last>Noord</last></author>
+      <author><first>Arianna</first><last>Bisazza</last></author>
+      <doi>10.1162/tacl_a_00651</doi>
+      <abstract>Pretrained character-level and byte-level language models have been shown to be competitive with popular subword models across a range of Natural Language Processing tasks. However, there has been little research on their effectiveness for neural machine translation (NMT), particularly within the popular pretrain-then-finetune paradigm. This work performs an extensive comparison across multiple languages and experimental conditions of character- and subword-level pretrained models (ByT5 and mT5, respectively) on NMT. We show the effectiveness of character-level modeling in translation, particularly in cases where fine-tuning data is limited. In our analysis, we show how character models’ gains in translation quality are reflected in better translations of orthographically similar words and rare words. While evaluating the importance of source texts in driving model predictions, we highlight word-level patterns within ByT5, suggesting an ability to modulate word-level and character-level information during generation. We conclude by assessing the efficiency tradeoff of byte models, suggesting their usage in non-time-critical scenarios to boost translation quality.</abstract>
+      <pages>392–410</pages>
+      <url hash="1136a915">2024.tacl-1.22</url>
+      <bibkey>edman-etal-2024-character</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Geographic Adaptation of Pretrained Language Models</title>
+      <author><first>Valentin</first><last>Hofmann</last></author>
+      <author><first>Goran</first><last>Glavaš</last></author>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
+      <author><first>Janet B.</first><last>Pierrehumbert</last></author>
+      <author><first>Hinrich</first><last>Schütze</last></author>
+      <doi>10.1162/tacl_a_00652</doi>
+      <abstract>While pretrained language models (PLMs) have been shown to possess a plethora of linguistic knowledge, the existing body of research has largely neglected extralinguistic knowledge, which is generally difficult to obtain by pretraining on text alone. Here, we contribute to closing this gap by examining geolinguistic knowledge, i.e., knowledge about geographic variation in language. We introduce geoadaptation, an intermediate training step that couples language modeling with geolocation prediction in a multi-task learning setup. We geoadapt four PLMs, covering language groups from three geographic areas, and evaluate them on five different tasks: fine-tuned (i.e., supervised) geolocation prediction, zero-shot (i.e., unsupervised) geolocation prediction, fine-tuned language identification, zero-shot language identification, and zero-shot prediction of dialect features. Geoadaptation is very successful at injecting geolinguistic knowledge into the PLMs: The geoadapted PLMs consistently outperform PLMs adapted using only language modeling (by especially wide margins on zero-shot prediction tasks), and we obtain new state-of-the-art results on two benchmarks for geolocation prediction and language identification. Furthermore, we show that the effectiveness of geoadaptation stems from its ability to geographically retrofit the representation space of the PLMs.</abstract>
+      <pages>411–431</pages>
+      <url hash="e0cc9a7d">2024.tacl-1.23</url>
+      <bibkey>hofmann-etal-2024-geographic</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Do Text Simplification Systems Preserve Meaning? A Human Evaluation via Reading Comprehension</title>
+      <author><first>Sweta</first><last>Agrawal</last></author>
+      <author><first>Marine</first><last>Carpuat</last></author>
+      <doi>10.1162/tacl_a_00653</doi>
+      <abstract>Automatic text simplification (TS) aims to automate the process of rewriting text to make it easier for people to read. A pre-requisite for TS to be useful is that it should convey information that is consistent with the meaning of the original text. However, current TS evaluation protocols assess system outputs for simplicity and meaning preservation without regard for the document context in which output sentences occur and for how people understand them. In this work, we introduce a human evaluation framework to assess whether simplified texts preserve meaning using reading comprehension questions. With this framework, we conduct a thorough human evaluation of texts by humans and by nine automatic systems. Supervised systems that leverage pre-training knowledge achieve the highest scores on the reading comprehension tasks among the automatic controllable TS systems. However, even the best-performing supervised system struggles with at least 14% of the questions, marking them as “unanswerable” based on simplified content. We further investigate how existing TS evaluation metrics and automatic question-answering systems approximate the human judgments we obtained.</abstract>
+      <pages>432–448</pages>
+      <url hash="8df122c2">2024.tacl-1.24</url>
+      <bibkey>agrawal-carpuat-2024-text</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Simultaneous Selection and Adaptation of Source Data via Four-Level Optimization</title>
+      <author><first>Pengtao</first><last>Xie</last></author>
+      <author><first>Xingchen</first><last>Zhao</last></author>
+      <author><first>Xuehai</first><last>He</last></author>
+      <doi>10.1162/tacl_a_00658</doi>
+      <abstract>In many NLP applications, to mitigate data deficiency in a target task, source data is collected to help with target model training. Existing transfer learning methods either select a subset of source examples that are close to the target domain or try to adapt all source examples into the target domain, then use selected or adapted source examples to train the target model. These methods either incur significant information loss or bear the risk that after adaptation, source examples which are originally already in the target domain may be outside the target domain. To address the limitations of these methods, we propose a four-level optimization based framework which simultaneously selects and adapts source data. Our method can automatically identify in-domain and out-of-domain source examples and apply example-specific processing methods: selection for in-domain examples and adaptation for out-of-domain examples. Experiments on various datasets demonstrate the effectiveness of our proposed method.</abstract>
+      <pages>449–466</pages>
+      <url hash="55003fba">2024.tacl-1.25</url>
+      <bibkey>xie-etal-2024-simultaneous</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>C</fixed-case>onvo<fixed-case>S</fixed-case>ense: Overcoming Monotonous Commonsense Inferences for Conversational <fixed-case>AI</fixed-case></title>
+      <author><first>Sarah E.</first><last>Finch</last></author>
+      <author><first>Jinho D.</first><last>Choi</last></author>
+      <doi>10.1162/tacl_a_00659</doi>
+      <abstract>Mastering commonsense understanding and reasoning is a pivotal skill essential for conducting engaging conversations. While there have been several attempts to create datasets that facilitate commonsense inferences in dialogue contexts, existing datasets tend to lack in-depth details, restate information already present in the conversation, and often fail to capture the multifaceted nature of commonsense reasoning. In response to these limitations, we compile a new synthetic dataset for commonsense reasoning in dialogue contexts using GPT, ℂonvoSense, that boasts greater contextual novelty, offers a higher volume of inferences per example, and substantially enriches the detail conveyed by the inferences. Our dataset contains over 500,000 inferences across 12,000 dialogues with 10 popular inference types, which empowers the training of generative commonsense models for dialogue that are superior in producing plausible inferences with high novelty when compared to models trained on the previous datasets. To the best of our knowledge, ℂonvoSense is the first of its kind to provide such a multitude of novel inferences at such a large scale.</abstract>
+      <pages>467–483</pages>
+      <url hash="131f83b5">2024.tacl-1.26</url>
+      <bibkey>finch-choi-2024-convosense</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Automatically Correcting Large Language Models: Surveying the Landscape of Diverse Automated Correction Strategies</title>
+      <author><first>Liangming</first><last>Pan</last></author>
+      <author><first>Michael</first><last>Saxon</last></author>
+      <author><first>Wenda</first><last>Xu</last></author>
+      <author><first>Deepak</first><last>Nathani</last></author>
+      <author><first>Xinyi</first><last>Wang</last></author>
+      <author><first>William Yang</first><last>Wang</last></author>
+      <doi>10.1162/tacl_a_00660</doi>
+      <abstract>While large language models (LLMs) have shown remarkable effectiveness in various NLP tasks, they are still prone to issues such as hallucination, unfaithful reasoning, and toxicity. A promising approach to rectify these flaws is correcting LLMs with feedback, where the LLM itself is prompted or guided with feedback to fix problems in its own output. Techniques leveraging automated feedback—either produced by the LLM itself (self-correction) or some external system—are of particular interest as they make LLM-based solutions more practical and deployable with minimal human intervention. This paper provides an exhaustive review of the recent advances in correcting LLMs with automated feedback, categorizing them into training-time, generation-time, and post-hoc approaches. We also identify potential challenges and future directions in this emerging field.</abstract>
+      <pages>484–506</pages>
+      <url hash="2726c64d">2024.tacl-1.27</url>
+      <bibkey>pan-etal-2024-automatically</bibkey>
+    </paper>
+    <paper id="28">
+      <title><fixed-case>K</fixed-case>o<fixed-case>BBQ</fixed-case>: <fixed-case>K</fixed-case>orean Bias Benchmark for Question Answering</title>
+      <author><first>Jiho</first><last>Jin</last></author>
+      <author><first>Jiseon</first><last>Kim</last></author>
+      <author><first>Nayeon</first><last>Lee</last></author>
+      <author><first>Haneul</first><last>Yoo</last></author>
+      <author><first>Alice</first><last>Oh</last></author>
+      <author><first>Hwaran</first><last>Lee</last></author>
+      <doi>10.1162/tacl_a_00661</doi>
+      <abstract>Warning: This paper contains examples of stereotypes and biases. The Bias Benchmark for Question Answering (BBQ) is designed to evaluate social biases of language models (LMs), but it is not simple to adapt this benchmark to cultural contexts other than the US because social biases depend heavily on the cultural context. In this paper, we present KoBBQ, a Korean bias benchmark dataset, and we propose a general framework that addresses considerations for cultural adaptation of a dataset. Our framework includes partitioning the BBQ dataset into three classes—Simply-Transferred (can be used directly after cultural translation), Target-Modified (requires localization in target groups), and Sample-Removed (does not fit Korean culture)—and adding four new categories of bias specific to Korean culture. We conduct a large-scale survey to collect and validate the social biases and the targets of the biases that reflect the stereotypes in Korean culture. The resulting KoBBQ dataset comprises 268 templates and 76,048 samples across 12 categories of social bias. We use KoBBQ to measure the accuracy and bias scores of several state-of-the-art multilingual LMs. The results clearly show differences in the bias of LMs as measured by KoBBQ and a machine-translated version of BBQ, demonstrating the need for and utility of a well-constructed, culturally aware social bias benchmark.</abstract>
+      <pages>507–524</pages>
+      <url hash="860d3737">2024.tacl-1.28</url>
+      <bibkey>jin-etal-2024-kobbq</bibkey>
+    </paper>
+    <paper id="29">
+      <title><fixed-case>A</fixed-case>uto<fixed-case>PEFT</fixed-case>: Automatic Configuration Search for Parameter-Efficient Fine-Tuning</title>
+      <author><first>Han</first><last>Zhou</last></author>
+      <author><first>Xingchen</first><last>Wan</last></author>
+      <author><first>Ivan</first><last>Vulić</last></author>
+      <author><first>Anna</first><last>Korhonen</last></author>
+      <doi>10.1162/tacl_a_00662</doi>
+      <abstract>Large pretrained language models are widely used in downstream NLP tasks via task- specific fine-tuning, but such procedures can be costly. Recently, Parameter-Efficient Fine-Tuning (PEFT) methods have achieved strong task performance while updating much fewer parameters than full model fine-tuning (FFT). However, it is non-trivial to make informed design choices on the PEFT configurations, such as their architecture, the number of tunable parameters, and even the layers in which the PEFT modules are inserted. Consequently, it is highly likely that the current, manually designed configurations are suboptimal in terms of their performance-efficiency trade-off. Inspired by advances in neural architecture search, we propose AutoPEFT for automatic PEFT configuration selection: We first design an expressive configuration search space with multiple representative PEFT modules as building blocks. Using multi-objective Bayesian optimization in a low-cost setup, we then discover a Pareto-optimal set of configurations with strong performance-cost trade-offs across different numbers of parameters that are also highly transferable across different tasks. Empirically, on GLUE and SuperGLUE tasks, we show that AutoPEFT-discovered configurations significantly outperform existing PEFT methods and are on par or better than FFT without incurring substantial training efficiency costs.</abstract>
+      <pages>525–542</pages>
+      <url hash="06b48cf9">2024.tacl-1.29</url>
+      <bibkey>zhou-etal-2024-autopeft</bibkey>
+    </paper>
+    <paper id="30">
+      <title>What Formal Languages Can Transformers Express? A Survey</title>
+      <author><first>Lena</first><last>Strobl</last></author>
+      <author><first>William</first><last>Merrill</last></author>
+      <author><first>Gail</first><last>Weiss</last></author>
+      <author><first>David</first><last>Chiang</last></author>
+      <author><first>Dana</first><last>Angluin</last></author>
+      <doi>10.1162/tacl_a_00663</doi>
+      <abstract>As transformers have gained prominence in natural language processing, some researchers have investigated theoretically what problems they can and cannot solve, by treating problems as formal languages. Exploring such questions can help clarify the power of transformers relative to other models of computation, their fundamental capabilities and limits, and the impact of architectural choices. Work in this subarea has made considerable progress in recent years. Here, we undertake a comprehensive survey of this work, documenting the diverse assumptions that underlie different results and providing a unified framework for harmonizing seemingly contradictory findings.</abstract>
+      <pages>543–561</pages>
+      <url hash="aac62bb1">2024.tacl-1.30</url>
+      <bibkey>strobl-etal-2024-formal</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Text-to-<fixed-case>O</fixed-case>verpass<fixed-case>QL</fixed-case>: A Natural Language Interface for Complex Geodata Querying of <fixed-case>O</fixed-case>pen<fixed-case>S</fixed-case>treet<fixed-case>M</fixed-case>ap</title>
+      <author><first>Michael</first><last>Staniek</last></author>
+      <author><first>Raphael</first><last>Schumann</last></author>
+      <author><first>Maike</first><last>Züfle</last></author>
+      <author><first>Stefan</first><last>Riezler</last></author>
+      <doi>10.1162/tacl_a_00654</doi>
+      <abstract>We present Text-to-OverpassQL, a task designed to facilitate a natural language interface for querying geodata from OpenStreetMap (OSM). The Overpass Query Language (OverpassQL) allows users to formulate complex database queries and is widely adopted in the OSM ecosystem. Generating Overpass queries from natural language input serves multiple use-cases. It enables novice users to utilize OverpassQL without prior knowledge, assists experienced users with crafting advanced queries, and enables tool-augmented large language models to access information stored in the OSM database. In order to assess the performance of current sequence generation models on this task, we propose OverpassNL,1 a dataset of 8,352 queries with corresponding natural language inputs. We further introduce task specific evaluation metrics and ground the evaluation of the Text-to-OverpassQL task by executing the queries against the OSM database. We establish strong baselines by finetuning sequence-to-sequence models and adapting large language models with in-context examples. The detailed evaluation reveals strengths and weaknesses of the considered learning strategies, laying the foundations for further research into the Text-to-OverpassQL task.</abstract>
+      <pages>562–575</pages>
+      <url hash="a0c93cad">2024.tacl-1.31</url>
+      <bibkey>staniek-etal-2024-text</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Eliciting the Translation Ability of Large Language Models via Multilingual Finetuning with Translation Instructions</title>
+      <author><first>Jiahuan</first><last>Li</last></author>
+      <author><first>Hao</first><last>Zhou</last></author>
+      <author><first>Shujian</first><last>Huang</last></author>
+      <author><first>Shanbo</first><last>Cheng</last></author>
+      <author><first>Jiajun</first><last>Chen</last></author>
+      <doi>10.1162/tacl_a_00655</doi>
+      <abstract>Large-scale pretrained language models (LLMs), such as ChatGPT and GPT4, have shown strong abilities in multilingual translation, without being explicitly trained on parallel corpora. It is intriguing how the LLMs obtain their ability to carry out translation instructions for different languages. In this paper, we present a detailed analysis by finetuning a multilingual pretrained language model, XGLM-7.5B, to perform multilingual translation following given instructions. Firstly, we show that multilingual LLMs have stronger translation abilities than previously demonstrated. For a certain language, the translation performance depends on its similarity to English and the amount of data used in the pretraining phase. Secondly, we find that LLMs’ ability to carry out translation instructions relies on the understanding of translation instructions and the alignment among different languages. With multilingual finetuning with translation instructions, LLMs could learn to perform the translation task well even for those language pairs unseen during the instruction tuning phase.</abstract>
+      <pages>576–592</pages>
+      <url hash="4caf1b95">2024.tacl-1.32</url>
+      <bibkey>li-etal-2024-eliciting</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Semantics of Multiword Expressions in Transformer-Based Models: A Survey</title>
+      <author><first>Filip</first><last>Miletić</last></author>
+      <author><first>Sabine Schulte im</first><last>Walde</last></author>
+      <doi>10.1162/tacl_a_00657</doi>
+      <abstract>Multiword expressions (MWEs) are composed of multiple words and exhibit variable degrees of compositionality. As such, their meanings are notoriously difficult to model, and it is unclear to what extent this issue affects transformer architectures. Addressing this gap, we provide the first in-depth survey of MWE processing with transformer models. We overall find that they capture MWE semantics inconsistently, as shown by reliance on surface patterns and memorized information. MWE meaning is also strongly localized, predominantly in early layers of the architecture. Representations benefit from specific linguistic properties, such as lower semantic idiosyncrasy and ambiguity of target expressions. Our findings overall question the ability of transformer models to robustly capture fine-grained semantics. Furthermore, we highlight the need for more directly comparable evaluation setups.</abstract>
+      <pages>593–612</pages>
+      <url hash="aefab9e8">2024.tacl-1.33</url>
+      <bibkey>miletic-walde-2024-semantics</bibkey>
+    </paper>
+    <paper id="34">
+      <title>The <fixed-case>T</fixed-case>hai Discourse Treebank: Annotating and Classifying <fixed-case>T</fixed-case>hai Discourse Connectives</title>
+      <author><first>Ponrawee</first><last>Prasertsom</last></author>
+      <author><first>Apiwat</first><last>Jaroonpol</last></author>
+      <author><first>Attapol T.</first><last>Rutherford</last></author>
+      <doi>10.1162/tacl_a_00650</doi>
+      <abstract>Discourse analysis is a highly applicable area of natural language processing. In English and other languages, resources for discourse-based tasks are widely available. Thai, however, has hitherto lacked such resources. We present the Thai Discourse Treebank, the first, large Thai corpus annotated in the style of the Penn Discourse Treebank. The resulting corpus has over 10,000 sentences and 18,000 instances of connectives in 33 different relations. We release the corpus alongside our list of 148 potentially polysemous discourse connectives with a total of 340 form-sense pairs and their classification criteria to facilitate future research. We also develop models for connective identification and classification tasks. Our best models achieve an F1 of 0.96 in the identification task and 0.46 on the sense classification task. Our results serve as benchmarks for future models for Thai discourse tasks.</abstract>
+      <pages>613–629</pages>
+      <url hash="31c807bd">2024.tacl-1.34</url>
+      <bibkey>prasertsom-etal-2024-thai</bibkey>
+    </paper>
+    <paper id="35">
+      <title>Federated Learning for Exploiting Annotators’ Disagreements in Natural Language Processing</title>
+      <author><first>Nuria</first><last>Rodríguez-Barroso</last></author>
+      <author><first>Eugenio Martínez</first><last>Cámara</last></author>
+      <author><first>Jose Camacho</first><last>Collados</last></author>
+      <author><first>M. Victoria</first><last>Luzón</last></author>
+      <author><first>Francisco</first><last>Herrera</last></author>
+      <doi>10.1162/tacl_a_00664</doi>
+      <abstract>The annotation of ambiguous or subjective NLP tasks is usually addressed by various annotators. In most datasets, these annotations are aggregated into a single ground truth. However, this omits divergent opinions of annotators, hence missing individual perspectives. We propose FLEAD (Federated Learning for Exploiting Annotators’ Disagreements), a methodology built upon federated learning to independently learn from the opinions of all the annotators, thereby leveraging all their underlying information without relying on a single ground truth. We conduct an extensive experimental study and analysis in diverse text classification tasks to show the contribution of our approach with respect to mainstream approaches based on majority voting and other recent methodologies that also learn from annotator disagreements.</abstract>
+      <pages>630–648</pages>
+      <url hash="97eb9928">2024.tacl-1.35</url>
+      <bibkey>rodriguez-barroso-etal-2024-federated</bibkey>
+    </paper>
+    <paper id="36">
+      <title>Computational Complexity of Natural Morphology Revisited</title>
+      <author><first>Hajime</first><last>Senuma</last></author>
+      <author><first>Akiko</first><last>Aizawa</last></author>
+      <doi>10.1162/tacl_a_00665</doi>
+      <abstract>This paper revisits a classical, yet fundamental, discussion of theoretical computational linguistics: the computational complexity of natural languages. Past studies have revealed that syntax, as observed in Swiss-German, is not weakly context-free. Concerning morphology, Culy (1985) employed a construction in Bambara to show that morphology is not weakly context-free; however, Manaster-Ramer (1988) pointed out that the Bambara case can be problematic because the wordhood of the construction is reliant on special tonal behaviors, and it is ambiguous whether the behaviors belong to the morphological domain. This raises doubts about whether the case can be considered a genuine morphological phenomenon. In this paper, we argue that Classical Ainu, a language we examine, also defies weak context-freeness at the morphological level. The construction we introduce is unambiguously morphological because this language’s valency-sensitive structure and valency-changing operations, such as noun incorporation, preclude its grammatical interpretation as syntactic.</abstract>
+      <pages>649–663</pages>
+      <url hash="2722dceb">2024.tacl-1.36</url>
+      <bibkey>senuma-aizawa-2024-computational</bibkey>
+    </paper>
+    <paper id="37">
+      <title>Improving Probability-based Prompt Selection Through Unified Evaluation and Analysis</title>
+      <author><first>Sohee</first><last>Yang</last></author>
+      <author><first>Jonghyeon</first><last>Kim</last></author>
+      <author><first>Joel</first><last>Jang</last></author>
+      <author><first>Seonghyeon</first><last>Ye</last></author>
+      <author><first>Hyunji</first><last>Lee</last></author>
+      <author><first>Minjoon</first><last>Seo</last></author>
+      <doi>10.1162/tacl_a_00666</doi>
+      <abstract>Previous work in prompt engineering for large language models has introduced different gradient-free probability-based prompt selection methods that aim to choose the optimal prompt among the candidates for a given task but have failed to provide a comprehensive and fair comparison between each other. In this paper, we propose a unified framework to interpret and evaluate the existing probability-based prompt selection methods by performing extensive experiments on 13 common and diverse NLP tasks. We find that each of the existing methods can be interpreted as some variant of the method that maximizes mutual information between the input and the predicted output (MI). Utilizing this finding, we develop several other combinatorial variants of MI and increase the effectiveness of the oracle prompt selection method from 87.79% to 94.98%, measured as the ratio of the performance of the selected prompt to that of the optimal oracle prompt. Furthermore, considering that all the methods rely on the output probability distribution of the model that might be biased, we propose a novel calibration method called Calibration by Marginalization (CBM) that is orthogonal to the existing methods and helps increase the prompt selection effectiveness of the best method to 96.85%, achieving 99.44% of the oracle prompt F1 without calibration.1</abstract>
+      <pages>664–680</pages>
+      <url hash="9cb84511">2024.tacl-1.37</url>
+      <bibkey>yang-etal-2024-improving</bibkey>
+    </paper>
+    <paper id="38">
+      <title>Evaluating Correctness and Faithfulness of Instruction-Following Models for Question Answering</title>
+      <author><first>Vaibhav</first><last>Adlakha</last></author>
+      <author><first>Parishad</first><last>BehnamGhader</last></author>
+      <author><first>Xing Han</first><last>Lu</last></author>
+      <author><first>Nicholas</first><last>Meade</last></author>
+      <author><first>Siva</first><last>Reddy</last></author>
+      <doi>10.1162/tacl_a_00667</doi>
+      <abstract>Instruction-following models are attractive alternatives to fine-tuned approaches for question answering (QA). By simply prepending relevant documents and an instruction to their input, these models can be adapted to various information domains and tasks without additional training. However, these models tend to produce verbose responses with supplementary information, which makes traditional QA metrics like exact match (EM) and F1 unreliable for accurately quantifying model performance. In this work, we evaluate instruction-following models along two fronts: 1) how well they satisfy user’s information need (correctness), and 2) whether they disseminate information supported by the provided knowledge (faithfulness). Guided by human evaluation and analysis, we highlight the shortcomings of traditional metrics for both correctness and faithfulness and propose simple token-overlap metrics that correlate highly with human judgments. Our analysis reveals that for correctness, instruction-following models perform comparably to models specifically fine-tuned for that task. However, they struggle to accurately judge the relevance of the provided knowledge and often hallucinate in their responses. We hope our work encourages more holistic evaluation of instruction-following models for QA. Our code and human annotation data is available at https://github.com/McGill-NLP/instruct-qa.</abstract>
+      <pages>681–699</pages>
+      <url hash="56cf2300">2024.tacl-1.38</url>
+      <bibkey>adlakha-etal-2024-evaluating</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.tdle.xml b/data/xml/2024.tdle.xml
new file mode 100644
index 0000000000..a371136cc3
--- /dev/null
+++ b/data/xml/2024.tdle.xml
@@ -0,0 +1,95 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.tdle">
+  <volume id="1" ingest-date="2024-05-16" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Second International Workshop Towards Digital Language Equality (TDLE): Focusing on Sustainability @ LREC-COLING 2024</booktitle>
+      <editor><first>Federico</first><last>Gaspari</last></editor>
+      <editor><first>Joss</first><last>Moorkens</last></editor>
+      <editor><first>Itziar</first><last>Aldabe</last></editor>
+      <editor><first>Aritz</first><last>Farwell</last></editor>
+      <editor><first>Begona</first><last>Altuna</last></editor>
+      <editor><first>Stelios</first><last>Piperidis</last></editor>
+      <editor><first>Georg</first><last>Rehm</last></editor>
+      <editor><first>German</first><last>Rigau</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="157c7db1">2024.tdle-1</url>
+      <venue>tdle</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="cd05324c">2024.tdle-1.0</url>
+      <bibkey>tdle-2024-international</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Surveying the Technology Support of Languages</title>
+      <author><first>Annika</first><last>Grützner-Zahn</last></author>
+      <author><first>Federico</first><last>Gaspari</last></author>
+      <author><first>Maria</first><last>Giagkou</last></author>
+      <author><first>Stefanie</first><last>Hegele</last></author>
+      <author><first>Andy</first><last>Way</last></author>
+      <author><first>Georg</first><last>Rehm</last></author>
+      <pages>1–17</pages>
+      <abstract>Many of the world’s languages are left behind when it comes to Language Technology applications, since most of these are available only in a limited number of languages, creating a digital divide that affects millions of users worldwide. It is crucial, therefore, to monitor and quantify the progress of technology support for individual languages, which also enables comparisons across language communities. In this way, efforts can be directed towards reducing language barriers, promoting economic and social inclusion, and ensuring that all citizens can use their preferred language in the digital age. This paper critically reviews and compares recent quantitative approaches to measuring technology support for languages. Despite using different approaches and methodologies, the findings of all analysed papers demonstrate the unequal distribution of technology support and emphasise the existence of a digital divide among languages.</abstract>
+      <url hash="313f0512">2024.tdle-1.1</url>
+      <bibkey>grutzner-zahn-etal-2024-surveying</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Which Domains, Tasks and Languages are in the Focus of <fixed-case>NLP</fixed-case> Research on the Languages of <fixed-case>E</fixed-case>urope?</title>
+      <author><first>Diego</first><last>Alves</last></author>
+      <author><first>Marko</first><last>Tadić</last></author>
+      <author><first>Georg</first><last>Rehm</last></author>
+      <pages>18–32</pages>
+      <abstract>This article provides a thorough mapping of NLP and Language Technology research on 39 European languages onto 46 domains. Our analysis is based on almost 50,000 papers published between 2010 and October 2022 in the ACL Anthology. We use a dictionary-based approach to identify 1) languages, 2) domains, and 3) NLP tasks in these papers; the dictionary-based method using exact terms has a precision value of 0.81. Moreover, we identify common mistakes which can be useful to fine-tune the methodology for future work. While we are only able to highlight selected results in this submitted version, the final paper will contain detailed analyses and charts on a per-language basis. We hope that this study can contribute to digital language equality in Europe by providing information to the academic and industrial research community about the opportunities for novel LT/NLP research.</abstract>
+      <url hash="74dff7ff">2024.tdle-1.2</url>
+      <bibkey>alves-etal-2024-domains</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Fine-Tuning Open Access <fixed-case>LLM</fixed-case>s for High-Precision <fixed-case>NLU</fixed-case> in Goal-Driven Dialog Systems</title>
+      <author><first>Lluís</first><last>Padró</last></author>
+      <author><first>Roser</first><last>Saurí</last></author>
+      <pages>33–42</pages>
+      <abstract>This paper presents a set of experiments on fine-tuning LLMs to produce high-precision semantic representations for the NLU component of a dialog system front-end. The aim of this research is threefold: First, we want to explore the capabilities of LLMs on real, industry-based use cases that involve complex data and strict requirements on results. Since the LLM output should usable by the application back-end, the produced semantic representation must satisfy strict format and consistency requirements. Second, we want to evaluate the cost-benefit of open-source LLMs, that is, the feasibility of running this kind of models in machines affordable to small-medium enterprises (SMEs), in order to assess how far this organizations can go without depending on the large players controlling the market, and with a moderate use of computation resources. Finally, we also want to assess the language scalability of the LLMs in this kind of applications; specifically, whether a multilingual model is able to cast patterns learnt from one language to other ones –with special attention to underresourced languages–, thus reducing required training data and computation costs. This work was carried out within an R&amp;D context of assisting a real company in defining its NLU model strategy, and thus the results have a practical, industry-level focus.</abstract>
+      <url hash="a77510d5">2024.tdle-1.3</url>
+      <bibkey>padro-sauri-2024-fine</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Could We Have Had Better Multilingual <fixed-case>LLM</fixed-case>s if <fixed-case>E</fixed-case>nglish Was Not the Central Language?</title>
+      <author><first>Ryandito</first><last>Diandaru</last></author>
+      <author><first>Lucky</first><last>Susanto</last></author>
+      <author><first>Zilu</first><last>Tang</last></author>
+      <author><first>Ayu</first><last>Purwarianti</last></author>
+      <author><first>Derry Tanti</first><last>Wijaya</last></author>
+      <pages>43–52</pages>
+      <abstract>Large Language Models (LLMs) demonstrate strong machine translation capabilities on languages they are trained on. However, the impact of factors beyond training data size on translation performance remains a topic of debate, especially concerning languages not directly encountered during training. Our study delves into Llama2’s translation capabilities. By modeling a linear relationship between linguistic feature distances and machine translation scores, we ask ourselves if there are potentially better central languages for LLMs other than English. Our experiments show that the 7B Llama2 model yields above 10 BLEU when translating into all languages it has seen, which rarely happens for languages it has not seen. Most translation improvements into unseen languages come from scaling up the model size rather than instruction tuning or increasing shot count. Furthermore, our correlation analysis reveals that syntactic similarity is not the only linguistic factor that strongly correlates with machine translation scores. Interestingly, we discovered that under specific circumstances, some languages (e.g. Swedish, Catalan), despite having significantly less training data, exhibit comparable correlation levels to English. These insights challenge the prevailing landscape of LLMs, suggesting that models centered around languages other than English could provide a more efficient foundation for multilingual applications.</abstract>
+      <url hash="ea226ddb">2024.tdle-1.4</url>
+      <bibkey>diandaru-etal-2024-better</bibkey>
+    </paper>
+    <paper id="5">
+      <title>A Language Model Trained on Uruguayan <fixed-case>S</fixed-case>panish News Text</title>
+      <author><first>Juan Pablo</first><last>Filevich</last></author>
+      <author><first>Gonzalo</first><last>Marco</last></author>
+      <author><first>Santiago</first><last>Castro</last></author>
+      <author><first>Luis</first><last>Chiruzzo</last></author>
+      <author><first>Aiala</first><last>Rosá</last></author>
+      <pages>53–60</pages>
+      <abstract>This paper presents a language model trained from scratch exclusively on a brand new corpus consisting of about 6 GiB of Uruguayan newspaper text. We trained the model for 30 days on a single Nvidia P100 using the RoBERTa-base architecture but with considerably fewer parameters than other standard RoBERTa models. We evaluated the model on two NLP tasks and found that it outperforms BETO, the widely used Spanish BERT pre-trained model. We also compared our model on the masked-word prediction task with two popular multilingual BERT-based models, Multilingual BERT and XLM-RoBERTa, obtaining outstanding results on sentences from the Uruguayan press domain. Our experiments show that training a language model on a domain-specific corpus can significantly improve performance even when the model is smaller and was trained with significantly less data than more standard pre-trained models.</abstract>
+      <url hash="b13c75fa">2024.tdle-1.5</url>
+      <bibkey>filevich-etal-2024-language</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Environmental Impact Measurement in the <fixed-case>M</fixed-case>ental<fixed-case>R</fixed-case>isk<fixed-case>ES</fixed-case> Evaluation Campaign</title>
+      <author><first>Alba M.</first><last>Mármol Romero</last></author>
+      <author><first>Adrián</first><last>Moreno-Muñoz</last></author>
+      <author><first>Flor Miriam</first><last>Plaza-del-Arco</last></author>
+      <author><first>M. Dolores</first><last>Molina González</last></author>
+      <author><first>Arturo</first><last>Montejo-Ráez</last></author>
+      <pages>61–72</pages>
+      <abstract>With the rise of Large Language Models (LLMs), the NLP community is increasingly aware of the environmental consequences of model development due to the energy consumed for training and running these models. This study investigates the energy consumption and environmental impact of systems participating in the MentalRiskES shared task, at the Iberian Language Evaluation Forum (IberLEF) in the year 2023, which focuses on early risk identification of mental disorders in Spanish comments. Participants were asked to submit, for each prediction, a set of efficiency metrics, being carbon dioxide emissions among them. We conduct an empirical analysis of the data submitted considering model architecture, task complexity, and dataset characteristics, covering a spectrum from traditional Machine Learning (ML) models to advanced LLMs. Our findings contribute to understanding the ecological footprint of NLP systems and advocate for prioritizing environmental impact assessment in shared tasks to foster sustainability across diverse model types and approaches, being evaluation campaigns an adequate framework for this kind of analysis.</abstract>
+      <url hash="7df1a74e">2024.tdle-1.6</url>
+      <bibkey>marmol-romero-etal-2024-environmental</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.teicai.xml b/data/xml/2024.teicai.xml
index 5c25df0461..8d8f965eca 100644
--- a/data/xml/2024.teicai.xml
+++ b/data/xml/2024.teicai.xml
@@ -30,6 +30,7 @@
       <abstract>In healthcare, agency refers to the ability of patients to actively participate in and control their health through collaborating with providers, informed decision-making and understanding health information. Conversational agents (CAs) are increasingly used for realizing digital health interventions, but it is still unclear how they are enhancing patient agency. This paper explores which technological components are required to enable CAs impacting on patient agency, and identifies metrics for measuring and evaluating this impact. We do this by drawing on existing work related to developing and evaluating healthcare CAs and through analysis of a concrete example of a CA. As a result, we identify five main areas where CAs enhance patient agency, namely by: improved access to health information, personalized advice, increased engagement, emotional support and reduced barriers to care. For each of these areas, specific technological functions have to be integrated into CAs such as sentiment and emotion analysis methods that allow a CA to support emotionally.</abstract>
       <url hash="f095bf64">2024.teicai-1.1</url>
       <bibkey>denecke-2024-conversational</bibkey>
+      <video href="2024.teicai-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Why academia should cut back general enthusiasm about <fixed-case>CA</fixed-case>s</title>
@@ -38,6 +39,7 @@
       <abstract>This position paper will analyze LLMs, the core technology of CAs, from a socio-technical and linguistic perspective in order to argue for a limitation of its use in academia, which should be reflected in a more cautious adoption of CAs in private spaces. The article describes how machine learning technologies like LLMs are inserted into a more general process of platformization (van Dijck, 2021), negatively affecting autonomy of research (Kersessens and van Dijck, 2022). Moreover, fine-tuning practices, as means to polish language models (Kasirzadeh and Gabriel, 2023) are questioned, explaining how these foster a deterministic approach to language. A leading role of universities in this general gain of awareness is strongly advocated, as institutions that support transparent and open science, in order to foster and protect democratic values in our societies.</abstract>
       <url hash="942d0043">2024.teicai-1.2</url>
       <bibkey>giulimondi-2024-academia</bibkey>
+      <video href="2024.teicai-1.2.mp4"/>
     </paper>
     <paper id="3">
       <title>Bridging the Language Gap: Integrating Language Variations into Conversational <fixed-case>AI</fixed-case> Agents for Enhanced User Engagement</title>
@@ -48,6 +50,7 @@
       <abstract>This paper presents the initial steps taken to integrate language variations into conversational AI agents to enhance user engagement. The study is built upon sociolinguistic and pragmatic traditions and involves the creation of an annotation taxonomy. The taxonomy includes eleven classes, ranging from concrete to abstract, and the covered aspects are the instance itself, time, sentiment, register, state, region, type, grammar, part of speech, meaning, and language. The paper discusses the challenges of incorporating vernacular language into AI agents, the procedures for data collection, and the taxonomy organization. It also outlines the next steps, including the database expansion and the computational implementation. The authors believe that integrating language variation into conversational AI will build near-real language inventories and boost user engagement. The paper concludes by discussing the limitations and the importance of building rapport with users through their own vernacular.</abstract>
       <url hash="617a3ade">2024.teicai-1.3</url>
       <bibkey>amadeus-etal-2024-bridging</bibkey>
+      <video href="2024.teicai-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Socio-cultural adapted chatbots: Harnessing Knowledge Graphs and Large Language Models for enhanced context awarenes</title>
@@ -59,6 +62,7 @@
       <abstract>Understanding the socio-cultural context is crucial in machine translation (MT). Although conversational AI systems and chatbots, in particular, are not designed for translation, they can be used for MT purposes. Yet, chatbots often struggle to identify any socio-cultural context during user interactions. In this paper, we highlight this challenge with real-world examples from popular chatbots. We advocate for the use of knowledge graphs as an external source of information that can potentially encapsulate socio-cultural contexts, aiding chatbots in enhancing translation. We further present a method to exploit external knowledge and extract contextual information that can significantly improve text translation, as evidenced by our interactions with these chatbots.</abstract>
       <url hash="41d28881">2024.teicai-1.4</url>
       <bibkey>camboim-de-sa-etal-2024-socio</bibkey>
+      <video href="2024.teicai-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>How should Conversational Agent systems respond to sexual harassment?</title>
@@ -70,6 +74,7 @@
       <abstract>This paper investigates the appropriate responses that Conversational Agent systems (CAs) should employ when subjected to sexual harassment by users. Previous studies indicate that conventional CAs often respond neutrally or evade such requests. Enhancing the responsiveness of CAs to offensive speech is crucial, as users might carry over these interactions into their social interactions. To address this issue, we selected evaluators to compare a series of responses to sexual harassment from four commercial CAs (Amazon Alexa, Apple Siri, Google Home, and Microsoft Cortana) with alternative responses we realized based on insights from psychological and sociological studies. Focusing on CAs with a female voice, given their increased likelihood of encountering offensive language, we conducted two experiments involving 22 evaluators (11 females and 11 males). In the initial experiment, participants assessed the responses in a textual format, while the second experiment involved the evaluation of responses generated with a synthetic voice exhibiting three different intonations (angry, neutral, and assertive). Results from the first experiment revealed a general preference for the responses we formulated. For the most voted replies, female evaluators exhibited a tendency towards responses with an assertive intent, emphasizing the sexually harassing nature of the request. Conversely, male evaluators leaned towards a more neutral response, aligning with prior findings that highlight gender-based differences in the perception of sexual harassment. The second experiment underscored a preference for assertive responses. The study’s outcomes highlight the need to develop new, educational responses from CAs to instances of sexual harassment, aiming to discourage harmful behavior.</abstract>
       <url hash="1a8bfaef">2024.teicai-1.5</url>
       <bibkey>de-grazia-etal-2024-conversational</bibkey>
+      <video href="2024.teicai-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Non-Referential Functions of Language in Social Agents: The Case of Social Proximity</title>
@@ -78,6 +83,7 @@
       <abstract>Non-referential functions of language such as setting group boundaries, identity construction and regulation of social proximity have rarely found place in the language technology creation process. Nevertheless, their importance has been postulated in literature. While multiple methods to include social information in large language models (LLM) cover group properties (gender, age, geographic relations, professional characteristics), a combination of group social characteristics and individual features of an agent (natural or artificial) play a role in social interaction but have not been studied in generated language. This article explores the orchestration of prompt engineering and retrieval-augmented generation techniques to linguistic features of social proximity and distance in language generated by an LLM. The study uses the immediacy/distance model from literature to analyse language generated by an LLM for different recipients. This research reveals that kinship terms are almost the only way of displaying immediacy in LLM-made conversations.</abstract>
       <url hash="64e92a7e">2024.teicai-1.6</url>
       <bibkey>hohn-2024-non</bibkey>
+      <video href="2024.teicai-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Making a Long Story Short in Conversation Modeling</title>
@@ -88,6 +94,7 @@
       <abstract>Conversation systems accommodate diverse users with unique personalities and distinct writing styles. Within the domain of multi-turn dialogue modeling, this work studies the impact of varied utterance lengths on the quality of subsequent responses generated by conversation models. Using GPT-3 as the base model, multiple dialogue datasets, and several metrics, we conduct a thorough exploration of this aspect of conversational models. Our analysis sheds light on the complex relationship between utterance lengths and the quality of follow-up responses generated by dialogue systems. Empirical findings suggests that, for certain types of conversations, utterance lengths can be reduced by up to 72% without any noticeable difference in the quality of follow-up responses.</abstract>
       <url hash="c4f69374">2024.teicai-1.7</url>
       <bibkey>tao-etal-2024-making</bibkey>
+      <video href="2024.teicai-1.7.mp4"/>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2024.trac.xml b/data/xml/2024.trac.xml
new file mode 100644
index 0000000000..b4c5f1918b
--- /dev/null
+++ b/data/xml/2024.trac.xml
@@ -0,0 +1,214 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.trac">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Fourth Workshop on Threat, Aggression &amp; Cyberbullying @ LREC-COLING-2024</booktitle>
+      <editor><first>Ritesh</first><last>Kumar</last></editor>
+      <editor><first>Atul Kr.</first><last>Ojha</last></editor>
+      <editor><first>Shervin</first><last>Malmasi</last></editor>
+      <editor><first>Bharathi Raja</first><last>Chakravarthi</last></editor>
+      <editor><first>Bornini</first><last>Lahiri</last></editor>
+      <editor><first>Siddharth</first><last>Singh</last></editor>
+      <editor><first>Shyam</first><last>Ratan</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="1fa22f47">2024.trac-1</url>
+      <venue>trac</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="cbf955e2">2024.trac-1.0</url>
+      <bibkey>trac-2024-threat</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>The Constant in <fixed-case>HATE</fixed-case>: Toxicity in <fixed-case>R</fixed-case>eddit across Topics and Languages</title>
+      <author><first>Wondimagegnhue Tsegaye</first><last>Tufa</last></author>
+      <author><first>Ilia</first><last>Markov</last></author>
+      <author><first>Piek T.J.M.</first><last>Vossen</last></author>
+      <pages>1–11</pages>
+      <abstract>Toxic language remains an ongoing challenge on social media platforms, presenting significant issues for users and communities. This paper provides a cross-topic and cross-lingual analysis of toxicity in Reddit conversations. We collect 1.5 million comment threads from 481 communities in six languages. By aligning languages with topics, we thoroughly analyze how toxicity spikes within different communities. Our analysis targets six languages spanning different communities and topics such as Culture, Politics, and News. We observe consistent patterns across languages where toxicity increases within the same topics while also identifying significant differences where specific language communities exhibit notable variations in relation to certain topics.</abstract>
+      <url hash="c3a5cb60">2024.trac-1.1</url>
+      <bibkey>tufa-etal-2024-constant</bibkey>
+    </paper>
+    <paper id="2">
+      <title>A Federated Learning Approach to Privacy Preserving Offensive Language Identification</title>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <author><first>Damith</first><last>Premasiri</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <pages>12–20</pages>
+      <abstract>The spread of various forms of offensive speech online is an important concern in social media. While platforms have been investing heavily in ways of coping with this problem, the question of privacy remains largely unaddressed. Models trained to detect offensive language on social media are trained and/or fine-tuned using large amounts of data often stored in centralized servers. Since most social media data originates from end users, we propose a privacy preserving decentralized architecture for identifying offensive language online by introducing Federated Learning (FL) in the context of offensive language identification. FL is a decentralized architecture that allows multiple models to be trained locally without the need for data sharing hence preserving users’ privacy. We propose a model fusion approach to perform FL. We trained multiple deep learning models on four publicly available English benchmark datasets (AHSD, HASOC, HateXplain, OLID) and evaluated their performance in detail. We also present initial cross-lingual experiments in English and Spanish. We show that the proposed model fusion approach outperforms baselines in all the datasets while preserving privacy.</abstract>
+      <url hash="66b7ce6a">2024.trac-1.2</url>
+      <bibkey>zampieri-etal-2024-federated</bibkey>
+    </paper>
+    <paper id="3">
+      <title><fixed-case>CLTL</fixed-case>@<fixed-case>H</fixed-case>arm<fixed-case>P</fixed-case>ot-<fixed-case>ID</fixed-case>: Leveraging Transformer Models for Detecting Offline Harm Potential and Its Targets in Low-Resource Languages</title>
+      <author><first>Yeshan</first><last>Wang</last></author>
+      <author><first>Ilia</first><last>Markov</last></author>
+      <pages>21–26</pages>
+      <abstract>We present the winning approach to the TRAC 2024 Shared Task on Offline Harm Potential Identification (HarmPot-ID). The task focused on low-resource Indian languages and consisted of two sub-tasks: 1a) predicting the offline harm potential and 1b) detecting the most likely target(s) of the offline harm. We explored low-source domain specific, cross-lingual, and monolingual transformer models and submitted the aggregate predictions from the MuRIL and BERT models. Our approach achieved 0.74 micro-averaged F1-score for sub-task 1a and 0.96 for sub-task 1b, securing the 1st rank for both sub-tasks in the competition.</abstract>
+      <url hash="ad714d49">2024.trac-1.3</url>
+      <bibkey>wang-markov-2024-cltl-harmpot</bibkey>
+    </paper>
+    <paper id="4">
+      <title><fixed-case>NJUST</fixed-case>-<fixed-case>KMG</fixed-case> at <fixed-case>TRAC</fixed-case>-2024 Tasks 1 and 2: Offline Harm Potential Identification</title>
+      <author><first>Jingyuan</first><last>Wang</last></author>
+      <author><first>Jack</first><last>Depp</last></author>
+      <author><first>Yang</first><last>Yang</last></author>
+      <pages>27–31</pages>
+      <abstract>This report provide a detailed description of the method that we proposed in the TRAC-2024 Offline Harm Potential dentification which encloses two sub-tasks. The investigation utilized a rich dataset comprised of social media comments in several Indian languages, annotated with precision by expert judges to capture the nuanced implications for offline context harm. The objective assigned to the participants was to design algorithms capable of accurately assessing the likelihood of harm in given situations and identifying the most likely target(s) of offline harm. Our approach ranked second in two separate tracks, with F1 values of 0.73 and 0.96 respectively. Our method principally involved selecting pretrained models for finetuning, incorporating contrastive learning techniques, and culminating in an ensemble approach for the test set.</abstract>
+      <url hash="62b82474">2024.trac-1.4</url>
+      <bibkey>wang-etal-2024-njust</bibkey>
+    </paper>
+    <paper id="5">
+      <title><fixed-case>S</fixed-case>calar<fixed-case>L</fixed-case>ab@<fixed-case>TRAC</fixed-case>2024: Exploring Machine Learning Techniques for Identifying Potential Offline Harm in Multilingual Commentaries</title>
+      <author><first>Anagha</first><last>H C</last></author>
+      <author><first>Saatvik M.</first><last>Krishna</last></author>
+      <author><first>Soumya Sangam</first><last>Jha</last></author>
+      <author><first>Vartika T.</first><last>Rao</last></author>
+      <author><first>Anand Kumar</first><last>M</last></author>
+      <pages>32–36</pages>
+      <abstract>The objective of the shared task, Offline Harm Potential Identification (HarmPot-ID), is to build models to predict the offline harm potential of social media texts. “Harm potential” is defined as the ability of an online post or comment to incite offline physical harm such as murder, arson, riot, rape, etc. The first subtask was to predict the level of harm potential, and the second was to identify the group to which this harm was directed towards. This paper details our submissions for the shared task that includes a cascaded SVM model, an XGBoost model, and a TF-IDF weighted Word2Vec embedding-supported SVM model. Several other models that were explored have also been detailed.</abstract>
+      <url hash="ae8b4fdf">2024.trac-1.5</url>
+      <bibkey>h-c-etal-2024-scalarlab</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>LLM</fixed-case>-Based Synthetic Datasets: Applications and Limitations in Toxicity Detection</title>
+      <author><first>Udo</first><last>Kruschwitz</last></author>
+      <author><first>Maximilian</first><last>Schmidhuber</last></author>
+      <pages>37–51</pages>
+      <abstract>Large Language Model (LLM)-based Synthetic Data is becoming an increasingly important field of research. One of its promising application is in training classifiers to detect online toxicity, which is of increasing concern in today’s digital landscape. In this work, we assess the feasibility of generative models to generate synthetic data for toxic speech detection. Our experiments are conducted on six different toxicity datasets, four of whom are hateful and two are toxic in the broader sense. We then employ a classifier trained on the original data for filtering. To explore the potential of this data, we conduct experiments using combinations of original and synthetic data, synthetic oversampling of the minority class, and a comparison of original vs. synthetic-only training. Results indicate that while our generative models offer benefits in certain scenarios, it does not improve hateful dataset classification. However, it does boost patronizing and condescending language detection. We find that synthetic data generated by LLMs is a promising avenue of research, but further research is needed to improve the quality of the generated data and develop better filtering methods. Code is available on GitHub; the generated dataset will be available on Zenodo in the final submission.</abstract>
+      <url hash="e546c31f">2024.trac-1.6</url>
+      <bibkey>kruschwitz-schmidhuber-2024-llm</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Using Sarcasm to Improve Cyberbullying Detection</title>
+      <author><first>Xiaoyu</first><last>Guo</last></author>
+      <author><first>Susan</first><last>Gauch</last></author>
+      <pages>52–59</pages>
+      <abstract>Cyberbullying has become more prevalent over time, especially towards minority groups, and online human moderators cannot detect cyberbullying content efficiently. Prior work has addressed this problem by detecting cyberbullying with deep learning approaches. In this project, we compare several BERT-based benchmark methods for cyberbullying detection and do a failure analysis to see where the model fails to correctly identify cyberbullying. We find that many falsely classified texts are sarcastic, so we propose a method to mitigate the false classifications by incorporating neural network-based sarcasm detection. We define a simple multilayer perceptron (MLP) that incorpo- rates sarcasm detection in the final cyberbully classifications and demonstrate improvement over benchmark methods.</abstract>
+      <url hash="d652062f">2024.trac-1.7</url>
+      <bibkey>guo-gauch-2024-using</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Analyzing Offensive Language and Hate Speech in Political Discourse: A Case Study of <fixed-case>G</fixed-case>erman Politicians</title>
+      <author><first>Maximilian</first><last>Weissenbacher</last></author>
+      <author><first>Udo</first><last>Kruschwitz</last></author>
+      <pages>60–72</pages>
+      <abstract>Social media platforms have become key players in political discourse. Twitter (now ‘X’), for example, is used by many German politicians to communicate their views and interact with others. Due to its nature, however, social networks suffer from a number of issues such as offensive content, toxic language and hate speech. This has attracted a lot of research interest but in the context of political discourse there is a noticeable gap with no such study specifically looking at German politicians in a systematic way. We aim to help addressing this gap. We first create an annotated dataset of 1,197 Twitter posts mentioning German politicians. This is the basis to explore a number of approaches to detect hate speech and offensive language (HOF) and identify an ensemble of transformer models that achieves an F1-Macros score of 0.94. This model is then used to automatically classify two much larger, longitudinal datasets: one with 520,000 tweets posted by MPs, and the other with 2,200,000 tweets which comprise posts from the public mentioning politicians. We obtain interesting insights in regards to the distribution of hate and offensive content when looking at different independent variables.</abstract>
+      <url hash="96a71387">2024.trac-1.8</url>
+      <bibkey>weissenbacher-kruschwitz-2024-analyzing</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Ice and Fire: Dataset on Sentiment, Emotions, Toxicity, Sarcasm, Hate speech, Sympathy and More in <fixed-case>I</fixed-case>celandic Blog Comments</title>
+      <author><first>Steinunn Rut</first><last>Friðriksdóttir</last></author>
+      <author><first>Annika</first><last>Simonsen</last></author>
+      <author><first>Atli Snær</first><last>Ásmundsson</last></author>
+      <author><first>Guðrún Lilja</first><last>Friðjónsdóttir</last></author>
+      <author><first>Anton Karl</first><last>Ingason</last></author>
+      <author><first>Vésteinn</first><last>Snæbjarnarson</last></author>
+      <author><first>Hafsteinn</first><last>Einarsson</last></author>
+      <pages>73–84</pages>
+      <abstract>This study introduces “Ice and Fire,” a Multi-Task Learning (MTL) dataset tailored for sentiment analysis in the Icelandic language, encompassing a wide range of linguistic tasks, including sentiment and emotion detection, as well as identification of toxicity, hate speech, encouragement, sympathy, sarcasm/irony, and trolling. With 261 fully annotated blog comments and 1045 comments annotated in at least one task, this contribution marks a significant step forward in the field of Icelandic natural language processing. It provides a comprehensive dataset for understanding the nuances of online communication in Icelandic and an interface to expand the annotation effort. Despite the challenges inherent in subjective interpretation of text, our findings highlight the positive potential of this dataset to improve text analysis techniques and encourage more inclusive online discourse in Icelandic communities. With promising baseline performances, “Ice and Fire” sets the stage for future research to enhance automated text analysis and develop sophisticated language technologies, contributing to healthier online environments and advancing Icelandic language resources.</abstract>
+      <url hash="498e64dc">2024.trac-1.9</url>
+      <bibkey>fridriksdottir-etal-2024-ice</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Detecting Hate Speech in <fixed-case>A</fixed-case>mharic Using Multimodal Analysis of Social Media Memes</title>
+      <author><first>Melese Ayichlie</first><last>Jigar</last></author>
+      <author><first>Abinew Ali</first><last>Ayele</last></author>
+      <author><first>Seid Muhie</first><last>Yimam</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
+      <pages>85–95</pages>
+      <abstract>In contemporary society, the proliferation of hate speech is increasingly prevalent across various social media platforms, with a notable trend of incorporating memes to amplify its visual impact and reach. The conventional text-based detection approaches frequently fail to address the complexities introduced by memes, thereby aggravating the challenges, particularly in low-resource languages such as Amharic. We develop Amharic meme hate speech detection models using 2,000 memes collected from Facebook, Twitter, and Telegram over four months. We employ native Amharic speakers to annotate each meme using a web-based tool, yielding a Fleiss’ kappa score of 0.50. We utilize different feature extraction techniques, namely VGG16 for images and word2Vec for textual content, and build unimodal and multimodal models such as LSTM, BiLSTM, and CNN. The BiLSTM model shows the best performance, achieving 63% accuracy for text and 75% for multimodal features. In image-only experiments, the CNN model achieves 69% in accuracy. Multimodal models demonstrate superior performance in detecting Amharic hate speech in memes, showcasing their potential to address the unique challenges posed by meme-based hate speech on social media.</abstract>
+      <url hash="8fe87b8c">2024.trac-1.10</url>
+      <bibkey>jigar-etal-2024-detecting</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Content Moderation in Online Platforms: A Study of Annotation Methods for Inappropriate Language</title>
+      <author><first>Baran</first><last>Barbarestani</last></author>
+      <author><first>Isa</first><last>Maks</last></author>
+      <author><first>Piek T.J.M.</first><last>Vossen</last></author>
+      <pages>96–104</pages>
+      <abstract>Detecting inappropriate language in online platforms is vital for maintaining a safe and respectful digital environment, especially in the context of hate speech prevention. However, defining what constitutes inappropriate language can be highly subjective and context-dependent, varying from person to person. This study presents the outcomes of a comprehensive examination of the subjectivity involved in assessing inappropriateness within conversational contexts. Different annotation methods, including expert annotation, crowd annotation, ChatGPT-generated annotation, and lexicon-based annotation, were applied to English Reddit conversations. The analysis revealed a high level of agreement across these annotation methods, with most disagreements arising from subjective interpretations of inappropriate language. This emphasizes the importance of implementing content moderation systems that not only recognize inappropriate content but also understand and adapt to diverse user perspectives and contexts. The study contributes to the evolving field of hate speech annotation by providing a detailed analysis of annotation differences in relation to the subjective task of judging inappropriate words in conversations.</abstract>
+      <url hash="5013cc8f">2024.trac-1.11</url>
+      <bibkey>barbarestani-etal-2024-content</bibkey>
+    </paper>
+    <paper id="12">
+      <title><fixed-case>F</fixed-case>rench<fixed-case>T</fixed-case>oxicity<fixed-case>P</fixed-case>rompts: a Large Benchmark for Evaluating and Mitigating Toxicity in <fixed-case>F</fixed-case>rench Texts</title>
+      <author><first>Caroline</first><last>Brun</last></author>
+      <author><first>Vassilina</first><last>Nikoulina</last></author>
+      <pages>105–114</pages>
+      <abstract>Large language models (LLMs) are increasingly popular but are also prone to generating bias, toxic or harmful language, which can have detrimental effects on individuals and communities. Although most efforts is put to assess and mitigate toxicity in generated content, it is primarily concentrated on English, while it’s essential to consider other languages as well. For addressing this issue, we create and release FrenchToxicityPrompts, a dataset of 50K naturally occurring French prompts and their continuations, annotated with toxicity scores from a widely used toxicity classifier. We evaluate 14 different models from four prevalent open-sourced families of LLMs against our dataset to assess their potential toxicity across various dimensions. We hope that our contribution will foster future research on toxicity detection and mitigation beyond English.</abstract>
+      <url hash="c48bcf6f">2024.trac-1.12</url>
+      <bibkey>brun-nikoulina-2024-frenchtoxicityprompts</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Studying Reactions to Stereotypes in Teenagers: an Annotated <fixed-case>I</fixed-case>talian Dataset</title>
+      <author><first>Elisa</first><last>Chierchiello</last></author>
+      <author><first>Tom</first><last>Bourgeade</last></author>
+      <author><first>Giacomo</first><last>Ricci</last></author>
+      <author><first>Cristina</first><last>Bosco</last></author>
+      <author><first>Francesca</first><last>D’Errico</last></author>
+      <pages>115–125</pages>
+      <abstract>The paper introduces a novel corpus collected in a set of experiments in Italian schools, annotated for the presence of stereotypes, and related categories. It consists of comments written by teenage students in reaction to fabricated fake news, designed to elicit prejudiced responses, by featuring racial stereotypes. We make use of an annotation scheme which takes into account the implicit or explicit nature of different instances of stereotypes, alongside their forms of discredit. We also annotate the stance of the commenter towards the news article, using a schema inspired by rumor and fake news stance detection tasks. Through this rarely studied setting, we provide a preliminary exploration of the production of stereotypes in a more controlled context. Alongside this novel dataset, we provide both quantitative and qualitative analyses of these reactions, to validate the categories used in their annotation. Through this work, we hope to increase the diversity of available data in the study of the propagation and the dynamics of negative stereotypes.</abstract>
+      <url hash="b2401052">2024.trac-1.13</url>
+      <bibkey>chierchiello-etal-2024-studying</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Offensiveness, Hate, Emotion and <fixed-case>GPT</fixed-case>: Benchmarking <fixed-case>GPT</fixed-case>3.5 and <fixed-case>GPT</fixed-case>4 as Classifiers on <fixed-case>T</fixed-case>witter-specific Datasets</title>
+      <author><first>Nikolaj</first><last>Bauer</last></author>
+      <author><first>Moritz</first><last>Preisig</last></author>
+      <author><first>Martin</first><last>Volk</last></author>
+      <pages>126–133</pages>
+      <abstract>In this paper, we extend the work of benchmarking GPT by turning GPT models into classifiers and applying them on three different Twitter datasets on Hate-Speech Detection, Offensive Language Detection, and Emotion Classification. We use a Zero-Shot and Few-Shot approach to evaluate the classification capabilities of the GPT models. Our results show that GPT models do not always beat fine-tuned models on the tested benchmarks. However, in Hate-Speech and Emotion Detection, using a Few-Shot approach, state-of-the-art performance can be achieved. The results also reveal that GPT-4 is more sensitive to the examples given in a Few-Shot prompt, highlighting the importance of choosing fitting examples for inference and prompt formulation.</abstract>
+      <url hash="aea78193">2024.trac-1.14</url>
+      <bibkey>bauer-etal-2024-offensiveness</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>D</fixed-case>o<fixed-case>D</fixed-case>o Learning: Domain-Demographic Transfer in Language Models for Detecting Abuse Targeted at Public Figures</title>
+      <author><first>Angus Redlarski</first><last>Williams</last></author>
+      <author><first>Hannah Rose</first><last>Kirk</last></author>
+      <author><first>Liam</first><last>Burke-Moore</last></author>
+      <author><first>Yi-Ling</first><last>Chung</last></author>
+      <author><first>Ivan</first><last>Debono</last></author>
+      <author><first>Pica</first><last>Johansson</last></author>
+      <author><first>Francesca</first><last>Stevens</last></author>
+      <author><first>Jonathan</first><last>Bright</last></author>
+      <author><first>Scott</first><last>Hale</last></author>
+      <pages>134–154</pages>
+      <abstract>Public figures receive disproportionate levels of abuse on social media, impacting their active participation in public life. Automated systems can identify abuse at scale but labelling training data is expensive and potentially harmful. So, it is desirable that systems are efficient and generalisable, handling shared and specific aspects of abuse. We explore the dynamics of cross-group text classification in order to understand how well models trained on one domain or demographic can transfer to others, with a view to building more generalisable abuse classifiers. We fine-tune language models to classify tweets targeted at public figures using our novel DoDo dataset, containing 28,000 entries with fine-grained labels, split equally across four Domain-Demographic pairs (male and female footballers and politicians). We find that (i) small amounts of diverse data are hugely beneficial to generalisation and adaptation; (ii) models transfer more easily across demographics but cross-domain models are more generalisable; (iii) some groups contribute more to generalisability than others; and (iv) dataset similarity is a signal of transferability.</abstract>
+      <url hash="e5d93650">2024.trac-1.15</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="fb8a952b">2024.trac-1.15.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>williams-etal-2024-dodo</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Empowering Users and Mitigating Harm: Leveraging Nudging Principles to Enhance Social Media Safety</title>
+      <author><first>Gregor</first><last>Donabauer</last></author>
+      <author><first>Emily</first><last>Theophilou</last></author>
+      <author><first>Francesco</first><last>Lomonaco</last></author>
+      <author><first>Sathya</first><last>Bursic</last></author>
+      <author><first>Davide</first><last>Taibi</last></author>
+      <author><first>Davinia</first><last>Hernández-Leo</last></author>
+      <author><first>Udo</first><last>Kruschwitz</last></author>
+      <author><first>Dimitri</first><last>Ognibene</last></author>
+      <pages>155–166</pages>
+      <abstract>Social media have become an integral part of our daily lives, yet they have also resulted in various negative effects on users, ranging from offensive or hateful content to the spread of misinformation. In recent years, numerous automated approaches have been proposed to identify and combat such harmful content. However, it is crucial to recognize the human aspect of users who engage with this content in designing efforts to mitigate these threats. We propose to incorporate principles of behavioral science, specifically the concept of nudging into social media platforms. Our approach involves augmenting social media feeds with informative diagrams, which provide insights into the content that users are presented. The goal of our work is to empower social media users to make well-informed decisions for themselves and for others within these platforms. Nudges serve as a means to gently draw users’ attention to content in an unintrusive manner, a crucial consideration in the context of social media. To evaluate the effectiveness of our approach, we conducted a user study involving 120 Italian-speaking participants who interacted with a social media interface augmented with these nudging diagrams. Participants who had used the augmented interface were able to outperform those using the plain interface in a successive harmful content detection test where nudging diagrams were not visible anymore. Our findings demonstrate that our approach significantly improves users’ awareness of potentially harmful content with effects lasting beyond the duration of the interaction. In this work, we provide a comprehensive overview of our experimental materials and setup, present our findings, and refer to the limitations identified during our study.</abstract>
+      <url hash="89575323">2024.trac-1.16</url>
+      <bibkey>donabauer-etal-2024-empowering</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Exploring Boundaries and Intensities in Offensive and Hate Speech: Unveiling the Complex Spectrum of Social Media Discourse</title>
+      <author><first>Abinew Ali</first><last>Ayele</last></author>
+      <author><first>Esubalew Alemneh</first><last>Jalew</last></author>
+      <author><first>Adem Chanie</first><last>Ali</last></author>
+      <author><first>Seid Muhie</first><last>Yimam</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
+      <pages>167–178</pages>
+      <abstract>The prevalence of digital media and evolving sociopolitical dynamics have significantly amplified the dissemination of hateful content. Existing studies mainly focus on classifying texts into binary categories, often overlooking the continuous spectrum of offensiveness and hatefulness inherent in the text. In this research, we present an extensive benchmark dataset for Amharic, comprising 8,258 tweets annotated for three distinct tasks: category classification, identification of hate targets, and rating offensiveness and hatefulness intensities. Our study highlights that a considerable majority of tweets belong to the less offensive and less hate intensity levels, underscoring the need for early interventions by stakeholders. The prevalence of ethnic and political hatred targets, with significant overlaps in our dataset, emphasizes the complex relationships within Ethiopia’s sociopolitical landscape. We build classification and regression models and investigate the efficacy of models in handling these tasks. Our results reveal that hate and offensive speech can not be addressed by a simplistic binary classification, instead manifesting as variables across a continuous range of values. The afro-XLMR-large model exhibits the best performances achieving F1-scores of 75.30%, 70.59%, and 29.42% for the category, target, and regression tasks, respectively. The 80.22% correlation coefficient of the Afro-XLMR-large model indicates strong alignments.</abstract>
+      <url hash="266a970d">2024.trac-1.17</url>
+      <bibkey>ayele-etal-2024-exploring</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.uncertainlp.xml b/data/xml/2024.uncertainlp.xml
index 09eb3ad939..1af5a58461 100644
--- a/data/xml/2024.uncertainlp.xml
+++ b/data/xml/2024.uncertainlp.xml
@@ -37,6 +37,7 @@
       <abstract>Large language models are increasingly deployed for high-stakes decision making, for example in financial and medical applications. In such applications, it is imperative that we be able to estimate our confidence in the answers output by a language model in order to assess risks. Although we can easily compute the probability assigned by a language model to the sequence of tokens that make up an answer, we cannot easily compute the probability of the answer itself, which could be phrased in numerous ways.While other works have engineered ways of assigning such probabilities to LLM outputs, a key problem remains: existing language models are poorly calibrated, often confident when they are wrong or unsure when they are correct. In this work, we devise a protocol called *calibration tuning* for finetuning LLMs to output calibrated probabilities. Calibration-tuned models demonstrate superior calibration performance compared to existing language models on a variety of question-answering tasks, including open-ended generation, without affecting accuracy. We further show that this ability transfers to new domains outside of the calibration-tuning train set.</abstract>
       <url hash="9eb3ff58">2024.uncertainlp-1.1</url>
       <bibkey>kapoor-etal-2024-calibration</bibkey>
+      <video href="2024.uncertainlp-1.1.mp4"/>
     </paper>
     <paper id="2">
       <title>Context Tuning for Retrieval Augmented Generation</title>
@@ -56,6 +57,7 @@
       <abstract>This work explores the effectiveness of employing Clinical BERT for Relation Extraction (RE) tasks in medical texts within an Active Learning (AL) framework. Our main objective is to optimize RE in medical texts through AL while examining the trade-offs between performance and computation time, comparing it with alternative methods like Random Forest and BiLSTM networks. Comparisons extend to feature engineering requirements, performance metrics, and considerations of annotation costs, including AL step times and annotation rates. The utilization of AL strategies aligns with our broader goal of enhancing the efficiency of relation classification models, particularly when dealing with the challenges of annotating complex medical texts in a Human-in-the-Loop (HITL) setting. The results indicate that uncertainty-based sampling achieves comparable performance with significantly fewer annotated samples across three categories of supervised learning methods, thereby reducing annotation costs for clinical and biomedical corpora. While Clinical BERT exhibits clear performance advantages across two different corpora, the trade-off involves longer computation times in interactive annotation processes. In real-world applications, where practical feasibility and timely results are crucial, optimizing this trade-off becomes imperative.</abstract>
       <url hash="d4d25d98">2024.uncertainlp-1.3</url>
       <bibkey>liang-etal-2024-optimizing</bibkey>
+      <video href="2024.uncertainlp-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Linguistic Obfuscation Attacks and Large Language Model Uncertainty</title>
@@ -76,6 +78,7 @@
       <abstract>Automatically generated summaries can be evaluated along different dimensions, one being how faithfully the uncertainty from the source text is conveyed in the summary. We present a study on uncertainty alignment in automatic summarization, starting from a two-tier lexical and semantic categorization of linguistic expression of uncertainty, which we used to annotate source texts and automatically generate summaries. We collected a diverse dataset including news articles and personal blogs and generated summaries using GPT-4. Source texts and summaries were annotated based on our two-tier taxonomy using a markup language. The automatic annotation was refined and validated by subsequent iterations based on expert input. We propose a method to evaluate the fidelity of uncertainty transfer in text summarization. The method capitalizes on a small amount of expert annotations and on the capabilities of Large language models (LLMs) to evaluate how the uncertainty of the source text aligns with the uncertainty expressions in the summary.</abstract>
       <url hash="c8d3e176">2024.uncertainlp-1.5</url>
       <bibkey>kolagar-zarcone-2024-aligning</bibkey>
+      <video href="2024.uncertainlp-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>How Does Beam Search improve Span-Level Confidence Estimation in Generative Sequence Labeling?</title>
@@ -86,6 +89,7 @@
       <abstract>Sequence labeling is a core task in text understanding for IE/IR systems. Text generation models have increasingly become the go-to solution for such tasks (e.g., entity extraction and dialog slot filling). While most research has focused on the labeling accuracy, a key aspect – of vital practical importance – has slipped through the cracks: understanding model confidence. More specifically, we lack a principled understanding of how to reliably gauge the confidence of a model in its predictions for each labeled span. This paper aims to provide some empirical insights on estimating model confidence for generative sequence labeling. Most notably, we find that simply using the decoder’s output probabilities <b>is not</b> the best in realizing well-calibrated confidence estimates. As verified over six public datasets of different tasks, we show that our proposed approach – which leverages statistics from top-<tex-math>k</tex-math> predictions by a beam search – significantly reduces calibration errors of the predictions of a generative sequence labeling model.</abstract>
       <url hash="5cc64ed0">2024.uncertainlp-1.6</url>
       <bibkey>hashimoto-etal-2024-beam</bibkey>
+      <video href="2024.uncertainlp-1.6.mp4"/>
     </paper>
     <paper id="7">
       <title>Efficiently Acquiring Human Feedback with <fixed-case>B</fixed-case>ayesian Deep Learning</title>
@@ -96,6 +100,7 @@
       <abstract>Learning from human feedback can improve models for text generation or passage ranking, aligning them better to a user’s needs. Data is often collected by asking users to compare alternative outputs to a given input, which may require a large number of comparisons to learn a ranking function. The amount of comparisons needed can be reduced using Bayesian Optimisation (BO) to query the user about only the most promising candidate outputs. Previous applications of BO to text ranking relied on shallow surrogate models to learn ranking functions over candidate outputs,and were therefore unable to fine-tune rankers based on deep, pretrained language models. This paper leverages Bayesian deep learning (BDL) to adapt pretrained language models to highly specialised text ranking tasks, using BO to tune the model with a small number of pairwise preferences between candidate outputs. We apply our approach to community question answering (cQA) and extractive multi-document summarisation (MDS) with simulated noisy users, finding that our BDL approach significantly outperforms both a shallow Gaussian process model and traditional active learning with a standard deep neural network, while remaining robust to noise in the user feedback.</abstract>
       <url hash="7ac88df7">2024.uncertainlp-1.7</url>
       <bibkey>fang-etal-2024-efficiently</bibkey>
+      <video href="2024.uncertainlp-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title>Order Effects in Annotation Tasks: Further Evidence of Annotation Sensitivity</title>
@@ -116,6 +121,7 @@
       <abstract>The highest probability sequences of most neural language generation models tend to be degenerate in some way, a problem known as the inadequacy of the mode. While many approaches to tackling particular aspects of the problem exist, such as dealing with too short sequences or excessive repetitions, explanations of why it occurs in the first place are rarer and do not agree with each other. We believe none of the existing explanations paint a complete picture. In this position paper, we want to bring light to the incredible complexity of the modelling task and the problems that generalising to previously unseen contexts bring. We argue that our desire for models to generalise to contexts it has never observed before is exactly what leads to spread of probability mass and inadequate modes. While we do not claim that adequate modes are impossible, we argue that they are not to be expected either.</abstract>
       <url hash="7d23608e">2024.uncertainlp-1.9</url>
       <bibkey>eikema-2024-effect</bibkey>
+      <video href="2024.uncertainlp-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Uncertainty Resolution in Misinformation Detection</title>
@@ -144,6 +150,7 @@
       <abstract>Researchers have raised awareness about the harms of aggregating labels especially in subjective tasks that naturally contain disagreements among human annotators. In this work we show that models that are only provided aggregated labels show low confidence on high-disagreement data instances. While previous studies consider such instances as mislabeled, we argue that the reason the high-disagreement text instances have been hard-to-learn is that the conventional aggregated models underperform in extracting useful signals from subjective tasks. Inspired by recent studies demonstrating the effectiveness of learning from raw annotations, we investigate classifying using Multiple Ground Truth (Multi-GT) approaches. Our experiments show an improvement of confidence for the high-disagreement instances.</abstract>
       <url hash="9df0d8bd">2024.uncertainlp-1.11</url>
       <bibkey>anand-etal-2024-dont</bibkey>
+      <video href="2024.uncertainlp-1.11.mp4"/>
     </paper>
     <paper id="12">
       <title>Combining Confidence Elicitation and Sample-based Methods for Uncertainty Quantification in Misinformation Mitigation</title>
diff --git a/data/xml/2024.unlp.xml b/data/xml/2024.unlp.xml
new file mode 100644
index 0000000000..a3612b371c
--- /dev/null
+++ b/data/xml/2024.unlp.xml
@@ -0,0 +1,187 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.unlp">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Third Ukrainian Natural Language Processing Workshop (UNLP) @ LREC-COLING 2024</booktitle>
+      <editor><first>Mariana</first><last>Romanyshyn</last></editor>
+      <editor><first>Nataliia</first><last>Romanyshyn</last></editor>
+      <editor><first>Andrii</first><last>Hlybovets</last></editor>
+      <editor><first>Oleksii</first><last>Ignatenko</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="ec6afe3d">2024.unlp-1</url>
+      <venue>unlp</venue>
+    </meta>
+    <frontmatter>
+      <url hash="91949aa7">2024.unlp-1.0</url>
+      <bibkey>unlp-2024-ukrainian</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>A Contemporary News Corpus of <fixed-case>U</fixed-case>krainian (<fixed-case>CNC</fixed-case>-<fixed-case>UA</fixed-case>): Compilation, Annotation, Publication</title>
+      <author><first>Stefan</first><last>Fischer</last></author>
+      <author><first>Kateryna</first><last>Haidarzhyi</last></author>
+      <author><first>Jörg</first><last>Knappen</last></author>
+      <author><first>Olha</first><last>Polishchuk</last></author>
+      <author><first>Yuliya</first><last>Stodolinska</last></author>
+      <author><first>Elke</first><last>Teich</last></author>
+      <pages>1–7</pages>
+      <abstract>We present a corpus of contemporary Ukrainian news articles published between 2019 and 2022 on the news website of the national public broadcaster of Ukraine, commonly known as SUSPILNE. The current release comprises 87 210 364 words in 292 955 texts. Texts are annotated with titles and their time of publication. In addition, the corpus has been linguistically annotated at the token level with a dependency parser. To provide further aspects for investigation, a topic model was trained on the corpus. The corpus is hosted (Fischer et al., 2023) at the Saarbrücken CLARIN center under a CC BY-NC-ND 4.0 license and available in two tab-separated formats: CoNLL-U (de Marneffe et al., 2021) and vertical text format (VRT) as used by the IMS Open Corpus Workbench (CWB; Evert and Hardie, 2011) and CQPweb (Hardie, 2012). We show examples of using the CQPweb interface, which allows to extract the quantitative data necessary for distributional and collocation analyses of the CNC-UA. As the CNC-UA contains news texts documenting recent events, it is highly relevant not only for linguistic analyses of the modern Ukrainian language but also for socio-cultural and political studies.</abstract>
+      <url hash="d6ecf284">2024.unlp-1.1</url>
+      <bibkey>fischer-etal-2024-contemporary</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Introducing the Djinni Recruitment Dataset: A Corpus of Anonymized <fixed-case>CV</fixed-case>s and Job Postings</title>
+      <author><first>Nazarii</first><last>Drushchak</last></author>
+      <author><first>Mariana</first><last>Romanyshyn</last></author>
+      <pages>8–13</pages>
+      <abstract>This paper introduces the Djinni Recruitment Dataset, a large-scale open-source corpus of candidate profiles and job descriptions. With over 150,000 jobs and 230,000 candidates, the dataset includes samples in English and Ukrainian, thereby facilitating advancements in the recruitment domain of natural language processing (NLP) for both languages. It is one of the first open-source corpora in the recruitment domain, opening up new opportunities for AI-driven recruitment technologies and related fields. Notably, the dataset is accessible under the MIT license, encouraging widespread adoption for both scientific research and commercial projects.</abstract>
+      <url hash="2967cd2a">2024.unlp-1.2</url>
+      <bibkey>drushchak-romanyshyn-2024-introducing</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Creating Parallel Corpora for <fixed-case>U</fixed-case>krainian: A <fixed-case>G</fixed-case>erman-<fixed-case>U</fixed-case>krainian Parallel Corpus (<fixed-case>P</fixed-case>ara<fixed-case>R</fixed-case>ook||<fixed-case>DE</fixed-case>-<fixed-case>UK</fixed-case>)</title>
+      <author><first>Maria</first><last>Shvedova</last></author>
+      <author><first>Arsenii</first><last>Lukashevskyi</last></author>
+      <pages>14–22</pages>
+      <abstract>Parallel corpora are currently a popular and vibrantly developing category of linguistic resources, used both in literature and translation studies, as well as in the field of NLP. For Ukrainian, though, there are still not enough significant parallel corpora compiled within a single roof project and made available to the research community. In this paper we present a newly developed resource, the German-Ukrainian Parallel Corpus — ParaRook||DE-UK, searchable online. We describe various issues related to its compilation, text selection, and annotation. The paper also features several examples of how the corpus can be used in linguistic research and translation studies. Using the experience of the German-Ukrainian parallel corpus, parallel corpora for other languages with Ukrainian can be developed.</abstract>
+      <url hash="566c894e">2024.unlp-1.3</url>
+      <bibkey>shvedova-lukashevskyi-2024-creating</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Introducing <fixed-case>NER</fixed-case>-<fixed-case>UK</fixed-case> 2.0: A Rich Corpus of Named Entities for <fixed-case>U</fixed-case>krainian</title>
+      <author><first>Dmytro</first><last>Chaplynskyi</last></author>
+      <author><first>Mariana</first><last>Romanyshyn</last></author>
+      <pages>23–29</pages>
+      <abstract>This paper presents NER-UK 2.0, a corpus of texts in the Ukrainian language manually annotated for the named entity recognition task. The corpus contains 560 texts of multiple genres, boasting 21,993 entities in total. The annotation scheme covers 13 entity types, namely location, person name, organization, artifact, document, job title, date, time, period, money, percentage, quantity, and miscellaneous. Such a rich set of entities makes the corpus valuable for training named-entity recognition models in various domains, including news, social media posts, legal documents, and procurement contracts. The paper presents an updated baseline solution for named entity recognition in Ukrainian with 0.89 F1. The corpus is the largest of its kind for the Ukrainian language and is available for download.</abstract>
+      <url hash="d44f70d8">2024.unlp-1.4</url>
+      <bibkey>chaplynskyi-romanyshyn-2024-introducing</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Instant Messaging Platforms News Multi-Task Classification for Stance, Sentiment, and Discrimination Detection</title>
+      <author><first>Taras</first><last>Ustyianovych</last></author>
+      <author><first>Denilson</first><last>Barbosa</last></author>
+      <pages>30–40</pages>
+      <abstract>In the digital age, geopolitical events frequently catalyze discussions among global web users. Platforms such as social networks and messaging applications serve as vital means for information spreading and acquisition. The Russian aggression against Ukraine has notably intensified online discourse on the matter, drawing a significant audience eager for real-time updates. This surge in online activity inevitably results in the proliferation of content, some of which may be unreliable or manipulative. Given this context, the identification of such content with information distortion is imperative to mitigate bias and promote fairness. However, this task presents considerable challenges, primarily due to the lack of sophisticated language models capable of understanding the nuances and context of texts in low-resource languages, and the scarcity of well-annotated datasets for training such models. To address these gaps, we introduce the TRWU dataset - a meticulously annotated collection of Telegram news about the Russian war in Ukraine gathered starting from January 1, 2022. This paper outlines our methodology for semantic analysis and classification of these messages, aiming to ascertain their bias. Such an approach enhances our ability to detect manipulative and destructive content. Through descriptive statistical analysis, we explore deviations in message sentiment, stance, and metadata across different types of channels and levels of content creation activity. Our findings indicate a predominance of negative sentiment within the dataset. Additionally, our research elucidates distinct differences in the linguistic choices and phraseology among channels, based on their stance towards the war. This study contributes to the broader effort of understanding the spread and mitigating the impact of biased and manipulative content in digital communications.</abstract>
+      <url hash="3c3a4960">2024.unlp-1.5</url>
+      <bibkey>ustyianovych-barbosa-2024-instant</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Setting up the Data Printer with Improved <fixed-case>E</fixed-case>nglish to <fixed-case>U</fixed-case>krainian Machine Translation</title>
+      <author><first>Yurii</first><last>Paniv</last></author>
+      <author><first>Dmytro</first><last>Chaplynskyi</last></author>
+      <author><first>Nikita</first><last>Trynus</last></author>
+      <author><first>Volodymyr</first><last>Kyrylov</last></author>
+      <pages>41–50</pages>
+      <abstract>To build large language models for Ukrainian we need to expand our corpora with large amounts of new algorithmic tasks expressed in natural language. Examples of task performance expressed in English are abundant, so with a high-quality translation system our community will be enabled to curate datasets faster. To aid this goal, we introduce a recipe to build a translation system using supervised finetuning of a large pretrained language model with a noisy parallel dataset of 3M pairs of Ukrainian and English sentences followed by a second phase of training using 17K examples selected by k-fold perplexity filtering on another dataset of higher quality. Our decoder-only model named Dragoman beats performance of previous state of the art encoder-decoder models on the FLORES devtest set.</abstract>
+      <url hash="de56ad9f">2024.unlp-1.6</url>
+      <bibkey>paniv-etal-2024-setting</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Automated Extraction of Hypo-Hypernym Relations for the <fixed-case>U</fixed-case>krainian <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Nataliia</first><last>Romanyshyn</last></author>
+      <author><first>Dmytro</first><last>Chaplynskyi</last></author>
+      <author><first>Mariana</first><last>Romanyshyn</last></author>
+      <pages>51–60</pages>
+      <abstract>WordNet is a crucial resource in linguistics and natural language processing, providing a detailed and expansive set of lexico-semantic relationships among words in a language. The trend toward automated construction and expansion of WordNets has become increasingly popular due to the high costs of manual development. This study aims to automate the development of the Ukrainian WordNet, explicitly concentrating on hypo-hypernym relations that are crucial building blocks of the hierarchical structure of WordNet. Utilizing the linking between Princeton WordNet, Wikidata, and multilingual resources from Wikipedia, the proposed approach successfully mapped 17% of Princeton WordNet (PWN) content to Ukrainian Wikipedia. Furthermore, the study introduces three innovative strategies for generating new entries to fill in the gaps of the Ukrainian WordNet: machine translation, the Hypernym Discovery model, and the Hypernym Instruction-Following LLaMA model. The latter model shows a high level of effectiveness, evidenced by a 41.61% performance on the Mean Overlap Coefficient (MOC) metric. With the proposed approach that combines automated techniques with expert human input, we provide a reliable basis for creating the Ukrainian WordNet.</abstract>
+      <url hash="edaeeda9">2024.unlp-1.7</url>
+      <bibkey>romanyshyn-etal-2024-automated</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>U</fixed-case>krainian Visual Word Sense Disambiguation Benchmark</title>
+      <author><first>Yurii</first><last>Laba</last></author>
+      <author><first>Yaryna</first><last>Mohytych</last></author>
+      <author><first>Ivanna</first><last>Rohulia</last></author>
+      <author><first>Halyna</first><last>Kyryleyza</last></author>
+      <author><first>Hanna</first><last>Dydyk-Meush</last></author>
+      <author><first>Oles</first><last>Dobosevych</last></author>
+      <author><first>Rostyslav</first><last>Hryniv</last></author>
+      <pages>61–66</pages>
+      <abstract>This study presents a benchmark for evaluating the Visual Word Sense Disambiguation (Visual-WSD) task in Ukrainian. The main goal of the Visual-WSD task is to identify, with minimal contextual information, the most appropriate representation of a given ambiguous word from a set of ten images. To construct this benchmark, we followed a methodology similar to that proposed by (CITATION), who previously introduced benchmarks for the Visual-WSD task in English, Italian, and Farsi. This approach allows us to incorporate the Ukrainian benchmark into a broader framework for cross-language model performance comparisons. We collected the benchmark data semi-automatically and refined it with input from domain experts. We then assessed eight multilingual and multimodal large language models using this benchmark. All tested models performed worse than the zero-shot CLIP-based baseline model (CITATION) used by (CITATION) for the English Visual-WSD task. Our analysis revealed a significant performance gap in the Visual-WSD task between Ukrainian and English.</abstract>
+      <url hash="fa727494">2024.unlp-1.8</url>
+      <bibkey>laba-etal-2024-ukrainian</bibkey>
+    </paper>
+    <paper id="9">
+      <title>The <fixed-case>UNLP</fixed-case> 2024 Shared Task on Fine-Tuning Large Language Models for <fixed-case>U</fixed-case>krainian</title>
+      <author><first>Mariana</first><last>Romanyshyn</last></author>
+      <author><first>Oleksiy</first><last>Syvokon</last></author>
+      <author><first>Roman</first><last>Kyslyi</last></author>
+      <pages>67–74</pages>
+      <abstract>This paper presents the results of the UNLP 2024 shared task, the first Shared Task on Fine-Tuning Large Language Models for the Ukrainian language. The goal of the task was to facilitate the creation of models that have knowledge of the Ukrainian language, history, and culture, as well as common knowledge, and are capable of generating fluent and accurate responses in Ukrainian. The participants were required to use models with open weights and reasonable size to ensure the reproducibility of the solutions. The participating systems were evaluated using multiple-choice exam questions and manually crafted open questions. Three teams submitted their solutions before the deadline, and two teams submitted papers that were accepted to appear in the UNLP workshop proceedings and are referred to in this report. The Codabench leaderboard is left open for further submissions.</abstract>
+      <url hash="9aa5f906">2024.unlp-1.9</url>
+      <bibkey>romanyshyn-etal-2024-unlp</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Fine-Tuning and Retrieval Augmented Generation for Question Answering Using Affordable Large Language Models</title>
+      <author><first>Tiberiu</first><last>Boros</last></author>
+      <author><first>Radu</first><last>Chivereanu</last></author>
+      <author><first>Stefan</first><last>Dumitrescu</last></author>
+      <author><first>Octavian</first><last>Purcaru</last></author>
+      <pages>75–82</pages>
+      <abstract>We present our proposed system named Sherlock to UNLP 2024 Shared Task on Question Answering winning first place. We employ a mix of methods, from using automatically translated datasets to perform supervised fine-tuning and direct preference optimization on instruction-tuned models, to model weight merging and retrieval augmented generation. We present and motivate our chosen sequence of steps, as well as an ablation study to understand the effect of each additional step. The resulting model and code are made publicly available (download links provided in the paper).</abstract>
+      <url hash="b393400e">2024.unlp-1.10</url>
+      <bibkey>boros-etal-2024-fine</bibkey>
+    </paper>
+    <paper id="11">
+      <title>From Bytes to Borsch: Fine-Tuning Gemma and Mistral for the <fixed-case>U</fixed-case>krainian Language Representation</title>
+      <author><first>Artur</first><last>Kiulian</last></author>
+      <author><first>Anton</first><last>Polishko</last></author>
+      <author><first>Mykola</first><last>Khandoga</last></author>
+      <author><first>Oryna</first><last>Chubych</last></author>
+      <author><first>Jack</first><last>Connor</last></author>
+      <author><first>Raghav</first><last>Ravishankar</last></author>
+      <author><first>Adarsh</first><last>Shirawalmath</last></author>
+      <pages>83–94</pages>
+      <abstract>In the rapidly advancing field of AI and NLP, generative large language models (LLMs) stand at the forefront of innovation, showcasing unparalleled abilities in text understanding and generation. However, the limited representation of low-resource languages like Ukrainian poses a notable challenge, restricting the reach and relevance of this technology. Our paper addresses this by fine-tuning the open-source Gemma and Mistral LLMs with Ukrainian datasets, aiming to improve their linguistic proficiency and benchmarking them against other existing models capable of processing Ukrainian language. This endeavor not only aims to mitigate language bias in technology but also promotes inclusivity in the digital realm. Our transparent and reproducible approach encourages further NLP research and development. Additionally, we present the Ukrainian Knowledge and Instruction Dataset (UKID) to aid future efforts in language model fine-tuning. Our research not only advances the field of NLP but also highlights the importance of linguistic diversity in AI, which is crucial for cultural preservation, education, and expanding AI’s global utility. Ultimately, we advocate for a future where technology is inclusive, enabling AI to communicate effectively across all languages, especially those currently underrepresented.</abstract>
+      <url hash="4119aac5">2024.unlp-1.11</url>
+      <bibkey>kiulian-etal-2024-bytes</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Spivavtor: An Instruction Tuned <fixed-case>U</fixed-case>krainian Text Editing Model</title>
+      <author><first>Aman</first><last>Saini</last></author>
+      <author><first>Artem</first><last>Chernodub</last></author>
+      <author><first>Vipul</first><last>Raheja</last></author>
+      <author><first>Vivek</first><last>Kulkarni</last></author>
+      <pages>95–108</pages>
+      <abstract>We introduce Spivavtor, a dataset, and instruction-tuned models for text editing focused on the Ukrainian language. Spivavtor is the Ukrainian-focused adaptation of the English-only CoEdIT (Raheja et al., 2023) model. Similar to CoEdIT, Spivavtor performs text editing tasks by following instructions in Ukrainian like “Виправте граматику в цьому реченнi” and “Спростiть це речення” which translate to “Correct the grammar in this sentence” and “Simplify this sentence” in English, respectively. This paper describes the details of the Spivavtor-Instruct dataset and Spivavtor models. We evaluate Spivavtor on a variety of text editing tasks in Ukrainian, such as Grammatical Error Correction (GEC), Text Simplification, Coherence, and Paraphrasing, and demonstrate its superior performance on all of them. We publicly release our best performing models and data as resources to the community to advance further research in this space.</abstract>
+      <url hash="ff2172a0">2024.unlp-1.12</url>
+      <bibkey>saini-etal-2024-spivavtor</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Eval-<fixed-case>UA</fixed-case>-tion 1.0: Benchmark for Evaluating <fixed-case>U</fixed-case>krainian (Large) Language Models</title>
+      <author><first>Serhii</first><last>Hamotskyi</last></author>
+      <author><first>Anna-Izabella</first><last>Levbarg</last></author>
+      <author><first>Christian</first><last>Hänig</last></author>
+      <pages>109–119</pages>
+      <abstract>In this paper, we introduce Eval-UA-tion, a set of novel Ukrainian-language datasets aimed at evaluating the performance of language models on the Ukrainian language. The tasks include UA-CBT (inspired by the Children’s Book Test, a fill-in-the-gaps type task aimed at gauging the extent to which a story narrative is understood), UP-Titles (where the online newspaper <i>Ukrainska Pravda</i>‘s articles have to be matched to the correct title among 10 similar ones), and LMentry-static-UA/LMES (inspired by the LMentry benchmark, a set of tasks simple to solve for humans but hard for LMs, such as ‘which of these words is longer’ and ‘what is the fifth word of this sentence’). With the exception of UP-Titles, the tasks are built in a way to minimize contamination and use material unlikely to be present in the training sets of language models, and include a split for few-shot model prompting use that minimizes contamination. For each task human and random baselines are provided.</abstract>
+      <url hash="a3ebe10b">2024.unlp-1.13</url>
+      <bibkey>hamotskyi-etal-2024-eval</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>L</fixed-case>i<fixed-case>BERT</fixed-case>a: Advancing <fixed-case>U</fixed-case>krainian Language Modeling through Pre-training from Scratch</title>
+      <author><first>Mykola</first><last>Haltiuk</last></author>
+      <author><first>Aleksander</first><last>Smywiński-Pohl</last></author>
+      <pages>120–128</pages>
+      <abstract>Recent advancements in Natural Language Processing (NLP) have spurred remarkable progress in language modeling, predominantly benefiting English. While Ukrainian NLP has long grappled with significant challenges due to limited data and computational resources, recent years have seen a shift with the emergence of new corpora, marking a pivotal moment in addressing these obstacles. This paper introduces LiBERTa Large, the inaugural BERT Large model pre-trained entirely from scratch only on Ukrainian texts. Leveraging extensive multilingual text corpora, including a substantial Ukrainian subset, LiBERTa Large establishes a foundational resource for Ukrainian NLU tasks. Our model outperforms existing multilingual and monolingual models pre-trained from scratch for Ukrainian, demonstrating competitive performance against those relying on cross-lingual transfer from English. This achievement underscores our ability to achieve superior performance through pre-training from scratch with additional enhancements, obviating the need to rely on decisions made for English models to efficiently transfer weights. We establish LiBERTa Large as a robust baseline, paving the way for future advancements in Ukrainian language modeling.</abstract>
+      <url hash="2124bf19">2024.unlp-1.14</url>
+      <bibkey>haltiuk-smywinski-pohl-2024-liberta</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Entity Embellishment Mitigation in <fixed-case>LLM</fixed-case>s Output with Noisy Synthetic Dataset for Alignment</title>
+      <author><first>Svitlana</first><last>Galeshchuk</last></author>
+      <pages>129–134</pages>
+      <abstract>The present work focuses on the entity embellishments when named entities are accompanied by additional information that is not supported by the context or the source material. Our paper contributes into mitigating this problem in large language model’s generated texts, summaries in particular, by proposing the approach with synthetic noise injection in the generated samples that are further used for alignment of finetuned LLM. We also challenge the issue of solutions scarcity for low-resourced languages and test our approach with corpora in Ukrainian.</abstract>
+      <url hash="5fda4d1f">2024.unlp-1.15</url>
+      <bibkey>galeshchuk-2024-entity</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Language-Specific Pruning for Efficient Reduction of Large Language Models</title>
+      <author><first>Maksym</first><last>Shamrai</last></author>
+      <pages>135–140</pages>
+      <abstract>Delving into pruning techniques is essential to boost the efficiency of Large Language Models (LLMs) by reducing their size and computational demands, resulting in faster and more cost-effective inference. In this work, our key contribution lies in recognizing that LLMs trained on diverse languages manifest distinct language-specific weight distributions. Exploiting this insight, we illustrate that pruning LLMs using language-specific data results in a more potent model compression. Empirical evidence underscores the critical nature of pruning on language-specific data, highlighting a noteworthy impact on the perplexity of Ukrainian texts compared to pruning on English data. The proposed methodology significantly reduces the size of LLaMA, LLaMA 2 and Mistral models while preserving competitive performance. This research underscores the significance of linguistic considerations in LLM pruning and advocates for language-specific optimization, establishing a framework for more efficient and tailored language models across diverse linguistic contexts. Additionally, all experiments were conducted using a single consumer-grade NVIDIA RTX 3090 GPU, and the code is available at https://github.com/mshamrai/language-specific-pruning.</abstract>
+      <url hash="b1a51aaa">2024.unlp-1.16</url>
+      <bibkey>shamrai-2024-language</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.wildre.xml b/data/xml/2024.wildre.xml
new file mode 100644
index 0000000000..953ea03616
--- /dev/null
+++ b/data/xml/2024.wildre.xml
@@ -0,0 +1,146 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.wildre">
+  <volume id="1" ingest-date="2024-05-18" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 7th Workshop on Indian Language Data: Resources and Evaluation</booktitle>
+      <editor><first>Girish Nath</first><last>Jha</last></editor>
+      <editor><first>Sobha</first><last>L.</last></editor>
+      <editor><first>Kalika</first><last>Bali</last></editor>
+      <editor><first>Atul Kr.</first><last>Ojha</last></editor>
+      <publisher>ELRA and ICCL</publisher>
+      <address>Torino, Italia</address>
+      <month>May</month>
+      <year>2024</year>
+      <url hash="6de7b0f2">2024.wildre-1</url>
+      <venue>wildre</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="b01788a8">2024.wildre-1.0</url>
+      <bibkey>wildre-2024-indian</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Towards Disfluency Annotated Corpora for <fixed-case>I</fixed-case>ndian Languages</title>
+      <author><first>Chayan</first><last>Kochar</last></author>
+      <author><first>Vandan Vasantlal</first><last>Mujadia</last></author>
+      <author><first>Pruthwik</first><last>Mishra</last></author>
+      <author><first>Dipti Misra</first><last>Sharma</last></author>
+      <pages>1–10</pages>
+      <abstract>In the natural course of spoken language, individuals often engage in thinking and self-correction during speech production. These instances of interruption or correction are commonly referred to as disfluencies. When preparing data for subsequent downstream NLP tasks, these linguistic elements can be systematically removed, or handled as required, to enhance data quality. In this study, we present a comprehensive research on disfluencies in Indian languages. Our approach involves not only annotating real-world conversation transcripts but also conducting a detailed analysis of linguistic nuances inherent to Indian languages that are necessary to consider during annotation. Additionally, we introduce a robust algorithm for the synthetic generation of disfluent data. This algorithm aims to facilitate more effective model training for the identification of disfluencies in real-world conversations, thereby contributing to the advancement of disfluency research in Indian languages.</abstract>
+      <url hash="f60b9279">2024.wildre-1.1</url>
+      <bibkey>kochar-etal-2024-towards</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>E</fixed-case>mo<fixed-case>M</fixed-case>ix-3<fixed-case>L</fixed-case>: A Code-Mixed Dataset for <fixed-case>B</fixed-case>angla-<fixed-case>E</fixed-case>nglish-<fixed-case>H</fixed-case>indi for Emotion Detection</title>
+      <author><first>Nishat</first><last>Raihan</last></author>
+      <author><first>Dhiman</first><last>Goswami</last></author>
+      <author><first>Antara</first><last>Mahmud</last></author>
+      <author><first>Antonios</first><last>Anastasopoulos</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <pages>11–16</pages>
+      <abstract>Code-mixing is a well-studied linguistic phenomenon that occurs when two or more languages are mixed in text or speech. Several studies have been conducted on building datasets and performing downstream NLP tasks on code-mixed data. Although it is not uncommon to observe code-mixing of three or more languages, most available datasets in this domain contain code-mixed data from only two languages. In this paper, we introduce EmoMix-3L, a novel multi-label emotion detection dataset containing code-mixed data from three different languages. We experiment with several models on EmoMix-3L and we report that MuRIL outperforms other models on this dataset.</abstract>
+      <url hash="bcb622ab">2024.wildre-1.2</url>
+      <bibkey>raihan-etal-2024-emomix</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Findings of the <fixed-case>WILDRE</fixed-case> Shared Task on Code-mixed Less-resourced Sentiment Analysis for <fixed-case>I</fixed-case>ndo-<fixed-case>A</fixed-case>ryan Languages</title>
+      <author><first>Priya</first><last>Rani</last></author>
+      <author><first>Gaurav</first><last>Negi</last></author>
+      <author><first>Saroj</first><last>Jha</last></author>
+      <author><first>Shardul</first><last>Suryawanshi</last></author>
+      <author><first>Atul Kr.</first><last>Ojha</last></author>
+      <author><first>Paul</first><last>Buitelaar</last></author>
+      <author><first>John P.</first><last>McCrae</last></author>
+      <pages>17–23</pages>
+      <abstract>This paper describes the structure and findings of the WILDRE 2024 shared task on Code-mixed Less-resourced Sentiment Analysis for Indo-Aryan Languages. The participants were asked to submit the test data’s final prediction on CodaLab. A total of fourteen teams registered for the shared task. Only four participants submitted the system for evaluation on CodaLab, with only two teams submitting the system description paper. While all systems show a rather promising performance, they outperform the baseline scores.</abstract>
+      <url hash="d5e48df6">2024.wildre-1.3</url>
+      <bibkey>rani-etal-2024-findings</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Multilingual Bias Detection and Mitigation for <fixed-case>I</fixed-case>ndian Languages</title>
+      <author><first>Ankita</first><last>Maity</last></author>
+      <author><first>Anubhav</first><last>Sharma</last></author>
+      <author><first>Rudra</first><last>Dhar</last></author>
+      <author><first>Tushar</first><last>Abhishek</last></author>
+      <author><first>Manish</first><last>Gupta</last></author>
+      <author><first>Vasudeva</first><last>Varma</last></author>
+      <pages>24–29</pages>
+      <abstract>Lack of diverse perspectives causes neutrality bias in Wikipedia content leading to millions of worldwide readers getting exposed by potentially inaccurate information. Hence, neutrality bias detection and mitigation is a critical problem. Although previous studies have proposed effective solutions for English, no work exists for Indian languages. First, we contribute two large datasets, mWIKIBIAS and mWNC, covering 8 languages, for the bias detection and mitigation tasks respectively. Next, we investigate the effectiveness of popular multilingual Transformer-based models for the two tasks by modeling detection as a binary classification problem and mitigation as a style transfer problem. We make the code and data publicly available.</abstract>
+      <url hash="ce772215">2024.wildre-1.4</url>
+      <bibkey>maity-etal-2024-multilingual</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Dharmaśāstra Informatics: Concept Mining System for Socio-Cultural Facet in <fixed-case>A</fixed-case>ncient <fixed-case>I</fixed-case>ndia</title>
+      <author><first>Arooshi</first><last>Nigam</last></author>
+      <author><first>Subhash</first><last>Chandra</last></author>
+      <pages>30–39</pages>
+      <abstract>The heritage of Dharmaśāstra (DS) represents an extensive cultural legacy, spanning diverse fields such as family law, social ethics, culture and economics. In this paper, a new term “Dharmaśāstric Informatics,” is proposed which leverages computational methods for concept mining to unravel the socio-cultural complexities of ancient India as reflected in the DS. Despite its profound significance, the digitization and online information retrieval of DS texts encounter notable challenges. Therefore, the primary aim of this paper is to synergize digital accessibility and information mining techniques to enhance access to DS knowledge traditions. Through the utilization of heritage computing methodologies, it is an endeavour to develop a robust system for digitizing DS texts comprehensively, facilitating instant referencing and efficient retrieval, catering to the needs of researchers and scholars across disciplines worldwide. By leveraging advanced digital technologies and the burgeoning IT landscape, it seeks to create a seamless and user-friendly platform for accessing and exploring DS texts. This experiment not only promotes scholarly engagement but also serves as an invaluable resource for individuals interested in delving into the intricate realms of archaic Indian knowledge traditions. Ultimately, our efforts aim to amplify the visibility and accessibility of DS knowledge, fostering a deeper understanding and appreciation of this profound cultural heritage.</abstract>
+      <url hash="8bbf23bb">2024.wildre-1.5</url>
+      <bibkey>nigam-chandra-2024-dharmasastra</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Exploring News Summarization and Enrichment in a Highly Resource-Scarce <fixed-case>I</fixed-case>ndian Language: A Case Study of Mizo</title>
+      <author><first>Abhinaba</first><last>Bala</last></author>
+      <author><first>Ashok</first><last>Urlana</last></author>
+      <author><first>Rahul</first><last>Mishra</last></author>
+      <author><first>Parameswari</first><last>Krishnamurthy</last></author>
+      <pages>40–46</pages>
+      <abstract>Obtaining sufficient information in one’s mother tongue is crucial for satisfying the information needs of the users. While high-resource languages have abundant online resources, the situation is less than ideal for very low-resource languages. Moreover, the insufficient reporting of vital national and international events continues to be a worry, especially in languages with scarce resources, like Mizo. In this paper, we conduct a study to investigate the effectiveness of a simple methodology designed to generate a holistic summary for Mizo news articles, which leverages English-language news to supplement and enhance the information related to the corresponding news events. Furthermore, we make available 500 Mizo news articles and corresponding enriched holistic summaries. Human evaluation confirms that our approach significantly enhances the information coverage of Mizo news articles.</abstract>
+      <url hash="41d25f98">2024.wildre-1.6</url>
+      <bibkey>bala-etal-2024-exploring</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Finding the Causality of an Event in News Articles</title>
+      <author><first>Sobha</first><last>Lalitha Devi</last></author>
+      <author><first>Pattabhi</first><last>RK Rao</last></author>
+      <pages>47–53</pages>
+      <abstract>This paper discusses about the finding of causality of an event in newspaper articles. The analysis of causality , otherwise known as cause and effect is crucial for building efficient Natural Language Understanding (NLU) supported AI systems such as Event tracking and it is considered as a complex semantic relation under discourse theory. A cause-effect relation consists of a linguistic marker and its two arguments. The arguments are semantic arguments where the cause is the first argument (Arg1) and the effect is the second argument(Arg2). In this work we have considered the causal relations in Tamil Newspaper articles. The analysis of causal constructions, the causal markers and their syntactic relation lead to the identification of different features for developing the language model using RBMs (Restricted Boltzmann Machine). The experiments we performed have given encouraging results. The Cause-Effect system developed is used in a mobile App for Event profiling called “Nigalazhvi” where the cause and effect of an event is identified and given to the user.</abstract>
+      <url hash="a2e15d38">2024.wildre-1.7</url>
+      <bibkey>lalitha-devi-rk-rao-2024-finding</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Creating Corpus of Low Resource <fixed-case>I</fixed-case>ndian Languages for Natural Language Processing: Challenges and Opportunities</title>
+      <author><first>Pratibha</first><last>Dongare</last></author>
+      <pages>54–58</pages>
+      <abstract>Addressing tasks in Natural Language Processing requires access to sufficient and high-quality data. However, working with languages that have limited resources poses a significant challenge due to the absence of established methodologies, frameworks, and collaborative efforts. This paper intends to briefly outline the challenges associated with standardization in data creation, focusing on Indian languages, which are often categorized as low resource languages. Additionally, potential solutions and the importance of standardized procedures for low-resource language data are proposed. Furthermore, the critical role of standardized protocols in corpus creation and their impact on research is highlighted. Lastly, this paper concludes by defining what constitutes a corpus.</abstract>
+      <url hash="a969c905">2024.wildre-1.8</url>
+      <bibkey>dongare-2024-creating</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>FZZG</fixed-case> at <fixed-case>WILDRE</fixed-case>-7: Fine-tuning Pre-trained Models for Code-mixed, Less-resourced Sentiment Analysis</title>
+      <author><first>Gaurish</first><last>Thakkar</last></author>
+      <author><first>Marko</first><last>Tadić</last></author>
+      <author><first>Nives</first><last>Mikelic Preradovic</last></author>
+      <pages>59–65</pages>
+      <abstract>This paper describes our system used for a shared task on code-mixed, less-resourced sentiment analysis for Indo-Aryan languages. We are using the large language models (LLMs) since they have demonstrated excellent performance on classification tasks. In our participation in all tracks, we use <i>unsloth/mistral-7b-bnb-4bit</i> LLM for the task of code-mixed sentiment analysis. For track 1, we used a simple fine-tuning strategy on PLMs by combining data from multiple phases. Our trained systems secured first place in four phases out of five. In addition, we present the results achieved using several PLMs for each language.</abstract>
+      <url hash="ff689e00">2024.wildre-1.9</url>
+      <bibkey>thakkar-etal-2024-fzzg</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>MLI</fixed-case>nitiative@<fixed-case>WILDRE</fixed-case>7: Hybrid Approaches with Large Language Models for Enhanced Sentiment Analysis in Code-Switched and Code-Mixed Texts</title>
+      <author><first>Hariram</first><last>Veeramani</last></author>
+      <author><first>Surendrabikram</first><last>Thapa</last></author>
+      <author><first>Usman</first><last>Naseem</last></author>
+      <pages>66–72</pages>
+      <abstract>Code-switched and code-mixed languages are prevalent in multilingual societies, reflecting the complex interplay of cultures and languages in daily communication. Understanding the sentiment embedded in such texts is crucial for a range of applications, from improving social media analytics to enhancing customer feedback systems. Despite their significance, research in code-mixed and code-switched languages remains limited, particularly in less-resourced languages. This scarcity of research creates a gap in natural language processing (NLP) technologies, hindering their ability to accurately interpret the rich linguistic diversity of global communications. To bridge this gap, this paper presents a novel methodology for sentiment analysis in code-mixed and code-switched texts. Our approach combines the power of large language models (LLMs) and the versatility of the multilingual BERT (mBERT) framework to effectively process and analyze sentiments in multilingual data. By decomposing code-mixed texts into their constituent languages, employing mBERT for named entity recognition (NER) and sentiment label prediction, and integrating these insights into a decision-making LLM, we provide a comprehensive framework for understanding sentiment in complex linguistic contexts. Our system achieves competitive rank on all subtasks in the Code-mixed Less-Resourced Sentiment analysis (Code-mixed) shared task at WILDRE-7 (LREC-COLING).</abstract>
+      <url hash="8f9d556d">2024.wildre-1.10</url>
+      <bibkey>veeramani-etal-2024-mlinitiative</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Aalamaram: A Large-Scale Linguistically Annotated Treebank for the <fixed-case>T</fixed-case>amil Language</title>
+      <author><first>A M</first><last>Abirami</last></author>
+      <author><first>Wei Qi</first><last>Leong</last></author>
+      <author><first>Hamsawardhini</first><last>Rengarajan</last></author>
+      <author><first>D</first><last>Anitha</last></author>
+      <author><first>R</first><last>Suganya</last></author>
+      <author><first>Himanshu</first><last>Singh</last></author>
+      <author><first>Kengatharaiyer</first><last>Sarveswaran</last></author>
+      <author><first>William Chandra</first><last>Tjhi</last></author>
+      <author><first>Rajiv Ratn</first><last>Shah</last></author>
+      <pages>73–83</pages>
+      <abstract>Tamil is a relatively low-resource language in the field of Natural Language Processing (NLP). Recent years have seen a growth in Tamil NLP datasets in Natural Language Understanding (NLU) or Natural Language Generation (NLG) tasks, but high-quality linguistic resources remain scarce. In order to alleviate this gap in resources, this paper introduces Aalamaram, a treebank with rich linguistic annotations for the Tamil language. It is hitherto the largest publicly available Tamil treebank with almost 10,000 sentences from diverse sources and is annotated for the tasks of Part-of-speech (POS) tagging, Named Entity Recognition (NER), Morphological Parsing and Dependency Parsing. Close attention has also been paid to multi-word segmentation, especially in the context of Tamil clitics. Although the treebank is based largely on the Universal Dependencies (UD) specifications, significant effort has been made to adjust the annotation rules according to the idiosyncrasies and complexities of the Tamil language, thereby providing a valuable resource for linguistic research and NLP developments.</abstract>
+      <url hash="84bc38f4">2024.wildre-1.11</url>
+      <bibkey>abirami-etal-2024-aalamaram</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.wnut.xml b/data/xml/2024.wnut.xml
index 4eaf8a757c..817ef6983f 100644
--- a/data/xml/2024.wnut.xml
+++ b/data/xml/2024.wnut.xml
@@ -48,6 +48,7 @@
       <abstract>This paper investigates effects of noisy source texts (containing spelling and grammar errors, informal words or expressions, etc.) on human and machine translations, namely whether the noisy phenomena are kept in the translations, corrected, or caused errors. The analysed data consists of English user reviews of Amazon products translated into Croatian, Russian and Finnish by professional translators, translation students, machine translation (MT) systems, and ChatGPT language model. The results show that overall, ChatGPT and professional translators mostly correct/standardise those parts, while students are often keeping them. Furthermore, MT systems are most prone to errors while ChatGPT is more robust, but notably less robust than human translators. Finally, some of the phenomena are particularly challenging both for MT systems and for ChatGPT, especially spelling errors and informal constructions.</abstract>
       <url hash="6fde7a26">2024.wnut-1.3</url>
       <bibkey>popovic-etal-2024-effects</bibkey>
+      <video href="2024.wnut-1.3.mp4"/>
     </paper>
     <paper id="4">
       <title>Stanceosaurus 2.0 - Classifying Stance Towards <fixed-case>R</fixed-case>ussian and <fixed-case>S</fixed-case>panish Misinformation</title>
@@ -61,6 +62,7 @@
       <abstract>The Stanceosaurus corpus (Zheng et al., 2022) was designed to provide high-quality, annotated, 5-way stance data extracted from Twitter, suitable for analyzing cross-cultural and cross-lingual misinformation. In the Stanceosaurus 2.0 iteration, we extend this framework to encompass Russian and Spanish. The former is of current significance due to prevalent misinformation amid escalating tensions with the West and the violent incursion into Ukraine. The latter, meanwhile, represents an enormous community that has been largely overlooked on major social media platforms. By incorporating an additional 3,874 Spanish and Russian tweets over 41 misinformation claims, our objective is to support research focused on these issues. To demonstrate the value of this data, we employed zero-shot cross-lingual transfer on multilingual BERT, yielding results on par with the initial Stanceosaurus study with a macro F1 score of 43 for both languages. This underlines the viability of stance classification as an effective tool for identifying multicultural misinformation.</abstract>
       <url hash="db36f96f">2024.wnut-1.4</url>
       <bibkey>lavrouk-etal-2024-stanceosaurus</bibkey>
+      <video href="2024.wnut-1.4.mp4"/>
     </paper>
     <paper id="5">
       <title>A Comparative Analysis of Noise Reduction Methods in Sentiment Analysis on Noisy <fixed-case>B</fixed-case>angla Texts</title>
@@ -74,6 +76,7 @@
       <abstract>While Bangla is considered a language with limited resources, sentiment analysis has been a subject of extensive research in the literature. Nevertheless, there is a scarcity of exploration into sentiment analysis specifically in the realm of noisy Bangla texts. In this paper, we introduce a dataset (NC-SentNoB) that we annotated manually to identify ten different types of noise found in a pre-existing sentiment analysis dataset comprising of around 15K noisy Bangla texts. At first, given an input noisy text, we identify the noise type, addressing this as a multi-label classification task. Then, we introduce baseline noise reduction methods to alleviate noise prior to conducting sentiment analysis. Finally, we assess the performance of fine-tuned sentiment analysis models with both noisy and noise-reduced texts to make comparisons. The experimental findings indicate that the noise reduction methods utilized are not satisfactory, highlighting the need for more suitable noise reduction methods in future research endeavors. We have made the implementation and dataset presented in this paper publicly available at https://github.com/ktoufiquee/A-Comparative-Analysis-of-Noise-Reduction-Methods-in-Sentiment-Analysis-on-Noisy-Bangla-Texts</abstract>
       <url hash="44a3dd08">2024.wnut-1.5</url>
       <bibkey>elahi-etal-2024-comparative</bibkey>
+      <video href="2024.wnut-1.5.mp4"/>
     </paper>
     <paper id="6">
       <title>Label Supervised Contrastive Learning for Imbalanced Text Classification in <fixed-case>E</fixed-case>uclidean and Hyperbolic Embedding Spaces</title>
@@ -95,6 +98,7 @@
       <abstract>Maintenance short texts are invaluable unstructured data sources, serving as a diagnostic and prognostic window into the operational health and status of physical assets. These user-generated texts, created during routine or ad-hoc maintenance activities, offer insights into equipment performance, potential failure points, and maintenance needs. However, the use of information captured in these texts is hindered by inherent challenges: the prevalence of engineering jargon, domain-specific vernacular, random spelling errors without identifiable patterns, and the absence of standard grammatical structures. To transform these texts into accessible and analysable data, we introduce the MaintNorm dataset, the first resource specifically tailored for the lexical normalisation task of maintenance short texts. Comprising 12,000 examples, this dataset enables the efficient processing and interpretation of these texts. We demonstrate the utility of MaintNorm by training a lexical normalisation model as a sequence-to-sequence learning task with two learning objectives, namely, enhancing the quality of the texts and masking segments to obscure sensitive information to anonymise data. Our benchmark model demonstrates a universal error reduction rate of 95.8%. The dataset and benchmark outcomes are available to the public.</abstract>
       <url hash="89a732ab">2024.wnut-1.7</url>
       <bibkey>bikaun-etal-2024-maintnorm</bibkey>
+      <video href="2024.wnut-1.7.mp4"/>
     </paper>
     <paper id="8">
       <title>The Effects of Data Quality on Named Entity Recognition</title>
@@ -114,6 +118,7 @@
       <abstract>Emotion corpora are typically sampled based on keyword/hashtag search or by asking study participants to generate textual instances. In any case, these corpora are not uniform samples representing the entirety of a domain. We hypothesize that this practice of data acquision leads to unrealistic correlations between overrepresented topics in these corpora that harm the generalizability of models. Such topic bias could lead to wrong predictions for instances like “I organized the service for my aunt’s funeral.” when funeral events are overpresented for instances labeled with sadness, despite the emotion of pride being more appropriate here. In this paper, we study this topic bias both from the data and the modeling perspective. We first label a set of emotion corpora automatically via topic modeling and show that emotions in fact correlate with specific topics. Further, we see that emotion classifiers are confounded by such topics. Finally, we show that the established debiasing method of adversarial correction via gradient reversal mitigates the issue. Our work points out issues with existing emotion corpora and that more representative resources are required for fair evaluation of models predicting affective concepts from text.</abstract>
       <url hash="ef2b3a85">2024.wnut-1.9</url>
       <bibkey>wegge-klinger-2024-topic</bibkey>
+      <video href="2024.wnut-1.9.mp4"/>
     </paper>
     <paper id="10">
       <title>Stars Are All You Need: A Distantly Supervised Pyramid Network for Unified Sentiment Analysis</title>
diff --git a/data/xml/D15.xml b/data/xml/D15.xml
index 3ed7b52f46..b5f699e4b7 100644
--- a/data/xml/D15.xml
+++ b/data/xml/D15.xml
@@ -1,3 +1,4 @@
+<?xml version='1.0' encoding='UTF-8'?>
 <collection id="D15">
   <volume id="1" type="proceedings">
     <meta>
@@ -202,6 +203,7 @@
       <url hash="b32c8ee3">D15-1017</url>
       <doi>10.18653/v1/D15-1017</doi>
       <bibkey>jayanth-etal-2015-monotone</bibkey>
+      <pwccode url="https://github.com/jayaprakash-sundararaj/iitb-code/tree/master/SubModular" additional="false">jayaprakash-sundararaj/iitb-code</pwccode>
     </paper>
     <paper id="18">
       <title>Joint Prediction for Entity/Event-Level Sentiment Analysis using Probabilistic Soft Logic Models</title>
diff --git a/data/xml/D16.xml b/data/xml/D16.xml
index 429368fc94..19778fd29a 100644
--- a/data/xml/D16.xml
+++ b/data/xml/D16.xml
@@ -2270,7 +2270,7 @@
       <doi>10.18653/v1/D16-1193</doi>
       <bibkey>taghipour-ng-2016-neural</bibkey>
       <pwccode url="https://github.com/nusnlp/nea" additional="false">nusnlp/nea</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/asap-aligned-scores-and-performances">ASAP (Aligned Scores and Performances)</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/asap">ASAP</pwcdataset>
     </paper>
     <paper id="194">
       <title>Non-uniform Language Detection in Technical Writing</title>
diff --git a/data/xml/D18.xml b/data/xml/D18.xml
index 54ddd975fd..341f7d54a3 100644
--- a/data/xml/D18.xml
+++ b/data/xml/D18.xml
@@ -1,3 +1,4 @@
+<?xml version='1.0' encoding='UTF-8'?>
 <collection id="D18">
   <volume id="1" type="proceedings">
     <meta>
@@ -6050,6 +6051,7 @@
       <abstract>Despite continuously improving performance, contemporary image captioning models are prone to “hallucinating” objects that are not actually in a scene. One problem is that standard metrics only measure similarity to ground truth captions and may not fully capture image relevance. In this work, we propose a new image relevance metric to evaluate current models with veridical visual labels and assess their rate of object hallucination. We analyze how captioning model architectures and learning objectives contribute to object hallucination, explore when hallucination is likely due to image misclassification or language priors, and assess how well current sentence metrics capture object hallucination. We investigate these questions on the standard image captioning benchmark, MSCOCO, using a diverse set of models. Our analysis yields several interesting findings, including that models which score best on standard sentence metrics do not always have lower hallucination and that models which hallucinate more tend to make errors driven by language priors.</abstract>
       <doi>10.18653/v1/D18-1437</doi>
       <bibkey>rohrbach-etal-2018-object</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/object-halbench">Object HalBench</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">MS COCO</pwcdataset>
     </paper>
     <paper id="438">
diff --git a/data/xml/D19.xml b/data/xml/D19.xml
index b46b7d16cb..659737235d 100644
--- a/data/xml/D19.xml
+++ b/data/xml/D19.xml
@@ -3498,7 +3498,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/t-rex">T-REx</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="251">
       <title><fixed-case>N</fixed-case>um<fixed-case>N</fixed-case>et: Machine Reading Comprehension with Numerical Reasoning</title>
@@ -5125,7 +5124,6 @@
       <doi>10.18653/v1/D19-1369</doi>
       <bibkey>wang-2019-single</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="370">
       <title>A Surprisingly Effective Fix for Deep Latent Variable Modeling of Text</title>
@@ -8746,7 +8744,7 @@
       <author><first>Soumya</first><last>Sharma</last></author>
       <author><first>Bishal</first><last>Santra</last></author>
       <author><first>Abhik</first><last>Jana</last></author>
-      <author><first>Santosh</first><last>Tokala</last></author>
+      <author><first>Santosh</first><last>T.y.s.s</last></author>
       <author><first>Niloy</first><last>Ganguly</last></author>
       <author><first>Pawan</first><last>Goyal</last></author>
       <pages>6092–6097</pages>
diff --git a/data/xml/J16.xml b/data/xml/J16.xml
index 0fb448bfd6..7a729aab99 100644
--- a/data/xml/J16.xml
+++ b/data/xml/J16.xml
@@ -212,7 +212,7 @@
       <bibkey>bos-2016-squib</bibkey>
     </paper>
     <paper id="7">
-      <title><fixed-case>S</fixed-case>urvey: Computational Sociolinguistics: A <fixed-case>S</fixed-case>urvey</title>
+      <title>Computational Sociolinguistics: A <fixed-case>S</fixed-case>urvey</title>
       <author><first>Dong</first><last>Nguyen</last></author>
       <author><first>A. Seza</first><last>Doğruöz</last></author>
       <author><first>Carolyn P.</first><last>Rosé</last></author>
diff --git a/data/xml/L18.xml b/data/xml/L18.xml
index ee930e0805..47ed55a740 100644
--- a/data/xml/L18.xml
+++ b/data/xml/L18.xml
@@ -6519,6 +6519,7 @@
       <author><first>Sumeet</first><last>Agarwal</last></author>
       <url hash="b37b3489">L18-1712</url>
       <bibkey>bhardwaj-etal-2018-sandhikosh</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/sanskrit-sandhi-kosh">Sanskrit Sandhi Kosh</pwcdataset>
     </paper>
     <paper id="713">
       <title><fixed-case>C</fixed-case>zech Legal Text Treebank 2.0</title>
diff --git a/data/xml/N19.xml b/data/xml/N19.xml
index f11b5c465e..9267c39086 100644
--- a/data/xml/N19.xml
+++ b/data/xml/N19.xml
@@ -3114,7 +3114,7 @@
     </paper>
     <paper id="230">
       <title><fixed-case>A</fixed-case>ttentive<fixed-case>C</fixed-case>hecker: A Bi-Directional Attention Flow Mechanism for Fact Verification</title>
-      <author><first>Santosh</first><last>Tokala</last></author>
+      <author><first>Santosh</first><last>T.y.s.s</last></author>
       <author><first>Vishal</first><last>G</last></author>
       <author><first>Avirup</first><last>Saha</last></author>
       <author><first>Niloy</first><last>Ganguly</last></author>
diff --git a/data/xml/P18.xml b/data/xml/P18.xml
index 2612eb12fd..5acf8c7a93 100644
--- a/data/xml/P18.xml
+++ b/data/xml/P18.xml
@@ -1,3 +1,4 @@
+<?xml version='1.0' encoding='UTF-8'?>
 <collection id="P18">
   <volume id="1" type="proceedings">
     <meta>
@@ -504,7 +505,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dbpedia">DBpedia</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/yelp">Yelp</pwcdataset>
     </paper>
     <paper id="32">
@@ -4480,7 +4480,6 @@
       <bibkey>gadetsky-etal-2018-conditional</bibkey>
       <pwccode url="https://github.com/agadetsky/pytorch-definitions" additional="false">agadetsky/pytorch-definitions</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="44">
       <title><fixed-case>CNN</fixed-case> for Text-Based Multiple Choice Question Answering</title>
@@ -5644,7 +5643,6 @@
       <bibkey>fernandez-downey-2018-sampling</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/billion-word-benchmark">Billion Word Benchmark</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="3">
       <title>Learning-based Composite Metrics for Improved Caption Evaluation</title>
diff --git a/data/xml/P19.xml b/data/xml/P19.xml
index da6b4c3863..72bb1f46fd 100644
--- a/data/xml/P19.xml
+++ b/data/xml/P19.xml
@@ -4070,7 +4070,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/text8">Text8</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="286">
       <title>Domain Adaptation of Neural Machine Translation by Lexicon Induction</title>
@@ -5675,7 +5674,6 @@
       <bibkey>hu-etal-2019-shot</bibkey>
       <pwccode url="https://github.com/acbull/HiCE" additional="false">acbull/HiCE</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="403">
       <title>Neural Temporality Adaptation for Document Classification: Diachronic Word Embeddings and Domain Adaptation Models</title>
@@ -9035,7 +9033,6 @@
       <pwccode url="https://github.com/awslabs/w-lda" additional="false">awslabs/w-lda</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ag-news">AG News</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/yelp-review-polarity">Yelp Review Polarity</pwcdataset>
     </paper>
     <paper id="641">
diff --git a/data/xml/Q17.xml b/data/xml/Q17.xml
index 83bd877dcf..417544594d 100644
--- a/data/xml/Q17.xml
+++ b/data/xml/Q17.xml
@@ -1,3 +1,4 @@
+<?xml version='1.0' encoding='UTF-8'?>
 <collection id="Q17">
   <volume id="1" type="journal">
     <meta>
@@ -335,7 +336,6 @@
       <url hash="f12ac0a0">Q17-1024</url>
       <video href="Q17-1024.mp4"/>
       <bibkey>johnson-etal-2017-googles</bibkey>
-      <pwccode url="" additional="true"/>
     </paper>
     <paper id="25">
       <title>Unsupervised Learning of Morphological Forests</title>
diff --git a/data/xml/S19.xml b/data/xml/S19.xml
index f69c952fb3..ad498cb356 100644
--- a/data/xml/S19.xml
+++ b/data/xml/S19.xml
@@ -330,7 +330,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="27">
       <title><fixed-case>HELP</fixed-case>: A Dataset for Identifying Shortcomings of Neural Models in Monotonicity Reasoning</title>
diff --git a/data/xml/W18.xml b/data/xml/W18.xml
index f48bf7e580..460892fbf8 100644
--- a/data/xml/W18.xml
+++ b/data/xml/W18.xml
@@ -1,3 +1,4 @@
+<?xml version='1.0' encoding='UTF-8'?>
 <collection id="W18">
   <volume id="1" type="proceedings">
     <meta>
@@ -11525,7 +11526,6 @@
       <bibkey>joshi-etal-2018-shot</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="12">
       <title>Neural <fixed-case>D</fixed-case>rug<fixed-case>N</fixed-case>et</title>
@@ -11583,6 +11583,7 @@
     </paper>
     <paper id="17">
       <title>Deep Learning for Social Media Health Text Classification</title>
+      <author><first>Santosh</first><last>T.y.s.s</last></author>
       <author><first>Santosh</first><last>Tokala</last></author>
       <author><first>Vaibhav</first><last>Gambhir</last></author>
       <author><first>Animesh</first><last>Mukherjee</last></author>
diff --git a/data/xml/W19.xml b/data/xml/W19.xml
index c20215dac1..c06f5b25a7 100644
--- a/data/xml/W19.xml
+++ b/data/xml/W19.xml
@@ -2947,7 +2947,6 @@
       <pwccode url="https://github.com/kyunghyuncho/bert-gen" additional="true">kyunghyuncho/bert-gen</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bookcorpus">BookCorpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="5">
       <title>Neural Text Simplification in Low-Resource Conditions Using Weak Supervision</title>
@@ -8447,7 +8446,6 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <pwccode url="https://github.com/kakaobrain/helo_word" additional="true">kakaobrain/helo_word</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/locness-corpus">WI-LOCNESS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="24">
       <title>Neural and <fixed-case>FST</fixed-case>-based approaches to grammatical error correction</title>
@@ -14140,6 +14138,7 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <url hash="9b228715">W19-5945</url>
       <doi>10.18653/v1/W19-5945</doi>
       <bibkey>keizer-etal-2019-user</bibkey>
+      <pwccode url="https://bitbucket.org/skeizer/madrigal" additional="false">skeizer/madrigal</pwccode>
     </paper>
     <paper id="46">
       <title>Dialogue Act Classification in Team Communication for Robot Assisted Disaster Response</title>
diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml
index 54c6668b7b..078d3f5fd3 100644
--- a/data/yaml/name_variants.yaml
+++ b/data/yaml/name_variants.yaml
@@ -5,6 +5,9 @@
   variants:
   - {first: Leon, last: Dostert}
   - {first: L. E., last: Dostert}
+- canonical: {first: Maria, last: Berger}
+  variants:
+  - {first: Maria, last: Moritz}
 - canonical: {first: Pranav, last: A}
   comment: UC Santa Cruz
   id: pranav-a
@@ -10594,3 +10597,22 @@
 - canonical: {first: Katharina, last: von der Wense}
   variants:
   - {first: Katharina, last: Kann}
+- canonical: {first: Maxwell, last: Weinzierl}
+  variants:
+  - {first: Maxwell A., last: Weinzierl}
+- canonical: {first: Yifan, last: Peng}
+  comment: cmu
+  id: yifan-peng-cmu
+- canonical: {first: Kexin, last: Wang}
+  comment: Bytedance
+  id: kexin-wang-bd
+- canonical: {first: Weiwei, last: Sun}
+  comment: Shandong University
+  id: weiwei-sun-sd
+- canonical: {first: Chao, last: Zhang}
+  comment:  Tsinghua University
+  id: chao-zhang-tu
+- canonical: {first: Mathew, last: Huerta-Enochian}
+  id: mathew-huerta-enochian
+  variants:
+  - {first: Mathew John, last: Huerta-Enochian}
diff --git a/data/yaml/sigs/siglex.yaml b/data/yaml/sigs/siglex.yaml
index 7bcd2ddf21..d8babc94dd 100644
--- a/data/yaml/sigs/siglex.yaml
+++ b/data/yaml/sigs/siglex.yaml
@@ -2,6 +2,8 @@ Name: Special Interest Group on the Lexicon (SIGLEX)
 ShortName: SIGLEX
 URL: http://www.siglex.org/
 Meetings:
+  - 2024:
+    - 2024.mwe-1
   - 2023:
     - 2023.mwe-1 # Proceedings of the 19th Workshop on Multiword Expressions (MWE 2023)
     - 2023.semeval-1 # Proceedings of the 17th International Workshop on Semantic Evaluation (SemEval-2023)
diff --git a/data/yaml/sigs/sigparse.yaml b/data/yaml/sigs/sigparse.yaml
index 70a8110ea8..f07ea894b4 100644
--- a/data/yaml/sigs/sigparse.yaml
+++ b/data/yaml/sigs/sigparse.yaml
@@ -2,6 +2,8 @@ Name: Special Interest Group on Natural Language Parsing (SIGPARSE)
 ShortName: SIGPARSE
 URL: http://www.cs.cmu.edu/~sigparse
 Meetings:
+  - 2024:
+    - 2024.mwe-1
   - 2023:
     - 2023.depling-1
     - 2023.cxgsnlp-1
diff --git a/data/yaml/sigs/sigwrit.yaml b/data/yaml/sigs/sigwrit.yaml
new file mode 100644
index 0000000000..a34c3b7496
--- /dev/null
+++ b/data/yaml/sigs/sigwrit.yaml
@@ -0,0 +1,6 @@
+Name: Special Interest Group on Writing systems and Written Language (SIGMWrit)
+ShortName: SIGWrit
+URL: https://sigwrit.org/
+Meetings:
+  - 2024:
+    - 2024.cawl-1
diff --git a/data/yaml/venues/cl4health.yaml b/data/yaml/venues/cl4health.yaml
new file mode 100644
index 0000000000..0b0a739b07
--- /dev/null
+++ b/data/yaml/venues/cl4health.yaml
@@ -0,0 +1,2 @@
+acronym: CL4Health
+name: Workshop on Patient-Oriented Language Processing
diff --git a/data/yaml/venues/delite.yaml b/data/yaml/venues/delite.yaml
new file mode 100644
index 0000000000..899614a775
--- /dev/null
+++ b/data/yaml/venues/delite.yaml
@@ -0,0 +1,2 @@
+acronym: DELITE
+name: Workshop on Language-driven Deliberation Technology
diff --git a/data/yaml/venues/determit.yaml b/data/yaml/venues/determit.yaml
new file mode 100644
index 0000000000..40a39bb49a
--- /dev/null
+++ b/data/yaml/venues/determit.yaml
@@ -0,0 +1,2 @@
+acronym: DeTermIt
+name: DeTermIt! Evaluating Text Difficulty in a Multilingual Context
diff --git a/data/yaml/venues/dlnld.yaml b/data/yaml/venues/dlnld.yaml
new file mode 100644
index 0000000000..c8f008f138
--- /dev/null
+++ b/data/yaml/venues/dlnld.yaml
@@ -0,0 +1,2 @@
+acronym: DLnLD
+name: 'Deep Learning and Linked Data'
diff --git a/data/yaml/venues/htres.yaml b/data/yaml/venues/htres.yaml
new file mode 100644
index 0000000000..9b439661e0
--- /dev/null
+++ b/data/yaml/venues/htres.yaml
@@ -0,0 +1,2 @@
+acronym: htres
+name: Workshop on  Holocaust Testimonies as Language Resources
diff --git a/data/yaml/venues/neusymbridge.yaml b/data/yaml/venues/neusymbridge.yaml
new file mode 100644
index 0000000000..739ed913a4
--- /dev/null
+++ b/data/yaml/venues/neusymbridge.yaml
@@ -0,0 +1,2 @@
+acronym: NeusymBridge
+name: 'Workshop on Bridging Neurons and Symbols for Natural Language Processing and Knowledge Graphs Reasoning'
diff --git a/data/yaml/venues/rfp.yaml b/data/yaml/venues/rfp.yaml
new file mode 100644
index 0000000000..790c118ec1
--- /dev/null
+++ b/data/yaml/venues/rfp.yaml
@@ -0,0 +1,2 @@
+acronym: rfp
+name: Workshop on Reference, Framing, and Perspective
diff --git a/data/yaml/venues/safety4convai.yaml b/data/yaml/venues/safety4convai.yaml
new file mode 100644
index 0000000000..0ebc766bf5
--- /dev/null
+++ b/data/yaml/venues/safety4convai.yaml
@@ -0,0 +1,2 @@
+acronym: Safety4ConvAI
+name: Workhop on Safety for Conversational AI
diff --git a/hugo/content/posts/2024-05-23-eacl-videos.md b/hugo/content/posts/2024-05-23-eacl-videos.md
new file mode 100644
index 0000000000..8ef53da18b
--- /dev/null
+++ b/hugo/content/posts/2024-05-23-eacl-videos.md
@@ -0,0 +1,18 @@
+---
+Title: EACL 2024 videos
+date: "2024-05-23"
+Description: >
+    EACL 2024 conference videos are now available
+---
+
+Recorded videos for EACL main conference and workshop papers [are now
+available](https://aclanthology.org/events/eacl-2024/) in the ACL
+Anthology.
+
+We have checked the released videos against the permission forms that
+were signed for [Underline](https://underline.io) and believe
+everything to be in order.  If your video was inadvertently released
+against your wishes, please let us know by [filing an issue on
+Github](https://github.com/acl-org/acl-anthology/issues/new?assignees=anthology-assist&labels=correction%2Cmetadata&projects=&template=01-metadata-correction.yml&title=Paper+Metadata%3A+%7Breplace+with+Anthology+ID%7D)
+or by emailing the Anthology [director](mailto:anthology@aclweb.org)
+and [assistant](mailto:anthology-tech@aclweb.org).
diff --git a/hugo/content/posts/_index.md b/hugo/content/posts/_index.md
index 75699a3ab0..dc8a4c7d52 100644
--- a/hugo/content/posts/_index.md
+++ b/hugo/content/posts/_index.md
@@ -1,4 +1,4 @@
 ---
-Title: List of all news pages
+Title: List of all posts
 render_pagelist: true
 ---
diff --git a/hugo/content/posts/lrec_2024.md b/hugo/content/posts/lrec_2024.md
new file mode 100644
index 0000000000..0437c60e9f
--- /dev/null
+++ b/hugo/content/posts/lrec_2024.md
@@ -0,0 +1,8 @@
+---
+Title: LREC 2024 proceedings
+date: "2024-05-22"
+Description: >
+    The proceedings of LREC 2024 and its colocated workshops are now available
+---
+
+The proceedings of [LREC-COLING 2024](https://lrec-coling-2024.org/) and its thirty-four colocated workshops [are now available on the ACL Anthology](https://aclanthology.org/events/lrec-2024/).
diff --git a/hugo/layouts/index.html b/hugo/layouts/index.html
index c8ab63fb06..8d3f87cd5b 100644
--- a/hugo/layouts/index.html
+++ b/hugo/layouts/index.html
@@ -77,7 +77,7 @@ <h6>ACL Events</h6>
         <tr class="text-center border-left border-right">
           <th class="border-left" scope="col" style="width: 8rem;">Venue</th>
           <!-- Each year, adjust this line, changing the date and incrementing colspan -->
-          <th class="border-left" scope="col" colspan="4">2024 &ndash; 2020</th>
+          <th class="border-left" scope="col" colspan="5">2024 &ndash; 2020</th>
           <th class="border-left" scope="col" colspan="10">2019 &ndash; 2010</th>
           <th class="border-left" scope="col" colspan="10">2009 &ndash; 2000</th>
           <th class="border-left" scope="col" colspan="10">1999 &ndash; 1990</th>