diff --git a/bin/latex_to_unicode.py b/bin/latex_to_unicode.py
index 4049c3a13a..c35c668b4d 100755
--- a/bin/latex_to_unicode.py
+++ b/bin/latex_to_unicode.py
@@ -23,6 +23,13 @@
     # Entry(r'\textsc', None, 'sc', 'unary', False),
     # Entry(r'\sc', None, 'sc', 'setter', False),
     Entry(r"\url", None, "url", "unary", True),
+    Entry(r"\footnote", None, "fn", "unary", False),
+    Entry(r"\href", None, "delete", "unary", False),
+    # \cite and friends. There are a lot more, but these are the most common.
+    Entry(r"\cite", None, "cite", "unary", False),
+    Entry(r"\newcite", None, "cite", "unary", False),
+    Entry(r"\citet", None, "cite", "unary", False),
+    Entry(r"\citep", None, "cite", "unary", False),
     Entry(r"\fixedcase", None, "fixed-case", "unary", False),  # for our internal use
     Entry(r"", None, None, "trivial", False),
 ]
@@ -185,7 +192,7 @@ def latex_to_unicode(s):
     s = s.replace(r"\}", "}")
 
     def repl(s):
-        logging.warning("discarding control sequence {}".format(s.group(0)))
+        logging.warning(f"discarding control sequence '{s.group(0)}' from '{s.string}'")
         return ""
 
     s = re.sub(r"\\[A-Za-z]+ |\\.", repl, s)
@@ -223,6 +230,27 @@ def visit(node):
     return node
 
 
+def remove_notes(node):
+    def visit(node, outparent):
+        if isinstance(node, str):
+            outparent.append(node)
+        elif isinstance(node, list):
+            if openers[node[0].rstrip()].tag in ["fn", "delete"]:
+                return  # without copying
+            elif openers[node[0].rstrip()].tag == "cite":
+                outparent.append("(CITATION)")
+            else:
+                outnode = []
+                for child in node:
+                    visit(child, outnode)
+                outparent.append(outnode)
+
+    outroot = []
+    visit(node, outroot)
+    assert len(outroot) == 1
+    return outroot[0]
+
+
 def append_text(xnode, text):
     if len(xnode) == 0:
         xnode.text = (xnode.text or "") + text
@@ -293,6 +321,7 @@ def latex_to_xml(s, fixed_case=False, trivial_math=False):
         tree = find_fixed_case(tree, conservative=True)
     if trivial_math:
         tree = flatten_trivial_math(tree)
+    tree = remove_notes(tree)
     tree = latextree_to_xml(tree)
     tree = xml_to_unicode(tree)
     return tree
diff --git a/bin/normalize_anth.py b/bin/normalize_anth.py
index f90ca251a7..c6a756ad9e 100755
--- a/bin/normalize_anth.py
+++ b/bin/normalize_anth.py
@@ -174,7 +174,11 @@ def normalize(oldnode, informat):
                     )
                 )
             oldtext = "".join(oldnode.itertext())
-            newnode = latex_to_xml(oldtext, trivial_math=True, fixed_case=True)
+            newnode = latex_to_xml(
+                oldtext,
+                trivial_math=True,
+                fixed_case=oldnode.tag in ["title", "booktitle"],
+            )
             newnode.tag = oldnode.tag
             newnode.attrib.update(oldnode.attrib)
             replace_node(oldnode, newnode)
diff --git a/data/xml/D19.xml b/data/xml/D19.xml
index ec70cd199b..a40ebc0c3b 100644
--- a/data/xml/D19.xml
+++ b/data/xml/D19.xml
@@ -543,7 +543,7 @@
       <author><first>Kevin</first><last>Zhou</last></author>
       <author><first>Tengchao</first><last>Lv</last></author>
       <pages>476–485</pages>
-      <abstract>Most of the current effective methods for text classification tasks are based on large-scale labeled data and a great number of parameters, but when the supervised training data are few and difficult to be collected, these models are not available. In this work, we propose a hierarchical attention prototypical networks (HAPN) for few-shot text classification. We design the feature level, word level, and instance level multi cross attention for our model to enhance the expressive ability of semantic space, so it can highlight or weaken the importance of the features, words, and instances separately. We verify the effectiveness of our model on two standard benchmark few-shot text classification datasets – FewRel and CSID, and achieve the state-of-the-art performance. The visualization of hierarchical attention layers illustrates that our model can capture more important features, words, and instances. In addition, our attention mechanism increases support set augmentability and accelerates convergence speed in the training stage.</abstract>
+      <abstract>Most of the current effective methods for text classification tasks are based on large-scale labeled data and a great number of parameters, but when the supervised training data are few and difficult to be collected, these models are not available. In this work, we propose a hierarchical attention prototypical networks (HAPN) for few-shot text classification. We design the feature level, word level, and instance level multi cross attention for our model to enhance the expressive ability of semantic space, so it can highlight or weaken the importance of the features, words, and instances separately. We verify the effectiveness of our model on two standard benchmark few-shot text classification datasets—FewRel and CSID, and achieve the state-of-the-art performance. The visualization of hierarchical attention layers illustrates that our model can capture more important features, words, and instances. In addition, our attention mechanism increases support set augmentability and accelerates convergence speed in the training stage.</abstract>
       <url>D19-1045</url>
       <doi>10.18653/v1/D19-1045</doi>
     </paper>
@@ -874,7 +874,7 @@
       <title>Towards Linear Time Neural Machine Translation with Capsule Networks</title>
       <author><first>Mingxuan</first><last>Wang</last></author>
       <pages>803–812</pages>
-      <abstract>In this study, we first investigate a novel capsule network with dynamic routing for linear time Neural Machine Translation (NMT), referred as CapsNMT. CapsNMT uses an aggregation mechanism to map the source sentence into a matrix with pre-determined size, and then applys a deep LSTM network to decode the target sequence from the source representation. Unlike the previous work sutskever2014sequence to store the source sentence with a passive and bottom-up way, the dynamic routing policy encodes the source sentence with an iterative process to decide the credit attribution between nodes from lower and higher layers. CapsNMT has two core properties: it runs in time that is linear in the length of the sequences and provides a more flexible way to aggregate the part-whole information of the source sentence. On WMT14 English-German task and a larger WMT14 English-French task, CapsNMT achieves comparable results with the Transformer system. To the best of our knowledge, this is the first work that capsule networks have been empirically investigated for sequence to sequence problems.</abstract>
+      <abstract>In this study, we first investigate a novel capsule network with dynamic routing for linear time Neural Machine Translation (NMT), referred as CapsNMT. CapsNMT uses an aggregation mechanism to map the source sentence into a matrix with pre-determined size, and then applys a deep LSTM network to decode the target sequence from the source representation. Unlike the previous work (CITATION) to store the source sentence with a passive and bottom-up way, the dynamic routing policy encodes the source sentence with an iterative process to decide the credit attribution between nodes from lower and higher layers. CapsNMT has two core properties: it runs in time that is linear in the length of the sequences and provides a more flexible way to aggregate the part-whole information of the source sentence. On WMT14 English-German task and a larger WMT14 English-French task, CapsNMT achieves comparable results with the Transformer system. To the best of our knowledge, this is the first work that capsule networks have been empirically investigated for sequence to sequence problems.</abstract>
       <url>D19-1074</url>
       <doi>10.18653/v1/D19-1074</doi>
     </paper>
@@ -1004,7 +1004,7 @@
       <author><first>Xin-Yu</first><last>Dai</last></author>
       <author><first>Jiajun</first><last>Chen</last></author>
       <pages>931–941</pages>
-      <abstract>Previous studies have shown that neural machine translation (NMT) models can benefit from explicitly modeling translated () and untranslated () source contents as recurrent states zheng2018modeling. However, this less interpretable recurrent process hinders its power to model the dynamic updating of and contents during decoding. In this paper, we propose to model the <i>dynamic principles</i> by explicitly separating source words into groups of translated and untranslated contents through parts-to-wholes assignment. The assignment is learned through a novel variant of routing-by-agreement mechanism sabour2017dynamic, namely <i>Guided Dynamic Routing</i>, where the translating status at each decoding step <i>guides</i> the routing process to assign each source word to its associated group (i.e., translated or untranslated content) represented by a capsule, enabling translation to be made from holistic context. Experiments show that our approach achieves substantial improvements over both Rnmt and Transformer by producing more adequate translations. Extensive analysis demonstrates that our method is highly interpretable, which is able to recognize the translated and untranslated contents as expected.</abstract>
+      <abstract>Previous studies have shown that neural machine translation (NMT) models can benefit from explicitly modeling translated () and untranslated () source contents as recurrent states (CITATION). However, this less interpretable recurrent process hinders its power to model the dynamic updating of and contents during decoding. In this paper, we propose to model the <i>dynamic principles</i> by explicitly separating source words into groups of translated and untranslated contents through parts-to-wholes assignment. The assignment is learned through a novel variant of routing-by-agreement mechanism (CITATION), namely <i>Guided Dynamic Routing</i>, where the translating status at each decoding step <i>guides</i> the routing process to assign each source word to its associated group (i.e., translated or untranslated content) represented by a capsule, enabling translation to be made from holistic context. Experiments show that our approach achieves substantial improvements over both Rnmt and Transformer by producing more adequate translations. Extensive analysis demonstrates that our method is highly interpretable, which is able to recognize the translated and untranslated contents as expected.</abstract>
       <url>D19-1086</url>
       <attachment>D19-1086.Attachment.pdf</attachment>
       <doi>10.18653/v1/D19-1086</doi>
@@ -2849,7 +2849,7 @@
       <author><first>Chandra</first><last>Bhagavatula</last></author>
       <author><first>Yejin</first><last>Choi</last></author>
       <pages>2391–2401</pages>
-      <abstract>Understanding narratives requires reading between the lines, which in turn, requires interpreting the likely causes and effects of events, even when they are not mentioned explicitly. In this paper, we introduce Cosmos QA, a large-scale dataset of 35,600 problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. In stark contrast to most existing reading comprehension datasets where the questions focus on factual and literal understanding of the context paragraph, our dataset focuses on reading between the lines over a diverse collection of people’s everyday narratives, asking such questions as “what might be the possible reason of ...?”, or “what would have happened if ...” that require reasoning beyond the exact text spans in the context. To establish baseline performances on Cosmos QA, we experiment with several state-of-the-art neural architectures for reading comprehension, and also propose a new architecture that improves over the competitive baselines. Experimental results demonstrate a significant gap between machine (68.4%) and human performance (94%), pointing to avenues for future research on commonsense machine comprehension. Dataset, code and leaderboard is publicly available at https://wilburone.github.io/cosmos.</abstract>
+      <abstract>Understanding narratives requires reading between the lines, which in turn, requires interpreting the likely causes and effects of events, even when they are not mentioned explicitly. In this paper, we introduce Cosmos QA, a large-scale dataset of 35,600 problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. In stark contrast to most existing reading comprehension datasets where the questions focus on factual and literal understanding of the context paragraph, our dataset focuses on reading between the lines over a diverse collection of people’s everyday narratives, asking such questions as “what might be the possible reason of ...?", or “what would have happened if ..." that require reasoning beyond the exact text spans in the context. To establish baseline performances on Cosmos QA, we experiment with several state-of-the-art neural architectures for reading comprehension, and also propose a new architecture that improves over the competitive baselines. Experimental results demonstrate a significant gap between machine (68.4%) and human performance (94%), pointing to avenues for future research on commonsense machine comprehension. Dataset, code and leaderboard is publicly available at https://wilburone.github.io/cosmos.</abstract>
       <url>D19-1243</url>
       <attachment>D19-1243.Attachment.pdf</attachment>
       <doi>10.18653/v1/D19-1243</doi>
@@ -3449,7 +3449,7 @@
       <author><first>Shafiq</first><last>Joty</last></author>
       <author><first>Irina</first><last>Temnikova</last></author>
       <author><first>Preslav</first><last>Nakov</last></author>
-      <pages>2964–2973</pages>
+      <pages>2964–2975</pages>
       <abstract>The ongoing neural revolution in machine translation has made it easier to model larger contexts beyond the sentence-level, which can potentially help resolve some discourse-level ambiguities such as pronominal anaphora, thus enabling better translations. Unfortunately, even when the resulting improvements are seen as substantial by humans, they remain virtually unnoticed by traditional automatic evaluation measures like BLEU, as only a few words end up being affected. Thus, specialized evaluation measures are needed. With this aim in mind, we contribute an extensive, targeted dataset that can be used as a test suite for pronoun translation, covering multiple source languages and different pronoun errors drawn from real system translations, for English. We further propose an evaluation measure to differentiate good and bad pronoun translations. We also conduct a user study to report correlations with human judgments.</abstract>
       <url>D19-1294</url>
       <attachment>D19-1294.Attachment.zip</attachment>
@@ -3459,7 +3459,7 @@
       <title>A Regularization Approach for Incorporating Event Knowledge and Coreference Relations into Neural Discourse Parsing</title>
       <author><first>Zeyu</first><last>Dai</last></author>
       <author><first>Ruihong</first><last>Huang</last></author>
-      <pages>2974–2985</pages>
+      <pages>2976–2987</pages>
       <abstract>We argue that external commonsense knowledge and linguistic constraints need to be incorporated into neural network models for mitigating data sparsity issues and further improving the performance of discourse parsing. Realizing that external knowledge and linguistic constraints may not always apply in understanding a particular context, we propose a regularization approach that tightly integrates these constraints with contexts for deriving word representations. Meanwhile, it balances attentions over contexts and constraints through adding a regularization term into the objective function. Experiments show that our knowledge regularization approach outperforms all previous systems on the benchmark dataset PDTB for discourse parsing.</abstract>
       <url>D19-1295</url>
       <doi>10.18653/v1/D19-1295</doi>
@@ -3467,7 +3467,7 @@
     <paper id="296">
       <title>Weakly Supervised Multilingual Causality Extraction from <fixed-case>W</fixed-case>ikipedia</title>
       <author><first>Chikara</first><last>Hashimoto</last></author>
-      <pages>2986–2997</pages>
+      <pages>2988–2999</pages>
       <abstract>We present a method for extracting causality knowledge from Wikipedia, such as Protectionism -&gt; Trade war, where the cause and effect entities correspond to Wikipedia articles. Such causality knowledge is easy to verify by reading corresponding Wikipedia articles, to translate to multiple languages through Wikidata, and to connect to knowledge bases derived from Wikipedia. Our method exploits Wikipedia article sections that describe causality and the redundancy stemming from the multilinguality of Wikipedia. Experiments showed that our method achieved precision and recall above 98% and 64%, respectively. In particular, it could extract causalities whose cause and effect were written distantly in a Wikipedia article. We have released the code and data for further research.</abstract>
       <url>D19-1296</url>
       <doi>10.18653/v1/D19-1296</doi>
@@ -3478,7 +3478,7 @@
       <author><first>Xuepeng</first><last>Wang</last></author>
       <author><first>Dawei</first><last>Yin</last></author>
       <author><first>Chengqing</first><last>Zong</last></author>
-      <pages>2998–3008</pages>
+      <pages>3000–3010</pages>
       <abstract>Review summarization aims to generate a condensed summary for a review or multiple reviews. Existing review summarization systems mainly generate summary only based on review content and neglect the authors’ attributes (e.g., gender, age, and occupation). In fact, when summarizing a review, users with different attributes usually pay attention to specific aspects and have their own word-using habits or writing styles. Therefore, we propose an Attribute-aware Sequence Network (ASN) to take the aforementioned users’ characteristics into account, which includes three modules: an attribute encoder encodes the attribute preferences over the words; an attribute-aware review encoder adopts an attribute-based selective mechanism to select the important information of a review; and an attribute-aware summary decoder incorporates attribute embedding and attribute-specific word-using habits into word prediction. To validate our model, we collect a new dataset TripAtt, comprising 495,440 attribute-review-summary triplets with three kinds of attribute information: gender, age, and travel status. Extensive experiments show that ASN achieves state-of-the-art performance on review summarization in both auto-metric ROUGE and human evaluation.</abstract>
       <url>D19-1297</url>
       <doi>10.18653/v1/D19-1297</doi>
@@ -3487,7 +3487,7 @@
       <title>Extractive Summarization of Long Documents by Combining Global and Local Context</title>
       <author><first>Wen</first><last>Xiao</last></author>
       <author><first>Giuseppe</first><last>Carenini</last></author>
-      <pages>3009–3019</pages>
+      <pages>3011–3021</pages>
       <abstract>In this paper, we propose a novel neural single-document extractive summarization model for long documents, incorporating both the global context of the whole document and the local context within the current topic. We evaluate the model on two datasets of scientific papers , Pubmed and arXiv, where it outperforms previous work, both extractive and abstractive models, on ROUGE-1, ROUGE-2 and METEOR scores. We also show that, consistently with our goal, the benefits of our method become stronger as we apply it to longer documents. Rather surprisingly, an ablation study indicates that the benefits of our model seem to come exclusively from modeling the local context, even for the longest documents.</abstract>
       <url>D19-1298</url>
       <attachment>D19-1298.Attachment.zip</attachment>
@@ -3501,7 +3501,7 @@
       <author><first>Feng</first><last>Jiang</last></author>
       <author><first>Bing</first><last>Qin</last></author>
       <author><first>Chin-Yew</first><last>Lin</last></author>
-      <pages>3020–3030</pages>
+      <pages>3022–3032</pages>
       <abstract>Recent neural models for data-to-text generation rely on massive parallel pairs of data and text to learn the writing knowledge. They often assume that writing knowledge can be acquired from the training data alone. However, when people are writing, they not only rely on the data but also consider related knowledge. In this paper, we enhance neural data-to-text models with external knowledge in a simple but effective way to improve the fidelity of generated text. Besides relying on parallel data and text as in previous work, our model attends to relevant external knowledge, encoded as a temporary memory, and combines this knowledge with the context representation of data before generating words. This allows the model to infer relevant facts which are not explicitly stated in the data table from an external knowledge source. Experimental results on twenty-one Wikipedia infobox-to-text datasets show our model, KBAtt, consistently improves a state-of-the-art model on most of the datasets. In addition, to quantify when and why external knowledge is effective, we design a metric, KBGain, which shows a strong correlation with the observed performance boost. This result demonstrates the relevance of external knowledge and sparseness of original data are the main factors affecting system performance.</abstract>
       <url>D19-1299</url>
       <doi>10.18653/v1/D19-1299</doi>
@@ -3514,7 +3514,7 @@
       <author><first>Feiyang</first><last>Pan</last></author>
       <author><first>Min</first><last>Yang</last></author>
       <author><first>Qing</first><last>He</last></author>
-      <pages>3031–3041</pages>
+      <pages>3033–3043</pages>
       <abstract>In this work, we re-examine the problem of extractive text summarization for long documents. We observe that the process of extracting summarization of human can be divided into two stages: 1) a rough reading stage to look for sketched information, and 2) a subsequent careful reading stage to select key sentences to form the summary. By simulating such a two-stage process, we propose a novel approach for extractive summarization. We formulate the problem as a contextual-bandit problem and solve it with policy gradient. We adopt a convolutional neural network to encode gist of paragraphs for rough reading, and a decision making policy with an adapted termination mechanism for careful reading. Experiments on the CNN and DailyMail datasets show that our proposed method can provide high-quality summaries with varied length, and significantly outperform the state-of-the-art extractive methods in terms of ROUGE metrics.</abstract>
       <url>D19-1300</url>
       <attachment>D19-1300.Attachment.zip</attachment>
@@ -3528,7 +3528,7 @@
       <author><first>Min</first><last>Zhang</last></author>
       <author><first>Weihua</first><last>Luo</last></author>
       <author><first>Yue</first><last>Zhang</last></author>
-      <pages>3042–3051</pages>
+      <pages>3044–3053</pages>
       <abstract>We propose a contrastive attention mechanism to extend the sequence-to-sequence framework for abstractive sentence summarization task, which aims to generate a brief summary of a given source sentence. The proposed contrastive attention mechanism accommodates two categories of attention: one is the conventional attention that attends to relevant parts of the source sentence, the other is the opponent attention that attends to irrelevant or less relevant parts of the source sentence. Both attentions are trained in an opposite way so that the contribution from the conventional attention is encouraged and the contribution from the opponent attention is discouraged through a novel softmax and softmin functionality. Experiments on benchmark datasets show that, the proposed contrastive attention mechanism is more focused on the relevant parts for the summary than the conventional attention mechanism, and greatly advances the state-of-the-art performance on the abstractive sentence summarization task. We release the code at https://github.com/travel-go/ Abstractive-Text-Summarization.</abstract>
       <url>D19-1301</url>
       <doi>10.18653/v1/D19-1301</doi>
@@ -3542,7 +3542,7 @@
       <author><first>Jiajun</first><last>Zhang</last></author>
       <author><first>Shaonan</first><last>Wang</last></author>
       <author><first>Chengqing</first><last>Zong</last></author>
-      <pages>3052–3062</pages>
+      <pages>3054–3064</pages>
       <abstract>Cross-lingual summarization (CLS) is the task to produce a summary in one particular language for a source document in a different language. Existing methods simply divide this task into two steps: summarization and translation, leading to the problem of error propagation. To handle that, we present an end-to-end CLS framework, which we refer to as Neural Cross-Lingual Summarization (NCLS), for the first time. Moreover, we propose to further improve NCLS by incorporating two related tasks, monolingual summarization and machine translation, into the training process of CLS under multi-task learning. Due to the lack of supervised CLS data, we propose a round-trip translation strategy to acquire two high-quality large-scale CLS datasets based on existing monolingual summarization datasets. Experimental results have shown that our NCLS achieves remarkable improvement over traditional pipeline methods on both English-to-Chinese and Chinese-to-English CLS human-corrected test sets. In addition, NCLS with multi-task learning can further significantly improve the quality of generated summaries. We make our dataset and code publicly available here: http://www.nlpr.ia.ac.cn/cip/dataset.htm.</abstract>
       <url>D19-1302</url>
       <doi>10.18653/v1/D19-1302</doi>
@@ -3553,7 +3553,7 @@
       <author><first>Chien-Sheng</first><last>Wu</last></author>
       <author><first>Andrea</first><last>Madotto</last></author>
       <author><first>Pascale</first><last>Fung</last></author>
-      <pages>3063–3073</pages>
+      <pages>3065–3075</pages>
       <abstract>Sensational headlines are headlines that capture people’s attention and generate reader interest. Conventional abstractive headline generation methods, unlike human writers, do not optimize for maximal reader attention. In this paper, we propose a model that generates sensational headlines without labeled data. We first train a sensationalism scorer by classifying online headlines with many comments (“clickbait”) against a baseline of headlines generated from a summarization model. The score from the sensationalism scorer is used as the reward for a reinforcement learner. However, maximizing the noisy sensationalism reward will generate unnatural phrases instead of sensational headlines. To effectively leverage this noisy reward, we propose a novel loss function, Auto-tuned Reinforcement Learning (ARL), to dynamically balance reinforcement learning (RL) with maximum likelihood estimation (MLE). Human evaluation shows that 60.8% of samples generated by our model are sensational, which is significantly better than the Pointer-Gen baseline and other RL models.</abstract>
       <url>D19-1303</url>
       <doi>10.18653/v1/D19-1303</doi>
@@ -3564,7 +3564,7 @@
       <author><first>Yang</first><last>Gao</last></author>
       <author><first>Heyan</first><last>Huang</last></author>
       <author><first>Yuxiang</first><last>Zhou</last></author>
-      <pages>3074–3083</pages>
+      <pages>3076–3085</pages>
       <abstract>A quality abstractive summary should not only copy salient source texts as summaries but should also tend to generate new conceptual words to express concrete details. Inspired by the popular pointer generator sequence-to-sequence model, this paper presents a concept pointer network for improving these aspects of abstractive summarization. The network leverages knowledge-based, context-aware conceptualizations to derive an extended set of candidate concepts. The model then points to the most appropriate choice using both the concept set and original source text. This joint approach generates abstractive summaries with higher-level semantic concepts. The training model is also optimized in a way that adapts to different data, which is based on a novel method of distant-supervised learning guided by reference summaries and testing set. Overall, the proposed approach provides statistically significant improvements over several state-of-the-art models on both the DUC-2004 and Gigaword datasets. A human evaluation of the model’s abstractive abilities also supports the quality of the summaries produced within this framework.</abstract>
       <url>D19-1304</url>
       <attachment>D19-1304.Attachment.zip</attachment>
@@ -3574,7 +3574,7 @@
       <title>Surface Realisation Using Full Delexicalisation</title>
       <author><first>Anastasia</first><last>Shimorina</last></author>
       <author><first>Claire</first><last>Gardent</last></author>
-      <pages>3084–3094</pages>
+      <pages>3086–3096</pages>
       <abstract>Surface realisation (SR) maps a meaning representation to a sentence and can be viewed as consisting of three subtasks: word ordering, morphological inflection and contraction generation (e.g., clitic attachment in Portuguese or elision in French). We propose a modular approach to surface realisation which models each of these components separately, and evaluate our approach on the 10 languages covered by the SR’18 Surface Realisation Shared Task shallow track. We provide a detailed evaluation of how word order, morphological realisation and contractions are handled by the model and an analysis of the differences in word ordering performance across languages.</abstract>
       <url>D19-1305</url>
       <attachment>D19-1305.Attachment.zip</attachment>
@@ -3587,7 +3587,7 @@
       <author><first>Jonas</first><last>Mueller</last></author>
       <author><first>Nicholas</first><last>Matthews</last></author>
       <author><first>Enrico</first><last>Santus</last></author>
-      <pages>3095–3107</pages>
+      <pages>3097–3109</pages>
       <abstract>Text attribute transfer aims to automatically rewrite sentences such that they possess certain linguistic attributes, while simultaneously preserving their semantic content. This task remains challenging due to a lack of supervised parallel data. Existing approaches try to explicitly disentangle content and attribute information, but this is difficult and often results in poor content-preservation and ungrammaticality. In contrast, we propose a simpler approach, Iterative Matching and Translation (IMaT), which: (1) constructs a pseudo-parallel corpus by aligning a subset of semantically similar sentences from the source and the target corpora; (2) applies a standard sequence-to-sequence model to learn the attribute transfer; (3) iteratively improves the learned transfer function by refining imperfections in the alignment. In sentiment modification and formality transfer tasks, our method outperforms complex state-of-the-art systems by a large margin. As an auxiliary contribution, we produce a publicly-available test set with human-generated transfer references.</abstract>
       <url>D19-1306</url>
       <doi>10.18653/v1/D19-1306</doi>
@@ -3600,7 +3600,7 @@
       <author><first>Ori</first><last>Shapira</last></author>
       <author><first>Ido</first><last>Dagan</last></author>
       <author><first>Iryna</first><last>Gurevych</last></author>
-      <pages>3108–3118</pages>
+      <pages>3110–3120</pages>
       <abstract>Reinforcement Learning (RL)based document summarisation systems yield state-of-the-art performance in terms of ROUGE scores, because they directly use ROUGE as the rewards during training. However, summaries with high ROUGE scores often receive low human judgement. To find a better reward function that can guide RL to generate human-appealing summaries, we learn a reward function from human ratings on 2,500 summaries. Our reward function only takes the document and system summary as input. Hence, once trained, it can be used to train RL based summarisation systems without using any reference summaries. We show that our learned rewards have significantly higher correlation with human ratings than previous approaches. Human evaluation experiments show that, compared to the state-of-the-art supervised-learning systems and ROUGE-as-rewards RL summarisation systems, the RL systems using our learned rewards during training generate summaries with higher human ratings. The learned reward function and our source code are available at https://github.com/yg211/summary-reward-no-reference.</abstract>
       <url>D19-1307</url>
       <attachment>D19-1307.Attachment.pdf</attachment>
@@ -3611,7 +3611,7 @@
       <author><first>Jaemin</first><last>Cho</last></author>
       <author><first>Minjoon</first><last>Seo</last></author>
       <author><first>Hannaneh</first><last>Hajishirzi</last></author>
-      <pages>3119–3129</pages>
+      <pages>3121–3131</pages>
       <abstract>Generating diverse sequences is important in many NLP applications such as question generation or summarization that exhibit semantically one-to-many relationships between source and the target sequences. We present a method to explicitly separate diversification from generation using a general plug-and-play module (called SELECTOR) that wraps around and guides an existing encoder-decoder model. The diversification stage uses a mixture of experts to sample different binary masks on the source sequence for diverse content selection. The generation stage uses a standard encoder-decoder model given each selected content from the source sequence. Due to the non-differentiable nature of discrete sampling and the lack of ground truth labels for binary mask, we leverage a proxy for ground truth mask and adopt stochastic hard-EM for training. In question generation (SQuAD) and abstractive summarization (CNN-DM), our method demonstrates significant improvements in accuracy, diversity and training efficiency, including state-of-the-art top-1 accuracy in both datasets, 6% gain in top-5 accuracy, and 3.7 times faster training over a state-of-the-art model. Our code is publicly available at https://github.com/clovaai/FocusSeq2Seq.</abstract>
       <url>D19-1308</url>
       <doi>10.18653/v1/D19-1308</doi>
@@ -3625,7 +3625,7 @@
       <author><first>Wenlin</first><last>Wang</last></author>
       <author><first>Guoyin</first><last>Wang</last></author>
       <author><first>Lawrence</first><last>Carin</last></author>
-      <pages>3130–3140</pages>
+      <pages>3132–3142</pages>
       <abstract>Generating high-quality paraphrases is a fundamental yet challenging natural language processing task. Despite the effectiveness of previous work based on generative models, there remain problems with exposure bias in recurrent neural networks, and often a failure to generate realistic sentences. To overcome these challenges, we propose the first end-to-end conditional generative architecture for generating paraphrases via adversarial training, which does not depend on extra linguistic information. Extensive experiments on four public datasets demonstrate the proposed method achieves state-of-the-art results, outperforming previous generative architectures on both automatic metrics (BLEU, METEOR, and TER) and human evaluations.</abstract>
       <url>D19-1309</url>
       <doi>10.18653/v1/D19-1309</doi>
@@ -3636,7 +3636,7 @@
       <author><first>Xiaocheng</first><last>Feng</last></author>
       <author><first>Bing</first><last>Qin</last></author>
       <author><first>Ting</first><last>Liu</last></author>
-      <pages>3141–3150</pages>
+      <pages>3143–3152</pages>
       <abstract>Although Seq2Seq models for table-to-text generation have achieved remarkable progress, modeling table representation in one dimension is inadequate. This is because (1) the table consists of multiple rows and columns, which means that encoding a table should not depend only on one dimensional sequence or set of records and (2) most of the tables are time series data (e.g. NBA game data, stock market data), which means that the description of the current table may be affected by its historical data. To address aforementioned problems, not only do we model each table cell considering other records in the same row, we also enrich table’s representation by modeling each table cell in context of other cells in the same column or with historical (time dimension) data respectively. In addition, we develop a table cell fusion gate to combine representations from row, column and time dimension into one dense vector according to the saliency of each dimension’s representation. We evaluated our methods on ROTOWIRE, a benchmark dataset of NBA basketball games. Both automatic and human evaluation results demonstrate the effectiveness of our model with improvement of 2.66 in BLEU over the strong baseline and outperformance of state-of-the-art model.</abstract>
       <url>D19-1310</url>
       <doi>10.18653/v1/D19-1310</doi>
@@ -3647,7 +3647,7 @@
       <author><first>Aixin</first><last>Sun</last></author>
       <author><first>Jing</first><last>Li</last></author>
       <author><first>Karthik</first><last>Muthuswamy</last></author>
-      <pages>3151–3160</pages>
+      <pages>3153–3162</pages>
       <abstract>In multi-document summarization, a set of documents to be summarized is assumed to be on the same topic, known as the underlying topic in this paper. That is, the underlying topic can be collectively represented by all the documents in the set. Meanwhile, different documents may cover various different subtopics and the same subtopic can be across several documents. Inspired by topic model, the underlying topic of a document set can also be viewed as a collection of different subtopics of different importance. In this paper, we propose a summarization model called STDS. The model generates the underlying topic representation from both document view and subtopic view in parallel. The learning objective is to minimize the distance between the representations learned from the two views. The contextual information is encoded through a hierarchical RNN architecture. Sentence salience is estimated in a hierarchical way with subtopic salience and relative sentence salience, by considering the contextual information. Top ranked sentences are then extracted as a summary. Note that the notion of subtopic enables us to bring in additional information (e.g. comments to news articles) that is helpful for document summarization. Experimental results show that the proposed solution outperforms state-of-the-art methods on benchmark datasets.</abstract>
       <url>D19-1311</url>
       <doi>10.18653/v1/D19-1311</doi>
@@ -3656,7 +3656,7 @@
       <title>Referring Expression Generation Using Entity Profiles</title>
       <author><first>Meng</first><last>Cao</last></author>
       <author><first>Jackie Chi Kit</first><last>Cheung</last></author>
-      <pages>3161–3170</pages>
+      <pages>3163–3172</pages>
       <abstract>Referring Expression Generation (REG) is the task of generating contextually appropriate references to entities. A limitation of existing REG systems is that they rely on entity-specific supervised training, which means that they cannot handle entities not seen during training. In this study, we address this in two ways. First, we propose task setups in which we specifically test a REG system’s ability to generalize to entities not seen during training. Second, we propose a profile-based deep neural network model, ProfileREG, which encodes both the local context and an external profile of the entity to generate reference realizations. Our model generates tokens by learning to choose between generating pronouns, generating from a fixed vocabulary, or copying a word from the profile. We evaluate our model on three different splits of the WebNLG dataset, and show that it outperforms competitive baselines in all settings according to automatic and human evaluations.</abstract>
       <url>D19-1312</url>
       <doi>10.18653/v1/D19-1312</doi>
@@ -3668,7 +3668,7 @@
       <author><first>Weinan</first><last>Zhang</last></author>
       <author><first>Xin</first><last>Jiang</last></author>
       <author><first>Yong</first><last>Yu</last></author>
-      <pages>3171–3180</pages>
+      <pages>3173–3182</pages>
       <abstract>Paraphrasing plays an important role in various natural language processing (NLP) tasks, such as question answering, information retrieval and sentence simplification. Recently, neural generative models have shown promising results in paraphrase generation. However, prior work mainly focused on single paraphrase generation, while ignoring the fact that diversity is essential for enhancing generalization capability and robustness of downstream applications. Few works have been done to solve diverse paraphrase generation. In this paper, we propose a novel approach with two discriminators and multiple generators to generate a variety of different paraphrases. A reinforcement learning algorithm is applied to train our model. Our experiments on two real-world datasets demonstrate that our model not only gains a significant increase in diversity but also improves generation quality over several state-of-the-art baselines.</abstract>
       <url>D19-1313</url>
       <doi>10.18653/v1/D19-1313</doi>
@@ -3678,7 +3678,7 @@
       <author><first>Leonardo F. R.</first><last>Ribeiro</last></author>
       <author><first>Claire</first><last>Gardent</last></author>
       <author><first>Iryna</first><last>Gurevych</last></author>
-      <pages>3181–3192</pages>
+      <pages>3183–3194</pages>
       <abstract>Generating text from graph-based data, such as Abstract Meaning Representation (AMR), is a challenging task due to the inherent difficulty in how to properly encode the structure of a graph with labeled edges. To address this difficulty, we propose a novel graph-to-sequence model that encodes different but complementary perspectives of the structural information contained in the AMR graph. The model learns parallel top-down and bottom-up representations of nodes capturing contrasting views of the graph. We also investigate the use of different node message passing strategies, employing different state-of-the-art graph encoders to compute node representations based on incoming and outgoing perspectives. In our experiments, we demonstrate that the dual graph representation leads to improvements in AMR-to-text generation, achieving state-of-the-art results on two AMR datasets</abstract>
       <url>D19-1314</url>
       <attachment>D19-1314.Attachment.zip</attachment>
@@ -3692,7 +3692,7 @@
       <author><first>Tomoki</first><last>Taniguchi</last></author>
       <author><first>Yasuhide</first><last>Miura</last></author>
       <author><first>Tomoko</first><last>Ohkuma</last></author>
-      <pages>3193–3203</pages>
+      <pages>3195–3205</pages>
       <abstract>The automated generation of information indicating the characteristics of articles such as headlines, key phrases, summaries and categories helps writers to alleviate their workload. Previous research has tackled these tasks using neural abstractive summarization and classification methods. However, the outputs may be inconsistent if they are generated individually. The purpose of our study is to generate multiple outputs consistently. We introduce a multi-task learning model with a shared encoder and multiple decoders for each task. We propose a novel loss function called hierarchical consistency loss to maintain consistency among the attention weights of the decoders. To evaluate the consistency, we employ a human evaluation. The results show that our model generates more consistent headlines, key phrases and categories. In addition, our model outperforms the baseline model on the ROUGE scores, and generates more adequate and fluent headlines.</abstract>
       <url>D19-1315</url>
       <attachment>D19-1315.Attachment.pdf</attachment>
@@ -3701,7 +3701,7 @@
     <paper id="316">
       <title>Toward a Task of Feedback Comment Generation for Writing Learning</title>
       <author><first>Ryo</first><last>Nagata</last></author>
-      <pages>3204–3213</pages>
+      <pages>3206–3215</pages>
       <abstract>In this paper, we introduce a novel task called feedback comment generation — a task of automatically generating feedback comments such as a hint or an explanatory note for writing learning for non-native learners of English. There has been almost no work on this task nor corpus annotated with feedback comments. We have taken the first step by creating learner corpora consisting of approximately 1,900 essays where all preposition errors are manually annotated with feedback comments. We have tested three baseline methods on the dataset, showing that a simple neural retrieval-based method sets a baseline performance with an F-measure of 0.34 to 0.41. Finally, we have looked into the results to explore what modifications we need to make to achieve better performance. We also have explored problems unaddressed in this work</abstract>
       <url>D19-1316</url>
       <doi>10.18653/v1/D19-1316</doi>
@@ -3713,7 +3713,7 @@
       <author><first>Lidong</first><last>Bing</last></author>
       <author><first>Irwin</first><last>King</last></author>
       <author><first>Michael R.</first><last>Lyu</last></author>
-      <pages>3214–3224</pages>
+      <pages>3216–3226</pages>
       <abstract>Question generation (QG) is the task of generating a question from a reference sentence and a specified answer within the sentence. A major challenge in QG is to identify answer-relevant context words to finish the declarative-to-interrogative sentence transformation. Existing sequence-to-sequence neural models achieve this goal by proximity-based answer position encoding under the intuition that neighboring words of answers are of high possibility to be answer-relevant. However, such intuition may not apply to all cases especially for sentences with complex answer-relevant relations. Consequently, the performance of these models drops sharply when the relative distance between the answer fragment and other non-stop sentence words that also appear in the ground truth question increases. To address this issue, we propose a method to jointly model the unstructured sentence and the structured answer-relevant relation (extracted from the sentence in advance) for question generation. Specifically, the structured answer-relevant relation acts as the to the point context and it thus naturally helps keep the generated question to the point, while the unstructured sentence provides the full information. Extensive experiments show that to the point context helps our question generation model achieve significant improvements on several automatic evaluation metrics. Furthermore, our model is capable of generating diverse questions for a sentence which conveys multiple relations of its answer fragment.</abstract>
       <url>D19-1317</url>
       <doi>10.18653/v1/D19-1317</doi>
@@ -3723,7 +3723,7 @@
       <author><first>Julia</first><last>Ive</last></author>
       <author><first>Pranava</first><last>Madhyastha</last></author>
       <author><first>Lucia</first><last>Specia</last></author>
-      <pages>3225–3234</pages>
+      <pages>3227–3236</pages>
       <abstract>Most text-to-text generation tasks, for example text summarisation and text simplification, require copying words from the input to the output. We introduce Copycat, a transformer-based pointer network for such tasks which obtains competitive results in abstractive text summarisation and generates more abstractive summaries. We propose a further extension of this architecture for automatic post-editing, where generation is conditioned over two inputs (source language and machine translation), and the model is capable of deciding where to copy information from. This approach achieves competitive performance when compared to state-of-the-art automated post-editing systems. More importantly, we show that it addresses a well-known limitation of automatic post-editing - overcorrecting translations - and that our novel mechanism for copying source language words improves the results.</abstract>
       <url>D19-1318</url>
       <doi>10.18653/v1/D19-1318</doi>
@@ -3732,7 +3732,7 @@
       <title>Towards Controllable and Personalized Review Generation</title>
       <author><first>Pan</first><last>Li</last></author>
       <author><first>Alexander</first><last>Tuzhilin</last></author>
-      <pages>3235–3243</pages>
+      <pages>3237–3245</pages>
       <abstract>In this paper, we propose a novel model RevGAN that automatically generates controllable and personalized user reviews based on the arbitrarily given sentimental and stylistic information. RevGAN utilizes the combination of three novel components, including self-attentive recursive autoencoders, conditional discriminators, and personalized decoders. We test its performance on the several real-world datasets, where our model significantly outperforms state-of-the-art generation models in terms of sentence quality, coherence, personalization, and human evaluations. We also empirically show that the generated reviews could not be easily distinguished from the organically produced reviews and that they follow the same statistical linguistics laws.</abstract>
       <url>D19-1319</url>
       <doi>10.18653/v1/D19-1319</doi>
@@ -3743,7 +3743,7 @@
       <author><first>Sylvain</first><last>Lamprier</last></author>
       <author><first>Benjamin</first><last>Piwowarski</last></author>
       <author><first>Jacopo</first><last>Staiano</last></author>
-      <pages>3244–3254</pages>
+      <pages>3246–3256</pages>
       <abstract>Abstractive summarization approaches based on Reinforcement Learning (RL) have recently been proposed to overcome classical likelihood maximization. RL enables to consider complex, possibly non differentiable, metrics that globally assess the quality and relevance of the generated outputs. ROUGE, the most used summarization metric, is known to suffer from bias towards lexical similarity as well as from sub-optimal accounting for fluency and readability of the generated abstracts. We thus explore and propose alternative evaluation measures: the reported human-evaluation analysis shows that the proposed metrics, based on Question Answering, favorably compare to ROUGE – with the additional property of not requiring reference summaries. Training a RL-based model on these metrics leads to improvements (both in terms of human or automated metrics) over current approaches that use ROUGE as reward.</abstract>
       <url>D19-1320</url>
       <doi>10.18653/v1/D19-1320</doi>
@@ -3755,7 +3755,7 @@
       <author><first>Jiangtao</first><last>Wen</last></author>
       <author><first>Wenfei</first><last>Xu</last></author>
       <author><first>Xiaoyan</first><last>Zhu</last></author>
-      <pages>3255–3266</pages>
+      <pages>3257–3268</pages>
       <abstract>Existing neural methods for data-to-text generation are still struggling to produce long and diverse texts: they are insufficient to model input data dynamically during generation, to capture inter-sentence coherence, or to generate diversified expressions. To address these issues, we propose a Planning-based Hierarchical Variational Model (PHVM). Our model first plans a sequence of groups (each group is a subset of input items to be covered by a sentence) and then realizes each sentence conditioned on the planning result and the previously generated context, thereby decomposing long text generation into dependent sentence generation sub-tasks. To capture expression diversity, we devise a hierarchical latent structure where a global planning latent variable models the diversity of reasonable planning and a sequence of local latent variables controls sentence realization. Experiments show that our model outperforms state-of-the-art baselines in long and diverse text generation.</abstract>
       <url>D19-1321</url>
       <attachment>D19-1321.Attachment.zip</attachment>
@@ -3766,7 +3766,7 @@
       <author><first>Akhilesh</first><last>Sudhakar</last></author>
       <author><first>Bhargav</first><last>Upadhyay</last></author>
       <author><first>Arjun</first><last>Maheswaran</last></author>
-      <pages>3267–3277</pages>
+      <pages>3269–3279</pages>
       <abstract>Text style transfer is the task of transferring the style of text having certain stylistic attributes, while preserving non-stylistic or content information. In this work we introduce the Generative Style Transformer (GST) - a new approach to rewriting sentences to a target style in the absence of parallel style corpora. GST leverages the power of both, large unsupervised pre-trained language models as well as the Transformer. GST is a part of a larger ‘Delete Retrieve Generate’ framework, in which we also propose a novel method of deleting style attributes from the source sentence by exploiting the inner workings of the Transformer. Our models outperform state-of-art systems across 5 datasets on sentiment, gender and political slant transfer. We also propose the use of the GLEU metric as an automatic metric of evaluation of style transfer, which we found to compare better with human ratings than the predominantly used BLEU score.</abstract>
       <url>D19-1322</url>
       <doi>10.18653/v1/D19-1322</doi>
@@ -3777,7 +3777,7 @@
       <author><first>Luyang</first><last>Huang</last></author>
       <author><first>Zhe</first><last>Hu</last></author>
       <author><first>Lu</first><last>Wang</last></author>
-      <pages>3278–3289</pages>
+      <pages>3280–3291</pages>
       <abstract>Abstractive summarization systems aim to produce more coherent and concise summaries than their extractive counterparts. Popular neural models have achieved impressive results for single-document summarization, yet their outputs are often incoherent and unfaithful to the input. In this paper, we introduce SENECA, a novel System for ENtity-drivEn Coherent Abstractive summarization framework that leverages entity information to generate informative and coherent abstracts. Our framework takes a two-step approach: (1) an entity-aware content selection module first identifies salient sentences from the input, then (2) an abstract generation module conducts cross-sentence information compression and abstraction to generate the final summary, which is trained with rewards to promote coherence, conciseness, and clarity. The two components are further connected using reinforcement learning. Automatic evaluation shows that our model significantly outperforms previous state-of-the-art based on ROUGE and our proposed coherence measures on New York Times and CNN/Daily Mail datasets. Human judges further rate our system summaries as more informative and coherent than those by popular summarization models.</abstract>
       <url>D19-1323</url>
       <attachment>D19-1323.Attachment.zip</attachment>
@@ -3787,7 +3787,7 @@
       <title>Neural Extractive Text Summarization with Syntactic Compression</title>
       <author><first>Jiacheng</first><last>Xu</last></author>
       <author><first>Greg</first><last>Durrett</last></author>
-      <pages>3290–3301</pages>
+      <pages>3292–3303</pages>
       <abstract>Recent neural network approaches to summarization are largely either selection-based extraction or generation-based abstraction. In this work, we present a neural model for single-document summarization based on joint extraction and syntactic compression. Our model chooses sentences from the document, identifies possible compressions based on constituency parses, and scores those compressions with a neural model to produce the final summary. For learning, we construct oracle extractive-compressive summaries, then learn both of our components jointly with this supervision. Experimental results on the CNN/Daily Mail and New York Times datasets show that our model achieves strong performance (comparable to state-of-the-art systems) as evaluated by ROUGE. Moreover, our approach outperforms an off-the-shelf compression module, and human and manual evaluation shows that our model’s output generally remains grammatical.</abstract>
       <url>D19-1324</url>
       <attachment>D19-1324.Attachment.pdf</attachment>
@@ -3802,7 +3802,7 @@
       <author><first>Chris</first><last>Brockett</last></author>
       <author><first>Bill</first><last>Dolan</last></author>
       <author><first>Ming-Ting</first><last>Sun</last></author>
-      <pages>3302–3311</pages>
+      <pages>3304–3313</pages>
       <abstract>Text style transfer without parallel data has achieved some practical success. However, in the scenario where less data is available, these methods may yield poor performance. In this paper, we examine domain adaptation for text style transfer to leverage massively available data from other domains. These data may demonstrate domain shift, which impedes the benefits of utilizing such data for training. To address this challenge, we propose simple yet effective domain adaptive text style transfer models, enabling domain-adaptive information exchange. The proposed models presumably learn from the source domain to: (i) distinguish stylized information and generic content information; (ii) maximally preserve content information; and (iii) adaptively transfer the styles in a domain-aware manner. We evaluate the proposed models on two style transfer tasks (sentiment and formality) over multiple target domains where only limited non-parallel data is available. Extensive experiments demonstrate the effectiveness of the proposed model compared to the baselines.</abstract>
       <url>D19-1325</url>
       <attachment>D19-1325.Attachment.pdf</attachment>
@@ -3815,8 +3815,8 @@
       <author><first>Mitesh M.</first><last>Khapra</last></author>
       <author><first>Balaji Vasan</first><last>Srinivasan</last></author>
       <author><first>Balaraman</first><last>Ravindran</last></author>
-      <pages>3312–3321</pages>
-      <abstract>In this work, we focus on the task of Automatic Question Generation (AQG) where given a passage and an answer the task is to generate the corresponding question. It is desired that the generated question should be (i) grammatically correct (ii) answerable from the passage and (iii) specific to the given answer. An analysis of existing AQG models shows that they produce questions which do not adhere to one or more of <fixed-case>the above-mentioned qualities</fixed-case>. In particular, the generated questions look like an incomplete draft of the desired question with a clear scope for refinement. <fixed-case>To alleviate this shortcoming</fixed-case>, we propose a method which tries to mimic the human process of generating questions by first creating an initial draft and then refining it. More specifically, we propose Refine Network (RefNet) which contains two decoders. The second decoder uses a dual attention network which pays attention to both (i) the original passage and (ii) the question (initial draft) generated by the first decoder. In effect, it refines the question generated by the first decoder, thereby making it more correct and complete. We evaluate RefNet on three datasets, <i>viz.</i>, SQuAD, HOTPOT-QA, and DROP, and show that it outperforms existing state-of-the-art methods by 7-16% on all of these datasets. Lastly, we show that we can improve the quality of the second decoder on specific metrics, such as, fluency and answerability by explicitly rewarding revisions that improve on the corresponding metric during training. The code has been made publicly available https://github.com/PrekshaNema25/RefNet-QG.</abstract>
+      <pages>3314–3323</pages>
+      <abstract>In this work, we focus on the task of Automatic Question Generation (AQG) where given a passage and an answer the task is to generate the corresponding question. It is desired that the generated question should be (i) grammatically correct (ii) answerable from the passage and (iii) specific to the given answer. An analysis of existing AQG models shows that they produce questions which do not adhere to one or more of <fixed-case>the above-mentioned qualities</fixed-case>. In particular, the generated questions look like an incomplete draft of the desired question with a clear scope for refinement. <fixed-case>To alleviate this shortcoming</fixed-case>, we propose a method which tries to mimic the human process of generating questions by first creating an initial draft and then refining it. More specifically, we propose Refine Network (RefNet) which contains two decoders. The second decoder uses a dual attention network which pays attention to both (i) the original passage and (ii) the question (initial draft) generated by the first decoder. In effect, it refines the question generated by the first decoder, thereby making it more correct and complete. We evaluate RefNet on three datasets, <i>viz.</i>, SQuAD, HOTPOT-QA, and DROP, and show that it outperforms existing state-of-the-art methods by 7-16% on all of these datasets. Lastly, we show that we can improve the quality of the second decoder on specific metrics, such as, fluency and answerability by explicitly rewarding revisions that improve on the corresponding metric during training. The code has been made publicly available .</abstract>
       <url>D19-1326</url>
       <attachment>D19-1326.Attachment.pdf</attachment>
       <doi>10.18653/v1/D19-1326</doi>
@@ -3827,7 +3827,7 @@
       <author><first>Dongyeop</first><last>Kang</last></author>
       <author><first>Lucas</first><last>Mentch</last></author>
       <author><first>Eduard</first><last>Hovy</last></author>
-      <pages>3322–3333</pages>
+      <pages>3324–3335</pages>
       <abstract>Despite the recent developments on neural summarization systems, the underlying logic behind the improvements from the systems and its corpus-dependency remains largely unexplored. Position of sentences in the original text, for example, is a well known bias for news summarization. Following in the spirit of the claim that summarization is a combination of sub-functions, we define three sub-aspects of summarization: position, importance, and diversity and conduct an extensive analysis of the biases of each sub-aspect with respect to the domain of nine different summarization corpora (e.g., news, academic papers, meeting minutes, movie script, books, posts). We find that while position exhibits substantial bias in news articles, this is not the case, for example, with academic papers and meeting minutes. Furthermore, our empirical study shows that different types of summarization systems (e.g., neural-based) are composed of different degrees of the sub-aspects. Our study provides useful lessons regarding consideration of underlying sub-aspects when collecting a new summarization dataset or developing a new system.</abstract>
       <url>D19-1327</url>
       <attachment>D19-1327.Attachment.pdf</attachment>
@@ -3838,7 +3838,7 @@
       <author><first>Yova</first><last>Kementchedjhieva</last></author>
       <author><first>Mareike</first><last>Hartmann</last></author>
       <author><first>Anders</first><last>Søgaard</last></author>
-      <pages>3334–3339</pages>
+      <pages>3336–3341</pages>
       <abstract>The task of bilingual dictionary induction (BDI) is commonly used for intrinsic evaluation of cross-lingual word embeddings. The largest dataset for BDI was generated automatically, so its quality is dubious. We study the composition and quality of the test sets for five diverse languages from this dataset, with concerning findings: (1) a quarter of the data consists of proper nouns, which can be hardly indicative of BDI performance, and (2) there are pervasive gaps in the gold-standard targets. These issues appear to affect the ranking between cross-lingual embedding systems on individual languages, and the overall degree to which the systems differ in performance. With proper nouns removed from the data, the margin between the top two systems included in the study grows from 3.4% to 17.2%. Manual verification of the predictions, on the other hand, reveals that gaps in the gold standard targets artificially inflate the margin between the two systems on English to Bulgarian BDI from 0.1% to 6.7%. We thus suggest that future research either avoids drawing conclusions from quantitative results on this BDI dataset, or accompanies such evaluation with rigorous error analysis.</abstract>
       <url>D19-1328</url>
       <attachment>D19-1328.Attachment.zip</attachment>
@@ -3849,7 +3849,7 @@
       <author><first>Katharina</first><last>Kann</last></author>
       <author><first>Kyunghyun</first><last>Cho</last></author>
       <author><first>Samuel R.</first><last>Bowman</last></author>
-      <pages>3340–3347</pages>
+      <pages>3342–3349</pages>
       <abstract>Development sets are impractical to obtain for real low-resource languages, since using all available data for training is often more effective. However, development sets are widely used in research papers that purport to deal with low-resource natural language processing (NLP). Here, we aim to answer the following questions: Does using a development set for early stopping in the low-resource setting influence results as compared to a more realistic alternative, where the number of training epochs is tuned on development languages? And does it lead to overestimation or underestimation of performance? We repeat multiple experiments from recent work on neural models for low-resource NLP and compare results for models obtained by training with and without development sets. On average over languages, absolute accuracy differs by up to 1.4%. However, for some languages and tasks, differences are as big as 18.0% accuracy. Our results highlight the importance of realistic experimental setups in the publication of low-resource NLP research results.</abstract>
       <url>D19-1329</url>
       <attachment>D19-1329.Attachment.zip</attachment>
@@ -3862,7 +3862,7 @@
       <author><first>Long</first><last>Zhou</last></author>
       <author><first>Yuchen</first><last>Liu</last></author>
       <author><first>Chengqing</first><last>Zong</last></author>
-      <pages>3348–3353</pages>
+      <pages>3350–3355</pages>
       <abstract>In this paper, we introduce a novel interactive approach to translate a source language into two different languages simultaneously and interactively. Specifically, the generation of one language relies on not only previously generated outputs by itself, but also the outputs predicted in the other language. Experimental results on IWSLT and WMT datasets demonstrate that our method can obtain significant improvements over both conventional Neural Machine Translation (NMT) model and multilingual NMT model.</abstract>
       <url>D19-1330</url>
       <doi>10.18653/v1/D19-1330</doi>
@@ -3871,7 +3871,7 @@
       <title>On <fixed-case>NMT</fixed-case> Search Errors and Model Errors: Cat Got Your Tongue?</title>
       <author><first>Felix</first><last>Stahlberg</last></author>
       <author><first>Bill</first><last>Byrne</last></author>
-      <pages>3354–3360</pages>
+      <pages>3356–3362</pages>
       <abstract>We report on search errors and model errors in neural machine translation (NMT). We present an exact inference procedure for neural sequence models based on a combination of beam search and depth-first search. We use our exact search to find the global best model scores under a Transformer base model for the entire WMT15 English-German test set. Surprisingly, beam search fails to find these global best model scores in most cases, even with a very large beam size of 100. For more than 50% of the sentences, the model in fact assigns its global best score to the empty translation, revealing a massive failure of neural models in properly accounting for adequacy. We show by constraining search with a minimum translation length that at the root of the problem of empty translations lies an inherent bias towards shorter translations. We conclude that vanilla NMT in its current form requires just the right amount of beam search errors, which, from a modelling perspective, is a highly unsatisfactory conclusion indeed, as the model often prefers an empty translation.</abstract>
       <url>D19-1331</url>
       <doi>10.18653/v1/D19-1331</doi>
@@ -3882,7 +3882,7 @@
       <author><first>Daniel</first><last>Khashabi</last></author>
       <author><first>Qiang</first><last>Ning</last></author>
       <author><first>Dan</first><last>Roth</last></author>
-      <pages>3361–3367</pages>
+      <pages>3363–3369</pages>
       <abstract>Understanding time is crucial for understanding events expressed in natural language. Because people rarely say the obvious, it is often necessary to have commonsense knowledge about various temporal aspects of events, such as duration, frequency, and temporal order. However, this important problem has so far received limited attention. This paper systematically studies this temporal commonsense problem. Specifically, we define five classes of temporal commonsense, and use crowdsourcing to develop a new dataset, MCTACO, that serves as a test set for this task. We find that the best current methods used on MCTACO are still far behind human performance, by about 20%, and discuss several directions for improvement. We hope that the new dataset and our study here can foster more future research on this topic.</abstract>
       <url>D19-1332</url>
       <attachment>D19-1332.Attachment.zip</attachment>
@@ -3892,7 +3892,7 @@
       <title><fixed-case>QAI</fixed-case>nfomax: Learning Robust Question Answering System by Mutual Information Maximization</title>
       <author><first>Yi-Ting</first><last>Yeh</last></author>
       <author><first>Yun-Nung</first><last>Chen</last></author>
-      <pages>3368–3373</pages>
+      <pages>3370–3375</pages>
       <abstract>Standard accuracy metrics indicate that modern reading comprehension systems have achieved strong performance in many question answering datasets. However, the extent these systems truly understand language remains unknown, and existing systems are not good at distinguishing distractor sentences which look related but do not answer the question. To address this problem, we propose QAInfomax as a regularizer in reading comprehension systems by maximizing mutual information among passages, a question, and its answer. QAInfomax helps regularize the model to not simply learn the superficial correlation for answering the questions. The experiments show that our proposed QAInfomax achieves the state-of-the-art performance on the benchmark Adversarial-SQuAD dataset.</abstract>
       <url>D19-1333</url>
       <attachment>D19-1333.Attachment.zip</attachment>
@@ -3906,7 +3906,7 @@
       <author><first>Lei</first><last>Hou</last></author>
       <author><first>Juanzi</first><last>Li</last></author>
       <author><first>Zhiyuan</first><last>Liu</last></author>
-      <pages>3374–3379</pages>
+      <pages>3376–3381</pages>
       <abstract>Multi-hop knowledge graph (KG) reasoning is an effective and explainable method for predicting the target entity via reasoning paths in query answering (QA) task. Most previous methods assume that every relation in KGs has enough triples for training, regardless of those few-shot relations which cannot provide sufficient triples for training robust reasoning models. In fact, the performance of existing multi-hop reasoning methods drops significantly on few-shot relations. In this paper, we propose a meta-based multi-hop reasoning method (Meta-KGR), which adopts meta-learning to learn effective meta parameters from high-frequency relations that could quickly adapt to few-shot relations. We evaluate Meta-KGR on two public datasets sampled from Freebase and NELL, and the experimental results show that Meta-KGR outperforms state-of-the-art methods in few-shot scenarios. In the future, our codes and datasets will also be available to provide more details.</abstract>
       <url>D19-1334</url>
       <doi>10.18653/v1/D19-1334</doi>
@@ -3918,7 +3918,7 @@
       <author><first>Adam</first><last>Trischler</last></author>
       <author><first>Kaheer</first><last>Suleman</last></author>
       <author><first>Jackie Chi Kit</first><last>Cheung</last></author>
-      <pages>3380–3385</pages>
+      <pages>3382–3387</pages>
       <abstract>Recent studies have significantly improved the state-of-the-art on common-sense reasoning (CSR) benchmarks like the Winograd Schema Challenge (WSC) and SWAG. The question we ask in this paper is whether improved performance on these benchmarks represents genuine progress towards common-sense-enabled systems. We make case studies of both benchmarks and design protocols that clarify and qualify the results of previous work by analyzing threats to the validity of previous experimental designs. Our protocols account for several properties prevalent in common-sense benchmarks including size limitations, structural regularities, and variable instance difficulty.</abstract>
       <url>D19-1335</url>
       <attachment>D19-1335.Attachment.pdf</attachment>
@@ -3933,7 +3933,7 @@
       <author><first>Baobao</first><last>Chang</last></author>
       <author><first>Zhifang</first><last>Sui</last></author>
       <author><first>Xu</first><last>Sun</last></author>
-      <pages>3386–3391</pages>
+      <pages>3388–3393</pages>
       <abstract>In this paper, we focus on the task of generating a pun sentence given a pair of word senses. A major challenge for pun generation is the lack of large-scale pun corpus to guide supervised learning. To remedy this, we propose an adversarial generative network for pun generation (Pun-GAN). It consists of a generator to produce pun sentences, and a discriminator to distinguish between the generated pun sentences and the real sentences with specific word senses. The output of the discriminator is then used as a reward to train the generator via reinforcement learning, encouraging it to produce pun sentences which can support two word senses simultaneously. Experiments show that the proposed Pun-GAN can generate sentences that are more ambiguous and diverse in both automatic and human evaluation.</abstract>
       <url>D19-1336</url>
       <doi>10.18653/v1/D19-1336</doi>
@@ -3943,7 +3943,7 @@
       <author><first>Wenjie</first><last>Zhou</last></author>
       <author><first>Minghua</first><last>Zhang</last></author>
       <author><first>Yunfang</first><last>Wu</last></author>
-      <pages>3392–3397</pages>
+      <pages>3394–3399</pages>
       <abstract>This paper explores the task of answer-aware questions generation. Based on the attention-based pointer generator model, we propose to incorporate an auxiliary task of language modeling to help question generation in a hierarchical multi-task learning structure. Our joint-learning model enables the encoder to learn a better representation of the input sequence, which will guide the decoder to generate more coherent and fluent questions. On both SQuAD and MARCO datasets, our multi-task learning model boosts the performance, achieving state-of-the-art results. Moreover, human evaluation further proves the high quality of our generated questions.</abstract>
       <url>D19-1337</url>
       <doi>10.18653/v1/D19-1337</doi>
@@ -3953,7 +3953,7 @@
       <author><first>Florian</first><last>Schmidt</last></author>
       <author><first>Stephan</first><last>Mandt</last></author>
       <author><first>Thomas</first><last>Hofmann</last></author>
-      <pages>3398–3404</pages>
+      <pages>3400–3406</pages>
       <abstract>Autoregressive state transitions, where predictions are conditioned on past predictions, are the predominant choice for both deterministic and stochastic sequential models. However, autoregressive feedback exposes the evolution of the hidden state trajectory to potential biases from well-known train-test discrepancies. In this paper, we combine a latent state space model with a CRF observation model. We argue that such autoregressive observation models form an interesting middle ground that expresses local correlations on the word level but keeps the state evolution non-autoregressive. On unconditional sentence generation we show performance improvements compared to RNN and GAN baselines while avoiding some prototypical failure modes of autoregressive models.</abstract>
       <url>D19-1338</url>
       <attachment>D19-1338.Attachment.pdf</attachment>
@@ -3965,7 +3965,7 @@
       <author><first>Kai-Wei</first><last>Chang</last></author>
       <author><first>Premkumar</first><last>Natarajan</last></author>
       <author><first>Nanyun</first><last>Peng</last></author>
-      <pages>3405–3410</pages>
+      <pages>3407–3412</pages>
       <abstract>We present a systematic study of biases in natural language generation (NLG) by analyzing text generated from prompts that contain mentions of different demographic groups. In this work, we introduce the notion of the regard towards a demographic, use the varying levels of regard towards different demographics as a defining metric for bias in NLG, and analyze the extent to which sentiment scores are a relevant proxy metric for regard. To this end, we collect strategically-generated text from language models and manually annotate the text with both sentiment and regard scores. Additionally, we build an automatic regard classifier through transfer learning, so that we can analyze biases in unseen text. Together, these methods reveal the extent of the biased nature of language model generations. Our analysis provides a study of biases in NLG, bias metrics and correlated human judgments, and empirical evidence on the usefulness of our annotated dataset.</abstract>
       <url>D19-1339</url>
       <attachment>D19-1339.Attachment.zip</attachment>
@@ -3977,7 +3977,7 @@
       <author><first>Mithun</first><last>Paul</last></author>
       <author><first>Rebecca</first><last>Sharp</last></author>
       <author><first>Mihai</first><last>Surdeanu</last></author>
-      <pages>3411–3416</pages>
+      <pages>3413–3418</pages>
       <abstract>While neural networks produce state-of-the-art performance in many NLP tasks, they generally learn from lexical information, which may transfer poorly between domains. Here, we investigate the importance that a model assigns to various aspects of data while learning and making predictions, specifically, in a recognizing textual entailment (RTE) task. By inspecting the attention weights assigned by the model, we confirm that most of the weights are assigned to noun phrases. To mitigate this dependence on lexicalized information, we experiment with two strategies of masking. First, we replace named entities with their corresponding semantic tags along with a unique identifier to indicate lexical overlap between claim and evidence. Second, we similarly replace other word classes in the sentence (nouns, verbs, adjectives, and adverbs) with their super sense tags (Ciaramita and Johnson, 2003). Our results show that, while performance on the in-domain dataset remains on par with that of the model trained on fully lexicalized data, it improves considerably when tested out of domain. For example, the performance of a state-of-the-art RTE model trained on the masked Fake News Challenge (Pomerleau and Rao, 2017) data and evaluated on Fact Extraction and Verification (Thorne et al., 2018) data improved by over 10% in accuracy score compared to the fully lexicalized model.</abstract>
       <url>D19-1340</url>
       <doi>10.18653/v1/D19-1340</doi>
@@ -3990,7 +3990,7 @@
       <author><first>Daniel</first><last>Roberto Filizzola Ortiz</last></author>
       <author><first>Enrico</first><last>Santus</last></author>
       <author><first>Regina</first><last>Barzilay</last></author>
-      <pages>3417–3423</pages>
+      <pages>3419–3425</pages>
       <abstract>Fact verification requires validating a claim in the context of evidence. We show, however, that in the popular FEVER dataset this might not necessarily be the case. Claim-only classifiers perform competitively with top evidence-aware models. In this paper, we investigate the cause of this phenomenon, identifying strong cues for predicting labels solely based on the claim, without considering any evidence. We create an evaluation set that avoids those idiosyncrasies. The performance of FEVER-trained models significantly drops when evaluated on this test set. Therefore, we introduce a regularization method which alleviates the effect of bias in the training data, obtaining improvements on the newly created test set. This work is a step towards a more sound evaluation of reasoning capabilities in fact verification models.</abstract>
       <url>D19-1341</url>
       <attachment>D19-1341.Attachment.pdf</attachment>
@@ -4001,7 +4001,7 @@
       <author><first>Xingwei</first><last>Tan</last></author>
       <author><first>Yi</first><last>Cai</last></author>
       <author><first>Changxi</first><last>Zhu</last></author>
-      <pages>3424–3429</pages>
+      <pages>3426–3431</pages>
       <abstract>Aspect-level sentiment classification, which is a fine-grained sentiment analysis task, has received lots of attention these years. There is a phenomenon that people express both positive and negative sentiments towards an aspect at the same time. Such opinions with conflicting sentiments, however, are ignored by existing studies, which design models based on the absence of them. We argue that the exclusion of conflict opinions is problematic, for the reason that it represents an important style of human thinking – dialectic thinking. If a real-world sentiment classification system ignores the existence of conflict opinions when it is designed, it will incorrectly mixed conflict opinions into other sentiment polarity categories in action. Existing models have problems when recognizing conflicting opinions, such as data sparsity. In this paper, we propose a multi-label classification model with dual attention mechanism to address these problems.</abstract>
       <url>D19-1342</url>
       <attachment>D19-1342.Attachment.zip</attachment>
@@ -4013,7 +4013,7 @@
       <author><first>Liang-Chih</first><last>Yu</last></author>
       <author><first>K. Robert</first><last>Lai</last></author>
       <author><first>Xuejie</first><last>Zhang</last></author>
-      <pages>3430–3435</pages>
+      <pages>3432–3437</pages>
       <abstract>Deep neural network models such as long short-term memory (LSTM) and tree-LSTM have been proven to be effective for sentiment analysis. However, sequential LSTM is a bias model wherein the words in the tail of a sentence are more heavily emphasized than those in the header for building sentence representations. Even tree-LSTM, with useful structural information, could not avoid the bias problem because the root node will be dominant and the nodes in the bottom of the parse tree will be less emphasized even though they may contain salient information. To overcome the bias problem, this study proposes a capsule tree-LSTM model, introducing a dynamic routing algorithm as an aggregation layer to build sentence representation by assigning different weights to nodes according to their contributions to prediction. Experiments on Stanford Sentiment Treebank (SST) for sentiment classification and EmoBank for regression show that the proposed method improved the performance of tree-LSTM and other neural network models. In addition, the deeper the tree structure, the bigger the improvement.</abstract>
       <url>D19-1343</url>
       <doi>10.18653/v1/D19-1343</doi>
@@ -4022,7 +4022,7 @@
       <title>A Label Informative Wide &amp; Deep Classifier for Patents and Papers</title>
       <author><first>Muyao</first><last>Niu</last></author>
       <author><first>Jie</first><last>Cai</last></author>
-      <pages>3436–3441</pages>
+      <pages>3438–3443</pages>
       <abstract>In this paper, we provide a simple and effective baseline for classifying both patents and papers to the well-established Cooperative Patent Classification (CPC). We propose a label-informative classifier based on the Wide &amp; Deep structure, where the Wide part encodes string-level similarities between texts and labels, and the Deep part captures semantic-level similarities via non-linear transformations. Our model trains on millions of patents, and transfers to papers by developing distant-supervised training set and domain-specific features. Extensive experiments show that our model achieves comparable performance to the state-of-the-art model used in industry on both patents and papers. The output of this work should facilitate the searching, granting and filing of innovative ideas for patent examiners, attorneys and researchers.</abstract>
       <url>D19-1344</url>
       <attachment>D19-1344.Attachment.zip</attachment>
@@ -4035,7 +4035,7 @@
       <author><first>Sujian</first><last>Li</last></author>
       <author><first>Xiaodong</first><last>Zhang</last></author>
       <author><first>Houfeng</first><last>Wang</last></author>
-      <pages>3442–3448</pages>
+      <pages>3444–3450</pages>
       <abstract>Recently, researches have explored the graph neural network (GNN) techniques on text classification, since GNN does well in handling complex structures and preserving global information. However, previous methods based on GNN are mainly faced with the practical problems of fixed corpus level graph structure which don’t support online testing and high memory consumption. To tackle the problems, we propose a new GNN based model that builds graphs for each input text with global parameters sharing instead of a single graph for the whole corpus. This method removes the burden of dependence between an individual text and entire corpus which support online testing, but still preserve global information. Besides, we build graphs by much smaller windows in the text, which not only extract more local features but also significantly reduce the edge numbers as well as memory consumption. Experiments show that our model outperforms existing models on several text classification datasets even with consuming less memory.</abstract>
       <url>D19-1345</url>
       <doi>10.18653/v1/D19-1345</doi>
@@ -4045,7 +4045,7 @@
       <author><first>Ahmed</first><last>Sabir</last></author>
       <author><first>Francesc</first><last>Moreno</last></author>
       <author><first>Lluís</first><last>Padró</last></author>
-      <pages>3449–3455</pages>
+      <pages>3451–3457</pages>
       <abstract>Applications such as textual entailment, plagiarism detection or document clustering rely on the notion of semantic similarity, and are usually approached with dimension reduction techniques like LDA or with embedding-based neural approaches. We present a scenario where semantic similarity is not enough, and we devise a neural approach to learn semantic relatedness. The scenario is text spotting in the wild, where a text in an image (e.g. street sign, advertisement or bus destination) must be identified and recognized. Our goal is to improve the performance of vision systems by leveraging semantic information. Our rationale is that the text to be spotted is often related to the image context in which it appears (word pairs such as Delta-airplane, or quarters-parking are not similar, but are clearly related). We show how learning a word-to-word or word-to-sentence relatedness score can improve the performance of text spotting systems up to 2.9 points, outperforming other measures in a benchmark dataset.</abstract>
       <url>D19-1346</url>
       <doi>10.18653/v1/D19-1346</doi>
@@ -4054,7 +4054,7 @@
       <title>Delta-training: Simple Semi-Supervised Text Classification using Pretrained Word Embeddings</title>
       <author><first>Hwiyeol</first><last>Jo</last></author>
       <author><first>Ceyda</first><last>Cinarel</last></author>
-      <pages>3456–3461</pages>
+      <pages>3458–3463</pages>
       <abstract>We propose a novel and simple method for semi-supervised text classification. The method stems from the hypothesis that a classifier with pretrained word embeddings always outperforms the same classifier with randomly initialized word embeddings, as empirically observed in NLP tasks. Our method first builds two sets of classifiers as a form of model ensemble, and then initializes their word embeddings differently: one using random, the other using pretrained word embeddings. We focus on different predictions between the two classifiers on unlabeled data while following the self-training framework. We also use early-stopping in meta-epoch to improve the performance of our method. Our method, Delta-training, outperforms the self-training and the co-training framework in 4 different text classification datasets, showing robustness against error accumulation.</abstract>
       <url>D19-1347</url>
       <doi>10.18653/v1/D19-1347</doi>
@@ -4063,7 +4063,7 @@
       <title>Visual Detection with Context for Document Layout Analysis</title>
       <author><first>Carlos</first><last>Soto</last></author>
       <author><first>Shinjae</first><last>Yoo</last></author>
-      <pages>3462–3468</pages>
+      <pages>3464–3470</pages>
       <abstract>We present 1) a work in progress method to visually segment key regions of scientific articles using an object detection technique augmented with contextual features, and 2) a novel dataset of region-labeled articles. A continuing challenge in scientific literature mining is the difficulty of consistently extracting high-quality text from formatted PDFs. To address this, we adapt the object-detection technique Faster R-CNN for document layout detection, incorporating contextual information that leverages the inherently localized nature of article contents to improve the region detection performance. Due to the limited availability of high-quality region-labels for scientific articles, we also contribute a novel dataset of region annotations, the first version of which covers 9 region classes and 822 article pages. Initial experimental results demonstrate a 23.9% absolute improvement in mean average precision over the baseline model by incorporating contextual features, and a processing speed 14x faster than a text-based technique. Ongoing work on further improvements is also discussed.</abstract>
       <url>D19-1348</url>
       <attachment>D19-1348.Attachment.pdf</attachment>
@@ -4074,7 +4074,7 @@
       <author><first>Linzi</first><last>Xing</last></author>
       <author><first>Michael J.</first><last>Paul</last></author>
       <author><first>Giuseppe</first><last>Carenini</last></author>
-      <pages>3469–3475</pages>
+      <pages>3471–3477</pages>
       <abstract>Probabilistic topic models such as latent Dirichlet allocation (LDA) are popularly used with Bayesian inference methods such as Gibbs sampling to learn posterior distributions over topic model parameters. We derive a novel measure of LDA topic quality using the variability of the posterior distributions. Compared to several existing baselines for automatic topic evaluation, the proposed metric achieves state-of-the-art correlations with human judgments of topic quality in experiments on three corpora. We additionally demonstrate that topic quality estimation can be further improved using a supervised estimator that combines multiple metrics.</abstract>
       <url>D19-1349</url>
       <attachment>D19-1349.Attachment.zip</attachment>
@@ -4088,7 +4088,7 @@
       <author><first>Yu</first><last>Zhou</last></author>
       <author><first>Ruifeng</first><last>Xu</last></author>
       <author><first>Yulan</first><last>He</last></author>
-      <pages>3476–3481</pages>
+      <pages>3478–3483</pages>
       <abstract>In recent years, advances in neural variational inference have achieved many successes in text processing. Examples include neural topic models which are typically built upon variational autoencoder (VAE) with an objective of minimising the error of reconstructing original documents based on the learned latent topic vectors. However, minimising reconstruction errors does not necessarily lead to high quality topics. In this paper, we borrow the idea of reinforcement learning and incorporate topic coherence measures as reward signals to guide the learning of a VAE-based topic model. Furthermore, our proposed model is able to automatically separating background words dynamically from topic words, thus eliminating the pre-processing step of filtering infrequent and/or top frequent words, typically required for learning traditional topic models. Experimental results on the 20 Newsgroups and the NIPS datasets show superior performance both on perplexity and topic coherence measure compared to state-of-the-art neural topic models.</abstract>
       <url>D19-1350</url>
       <doi>10.18653/v1/D19-1350</doi>
@@ -4097,7 +4097,7 @@
       <title>Modelling Stopping Criteria for Search Results using <fixed-case>P</fixed-case>oisson Processes</title>
       <author><first>Alison</first><last>Sneyd</last></author>
       <author><first>Mark</first><last>Stevenson</last></author>
-      <pages>3482–3487</pages>
+      <pages>3484–3489</pages>
       <abstract>Text retrieval systems often return large sets of documents, particularly when applied to large collections. Stopping criteria can reduce the number of these documents that need to be manually evaluated for relevance by predicting when a suitable level of recall has been achieved. In this work, a novel method for determining a stopping criterion is proposed that models the rate at which relevant documents occur using a Poisson process. This method allows a user to specify both a minimum desired level of recall to achieve and a desired probability of having achieved it. We evaluate our method on a public dataset and compare it with previous techniques for determining stopping criteria.</abstract>
       <url>D19-1351</url>
       <doi>10.18653/v1/D19-1351</doi>
@@ -4108,7 +4108,7 @@
       <author><first>Wei</first><last>Yang</last></author>
       <author><first>Haotian</first><last>Zhang</last></author>
       <author><first>Jimmy</first><last>Lin</last></author>
-      <pages>3488–3494</pages>
+      <pages>3490–3496</pages>
       <abstract>This paper applies BERT to ad hoc document retrieval on news articles, which requires addressing two challenges: relevance judgments in existing test collections are typically provided only at the document level, and documents often exceed the length that BERT was designed to handle. Our solution is to aggregate sentence-level evidence to rank documents. Furthermore, we are able to leverage passage-level relevance judgments fortuitously available in other domains to fine-tune BERT models that are able to capture cross-domain notions of relevance, and can be directly used for ranking news articles. Our simple neural ranking models achieve state-of-the-art effectiveness on three standard test collections.</abstract>
       <url>D19-1352</url>
       <doi>10.18653/v1/D19-1352</doi>
@@ -4121,7 +4121,7 @@
       <author><first>Pratik</first><last>Mehta</last></author>
       <author><first>W. Bruce</first><last>Croft</last></author>
       <author><first>Scott</first><last>Miller</last></author>
-      <pages>3495–3500</pages>
+      <pages>3497–3502</pages>
       <abstract>When performing cross-language information retrieval (CLIR) for lower-resourced languages, a common approach is to retrieve over the output of machine translation (MT). However, there is no established guidance on how to optimize the resulting MT-IR system. In this paper, we examine the relationship between the performance of MT systems and both neural and term frequency-based IR models to identify how CLIR performance can be best predicted from MT quality. We explore performance at varying amounts of MT training data, byte pair encoding (BPE) merge operations, and across two IR collections and retrieval models. We find that the choice of IR collection can substantially affect the predictive power of MT tuning decisions and evaluation, potentially introducing dissociations between MT-only and overall CLIR performance.</abstract>
       <url>D19-1353</url>
       <attachment>D19-1353.Attachment.zip</attachment>
@@ -4130,7 +4130,7 @@
     <paper id="354">
       <title>Rotate King to get Queen: Word Relationships as Orthogonal Transformations in Embedding Space</title>
       <author><first>Kawin</first><last>Ethayarajh</last></author>
-      <pages>3501–3506</pages>
+      <pages>3503–3508</pages>
       <abstract>A notable property of word embeddings is that word relationships can exist as linear substructures in the embedding space. For example, ‘gender’ corresponds to v_woman - v_man and v_queen - v_king. This, in turn, allows word analogies to be solved arithmetically: v_king - v_man + v_woman = v_queen. This property is notable because it suggests that models trained on word embeddings can easily learn such relationships as geometric translations. However, there is no evidence that models exclusively represent relationships in this manner. We document an alternative way in which downstream models might learn these relationships: orthogonal and linear transformations. For example, given a translation vector for ‘gender’, we can find an orthogonal matrix R, representing a rotation and reflection, such that R(v_king) = v_queen and R(v_man) = v_woman. Analogical reasoning using orthogonal transformations is almost as accurate as using vector arithmetic; using linear transformations is more accurate than both. Our findings suggest that these transformations can be as good a representation of word relationships as translation vectors.</abstract>
       <url>D19-1354</url>
       <doi>10.18653/v1/D19-1354</doi>
@@ -4141,7 +4141,7 @@
       <author><first>Chi</first><last>Sun</last></author>
       <author><first>Xipeng</first><last>Qiu</last></author>
       <author><first>Xuanjing</first><last>Huang</last></author>
-      <pages>3507–3512</pages>
+      <pages>3509–3514</pages>
       <abstract>Word Sense Disambiguation (WSD) aims to find the exact sense of an ambiguous word in a particular context. Traditional supervised methods rarely take into consideration the lexical resources like WordNet, which are widely utilized in knowledge-based methods. Recent studies have shown the effectiveness of incorporating gloss (sense definition) into neural networks for WSD. However, compared with traditional word expert supervised methods, they have not achieved much improvement. In this paper, we focus on how to better leverage gloss knowledge in a supervised neural WSD system. We construct context-gloss pairs and propose three BERT based models for WSD. We fine-tune the pre-trained BERT model and achieve new state-of-the-art results on WSD task.</abstract>
       <url>D19-1355</url>
       <doi>10.18653/v1/D19-1355</doi>
@@ -4154,7 +4154,7 @@
       <author><first>Bei</first><last>Chen</last></author>
       <author><first>Jian-Guang</first><last>Lou</last></author>
       <author><first>Zhoujun</first><last>Li</last></author>
-      <pages>3513–3518</pages>
+      <pages>3515–3520</pages>
       <abstract>One key component in text-to-SQL is to predict the comparison relations between columns and their values. To the best of our knowledge, no existing models explicitly introduce external common knowledge to address this problem, thus their capabilities of predicting comparison relations are limited beyond training data. In this paper, we propose to leverage adjective-noun phrasing knowledge mined from the web to predict the comparison relations in text-to-SQL. Experimental results on both the original and the re-split Spider dataset show that our approach achieves significant improvement over state-of-the-art methods on comparison relation prediction.</abstract>
       <url>D19-1356</url>
       <doi>10.18653/v1/D19-1356</doi>
@@ -4164,7 +4164,7 @@
       <author><first>Koki</first><last>Washio</last></author>
       <author><first>Satoshi</first><last>Sekine</last></author>
       <author><first>Tsuneaki</first><last>Kato</last></author>
-      <pages>3519–3525</pages>
+      <pages>3521–3527</pages>
       <abstract>Definition modeling includes acquiring word embeddings from dictionary definitions and generating definitions of words. While the meanings of defining words are important in dictionary definitions, it is crucial to capture the lexical semantic relations between defined words and defining words. However, thus far, the utilization of such relations has not been explored for definition modeling. In this paper, we propose definition modeling methods that use lexical semantic relations. To utilize implicit semantic relations in definitions, we use unsupervisedly obtained pattern-based word-pair embeddings that represent semantic relations of word pairs. Experimental results indicate that our methods improve the performance in learning embeddings from definitions, as well as definition generation.</abstract>
       <url>D19-1357</url>
       <attachment>D19-1357.Attachment.zip</attachment>
@@ -4175,7 +4175,7 @@
       <author><first>Kang Min</first><last>Yoo</last></author>
       <author><first>Taeuk</first><last>Kim</last></author>
       <author><first>Sang-goo</first><last>Lee</last></author>
-      <pages>3526–3531</pages>
+      <pages>3528–3533</pages>
       <abstract>We propose a simple yet effective approach for improving Korean word representations using additional linguistic annotation (i.e. Hanja). We employ cross-lingual transfer learning in training word representations by leveraging the fact that Hanja is closely related to Chinese. We evaluate the intrinsic quality of representations learned through our approach using the word analogy and similarity tests. In addition, we demonstrate their effectiveness on several downstream tasks, including a novel Korean news headline generation task.</abstract>
       <url>D19-1358</url>
       <attachment>D19-1358.Attachment.zip</attachment>
@@ -4187,7 +4187,7 @@
       <author><first>Federico</first><last>Scozzafava</last></author>
       <author><first>Federico</first><last>Martelli</last></author>
       <author><first>Roberto</first><last>Navigli</last></author>
-      <pages>3532–3538</pages>
+      <pages>3534–3540</pages>
       <abstract>Current research in knowledge-based Word Sense Disambiguation (WSD) indicates that performances depend heavily on the Lexical Knowledge Base (LKB) employed. This paper introduces SyntagNet, a novel resource consisting of manually disambiguated lexical-semantic combinations. By capturing sense distinctions evoked by syntagmatic relations, SyntagNet enables knowledge-based WSD systems to establish a new state of the art which challenges the hitherto unrivaled performances attained by supervised approaches. To the best of our knowledge, SyntagNet is the first large-scale manually-curated resource of this kind made available to the community (at http://syntagnet.org).</abstract>
       <url>D19-1359</url>
       <doi>10.18653/v1/D19-1359</doi>
@@ -4199,7 +4199,7 @@
       <author><first>Jamin</first><last>Shin</last></author>
       <author><first>Zihan</first><last>Liu</last></author>
       <author><first>Pascale</first><last>Fung</last></author>
-      <pages>3539–3545</pages>
+      <pages>3541–3547</pages>
       <abstract>In countries that speak multiple main languages, mixing up different languages within a conversation is commonly called code-switching. Previous works addressing this challenge mainly focused on word-level aspects such as word embeddings. However, in many cases, languages share common subwords, especially for closely related languages, but also for languages that are seemingly irrelevant. Therefore, we propose Hierarchical Meta-Embeddings (HME) that learn to combine multiple monolingual word-level and subword-level embeddings to create language-agnostic lexical representations. On the task of Named Entity Recognition for English-Spanish code-switching data, our model achieves the state-of-the-art performance in the multilingual settings. We also show that, in cross-lingual settings, our model not only leverages closely related languages, but also learns from languages with different roots. Finally, we show that combining different subunits are crucial for capturing code-switching entities.</abstract>
       <url>D19-1360</url>
       <doi>10.18653/v1/D19-1360</doi>
@@ -4210,7 +4210,7 @@
       <author><first>Yingming</first><last>Li</last></author>
       <author><first>Ming</first><last>Chen</last></author>
       <author><first>Zhongfei</first><last>Zhang</last></author>
-      <pages>3546–3551</pages>
+      <pages>3548–3553</pages>
       <abstract>In this paper, we develop a novel Sparse Self-Attention Fine-tuning model (referred as SSAF) which integrates sparsity into self-attention mechanism to enhance the fine-tuning performance of BERT. In particular, sparsity is introduced into the self-attention by replacing softmax function with a controllable sparse transformation when fine-tuning with BERT. It enables us to learn a structurally sparse attention distribution, which leads to a more interpretable representation for the whole input. The proposed model is evaluated on sentiment analysis, question answering, and natural language inference tasks. The extensive experimental results across multiple datasets demonstrate its effectiveness and superiority to the baseline methods.</abstract>
       <url>D19-1361</url>
       <doi>10.18653/v1/D19-1361</doi>
@@ -4220,7 +4220,7 @@
       <author><first>Lukas</first><last>Lange</last></author>
       <author><first>Michael A.</first><last>Hedderich</last></author>
       <author><first>Dietrich</first><last>Klakow</last></author>
-      <pages>3552–3557</pages>
+      <pages>3554–3559</pages>
       <abstract>In low-resource settings, the performance of supervised labeling models can be improved with automatically annotated or distantly supervised data, which is cheap to create but often noisy. Previous works have shown that significant improvements can be reached by injecting information about the confusion between clean and noisy labels in this additional training data into the classifier training. However, for noise estimation, these approaches either do not take the input features (in our case word embeddings) into account, or they need to learn the noise modeling from scratch which can be difficult in a low-resource setting. We propose to cluster the training data using the input features and then compute different confusion matrices for each cluster. To the best of our knowledge, our approach is the first to leverage feature-dependent noise modeling with pre-initialized confusion matrices. We evaluate on low-resource named entity recognition settings in several languages, showing that our methods improve upon other confusion-matrix based methods by up to 9%.</abstract>
       <url>D19-1362</url>
       <doi>10.18653/v1/D19-1362</doi>
@@ -4230,7 +4230,7 @@
       <author><first>Hagai</first><last>Taitelbaum</last></author>
       <author><first>Gal</first><last>Chechik</last></author>
       <author><first>Jacob</first><last>Goldberger</last></author>
-      <pages>3558–3563</pages>
+      <pages>3560–3565</pages>
       <abstract>In this paper we present a novel approach to simultaneously representing multiple languages in a common space. Procrustes Analysis (PA) is commonly used to find the optimal orthogonal word mapping in the bilingual case. The proposed Multi Pairwise Procrustes Analysis (MPPA) is a natural extension of the PA algorithm to multilingual word mapping. Unlike previous PA extensions that require a k-way dictionary, this approach requires only pairwise bilingual dictionaries that are much easier to construct.</abstract>
       <url>D19-1363</url>
       <doi>10.18653/v1/D19-1363</doi>
@@ -4244,7 +4244,7 @@
       <author><first>Saloni</first><last>Potdar</last></author>
       <author><first>Shiyu</first><last>Chang</last></author>
       <author><first>Mo</first><last>Yu</last></author>
-      <pages>3564–3570</pages>
+      <pages>3566–3572</pages>
       <abstract>Out-of-domain (OOD) detection for low-resource text classification is a realistic but understudied task. The goal is to detect the OOD cases with limited in-domain (ID) training data, since in machine learning applications we observe that training data is often insufficient. In this work, we propose an <i>OOD-resistant Prototypical Network</i> to tackle this zero-shot OOD detection and few-shot ID classification task. Evaluations on real-world datasets show that the proposed solution outperforms state-of-the-art methods in zero-shot OOD detection task, while maintaining a competitive performance on ID classification task.</abstract>
       <url>D19-1364</url>
       <doi>10.18653/v1/D19-1364</doi>
@@ -4256,7 +4256,7 @@
       <author><first>Lili</first><last>Mou</last></author>
       <author><first>Zhoujun</first><last>Li</last></author>
       <author><first>Wenhan</first><last>Chao</last></author>
-      <pages>3571–3576</pages>
+      <pages>3573–3578</pages>
       <abstract>Formality text style transfer plays an important role in various NLP applications, such as non-native speaker assistants and child education. Early studies normalize informal sentences with rules, before statistical and neural models become a prevailing method in the field. While a rule-based system is still a common preprocessing step for formality style transfer in the neural era, it could introduce noise if we use the rules in a naive way such as data preprocessing. To mitigate this problem, we study how to harness rules into a state-of-the-art neural network that is typically pretrained on massive corpora. We propose three fine-tuning methods in this paper and achieve a new state-of-the-art on benchmark datasets</abstract>
       <url>D19-1365</url>
       <doi>10.18653/v1/D19-1365</doi>
@@ -4268,7 +4268,7 @@
       <author><first>Hong-You</first><last>Chen</last></author>
       <author><first>Chi-Jen</first><last>Lu</last></author>
       <author><first>Shou-De</first><last>Lin</last></author>
-      <pages>3577–3582</pages>
+      <pages>3579–3584</pages>
       <abstract>The objective of non-parallel text style transfer, or controllable text generation, is to alter specific attributes (e.g. sentiment, mood, tense, politeness, etc) of a given text while preserving its remaining attributes and content. Generative adversarial network (GAN) is a popular model to ensure the transferred sentences are realistic and have the desired target styles. However, training GAN often suffers from mode collapse problem, which causes that the transferred text is little related to the original text. In this paper, we propose a new GAN model with a word-level conditional architecture and a two-phase training procedure. By using a style-related condition architecture before generating a word, our model is able to maintain style-unrelated words while changing the others. By separating the training procedure into reconstruction and transfer phases, our model is able to learn a proper text generation process, which further improves the content preservation. We test our model on polarity sentiment transfer and multiple-attribute transfer tasks. The empirical results show that our model achieves comparable evaluation scores in both transfer accuracy and fluency but significantly outperforms other state-of-the-art models in content compatibility on three real-world datasets.</abstract>
       <url>D19-1366</url>
       <attachment>D19-1366.Attachment.zip</attachment>
@@ -4281,7 +4281,7 @@
       <author><first>Tong</first><last>Xiao</last></author>
       <author><first>Chunliang</first><last>Zhang</last></author>
       <author><first>Jingbo</first><last>Zhu</last></author>
-      <pages>3583–3588</pages>
+      <pages>3585–3590</pages>
       <abstract>In this paper, we study differentiable neural architecture search (NAS) methods for natural language processing. In particular, we improve differentiable architecture search by removing the softmax-local constraint. Also, we apply differentiable NAS to named entity recognition (NER). It is the first time that differentiable NAS methods are adopted in NLP tasks other than language modeling. On both the PTB language modeling and CoNLL-2003 English NER data, our method outperforms strong baselines. It achieves a new state-of-the-art on the NER task.</abstract>
       <url>D19-1367</url>
       <doi>10.18653/v1/D19-1367</doi>
@@ -4292,7 +4292,7 @@
       <author><first>Masha</first><last>Naslidnyk</last></author>
       <author><first>Dave</first><last>Palfrey</last></author>
       <author><first>Arpit</first><last>Mittal</last></author>
-      <pages>3589–3594</pages>
+      <pages>3591–3596</pages>
       <abstract>Bilinear models such as DistMult and ComplEx are effective methods for knowledge graph (KG) completion. However, they require large batch sizes, which becomes a performance bottleneck when training on large scale datasets due to memory constraints. In this paper we use occurrences of entity-relation pairs in the dataset to construct a joint learning model and to increase the quality of sampled negatives during training. We show on three standard datasets that when these two techniques are combined, they give a significant improvement in performance, especially when the batch size and the number of generated negative examples are low relative to the size of the dataset. We then apply our techniques to a dataset containing 2 million entities and demonstrate that our model outperforms the baseline by 2.8% absolute on hits@1.</abstract>
       <url>D19-1368</url>
       <attachment>D19-1368.Attachment.zip</attachment>
@@ -4301,7 +4301,7 @@
     <paper id="369">
       <title>Single Training Dimension Selection for Word Embedding with <fixed-case>PCA</fixed-case></title>
       <author><first>Yu</first><last>Wang</last></author>
-      <pages>3595–3600</pages>
+      <pages>3597–3602</pages>
       <abstract>In this paper, we present a fast and reliable method based on PCA to select the number of dimensions for word embeddings. First, we train one embedding with a generous upper bound (e.g. 1,000) of dimensions. Then we transform the embeddings using PCA and incrementally remove the lesser dimensions one at a time while recording the embeddings’ performance on language tasks. Lastly, we select the number of dimensions, balancing model size and accuracy. Experiments using various datasets and language tasks demonstrate that we are able to train about 10 times fewer sets of embeddings while retaining optimal performance. Researchers interested in training the best-performing embeddings for downstream tasks, such as sentiment analysis, question answering and hypernym extraction, as well as those interested in embedding compression should find the method helpful.</abstract>
       <url>D19-1369</url>
       <doi>10.18653/v1/D19-1369</doi>
@@ -4313,7 +4313,7 @@
       <author><first>Graham</first><last>Neubig</last></author>
       <author><first>Taylor</first><last>Berg-Kirkpatrick</last></author>
       <author><first>Yiming</first><last>Yang</last></author>
-      <pages>3601–3612</pages>
+      <pages>3603–3614</pages>
       <abstract>When trained effectively, the Variational Autoencoder (VAE) is both a powerful language model and an effective representation learning framework. In practice, however, VAEs are trained with the evidence lower bound (ELBO) as a surrogate objective to the intractable marginal data likelihood. This approach to training yields unstable results, frequently leading to a disastrous local optimum known as posterior collapse. In this paper, we investigate a simple fix for posterior collapse which yields surprisingly effective results. The combination of two known heuristics, previously considered only in isolation, substantially improves held-out likelihood, reconstruction, and latent representation learning when compared with previous state-of-the-art methods. More interestingly, while our experiments demonstrate superiority on these principle evaluations, our method obtains a worse ELBO. We use these results to argue that the typical surrogate objective for VAEs may not be sufficient or necessarily appropriate for balancing the goals of representation learning and data distribution modeling.</abstract>
       <url>D19-1370</url>
       <doi>10.18653/v1/D19-1370</doi>
@@ -4323,7 +4323,7 @@
       <author><first>Iz</first><last>Beltagy</last></author>
       <author><first>Kyle</first><last>Lo</last></author>
       <author><first>Arman</first><last>Cohan</last></author>
-      <pages>3613–3618</pages>
+      <pages>3615–3620</pages>
       <abstract>Obtaining large-scale annotated data for NLP tasks in the scientific domain is challenging and expensive. We release SciBERT, a pretrained language model based on BERT (Devlin et. al., 2018) to address the lack of high-quality, large-scale labeled scientific data. SciBERT leverages unsupervised pretraining on a large multi-domain corpus of scientific publications to improve performance on downstream scientific NLP tasks. We evaluate on a suite of tasks including sequence tagging, sentence classification and dependency parsing, with datasets from a variety of scientific domains. We demonstrate statistically significant improvements over BERT and achieve new state-of-the-art results on several of these tasks. The code and pretrained models are available at https://github.com/allenai/scibert/.</abstract>
       <url>D19-1371</url>
       <doi>10.18653/v1/D19-1371</doi>
@@ -4332,7 +4332,7 @@
       <title>Humor Detection: A Transformer Gets the Last Laugh</title>
       <author><first>Orion</first><last>Weller</last></author>
       <author><first>Kevin</first><last>Seppi</last></author>
-      <pages>3619–3623</pages>
+      <pages>3621–3625</pages>
       <abstract>Much previous work has been done in attempting to identify humor in text. In this paper we extend that capability by proposing a new task: assessing whether or not a joke is humorous. We present a novel way of approaching this problem by building a model that learns to identify humorous jokes based on ratings gleaned from Reddit pages, consisting of almost 16,000 labeled instances. Using these ratings to determine the level of humor, we then employ a Transformer architecture for its advantages in learning from sentence context. We demonstrate the effectiveness of this approach and show results that are comparable to human performance. We further demonstrate our model’s increased capabilities on humor identification problems, such as the previously created datasets for short jokes and puns. These experiments show that this method outperforms all previous work done on these tasks, with an F-measure of 93.1% for the Puns dataset and 98.6% on the Short Jokes dataset.</abstract>
       <url>D19-1372</url>
       <doi>10.18653/v1/D19-1372</doi>
@@ -4342,7 +4342,7 @@
       <author><first>Alham Fikri</first><last>Aji</last></author>
       <author><first>Kenneth</first><last>Heafield</last></author>
       <author><first>Nikolay</first><last>Bogoychev</last></author>
-      <pages>3624–3629</pages>
+      <pages>3626–3631</pages>
       <abstract>One way to reduce network traffic in multi-node data-parallel stochastic gradient descent is to only exchange the largest gradients. However, doing so damages the gradient and degrades the model’s performance. Transformer models degrade dramatically while the impact on RNNs is smaller. We restore gradient quality by combining the compressed global gradient with the node’s locally computed uncompressed gradient. Neural machine translation experiments show that Transformer convergence is restored while RNNs converge faster. With our method, training on 4 nodes converges up to 1.5x as fast as with uncompressed gradients and scales 3.5x relative to single-node training.</abstract>
       <url>D19-1373</url>
       <doi>10.18653/v1/D19-1373</doi>
@@ -4355,7 +4355,7 @@
       <author><first>Naveen</first><last>Arivazhagan</last></author>
       <author><first>Xin</first><last>Li</last></author>
       <author><first>Amelia</first><last>Archer</last></author>
-      <pages>3630–3634</pages>
+      <pages>3632–3636</pages>
       <abstract>We propose a practical scheme to train a single multilingual sequence labeling model that yields state of the art results and is small and fast enough to run on a single CPU. Starting from a public multilingual BERT checkpoint, our final model is 6x smaller and 27x faster, and has higher accuracy than a state-of-the-art multilingual baseline. We show that our model especially outperforms on low-resource languages, and works on codemixed input text without being explicitly trained on codemixed examples. We showcase the effectiveness of our method by reporting on part-of-speech tagging and morphological prediction on 70 treebanks and 48 languages.</abstract>
       <url>D19-1374</url>
       <attachment>D19-1374.Attachment.zip</attachment>
@@ -4366,7 +4366,7 @@
       <author><first>Zijian</first><last>Zhao</last></author>
       <author><first>Su</first><last>Zhu</last></author>
       <author><first>Kai</first><last>Yu</last></author>
-      <pages>3635–3641</pages>
+      <pages>3637–3643</pages>
       <abstract>Spoken Language Understanding (SLU) converts user utterances into structured semantic representations. Data sparsity is one of the main obstacles of SLU due to the high cost of human annotation, especially when domain changes or a new domain comes. In this work, we propose a data augmentation method with atomic templates for SLU, which involves minimum human efforts. The atomic templates produce exemplars for fine-grained constituents of semantic representations. We propose an encoder-decoder model to generate the whole utterance from atomic exemplars. Moreover, the generator could be transferred from source domains to help a new domain which has little data. Experimental results show that our method achieves significant improvements on DSTC 2&amp;3 dataset which is a domain adaptation setting of SLU.</abstract>
       <url>D19-1375</url>
       <doi>10.18653/v1/D19-1375</doi>
@@ -4376,7 +4376,7 @@
       <author><first>Hao</first><last>Peng</last></author>
       <author><first>Roy</first><last>Schwartz</last></author>
       <author><first>Noah A.</first><last>Smith</last></author>
-      <pages>3642–3649</pages>
+      <pages>3644–3651</pages>
       <abstract>We present PaLM, a hybrid parser and neural language model. Building on an RNN language model, PaLM adds an attention layer over text spans in the left context. An unsupervised constituency parser can be derived from its attention weights, using a greedy decoding algorithm. We evaluate PaLM on language modeling, and empirically show that it outperforms strong baselines. If syntactic annotations are available, the attention component can be trained in a supervised manner, providing syntactically-informed representations of the context, and further improving language modeling performance.</abstract>
       <url>D19-1376</url>
       <attachment>D19-1376.Attachment.pdf</attachment>
@@ -4387,7 +4387,7 @@
       <author><first>Qingkai</first><last>Min</last></author>
       <author><first>Yuefeng</first><last>Shi</last></author>
       <author><first>Yue</first><last>Zhang</last></author>
-      <pages>3650–3656</pages>
+      <pages>3652–3658</pages>
       <abstract>The task of semantic parsing is highly useful for dialogue and question answering systems. Many datasets have been proposed to map natural language text into SQL, among which the recent Spider dataset provides cross-domain samples with multiple tables and complex queries. We build a Spider dataset for Chinese, which is currently a low-resource language in this task area. Interesting research questions arise from the uniqueness of the language, which requires word segmentation, and also from the fact that SQL keywords and columns of DB tables are typically written in English. We compare character- and word-based encoders for a semantic parser, and different embedding schemes. Results show that word-based semantic parser is subject to segmentation errors and cross-lingual word embeddings are useful for text-to-SQL.</abstract>
       <url>D19-1377</url>
       <doi>10.18653/v1/D19-1377</doi>
@@ -4397,7 +4397,7 @@
       <author><first>Ben</first><last>Bogin</last></author>
       <author><first>Matt</first><last>Gardner</last></author>
       <author><first>Jonathan</first><last>Berant</last></author>
-      <pages>3657–3662</pages>
+      <pages>3659–3664</pages>
       <abstract>State-of-the-art semantic parsers rely on auto-regressive decoding, emitting one symbol at a time. When tested against complex databases that are unobserved at training time (zero-shot), the parser often struggles to select the correct set of database constants in the new database, due to the local nature of decoding. %since their decisions are based on weak, local information only. In this work, we propose a semantic parser that globally reasons about the structure of the output query to make a more contextually-informed selection of database constants. We use message-passing through a graph neural network to softly select a subset of database constants for the output query, conditioned on the question. Moreover, we train a model to rank queries based on the global alignment of database constants to question words. We apply our techniques to the current state-of-the-art model for Spider, a zero-shot semantic parsing dataset with complex databases, increasing accuracy from 39.4% to 47.4%.</abstract>
       <url>D19-1378</url>
       <attachment>D19-1378.Attachment.pdf</attachment>
@@ -4408,7 +4408,7 @@
       <author><first>Hiroki</first><last>Ouchi</last></author>
       <author><first>Jun</first><last>Suzuki</last></author>
       <author><first>Kentaro</first><last>Inui</last></author>
-      <pages>3663–3669</pages>
+      <pages>3665–3671</pages>
       <abstract>In transductive learning, an unlabeled test set is used for model training. Although this setting deviates from the common assumption of a completely unseen test set, it is applicable in many real-world scenarios, wherein the texts to be processed are known in advance. However, despite its practical advantages, transductive learning is underexplored in natural language processing. Here we conduct an empirical study of transductive learning for neural models and demonstrate its utility in syntactic and semantic tasks. Specifically, we fine-tune language models (LMs) on an unlabeled test set to obtain test-set-specific word representations. Through extensive experiments, we demonstrate that despite its simplicity, transductive LM fine-tuning consistently improves state-of-the-art neural models in in-domain and out-of-domain settings.</abstract>
       <url>D19-1379</url>
       <attachment>D19-1379.Attachment.pdf</attachment>
@@ -4419,7 +4419,7 @@
       <author><first>Nada</first><last>Almarwani</last></author>
       <author><first>Hanan</first><last>Aldarmaki</last></author>
       <author><first>Mona</first><last>Diab</last></author>
-      <pages>3670–3676</pages>
+      <pages>3672–3678</pages>
       <abstract>Vector averaging remains one of the most popular sentence embedding methods in spite of its obvious disregard for syntactic structure. While more complex sequential or convolutional networks potentially yield superior classification performance, the improvements in classification accuracy are typically mediocre compared to the simple vector averaging. As an efficient alternative, we propose the use of discrete cosine transform (DCT) to compress word sequences in an order-preserving manner. The lower order DCT coefficients represent the overall feature patterns in sentences, which results in suitable embeddings for tasks that could benefit from syntactic features. Our results in semantic probing tasks demonstrate that DCT embeddings indeed preserve more syntactic information compared with vector averaging. With practically equivalent complexity, the model yields better overall performance in downstream classification tasks that correlate with syntactic features, which illustrates the capacity of DCT to preserve word order information.</abstract>
       <url>D19-1380</url>
       <doi>10.18653/v1/D19-1380</doi>
@@ -4429,7 +4429,7 @@
       <author><first>Kurt Junshean</first><last>Espinosa</last></author>
       <author><first>Makoto</first><last>Miwa</last></author>
       <author><first>Sophia</first><last>Ananiadou</last></author>
-      <pages>3677–3684</pages>
+      <pages>3679–3686</pages>
       <abstract>We tackle the nested and overlapping event detection task and propose a novel search-based neural network (SBNN) structured prediction model that treats the task as a search problem on a relation graph of trigger-argument structures. Unlike existing structured prediction tasks such as dependency parsing, the task targets to detect DAG structures, which constitute events, from the relation graph. We define actions to construct events and use all the beams in a beam search to detect all event structures that may be overlapping and nested. The search process constructs events in a bottom-up manner while modelling the global properties for nested and overlapping structures simultaneously using neural networks. We show that the model achieves performance comparable to the state-of-the-art model Turku Event Extraction System (TEES) on the BioNLP Cancer Genetics (CG) Shared Task 2013 without the use of any syntactic and hand-engineered features. Further analyses on the development set show that our model is more computationally efficient while yielding higher F1-score performance.</abstract>
       <url>D19-1381</url>
       <doi>10.18653/v1/D19-1381</doi>
@@ -4440,7 +4440,7 @@
       <author><first>Yuan</first><last>Zhang</last></author>
       <author><first>Chris</first><last>Tar</last></author>
       <author><first>Jason</first><last>Baldridge</last></author>
-      <pages>3685–3690</pages>
+      <pages>3687–3692</pages>
       <abstract>Most existing work on adversarial data generation focuses on English. For example, PAWS (Paraphrase Adversaries from Word Scrambling) consists of challenging English paraphrase identification pairs from Wikipedia and Quora. We remedy this gap with PAWS-X, a new dataset of 23,659 human translated PAWS evaluation pairs in six typologically distinct languages: French, Spanish, German, Chinese, Japanese, and Korean. We provide baseline numbers for three models with different capacity to capture non-local context and sentence structure, and using different multilingual training and evaluation regimes. Multilingual BERT fine-tuned on PAWS English plus machine-translated data performs the best, with a range of 83.1-90.8 accuracy across the non-English languages and an average accuracy gain of 23% over the next best model. PAWS-X shows the effectiveness of deep, multilingual pre-training while also leaving considerable headroom as a new challenge to drive multilingual research that better captures structure and contextual information.</abstract>
       <url>D19-1382</url>
       <doi>10.18653/v1/D19-1382</doi>
@@ -4452,7 +4452,7 @@
       <author><first>Daniel</first><last>King</last></author>
       <author><first>Bhavana</first><last>Dalvi</last></author>
       <author><first>Dan</first><last>Weld</last></author>
-      <pages>3691–3697</pages>
+      <pages>3693–3699</pages>
       <abstract>As a step toward better document-level understanding, we explore classification of a sequence of sentences into their corresponding categories, a task that requires understanding sentences in context of the document. Recent successful models for this task have used hierarchical models to contextualize sentence representations, and Conditional Random Fields (CRFs) to incorporate dependencies between subsequent labels. In this work, we show that pretrained language models, BERT (Devlin et al., 2018) in particular, can be used for this task to capture contextual dependencies without the need for hierarchical encoding nor a CRF. Specifically, we construct a joint sentence representation that allows BERT Transformer layers to directly utilize contextual information from all words in all sentences. Our approach achieves state-of-the-art results on four datasets, including a new dataset of structured scientific abstracts.</abstract>
       <url>D19-1383</url>
       <doi>10.18653/v1/D19-1383</doi>
@@ -4462,7 +4462,7 @@
       <author><first>Laura</first><last>Harding Graesser</last></author>
       <author><first>Kyunghyun</first><last>Cho</last></author>
       <author><first>Douwe</first><last>Kiela</last></author>
-      <pages>3698–3708</pages>
+      <pages>3700–3710</pages>
       <abstract>We describe a multi-agent communication framework for examining high-level linguistic phenomena at the community-level. We demonstrate that complex linguistic behavior observed in natural language can be reproduced in this simple setting: i) the outcome of contact between communities is a function of inter- and intra-group connectivity; ii) linguistic contact either converges to the majority protocol, or in balanced cases leads to novel creole languages of lower complexity; and iii) a linguistic continuum emerges where neighboring languages are more mutually intelligible than farther removed languages. We conclude that at least some of the intricate properties of language evolution need not depend on complex evolved linguistic capabilities, but can emerge from simple social exchanges between perceptually-enabled agents playing communication games.</abstract>
       <url>D19-1384</url>
       <doi>10.18653/v1/D19-1384</doi>
@@ -4471,7 +4471,7 @@
       <title><fixed-case>T</fixed-case>alk<fixed-case>D</fixed-case>own: A Corpus for Condescension Detection in Context</title>
       <author><first>Zijian</first><last>Wang</last></author>
       <author><first>Christopher</first><last>Potts</last></author>
-      <pages>3709–3717</pages>
+      <pages>3711–3719</pages>
       <abstract>Condescending language use is caustic; it can bring dialogues to an end and bifurcate communities. Thus, systems for condescension detection could have a large positive impact. A challenge here is that condescension is often impossible to detect from isolated utterances, as it depends on the discourse and social context. To address this, we present TalkDown, a new labeled dataset of condescending linguistic acts in context. We show that extending a language-only model with representations of the discourse improves performance, and we motivate techniques for dealing with the low rates of condescension overall. We also use our model to estimate condescension rates in various online communities and relate these differences to differing community norms.</abstract>
       <url>D19-1385</url>
       <doi>10.18653/v1/D19-1385</doi>
@@ -4480,7 +4480,7 @@
       <title>Summary Cloze: A New Task for Content Selection in Topic-Focused Summarization</title>
       <author><first>Daniel</first><last>Deutsch</last></author>
       <author><first>Dan</first><last>Roth</last></author>
-      <pages>3718–3727</pages>
+      <pages>3720–3729</pages>
       <abstract>A key challenge in topic-focused summarization is determining what information should be included in the summary, a problem known as content selection. In this work, we propose a new method for studying content selection in topic-focused summarization called the summary cloze task. The goal of the summary cloze task is to generate the next sentence of a summary conditioned on the beginning of the summary, a topic, and a reference document(s). The main challenge is deciding what information in the references is relevant to the topic and partial summary and should be included in the summary. Although the cloze task does not address all aspects of the traditional summarization problem, the more narrow scope of the task allows us to collect a large-scale datset of nearly 500k summary cloze instances from Wikipedia. We report experimental results on this new dataset using various extractive models and a two-step abstractive model that first extractively selects a small number of sentences and then abstractively summarizes them. Our results show that the topic and partial summary help the models identify relevant content, but the task remains a significant challenge.</abstract>
       <url>D19-1386</url>
       <attachment>D19-1386.Attachment.zip</attachment>
@@ -4490,7 +4490,7 @@
       <title>Text Summarization with Pretrained Encoders</title>
       <author id="yang-liu"><first>Yang</first><last>Liu</last></author>
       <author><first>Mirella</first><last>Lapata</last></author>
-      <pages>3728–3738</pages>
+      <pages>3730–3740</pages>
       <abstract>Bidirectional Encoder Representations from Transformers (BERT) represents the latest incarnation of pretrained language models which have recently advanced a wide range of natural language processing tasks. In this paper, we showcase how BERT can be usefully applied in text summarization and propose a general framework for both extractive and abstractive models. We introduce a novel document-level encoder based on BERT which is able to express the semantics of a document and obtain representations for its sentences. Our extractive model is built on top of this encoder by stacking several inter-sentence Transformer layers. For abstractive summarization, we propose a new fine-tuning schedule which adopts different optimizers for the encoder and the decoder as a means of alleviating the mismatch between the two (the former is pretrained while the latter is not). We also demonstrate that a two-staged fine-tuning approach can further boost the quality of the generated summaries. Experiments on three datasets show that our model achieves state-of-the-art results across the board in both extractive and abstractive settings.</abstract>
       <url>D19-1387</url>
       <attachment>D19-1387.Attachment.pdf</attachment>
@@ -4504,7 +4504,7 @@
       <author><first>Zhangming</first><last>Chan</last></author>
       <author><first>Dongyan</first><last>Zhao</last></author>
       <author><first>Rui</first><last>Yan</last></author>
-      <pages>3739–3749</pages>
+      <pages>3741–3751</pages>
       <abstract>Under special circumstances, summaries should conform to a particular style with patterns, such as court judgments and abstracts in academic papers. To this end, the prototype document-summary pairs can be utilized to generate better summaries. There are two main challenges in this task: (1) the model needs to incorporate learned patterns from the prototype, but (2) should avoid copying contents other than the patternized words—such as irrelevant facts—into the generated summaries. To tackle these challenges, we design a model named Prototype Editing based Summary Generator (PESG). PESG first learns summary patterns and prototype facts by analyzing the correlation between a prototype document and its summary. Prototype facts are then utilized to help extract facts from the input document. Next, an editing generator generates new summary based on the summary pattern or extracted facts. Finally, to address the second challenge, a fact checker is used to estimate mutual information between the input document and generated summary, providing an additional signal for the generator. Extensive experiments conducted on a large-scale real-world text summarization dataset show that PESG achieves the state-of-the-art performance in terms of both automatic metrics and human evaluations.</abstract>
       <url>D19-1388</url>
       <doi>10.18653/v1/D19-1388</doi>
@@ -4515,7 +4515,7 @@
       <author><first>Ari</first><last>Holtzman</last></author>
       <author><first>Jan</first><last>Buys</last></author>
       <author><first>Yejin</first><last>Choi</last></author>
-      <pages>3750–3759</pages>
+      <pages>3752–3761</pages>
       <abstract>The principle of the Information Bottleneck (Tishby et al., 1999) produces a summary of information X optimized to predict some other relevant information Y. In this paper, we propose a novel approach to unsupervised sentence summarization by mapping the Information Bottleneck principle to a conditional language modelling objective: given a sentence, our approach seeks a compressed sentence that can best predict the next sentence. Our iterative algorithm under the Information Bottleneck objective searches gradually shorter subsequences of the given sentence while maximizing the probability of the next sentence conditioned on the summary. Using only pretrained language models with no direct supervision, our approach can efficiently perform extractive sentence summarization over a large corpus. Building on our unsupervised extractive summarization, we also present a new approach to self-supervised abstractive summarization, where a transformer-based language model is trained on the output summaries of our unsupervised method. Empirical results demonstrate that our extractive method outperforms other unsupervised models on multiple automatic metrics. In addition, we find that our self-supervised abstractive model outperforms unsupervised baselines (including our own) by human evaluation along multiple attributes.</abstract>
       <url>D19-1389</url>
       <attachment>D19-1389.Attachment.zip</attachment>
@@ -4527,7 +4527,7 @@
       <author><first>Yang</first><last>Zhao</last></author>
       <author><first>Hui</first><last>Su</last></author>
       <author><first>Dietrich</first><last>Klakow</last></author>
-      <pages>3760–3771</pages>
+      <pages>3762–3773</pages>
       <abstract>Pointer Generators have been the de facto standard for modern summarization systems. However, this architecture faces two major drawbacks: Firstly, the pointer is limited to copying the exact words while ignoring possible inflections or abstractions, which restricts its power of capturing richer latent alignment. Secondly, the copy mechanism results in a strong bias towards extractive generations, where most sentences are produced by simply copying from the source text. In this paper, we address these problems by allowing the model to “edit” pointed tokens instead of always hard copying them. The editing is performed by transforming the pointed word vector into a target space with a learned relation embedding. On three large-scale summarization dataset, we show the model is able to (1) capture more latent alignment relations than exact word matches, (2) improve word alignment accuracy, allowing for better model interpretation and controlling, (3) generate higher-quality summaries validated by both qualitative and quantitative evaluations and (4) bring more abstraction to the generated summaries.</abstract>
       <url>D19-1390</url>
       <attachment>D19-1390.Attachment.pdf</attachment>
@@ -4538,7 +4538,7 @@
       <author><first>Bailin</first><last>Wang</last></author>
       <author><first>Ivan</first><last>Titov</last></author>
       <author><first>Mirella</first><last>Lapata</last></author>
-      <pages>3772–3783</pages>
+      <pages>3774–3785</pages>
       <abstract>Semantic parsing aims to map natural language utterances onto machine interpretable meaning representations, aka programs whose execution against a real-world environment produces a denotation. Weakly-supervised semantic parsers are trained on utterance-denotation pairs treating programs as latent. The task is challenging due to the large search space and spuriousness of programs which may execute to the correct answer but do not generalize to unseen examples. Our goal is to instill an inductive bias in the parser to help it distinguish between spurious and correct programs. We capitalize on the intuition that correct programs would likely respect certain structural constraints were they to be aligned to the question (e.g., program fragments are unlikely to align to overlapping text spans) and propose to model alignments as structured latent variables. In order to make the latent-alignment framework tractable, we decompose the parsing task into (1) predicting a partial “abstract program” and (2) refining it while modeling structured alignments with differential dynamic programming. We obtain state-of-the-art performance on the WikiTableQuestions and WikiSQL datasets. When compared to a standard attention baseline, we observe that the proposed structured-alignment mechanism is highly beneficial.</abstract>
       <url>D19-1391</url>
       <doi>10.18653/v1/D19-1391</doi>
@@ -4549,7 +4549,7 @@
       <author><first>Xutai</first><last>Ma</last></author>
       <author><first>Kevin</first><last>Duh</last></author>
       <author><first>Benjamin</first><last>Van Durme</last></author>
-      <pages>3784–3796</pages>
+      <pages>3786–3798</pages>
       <abstract>We unify different broad-coverage semantic parsing tasks into a transduction parsing paradigm, and propose an attention-based neural transducer that incrementally builds meaning representation via a sequence of semantic relations. By leveraging multiple attention mechanisms, the neural transducer can be effectively trained without relying on a pre-trained aligner. Experiments separately conducted on three broad-coverage semantic parsing tasks – AMR, SDP and UCCA – demonstrate that our attention-based neural transducer improves the state of the art on both AMR and UCCA, and is competitive with the state of the art on SDP.</abstract>
       <url>D19-1392</url>
       <doi>10.18653/v1/D19-1392</doi>
@@ -4558,7 +4558,7 @@
       <title>Core Semantic First: A Top-down Approach for <fixed-case>AMR</fixed-case> Parsing</title>
       <author><first>Deng</first><last>Cai</last></author>
       <author><first>Wai</first><last>Lam</last></author>
-      <pages>3797–3807</pages>
+      <pages>3799–3809</pages>
       <abstract>We introduce a novel scheme for parsing a piece of text into its Abstract Meaning Representation (AMR): Graph Spanning based Parsing (GSP). One novel characteristic of GSP is that it constructs a parse graph incrementally in a top-down fashion. Starting from the root, at each step, a new node and its connections to existing nodes will be jointly predicted. The output graph spans the nodes by the distance to the root, following the intuition of first grasping the main ideas then digging into more details. The <i>core semantic first</i> principle emphasizes capturing the main ideas of a sentence, which is of great interest. We evaluate our model on the latest AMR sembank and achieve the state-of-the-art performance in the sense that no heuristic graph re-categorization is adopted. More importantly, the experiments show that our parser is especially good at obtaining the core semantics.</abstract>
       <url>D19-1393</url>
       <attachment>D19-1393.Attachment.pdf</attachment>
@@ -4568,7 +4568,7 @@
       <title>Don’t paraphrase, detect! Rapid and Effective Data Collection for Semantic Parsing</title>
       <author><first>Jonathan</first><last>Herzig</last></author>
       <author><first>Jonathan</first><last>Berant</last></author>
-      <pages>3808–3818</pages>
+      <pages>3810–3820</pages>
       <abstract>A major hurdle on the road to conversational interfaces is the difficulty in collecting data that maps language utterances to logical forms. One prominent approach for data collection has been to automatically generate pseudo-language paired with logical forms, and paraphrase the pseudo-language to natural language through crowdsourcing (Wang et al., 2015). However, this data collection procedure often leads to low performance on real data, due to a mismatch between the true distribution of examples and the distribution induced by the data collection procedure. In this paper, we thoroughly analyze two sources of mismatch in this process: the mismatch in logical form distribution and the mismatch in language distribution between the true and induced distributions. We quantify the effects of these mismatches, and propose a new data collection approach that mitigates them. Assuming access to unlabeled utterances from the true distribution, we combine crowdsourcing with a paraphrase model to detect correct logical forms for the unlabeled utterances. On two datasets, our method leads to 70.6 accuracy on average on the true distribution, compared to 51.3 in paraphrasing-based data collection.</abstract>
       <url>D19-1394</url>
       <doi>10.18653/v1/D19-1394</doi>
@@ -4581,7 +4581,7 @@
       <author><first>Liqiang</first><last>Nie</last></author>
       <author><first>Weili</first><last>Guan</last></author>
       <author><first>Cheng</first><last>Yang</last></author>
-      <pages>3819–3827</pages>
+      <pages>3821–3829</pages>
       <abstract>Distantly-supervised relation extraction has proven to be effective to find relational facts from texts. However, the existing approaches treat labels as independent and meaningless one-hot vectors, which cause a loss of potential label information for selecting valid instances. In this paper, we propose a novel multi-layer attention-based model to improve relation extraction with joint label embedding. The model makes full use of both structural information from Knowledge Graphs and textual information from entity descriptions to learn label embeddings through gating integration while avoiding the imposed noise with an attention mechanism. Then the learned label embeddings are used as another atten- tion over the instances (whose embeddings are also enhanced with the entity descriptions) for improving relation extraction. Extensive experiments demonstrate that our model significantly outperforms state-of-the-art methods.</abstract>
       <url>D19-1395</url>
       <doi>10.18653/v1/D19-1395</doi>
@@ -4593,7 +4593,7 @@
       <author><first>Kang</first><last>Liu</last></author>
       <author><first>Jun</first><last>Zhao</last></author>
       <author><first>Shengping</first><last>Liu</last></author>
-      <pages>3828–3838</pages>
+      <pages>3830–3840</pages>
       <abstract>The lack of word boundaries information has been seen as one of the main obstacles to develop a high performance Chinese named entity recognition (NER) system. Fortunately, the automatically constructed lexicon contains rich word boundaries information and word semantic information. However, integrating lexical knowledge in Chinese NER tasks still faces challenges when it comes to self-matched lexical words as well as the nearest contextual lexical words. We present a Collaborative Graph Network to solve these challenges. Experiments on various datasets show that our model not only outperforms the state-of-the-art (SOTA) results, but also achieves a speed that is six to fifteen times faster than that of the SOTA model.</abstract>
       <url>D19-1396</url>
       <doi>10.18653/v1/D19-1396</doi>
@@ -4604,7 +4604,7 @@
       <author><first>Liyuan</first><last>Liu</last></author>
       <author><first>Maosen</first><last>Zhang</last></author>
       <author><first>Xiang</first><last>Ren</last></author>
-      <pages>3839–3848</pages>
+      <pages>3841–3850</pages>
       <abstract>In recent years there is a surge of interest in applying distant supervision (DS) to automatically generate training data for relation extraction (RE). In this paper, we study the problem what limits the performance of DS-trained neural models, conduct thorough analyses, and identify a factor that can influence the performance greatly, shifted label distribution. Specifically, we found this problem commonly exists in real-world DS datasets, and without special handing, typical DS-RE models cannot automatically adapt to this shift, thus achieving deteriorated performance. To further validate our intuition, we develop a simple yet effective adaptation method for DS-trained models, bias adjustment, which updates models learned over the source domain (i.e., DS training set) with a label distribution estimated on the target domain (i.e., test set). Experiments demonstrate that bias adjustment achieves consistent performance gains on DS-trained models, especially on neural models, with an up to 23% relative F1 improvement, which verifies our assumptions. Our code and data can be found at https://github.com/INK-USC/shifted-label-distribution.</abstract>
       <url>D19-1397</url>
       <attachment>D19-1397.Attachment.pdf</attachment>
@@ -4616,7 +4616,7 @@
       <author><first>Gang</first><last>Wang</last></author>
       <author><first>Yansong</first><last>Feng</last></author>
       <author><first>Jinpeng</first><last>Huai</last></author>
-      <pages>3849–3859</pages>
+      <pages>3851–3861</pages>
       <abstract>Many existing relation extraction (RE) models make decisions globally using integer linear programming (ILP). However, it is nontrivial to make use of integer linear programming as a blackbox solver for RE. Its cost of time and memory may become unacceptable with the increase of data scale, and redundant information needs to be encoded cautiously for ILP. In this paper, we propose an easy first approach for relation extraction with information redundancies, embedded in the results produced by local sentence level extractors, during which conflict decisions are resolved with domain and uniqueness constraints. Information redundancies are leveraged to support both easy first collective inference for easy decisions in the first stage and ILP for hard decisions in a subsequent stage. Experimental study shows that our approach improves the efficiency and accuracy of RE, and outperforms both ILP and neural network-based methods.</abstract>
       <url>D19-1398</url>
       <doi>10.18653/v1/D19-1398</doi>
@@ -4625,7 +4625,7 @@
       <title>Dependency-Guided <fixed-case>LSTM</fixed-case>-<fixed-case>CRF</fixed-case> for Named Entity Recognition</title>
       <author><first>Zhanming</first><last>Jie</last></author>
       <author><first>Wei</first><last>Lu</last></author>
-      <pages>3860–3870</pages>
+      <pages>3862–3872</pages>
       <abstract>Dependency tree structures capture long-distance and syntactic relationships between words in a sentence. The syntactic relations (e.g., nominal subject, object) can potentially infer the existence of certain named entities. In addition, the performance of a named entity recognizer could benefit from the long-distance dependencies between the words in dependency trees. In this work, we propose a simple yet effective dependency-guided LSTM-CRF model to encode the complete dependency trees and capture the above properties for the task of named entity recognition (NER). The data statistics show strong correlations between the entity types and dependency relations. We conduct extensive experiments on several standard datasets and demonstrate the effectiveness of the proposed model in improving NER and achieving state-of-the-art performance. Our analysis reveals that the significant improvements mainly result from the dependency relations and long-distance interactions provided by dependency trees.</abstract>
       <url>D19-1399</url>
       <attachment>D19-1399.Attachment.zip</attachment>
@@ -4637,7 +4637,7 @@
       <author><first>Gal</first><last>Lavee</last></author>
       <author><first>Ido</first><last>Guy</last></author>
       <author><first>Kira</first><last>Radinsky</last></author>
-      <pages>3871–3881</pages>
+      <pages>3873–3883</pages>
       <abstract>Large training datasets are required to achieve competitive performance in most natural language tasks. The acquisition process for these datasets is labor intensive, expensive, and time consuming. This process is also prone to human errors. In this work, we show that cross-cultural differences can be harnessed for natural language text classification. We present a transfer-learning framework that leverages widely-available unaligned bilingual corpora for classification tasks, using no task-specific data. Our empirical evaluation on two tasks – formality classification and sarcasm detection – shows that the cross-cultural difference between German and American English, as manifested in product review text, can be applied to achieve good performance for formality classification, while the difference between Japanese and American English can be applied to achieve good performance for sarcasm detection – both without any task-specific labeled data.</abstract>
       <url>D19-1400</url>
       <doi>10.18653/v1/D19-1400</doi>
@@ -4647,7 +4647,7 @@
       <author><first>Oren</first><last>Melamud</last></author>
       <author><first>Mihaela</first><last>Bornea</last></author>
       <author><first>Ken</first><last>Barker</last></author>
-      <pages>3882–3891</pages>
+      <pages>3884–3893</pages>
       <abstract>Supervised learning models often perform poorly at low-shot tasks, i.e. tasks for which little labeled data is available for training. One prominent approach for improving low-shot learning is to use unsupervised pre-trained neural models. Another approach is to obtain richer supervision by collecting annotator rationales (explanations supporting label annotations). In this work, we combine these two approaches to improve low-shot text classification with two novel methods: a simple bag-of-words embedding approach; and a more complex context-aware method, based on the BERT model. In experiments with two English text classification datasets, we demonstrate substantial performance gains from combining pre-training with rationales. Furthermore, our investigation of a range of train-set sizes reveals that the simple bag-of-words approach is the clear top performer when there are only a few dozen training instances or less, while more complex models, such as BERT or CNN, require more training data to shine.</abstract>
       <url>D19-1401</url>
       <doi>10.18653/v1/D19-1401</doi>
@@ -4656,7 +4656,7 @@
       <title><fixed-case>P</fixed-case>ro<fixed-case>S</fixed-case>eqo: Projection Sequence Networks for On-Device Text Classification</title>
       <author><first>Zornitsa</first><last>Kozareva</last></author>
       <author><first>Sujith</first><last>Ravi</last></author>
-      <pages>3892–3901</pages>
+      <pages>3894–3903</pages>
       <abstract>We propose a novel on-device sequence model for text classification using recurrent projections. Our model ProSeqo uses dynamic recurrent projections without the need to store or look up any pre-trained embeddings. This results in fast and compact neural networks that can perform on-device inference for complex short and long text classification tasks. We conducted exhaustive evaluation on multiple text classification tasks. Results show that ProSeqo outperformed state-of-the-art neural and on-device approaches for short text classification tasks such as dialog act and intent prediction. To the best of our knowledge, ProSeqo is the first on-device long text classification neural model. It achieved comparable results to previous neural approaches for news article, answers and product categorization, while preserving small memory footprint and maintaining high accuracy.</abstract>
       <url>D19-1402</url>
       <doi>10.18653/v1/D19-1402</doi>
@@ -4669,7 +4669,7 @@
       <author><first>Xiaodan</first><last>Zhu</last></author>
       <author><first>Ping</first><last>Jian</last></author>
       <author><first>Jian</first><last>Sun</last></author>
-      <pages>3902–3911</pages>
+      <pages>3904–3913</pages>
       <abstract>Text classification tends to struggle when data is deficient or when it needs to adapt to unseen classes. In such challenging scenarios, recent studies have used meta-learning to simulate the few-shot task, in which new queries are compared to a small support set at the sample-wise level. However, this sample-wise comparison may be severely disturbed by the various expressions in the same class. Therefore, we should be able to learn a general representation of each class in the support set and then compare it to new queries. In this paper, we propose a novel Induction Network to learn such a generalized class-wise representation, by innovatively leveraging the dynamic routing algorithm in meta-learning. In this way, we find the model is able to induce and generalize better. We evaluate the proposed model on a well-studied sentiment classification dataset (English) and a real-world dialogue intent classification dataset (Chinese). Experiment results show that on both datasets, the proposed model significantly outperforms the existing state-of-the-art approaches, proving the effectiveness of class-wise generalization in few-shot text classification.</abstract>
       <url>D19-1403</url>
       <doi>10.18653/v1/D19-1403</doi>
@@ -4679,7 +4679,7 @@
       <author><first>Wenpeng</first><last>Yin</last></author>
       <author><first>Jamaal</first><last>Hay</last></author>
       <author><first>Dan</first><last>Roth</last></author>
-      <pages>3912–3921</pages>
+      <pages>3914–3923</pages>
       <abstract>Zero-shot text classification (0Shot-TC) is a challenging NLU problem to which little attention has been paid by the research community. 0Shot-TC aims to associate an appropriate label with a piece of text, irrespective of the text domain and the aspect (e.g., topic, emotion, event, etc.) described by the label. And there are only a few articles studying 0Shot-TC, all focusing only on topical categorization which, we argue, is just the tip of the iceberg in 0Shot-TC. In addition, the chaotic experiments in literature make no uniform comparison, which blurs the progress. This work benchmarks the 0Shot-TC problem by providing unified datasets, standardized evaluations, and state-of-the-art baselines. Our contributions include: i) The datasets we provide facilitate studying 0Shot-TC relative to conceptually different and diverse aspects: the “topic” aspect includes “sports” and “politics” as labels; the “emotion” aspect includes “joy” and “anger”; the “situation” aspect includes “medical assistance” and “water shortage”. ii) We extend the existing evaluation setup (label-partially-unseen) – given a dataset, train on some labels, test on all labels – to include a more challenging yet realistic evaluation label-fully-unseen 0Shot-TC (Chang et al., 2008), aiming at classifying text snippets without seeing task specific training data at all. iii) We unify the 0Shot-TC of diverse aspects within a textual entailment formulation and study it this way.</abstract>
       <url>D19-1404</url>
       <doi>10.18653/v1/D19-1404</doi>
@@ -4690,7 +4690,7 @@
       <author><first>Vivek</first><last>Gupta</last></author>
       <author><first>Maitrey</first><last>Mehta</last></author>
       <author><first>Vivek</first><last>Srikumar</last></author>
-      <pages>3922–3933</pages>
+      <pages>3924–3935</pages>
       <abstract>While neural models show remarkable accuracy on individual predictions, their internal beliefs can be inconsistent across examples. In this paper, we formalize such inconsistency as a generalization of prediction error. We propose a learning framework for constraining models using logic rules to regularize them away from inconsistency. Our framework can leverage both labeled and unlabeled examples and is directly compatible with off-the-shelf learning schemes without model redesign. We instantiate our framework on natural language inference, where experiments show that enforcing invariants stated in logic can help make the predictions of neural models both accurate and consistent.</abstract>
       <url>D19-1405</url>
       <doi>10.18653/v1/D19-1405</doi>
@@ -4702,7 +4702,7 @@
       <author><first>Aleksander</first><last>Nagaev</last></author>
       <author><first>Aigul</first><last>Nugmanova</last></author>
       <author><first>Ivan P.</first><last>Yamshchikov</last></author>
-      <pages>3934–3943</pages>
+      <pages>3936–3945</pages>
       <abstract>This paper shows that standard assessment methodology for style transfer has several significant problems. First, the standard metrics for style accuracy and semantics preservation vary significantly on different re-runs. Therefore one has to report error margins for the obtained results. Second, starting with certain values of bilingual evaluation understudy (BLEU) between input and output and accuracy of the sentiment transfer the optimization of these two standard metrics diverge from the intuitive goal of the style transfer task. Finally, due to the nature of the task itself, there is a specific dependence between these two metrics that could be easily manipulated. Under these circumstances, we suggest taking BLEU between input and human-written reformulations into consideration for benchmarks. We also propose three new architectures that outperform state of the art in terms of this metric.</abstract>
       <url>D19-1406</url>
       <doi>10.18653/v1/D19-1406</doi>
@@ -4714,7 +4714,7 @@
       <author><first>Jianfeng</first><last>Gao</last></author>
       <author><first>Wen</first><last>Dong</last></author>
       <author><first>Changyou</first><last>Chen</last></author>
-      <pages>3944–3954</pages>
+      <pages>3946–3956</pages>
       <abstract>Deep latent variable models (LVM) such as variational auto-encoder (VAE) have recently played an important role in text generation. One key factor is the exploitation of smooth latent structures to guide the generation. However, the representation power of VAEs is limited due to two reasons: (1) the Gaussian assumption is often made on the variational posteriors; and meanwhile (2) a notorious “posterior collapse” issue occurs. In this paper, we advocate sample-based representations of variational distributions for natural language, leading to implicit latent features, which can provide flexible representation power compared with Gaussian-based posteriors. We further develop an LVM to directly match the aggregated posterior to the prior. It can be viewed as a natural extension of VAEs with a regularization of maximizing mutual information, mitigating the “posterior collapse” issue. We demonstrate the effectiveness and versatility of our models in various text generation scenarios, including language modeling, unaligned style transfer, and dialog response generation. The source code to reproduce our experimental results is available on GitHub.</abstract>
       <url>D19-1407</url>
       <attachment>D19-1407.Attachment.zip</attachment>
@@ -4724,7 +4724,7 @@
       <title>Text Emotion Distribution Learning from Small Sample: A Meta-Learning Approach</title>
       <author><first>Zhenjie</first><last>Zhao</last></author>
       <author><first>Xiaojuan</first><last>Ma</last></author>
-      <pages>3955–3965</pages>
+      <pages>3957–3967</pages>
       <abstract>Text emotion distribution learning (EDL) aims to develop models that can predict the intensity values of a sentence across a set of emotion categories. Existing methods based on supervised learning require a large amount of well-labelled training data, which is difficult to obtain due to inconsistent perception of fine-grained emotion intensity. In this paper, we propose a meta-learning approach to learn text emotion distributions from a small sample. Specifically, we propose to learn low-rank sentence embeddings by tensor decomposition to capture their contextual semantic similarity, and use K-nearest neighbors (KNNs) of each sentence in the embedding space to generate sample clusters. We then train a meta-learner that can adapt to new data with only a few training samples on the clusters, and further fit the meta-learner on KNNs of a testing sample for EDL. In this way, we effectively augment the learning ability of a model on the small sample. To demonstrate the performance, we compare the proposed approach with state-of-the-art EDL methods on a widely used EDL dataset: SemEval 2007 Task 14 (Strapparava and Mihalcea, 2007). Results show the superiority of our method on small-sample emotion distribution learning.</abstract>
       <url>D19-1408</url>
       <doi>10.18653/v1/D19-1408</doi>
@@ -4735,7 +4735,7 @@
       <author><first>Samuel</first><last>Carton</last></author>
       <author><first>Shiyan</first><last>Yan</last></author>
       <author><first>Qiaozhu</first><last>Mei</last></author>
-      <pages>3966–3979</pages>
+      <pages>3968–3981</pages>
       <abstract>We conduct a large-scale, systematic study to evaluate the existing evaluation methods for natural language generation in the context of generating online product reviews. We compare human-based evaluators with a variety of automated evaluation procedures, including discriminative evaluators that measure how well machine-generated text can be distinguished from human-written text, as well as word overlap metrics that assess how similar the generated text compares to human-written references. We determine to what extent these different evaluators agree on the ranking of a dozen of state-of-the-art generators for online product reviews. We find that human evaluators do not correlate well with discriminative evaluators, leaving a bigger question of whether adversarial accuracy is the correct objective for natural language generation. In general, distinguishing machine-generated text is challenging even for human evaluators, and human decisions correlate better with lexical overlaps. We find lexical diversity an intriguing metric that is indicative of the assessments of different evaluators. A post-experiment survey of participants provides insights into how to evaluate and improve the quality of natural language generation systems.</abstract>
       <url>D19-1409</url>
       <attachment>D19-1409.Attachment.pdf</attachment>
@@ -4745,7 +4745,7 @@
       <title>Sentence-<fixed-case>BERT</fixed-case>: Sentence Embeddings using <fixed-case>S</fixed-case>iamese <fixed-case>BERT</fixed-case>-Networks</title>
       <author><first>Nils</first><last>Reimers</last></author>
       <author><first>Iryna</first><last>Gurevych</last></author>
-      <pages>3980–3990</pages>
+      <pages>3982–3992</pages>
       <abstract>BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS). However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations (~65 hours) with BERT. The construction of BERT makes it unsuitable for semantic similarity search as well as for unsupervised tasks like clustering. In this publication, we present Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity. This reduces the effort for finding the most similar pair from 65 hours with BERT / RoBERTa to about 5 seconds with SBERT, while maintaining the accuracy from BERT. We evaluate SBERT and SRoBERTa on common STS tasks and transfer learning tasks, where it outperforms other state-of-the-art sentence embeddings methods.</abstract>
       <url>D19-1410</url>
       <doi>10.18653/v1/D19-1410</doi>
@@ -4757,7 +4757,7 @@
       <author><first>Yiping</first><last>Jin</last></author>
       <author><first>Dittaya</first><last>Wanvarie</last></author>
       <author><first>Masashi</first><last>Sugiyama</last></author>
-      <pages>3991–4000</pages>
+      <pages>3993–4002</pages>
       <abstract>We consider a document classification problem where document labels are absent but only relevant keywords of a target class and unlabeled documents are given. Although heuristic methods based on pseudo-labeling have been considered, theoretical understanding of this problem has still been limited. Moreover, previous methods cannot easily incorporate well-developed techniques in supervised text classification. In this paper, we propose a theoretically guaranteed learning framework that is simple to implement and has flexible choices of models, e.g., linear models or neural networks. We demonstrate how to optimize the area under the receiver operating characteristic curve (AUC) effectively and also discuss how to adjust it to optimize other well-known evaluation metrics such as the accuracy and F1-measure. Finally, we show the effectiveness of our framework using benchmark datasets.</abstract>
       <url>D19-1411</url>
       <attachment>D19-1411.Attachment.rar</attachment>
@@ -4770,7 +4770,7 @@
       <author><first>Ruoyu</first><last>Jia</last></author>
       <author><first>Sujian</first><last>Li</last></author>
       <author><first>Jingming</first><last>Liu</last></author>
-      <pages>4001–4013</pages>
+      <pages>4003–4015</pages>
       <abstract>This paper presents a new sequence-to-sequence (seq2seq) pre-training method PoDA (Pre-training of Denoising Autoencoders), which learns representations suitable for text generation tasks. Unlike encoder-only (e.g., BERT) or decoder-only (e.g., OpenAI GPT) pre-training approaches, PoDA jointly pre-trains both the encoder and decoder by denoising the noise-corrupted text, and it also has the advantage of keeping the network architecture unchanged in the subsequent fine-tuning stage. Meanwhile, we design a hybrid model of Transformer and pointer-generator networks as the backbone architecture for PoDA. We conduct experiments on two text generation tasks: abstractive summarization, and grammatical error correction. Results on four datasets show that PoDA can improve model performance over strong baselines without using any task-specific techniques and significantly speed up convergence.</abstract>
       <url>D19-1412</url>
       <doi>10.18653/v1/D19-1412</doi>
@@ -4779,7 +4779,7 @@
       <title>Dialog Intent Induction with Deep Multi-View Clustering</title>
       <author><first>Hugh</first><last>Perkins</last></author>
       <author><first>Yi</first><last>Yang</last></author>
-      <pages>4014–4023</pages>
+      <pages>4016–4025</pages>
       <abstract>We introduce the dialog intent induction task and present a novel deep multi-view clustering approach to tackle the problem. Dialog intent induction aims at discovering user intents from user query utterances in human-human conversations such as dialogs between customer support agents and customers. Motivated by the intuition that a dialog intent is not only expressed in the user query utterance but also captured in the rest of the dialog, we split a conversation into two independent views and exploit multi-view clustering techniques for inducing the dialog intent. In par- ticular, we propose alternating-view k-means (AV-KMEANS) for joint multi-view represen- tation learning and clustering analysis. The key innovation is that the instance-view representations are updated iteratively by predicting the cluster assignment obtained from the alternative view, so that the multi-view representations of the instances lead to similar cluster assignments. Experiments on two public datasets show that AV-KMEANS can induce better dialog intent clusters than state-of-the-art unsupervised representation learning methods and standard multi-view clustering approaches.</abstract>
       <url>D19-1413</url>
       <doi>10.18653/v1/D19-1413</doi>
@@ -4790,7 +4790,7 @@
       <author><first>Aram</first><last>Galstyan</last></author>
       <author><first>Greg</first><last>Ver Steeg</last></author>
       <author><first>Guillermo</first><last>Cecchi</last></author>
-      <pages>4024–4034</pages>
+      <pages>4026–4036</pages>
       <abstract>Recently, kernelized locality sensitive hashcodes have been successfully employed as representations of natural language text, especially showing high relevance to biomedical relation extraction tasks. In this paper, we propose to optimize the hashcode representations in a nearly unsupervised manner, in which we only use data points, but not their class labels, for learning. The optimized hashcode representations are then fed to a supervised classifi er following the prior work. This nearly unsupervised approach allows fine-grained optimization of each hash function, which is particularly suitable for building hashcode representations generalizing from a training set to a test set. We empirically evaluate the proposed approach for biomedical relation extraction tasks, obtaining significant accuracy improvements w.r.t. state-of-the-art supervised and semi-supervised approaches.</abstract>
       <url>D19-1414</url>
       <doi>10.18653/v1/D19-1414</doi>
@@ -4800,7 +4800,7 @@
       <author><first>Danilo</first><last>Croce</last></author>
       <author><first>Daniele</first><last>Rossini</last></author>
       <author><first>Roberto</first><last>Basili</last></author>
-      <pages>4035–4044</pages>
+      <pages>4037–4046</pages>
       <abstract>While NLP systems become more pervasive, their accountability gains value as a focal point of effort. Epistemological opaqueness of nonlinear learning methods, such as deep learning models, can be a major drawback for their adoptions. In this paper, we discuss the application of Layerwise Relevance Propagation over a linguistically motivated neural architecture, the Kernel-based Deep Architecture, in order to trace back connections between linguistic properties of input instances and system decisions. Such connections then guide the construction of argumentations on network’s inferences, i.e., explanations based on real examples, semantically related to the input. We propose here a methodology to evaluate the transparency and coherence of analogy-based explanations modeling an audit stage for the system. Quantitative analysis on two semantic tasks, i.e., question classification and semantic role labeling, show that the explanatory capabilities (native in KDAs) are effective and they pave the way to more complex argumentation methods.</abstract>
       <url>D19-1415</url>
       <doi>10.18653/v1/D19-1415</doi>
@@ -4809,7 +4809,7 @@
       <title>Enhancing Variational Autoencoders with Mutual Information Neural Estimation for Text Generation</title>
       <author><first>Dong</first><last>Qian</last></author>
       <author><first>William K.</first><last>Cheung</last></author>
-      <pages>4045–4055</pages>
+      <pages>4047–4057</pages>
       <abstract>While broadly applicable to many natural language processing (NLP) tasks, variational autoencoders (VAEs) are hard to train due to the posterior collapse issue where the latent variable fails to encode the input data effectively. Various approaches have been proposed to alleviate this problem to improve the capability of the VAE. In this paper, we propose to introduce a mutual information (MI) term between the input and its latent variable to regularize the objective of the VAE. Since estimating the MI in the high-dimensional space is intractable, we employ neural networks for the estimation of the MI and provide a training algorithm based on the convex duality approach. Our experimental results on three benchmark datasets demonstrate that the proposed model, compared to the state-of-the-art baselines, exhibits less posterior collapse and has comparable or better performance in language modeling and text generation. We also qualitatively evaluate the inferred latent space and show that the proposed model can generate more reasonable and diverse sentences via linear interpolation in the latent space.</abstract>
       <url>D19-1416</url>
       <attachment>D19-1416.Attachment.pdf</attachment>
@@ -4820,7 +4820,7 @@
       <author><first>Ameya</first><last>Prabhu</last></author>
       <author><first>Charles</first><last>Dognin</last></author>
       <author><first>Maneesh</first><last>Singh</last></author>
-      <pages>4056–4066</pages>
+      <pages>4058–4068</pages>
       <abstract>The exploding cost and time needed for data labeling and model training are bottlenecks for training DNN models on large datasets. Identifying smaller representative data samples with strategies like active learning can help mitigate such bottlenecks. Previous works on active learning in NLP identify the problem of sampling bias in the samples acquired by uncertainty-based querying and develop costly approaches to address it. Using a large empirical study, we demonstrate that active set selection using the posterior entropy of deep models like FastText.zip (FTZ) is robust to sampling biases and to various algorithmic choices (query size and strategies) unlike that suggested by traditional literature. We also show that FTZ based query strategy produces sample sets similar to those from more sophisticated approaches (e.g ensemble networks). Finally, we show the effectiveness of the selected samples by creating tiny high-quality datasets, and utilizing them for fast and cheap training of large models. Based on the above, we propose a simple baseline for deep active text classification that outperforms the state of the art. We expect the presented work to be useful and informative for dataset compression and for problems involving active, semi-supervised or online learning scenarios. Code and models are available at: https://github.com/drimpossible/Sampling-Bias-Active-Learning.</abstract>
       <url>D19-1417</url>
       <attachment>D19-1417.Attachment.pdf</attachment>
@@ -4831,7 +4831,7 @@
       <author><first>Christopher</first><last>Clark</last></author>
       <author><first>Mark</first><last>Yatskar</last></author>
       <author><first>Luke</first><last>Zettlemoyer</last></author>
-      <pages>4067–4080</pages>
+      <pages>4069–4082</pages>
       <abstract>State-of-the-art models often make use of superficial patterns in the data that do not generalize well to out-of-domain or adversarial settings. For example, textual entailment models often learn that particular key words imply entailment, irrespective of context, and visual question answering models learn to predict prototypical answers, without considering evidence in the image. In this paper, we show that if we have prior knowledge of such biases, we can train a model to be more robust to domain shift. Our method has two stages: we (1) train a naive model that makes predictions exclusively based on dataset biases, and (2) train a robust model as part of an ensemble with the naive one in order to encourage it to focus on other patterns in the data that are more likely to generalize. Experiments on five datasets with out-of-domain test sets show significantly improved robustness in all settings, including a 12 point gain on a changing priors visual question answering dataset and a 9 point gain on an adversarial question answering test set.</abstract>
       <url>D19-1418</url>
       <doi>10.18653/v1/D19-1418</doi>
@@ -4846,7 +4846,7 @@
       <author><first>Sven</first><last>Gowal</last></author>
       <author><first>Krishnamurthy</first><last>Dvijotham</last></author>
       <author><first>Pushmeet</first><last>Kohli</last></author>
-      <pages>4081–4091</pages>
+      <pages>4083–4093</pages>
       <abstract>Neural networks are part of many contemporary NLP systems, yet their empirical successes come at the price of vulnerability to adversarial attacks. Previous work has used adversarial training and data augmentation to partially mitigate such brittleness, but these are unlikely to find worst-case adversaries due to the complexity of the search space arising from discrete text perturbations. In this work, we approach the problem from the opposite direction: to formally verify a system’s robustness against a predefined class of adversarial attacks. We study text classification under synonym replacements or character flip perturbations. We propose modeling these input perturbations as a simplex and then using Interval Bound Propagation – a formal model verification method. We modify the conventional log-likelihood training objective to train models that can be efficiently verified, which would otherwise come with exponential search complexity. The resulting models show only little difference in terms of nominal accuracy, but have much improved verified accuracy under perturbations and come with an efficiently computable formal guarantee on worst case adversaries.</abstract>
       <url>D19-1419</url>
       <attachment>D19-1419.Attachment.zip</attachment>
@@ -4858,7 +4858,7 @@
       <author><first>Shiyu</first><last>Chang</last></author>
       <author><first>Yang</first><last>Zhang</last></author>
       <author><first>Tommi</first><last>Jaakkola</last></author>
-      <pages>4092–4101</pages>
+      <pages>4094–4103</pages>
       <abstract>Selective rationalization has become a common mechanism to ensure that predictive models reveal how they use any available features. The selection may be soft or hard, and identifies a subset of input features relevant for prediction. The setup can be viewed as a co-operate game between the selector (aka rationale generator) and the predictor making use of only the selected features. The co-operative setting may, however, be compromised for two reasons. First, the generator typically has no direct access to the outcome it aims to justify, resulting in poor performance. Second, there’s typically no control exerted on the information left outside the selection. We revise the overall co-operative framework to address these challenges. We introduce an introspective model which explicitly predicts and incorporates the outcome into the selection process. Moreover, we explicitly control the rationale complement via an adversary so as not to leave any useful information out of the selection. We show that the two complementary mechanisms maintain both high predictive accuracy and lead to comprehensive rationales.</abstract>
       <url>D19-1420</url>
       <attachment>D19-1420.Attachment.zip</attachment>
@@ -4868,7 +4868,7 @@
       <title>Experimenting with Power Divergences for Language Modeling</title>
       <author><first>Matthieu</first><last>Labeau</last></author>
       <author><first>Shay B.</first><last>Cohen</last></author>
-      <pages>4102–4112</pages>
+      <pages>4104–4114</pages>
       <abstract>Neural language models are usually trained using Maximum-Likelihood Estimation (MLE). The corresponding objective function for MLE is derived from the Kullback-Leibler (KL) divergence between the empirical probability distribution representing the data and the parametric probability distribution output by the model. However, the word frequency discrepancies in natural language make performance extremely uneven: while the perplexity is usually very low for frequent words, it is especially difficult to predict rare words. In this paper, we experiment with several families (alpha, beta and gamma) of power divergences, generalized from the KL divergence, for learning language models with an objective different than standard MLE. Intuitively, these divergences should affect the way the probability mass is spread during learning, notably by prioritizing performances on high or low-frequency words. In addition, we implement and experiment with various sampling-based objectives, where the computation of the output layer is only done on a small subset of the vocabulary. They are derived as power generalizations of a softmax approximated via Importance Sampling, and Noise Contrastive Estimation, for accelerated learning. Our experiments on the Penn Treebank and Wikitext-2 show that these power divergences can indeed be used to prioritize learning on the frequent or rare words, and lead to general performance improvements in the case of sampling-based learning.</abstract>
       <url>D19-1421</url>
       <attachment>D19-1421.Attachment.pdf</attachment>
@@ -4878,7 +4878,7 @@
       <title>Hierarchically-Refined Label Attention Network for Sequence Labeling</title>
       <author><first>Leyang</first><last>Cui</last></author>
       <author><first>Yue</first><last>Zhang</last></author>
-      <pages>4113–4126</pages>
+      <pages>4115–4128</pages>
       <abstract>CRF has been used as a powerful model for statistical sequence labeling. For neural sequence labeling, however, BiLSTM-CRF does not always lead to better results compared with BiLSTM-softmax local classification. This can be because the simple Markov label transition model of CRF does not give much information gain over strong neural encoding. For better representing label sequences, we investigate a hierarchically-refined label attention network, which explicitly leverages label embeddings and captures potential long-term label dependency by giving each word incrementally refined label distributions with hierarchical attention. Results on POS tagging, NER and CCG supertagging show that the proposed model not only improves the overall tagging accuracy with similar number of parameters, but also significantly speeds up the training and testing compared to BiLSTM-CRF.</abstract>
       <url>D19-1422</url>
       <attachment>D19-1422.Attachment.rar</attachment>
@@ -4890,7 +4890,7 @@
       <author><first>Aditi</first><last>Raghunathan</last></author>
       <author><first>Kerem</first><last>Göksel</last></author>
       <author><first>Percy</first><last>Liang</last></author>
-      <pages>4127–4140</pages>
+      <pages>4129–4142</pages>
       <abstract>State-of-the-art NLP models can often be fooled by adversaries that apply seemingly innocuous label-preserving transformations (e.g., paraphrasing) to input text. The number of possible transformations scales exponentially with text length, so data augmentation cannot cover all transformations of an input. This paper considers one exponentially large family of label-preserving transformations, in which every word in the input can be replaced with a similar word. We train the first models that are provably robust to all word substitutions in this family. Our training procedure uses Interval Bound Propagation (IBP) to minimize an upper bound on the worst-case loss that any combination of word substitutions can induce. To evaluate models’ robustness to these transformations, we measure accuracy on adversarially chosen word substitutions applied to test examples. Our IBP-trained models attain 75% adversarial accuracy on both sentiment analysis on IMDB and natural language inference on SNLI; in comparison, on IMDB, models trained normally and ones trained with data augmentation achieve adversarial accuracy of only 12% and 41%, respectively.</abstract>
       <url>D19-1423</url>
       <attachment>D19-1423.Attachment.zip</attachment>
@@ -4902,7 +4902,7 @@
       <author><first>Li</first><last>Dong</last></author>
       <author><first>Furu</first><last>Wei</last></author>
       <author><first>Ke</first><last>Xu</last></author>
-      <pages>4141–4150</pages>
+      <pages>4143–4152</pages>
       <abstract>Language model pre-training, such as BERT, has achieved remarkable results in many NLP tasks. However, it is unclear why the pre-training-then-fine-tuning paradigm can improve performance and generalization capability across different tasks. In this paper, we propose to visualize loss landscapes and optimization trajectories of fine-tuning BERT on specific datasets. First, we find that pre-training reaches a good initial point across downstream tasks, which leads to wider optima and easier optimization compared with training from scratch. We also demonstrate that the fine-tuning procedure is robust to overfitting, even though BERT is highly over-parameterized for downstream tasks. Second, the visualization results indicate that fine-tuning BERT tends to generalize better because of the flat and wide optima, and the consistency between the training loss surface and the generalization error surface. Third, the lower layers of BERT are more invariant during fine-tuning, which suggests that the layers that are close to input learn more transferable representations of language.</abstract>
       <url>D19-1424</url>
       <doi>10.18653/v1/D19-1424</doi>
@@ -4913,9 +4913,8 @@
       <author><first>Shuly</first><last>Wintner</last></author>
       <author><first>Noah A.</first><last>Smith</last></author>
       <author><first>Yulia</first><last>Tsvetkov</last></author>
-      <pages>4151–4161</pages>
-      <abstract>Despite impressive performance on many text classification tasks, deep neural networks tend to learn frequent superficial patterns that are specific to the training data and do not always generalize well. In this work, we observe this limitation with respect to the task of <i>native language identification</i>. We find that standard text classifiers which perform well on the test set end up learning topical features which are confounds of the prediction task (e.g., if the input text mentions Sweden, the classifier predicts that the author’s native language is Swedish). We propose a method that represents the latent topical confounds and a model which “unlearns” confounding features by predicting both the label of the input text and the confound; but we train the two predictors adversarially in an alternating fashion to learn a text representation that predicts the correct label but is less prone to using information about the confound. We show that this model generalizes better and learns features that are indicative of the writing style rather than the content.The code is available at: <url>https://github.com/Sachin19/adversarial-classify</url>
-      </abstract>
+      <pages>4153–4163</pages>
+      <abstract>Despite impressive performance on many text classification tasks, deep neural networks tend to learn frequent superficial patterns that are specific to the training data and do not always generalize well. In this work, we observe this limitation with respect to the task of <i>native language identification</i>. We find that standard text classifiers which perform well on the test set end up learning topical features which are confounds of the prediction task (e.g., if the input text mentions Sweden, the classifier predicts that the author’s native language is Swedish). We propose a method that represents the latent topical confounds and a model which “unlearns” confounding features by predicting both the label of the input text and the confound; but we train the two predictors adversarially in an alternating fashion to learn a text representation that predicts the correct label but is less prone to using information about the confound. We show that this model generalizes better and learns features that are indicative of the writing style rather than the content.</abstract>
       <url>D19-1425</url>
       <doi>10.18653/v1/D19-1425</doi>
     </paper>
@@ -4924,7 +4923,7 @@
       <author><first>Shashank</first><last>Srivastava</last></author>
       <author><first>Igor</first><last>Labutov</last></author>
       <author><first>Tom</first><last>Mitchell</last></author>
-      <pages>4162–4172</pages>
+      <pages>4164–4174</pages>
       <abstract>Natural language has recently been explored as a new medium of supervision for training machine learning models. Here, we explore learning classification tasks using language in a conversational setting – where the automated learner does not simply receive language input from a teacher, but can proactively engage the teacher by asking questions. We present a reinforcement learning framework, where the learner’s actions correspond to question types and the reward for asking a question is based on how the teacher’s response changes performance of the resulting machine learning model on the learning task. In this framework, learning good question-asking strategies corresponds to asking sequences of questions that maximize the cumulative (discounted) reward, and hence quickly lead to effective classifiers. Empirical analysis across three domains shows that learned question-asking strategies expedite classifier training by asking appropriate questions at different points in the learning process. The approach allows learning classifiers from a blend of strategies, including learning from observations, explanations and clarifications.</abstract>
       <url>D19-1426</url>
       <doi>10.18653/v1/D19-1426</doi>
@@ -4933,7 +4932,7 @@
       <title>Language Modeling for Code-Switching: Evaluation, Integration of Monolingual Data, and Discriminative Training</title>
       <author><first>Hila</first><last>Gonen</last></author>
       <author><first>Yoav</first><last>Goldberg</last></author>
-      <pages>4173–4183</pages>
+      <pages>4175–4185</pages>
       <abstract>We focus on the problem of language modeling for code-switched language, in the context of automatic speech recognition (ASR). Language modeling for code-switched language is challenging for (at least) three reasons: (1) lack of available large-scale code-switched data for training; (2) lack of a replicable evaluation setup that is ASR directed yet isolates language modeling performance from the other intricacies of the ASR system; and (3) the reliance on generative modeling. We tackle these three issues: we propose an ASR-motivated evaluation setup which is decoupled from an ASR system and the choice of vocabulary, and provide an evaluation dataset for English-Spanish code-switching. This setup lends itself to a discriminative training approach, which we demonstrate to work better than generative language modeling. Finally, we explore a variety of training protocols and verify the effectiveness of training with large amounts of monolingual data followed by fine-tuning with small amounts of code-switched data, for both the generative and discriminative cases.</abstract>
       <url>D19-1427</url>
       <attachment>D19-1427.Attachment.zip</attachment>
@@ -4945,7 +4944,7 @@
       <author><first>Claire</first><last>Gardent</last></author>
       <author><first>Chloé</first><last>Braud</last></author>
       <author><first>Antoine</first><last>Bordes</last></author>
-      <pages>4184–4194</pages>
+      <pages>4186–4196</pages>
       <abstract>Query-based open-domain NLP tasks require information synthesis from long and diverse web results. Current approaches extractively select portions of web text as input to Sequence-to-Sequence models using methods such as TF-IDF ranking. We propose constructing a local graph structured knowledge base for each query, which compresses the web search information and reduces redundancy. We show that by linearizing the graph into a structured input sequence, models can encode the graph representations within a standard Sequence-to-Sequence setting. For two generative tasks with very long text input, long-form question answering and multi-document summarization, feeding graph representations as input can achieve better performance than using retrieved text portions.</abstract>
       <url>D19-1428</url>
       <attachment>D19-1428.Attachment.zip</attachment>
@@ -4957,7 +4956,7 @@
       <author><first>Shujian</first><last>Huang</last></author>
       <author><first>Xin-Yu</first><last>Dai</last></author>
       <author><first>Jiajun</first><last>Chen</last></author>
-      <pages>4195–4204</pages>
+      <pages>4197–4206</pages>
       <abstract>In sequence labeling, previous domain adaptation methods focus on the adaptation from the source domain to the entire target domain without considering the diversity of individual target domain samples, which may lead to negative transfer results for certain samples. Besides, an important characteristic of sequence labeling tasks is that different elements within a given sample may also have diverse domain relevance, which requires further consideration. To take the multi-level domain relevance discrepancy into account, in this paper, we propose a fine-grained knowledge fusion model with the domain relevance modeling scheme to control the balance between learning from the target domain data and learning from the source domain model. Experiments on three sequence labeling tasks show that our fine-grained knowledge fusion model outperforms strong baselines and other state-of-the-art sequence labeling domain adaptation methods.</abstract>
       <url>D19-1429</url>
       <doi>10.18653/v1/D19-1429</doi>
@@ -4970,7 +4969,7 @@
       <author><first>Tao</first><last>Qin</last></author>
       <author><first>Jianhuang</first><last>Lai</last></author>
       <author><first>Tie-Yan</first><last>Liu</last></author>
-      <pages>4205–4215</pages>
+      <pages>4207–4216</pages>
       <abstract>While target-side monolingual data has been proven to be very useful to improve neural machine translation (briefly, NMT) through back translation, source-side monolingual data is not well investigated. In this work, we study how to use both the source-side and target-side monolingual data for NMT, and propose an effective strategy leveraging both of them. First, we generate synthetic bitext by translating monolingual data from the two domains into the other domain using the models pretrained on genuine bitext. Next, a model is trained on a noised version of the concatenated synthetic bitext where each source sequence is randomly corrupted. Finally, the model is fine-tuned on the genuine bitext and a clean version of a subset of the synthetic bitext without adding any noise. Our approach achieves state-of-the-art results on WMT16, WMT17, WMT18 English<tex-math>\leftrightarrow</tex-math>German translations and WMT19 German<tex-math>\to</tex-math>French translations, which demonstrate the effectiveness of our method. We also conduct a comprehensive study on how each part in the pipeline works.</abstract>
       <url>D19-1430</url>
       <attachment>D19-1430.Attachment.zip</attachment>
@@ -4983,7 +4982,7 @@
       <author><first>Wei</first><last>Zhang</last></author>
       <author><first>Qiang</first><last>Chen</last></author>
       <author><first>Huajun</first><last>Chen</last></author>
-      <pages>4216–4225</pages>
+      <pages>4217–4226</pages>
       <abstract>Link prediction is an important way to complete knowledge graphs (KGs), while embedding-based methods, effective for link prediction in KGs, perform poorly on relations that only have a few associative triples. In this work, we propose a Meta Relational Learning (MetaR) framework to do the common but challenging few-shot link prediction in KGs, namely predicting new triples about a relation by only observing a few associative triples. We solve few-shot link prediction by focusing on transferring relation-specific meta information to make model learn the most important knowledge and learn faster, corresponding to relation meta and gradient meta respectively in MetaR. Empirically, our model achieves state-of-the-art results on few-shot link prediction KG benchmarks.</abstract>
       <url>D19-1431</url>
       <doi>10.18653/v1/D19-1431</doi>
@@ -4994,7 +4993,7 @@
       <author><first>Shiori</first><last>Sagawa</last></author>
       <author><first>Tatsunori</first><last>Hashimoto</last></author>
       <author><first>Percy</first><last>Liang</last></author>
-      <pages>4226–4236</pages>
+      <pages>4227–4237</pages>
       <abstract>Language models are generally trained on data spanning a wide range of topics (e.g., news, reviews, fiction), but they might be applied to an a priori unknown target distribution (e.g., restaurant reviews). In this paper, we first show that training on text outside the test distribution can degrade test performance when using standard maximum likelihood (MLE) training. To remedy this without the knowledge of the test distribution, we propose an approach which trains a model that performs well over a wide range of potential test distributions. In particular, we derive a new distributionally robust optimization (DRO) procedure which minimizes the loss of the model over the worst-case mixture of topics with sufficient overlap with the training distribution. Our approach, called topic conditional value at risk (topic CVaR), obtains a 5.5 point perplexity reduction over MLE when the language models are trained on a mixture of Yelp reviews and news and tested only on reviews.</abstract>
       <url>D19-1432</url>
       <doi>10.18653/v1/D19-1432</doi>
@@ -5003,7 +5002,7 @@
       <title>Unsupervised Domain Adaptation of Contextualized Embeddings for Sequence Labeling</title>
       <author><first>Xiaochuang</first><last>Han</last></author>
       <author><first>Jacob</first><last>Eisenstein</last></author>
-      <pages>4237–4247</pages>
+      <pages>4238–4248</pages>
       <abstract>Contextualized word embeddings such as ELMo and BERT provide a foundation for strong performance across a wide range of natural language processing tasks by pretraining on large corpora of unlabeled text. However, the applicability of this approach is unknown when the target domain varies substantially from the pretraining corpus. We are specifically interested in the scenario in which labeled data is available in only a canonical source domain such as newstext, and the target domain is distinct from both the labeled and pretraining texts. To address this scenario, we propose domain-adaptive fine-tuning, in which the contextualized embeddings are adapted by masked language modeling on text from the target domain. We test this approach on sequence labeling in two challenging domains: Early Modern English and Twitter. Both domains differ substantially from existing pretraining corpora, and domain-adaptive fine-tuning yields substantial improvements over strong BERT baselines, with particularly impressive results on out-of-vocabulary words. We conclude that domain-adaptive fine-tuning offers a simple and effective approach for the unsupervised adaptation of sequence labeling to difficult new domains.</abstract>
       <url>D19-1433</url>
       <attachment>D19-1433.Attachment.zip</attachment>
@@ -5014,7 +5013,7 @@
       <author><first>John P.</first><last>Lalor</last></author>
       <author><first>Hao</first><last>Wu</last></author>
       <author><first>Hong</first><last>Yu</last></author>
-      <pages>4248–4258</pages>
+      <pages>4249–4259</pages>
       <abstract>Incorporating Item Response Theory (IRT) into NLP tasks can provide valuable information about model performance and behavior. Traditionally, IRT models are learned using human response pattern (RP) data, presenting a significant bottleneck for large data sets like those required for training deep neural networks (DNNs). In this work we propose learning IRT models using RPs generated from artificial crowds of DNN models. We demonstrate the effectiveness of learning IRT models using DNN-generated data through quantitative and qualitative analyses for two NLP tasks. Parameters learned from human and machine RPs for natural language inference and sentiment analysis exhibit medium to large positive correlations. We demonstrate a use-case for latent difficulty item parameters, namely training set filtering, and show that using difficulty to sample training data outperforms baseline methods. Finally, we highlight cases where human expectation about item difficulty does not match difficulty as estimated from the machine RPs.</abstract>
       <url>D19-1434</url>
       <attachment>D19-1434.Attachment.zip</attachment>
@@ -5027,7 +5026,7 @@
       <author><first>Rasna</first><last>Goyal</last></author>
       <author><first>Sabyasachi</first><last>Ghosh</last></author>
       <author><first>Vihari</first><last>Piratla</last></author>
-      <pages>4259–4269</pages>
+      <pages>4260–4270</pages>
       <abstract>We present a Parallel Iterative Edit (PIE) model for the problem of local sequence transduction arising in tasks like Grammatical error correction (GEC). Recent approaches are based on the popular encoder-decoder (ED) model for sequence to sequence learning. The ED model auto-regressively captures full dependency among output tokens but is slow due to sequential decoding. The PIE model does parallel decoding, giving up the advantage of modeling full dependency in the output, yet it achieves accuracy competitive with the ED model for four reasons: 1. predicting edits instead of tokens, 2. labeling sequences instead of generating sequences, 3. iteratively refining predictions to capture dependencies, and 4. factorizing logits over edits and their token argument to harness pre-trained language models like BERT. Experiments on tasks spanning GEC, OCR correction and spell correction demonstrate that the PIE model is an accurate and significantly faster alternative for local sequence transduction.</abstract>
       <url>D19-1435</url>
       <attachment>D19-1435.Attachment.pdf</attachment>
@@ -5039,7 +5038,7 @@
       <author><first>Fei</first><last>Huang</last></author>
       <author><first>Minlie</first><last>Huang</last></author>
       <author><first>Xiaoyan</first><last>Zhu</last></author>
-      <pages>4270–4280</pages>
+      <pages>4271–4281</pages>
       <abstract>Most of the existing generative adversarial networks (GAN) for text generation suffer from the instability of reinforcement learning training algorithms such as policy gradient, leading to unstable performance. To tackle this problem, we propose a novel framework called Adversarial Reward Augmented Maximum Likelihood (ARAML). During adversarial training, the discriminator assigns rewards to samples which are acquired from a stationary distribution near the data rather than the generator’s distribution. The generator is optimized with maximum likelihood estimation augmented by the discriminator’s rewards instead of policy gradient. Experiments show that our model can outperform state-of-the-art text GANs with a more stable training process.</abstract>
       <url>D19-1436</url>
       <doi>10.18653/v1/D19-1436</doi>
@@ -5051,7 +5050,7 @@
       <author><first>Xian</first><last>Li</last></author>
       <author><first>Graham</first><last>Neubig</last></author>
       <author><first>Eduard</first><last>Hovy</last></author>
-      <pages>4281–4291</pages>
+      <pages>4282–4292</pages>
       <abstract>Most sequence-to-sequence (seq2seq) models are autoregressive; they generate each token by conditioning on previously generated tokens. In contrast, non-autoregressive seq2seq models generate all tokens in one pass, which leads to increased efficiency through parallel processing on hardware such as GPUs. However, directly modeling the joint distribution of all tokens simultaneously is challenging, and even with increasingly complex model structures accuracy lags significantly behind autoregressive models. In this paper, we propose a simple, efficient, and effective model for non-autoregressive sequence generation using latent variable models. Specifically, we turn to generative flow, an elegant technique to model complex distributions using neural networks, and design several layers of flow tailored for modeling the conditional density of sequential latent variables. We evaluate this model on three neural machine translation (NMT) benchmark datasets, achieving comparable performance with state-of-the-art non-autoregressive NMT models and almost constant decoding time w.r.t the sequence length.</abstract>
       <url>D19-1437</url>
       <attachment>D19-1437.Attachment.zip</attachment>
@@ -5063,7 +5062,7 @@
       <author><first>Liang</first><last>Zhao</last></author>
       <author><first>Jianyu</first><last>Wang</last></author>
       <author><first>Joel</first><last>Hestness</last></author>
-      <pages>4292–4301</pages>
+      <pages>4293–4302</pages>
       <abstract>Compositional generalization is a basic mechanism in human language learning, but current neural networks lack such ability. In this paper, we conduct fundamental research for encoding compositionality in neural networks. Conventional methods use a single representation for the input sentence, making it hard to apply prior knowledge of compositionality. In contrast, our approach leverages such knowledge with two representations, one generating attention maps, and the other mapping attended input words to output symbols. We reduce the entropy in each representation to improve generalization. Our experiments demonstrate significant improvements over the conventional methods in five NLP tasks including instruction learning and machine translation. In the SCAN domain, it boosts accuracies from 14.0% to 98.8% in Jump task, and from 92.0% to 99.7% in TurnLeft task. It also beats human performance on a few-shot learning task. We hope the proposed approach can help ease future research towards human-level compositional language learning.</abstract>
       <url>D19-1438</url>
       <attachment>D19-1438.Attachment.zip</attachment>
@@ -5077,7 +5076,7 @@
       <author><first>Yordan</first><last>Yordanov</last></author>
       <author><first>Phil</first><last>Blunsom</last></author>
       <author><first>Thomas</first><last>Lukasiewicz</last></author>
-      <pages>4302–4311</pages>
+      <pages>4303–4312</pages>
       <abstract>Pronoun resolution is a major area of natural language understanding. However, large-scale training sets are still scarce, since manually labelling data is costly. In this work, we introduce WikiCREM (Wikipedia CoREferences Masked) a large-scale, yet accurate dataset of pronoun disambiguation instances. We use a language-model-based approach for pronoun resolution in combination with our WikiCREM dataset. We compare a series of models on a collection of diverse and challenging coreference resolution problems, where we match or outperform previous state-of-the-art approaches on 6 out of 7 datasets, such as GAP, DPR, WNLI, PDP, WinoBias, and WinoGender. We release our model to be used off-the-shelf for solving pronoun disambiguation.</abstract>
       <url>D19-1439</url>
       <attachment>D19-1439.Attachment.zip</attachment>
@@ -5087,7 +5086,7 @@
       <title>Identifying and Explaining Discriminative Attributes</title>
       <author><first>Armins</first><last>Stepanjans</last></author>
       <author><first>André</first><last>Freitas</last></author>
-      <pages>4312–4321</pages>
+      <pages>4313–4322</pages>
       <abstract>Identifying what is at the center of the meaning of a word and what discriminates it from other words is a fundamental natural language inference task. This paper describes an explicit word vector representation model (WVM) to support the identification of discriminative attributes. A core contribution of the paper is a quantitative and qualitative comparative analysis of different types of data sources and Knowledge Bases in the construction of explainable and explicit WVMs: (i) knowledge graphs built from dictionary definitions, (ii) entity-attribute-relationships graphs derived from images and (iii) commonsense knowledge graphs. Using a detailed quantitative and qualitative analysis, we demonstrate that these data sources have complementary semantic aspects, supporting the creation of explicit semantic vector spaces. The explicit vector spaces are evaluated using the task of discriminative attribute identification, showing comparable performance to the state-of-the-art systems in the task (F1-score = 0.69), while delivering full model transparency and explainability.</abstract>
       <url>D19-1440</url>
       <doi>10.18653/v1/D19-1440</doi>
@@ -5098,7 +5097,7 @@
       <author><first>Yu</first><last>Cheng</last></author>
       <author><first>Zhe</first><last>Gan</last></author>
       <author><first>Jingjing</first><last>Liu</last></author>
-      <pages>4322–4331</pages>
+      <pages>4323–4332</pages>
       <abstract>Pre-trained language models such as BERT have proven to be highly effective for natural language processing (NLP) tasks. However, the high demand for computing resources in training such models hinders their application in practice. In order to alleviate this resource hunger in large-scale model training, we propose a Patient Knowledge Distillation approach to compress an original large model (teacher) into an equally-effective lightweight shallow network (student). Different from previous knowledge distillation methods, which only use the output from the last layer of the teacher network for distillation, our student model patiently learns from multiple intermediate layers of the teacher model for incremental knowledge extraction, following two strategies: (i) PKD-Last: learning from the last k layers; and (ii) PKD-Skip: learning from every k layers. These two patient distillation schemes enable the exploitation of rich information in the teacher’s hidden layers, and encourage the student model to patiently learn from and imitate the teacher through a multi-layer distillation process. Empirically, this translates into improved results on multiple NLP tasks with a significant gain in training efficiency, without sacrificing model accuracy.</abstract>
       <url>D19-1441</url>
       <doi>10.18653/v1/D19-1441</doi>
@@ -5107,7 +5106,7 @@
       <title>Neural <fixed-case>G</fixed-case>aussian Copula for Variational Autoencoder</title>
       <author><first>Prince Zizhuang</first><last>Wang</last></author>
       <author><first>William Yang</first><last>Wang</last></author>
-      <pages>4332–4342</pages>
+      <pages>4333–4343</pages>
       <abstract>Variational language models seek to estimate the posterior of latent variables with an approximated variational posterior. The model often assumes the variational posterior to be factorized even when the true posterior is not. The learned variational posterior under this assumption does not capture the dependency relationships over latent variables. We argue that this would cause a typical training problem called posterior collapse observed in all other variational language models. We propose Gaussian Copula Variational Autoencoder (VAE) to avert this problem. Copula is widely used to model correlation and dependencies of high-dimensional random variables, and therefore it is helpful to maintain the dependency relationships that are lost in VAE. The empirical results show that by modeling the correlation of latent variables explicitly using a neural parametric copula, we can avert this training difficulty while getting competitive results among all other VAE approaches.</abstract>
       <url>D19-1442</url>
       <doi>10.18653/v1/D19-1442</doi>
@@ -5119,7 +5118,7 @@
       <author><first>Makoto</first><last>Yamada</last></author>
       <author><first>Louis-Philippe</first><last>Morency</last></author>
       <author><first>Ruslan</first><last>Salakhutdinov</last></author>
-      <pages>4343–4352</pages>
+      <pages>4344–4353</pages>
       <abstract>Transformer is a powerful architecture that achieves superior performance on various sequence learning tasks, including neural machine translation, language understanding, and sequence prediction. At the core of the Transformer is the attention mechanism, which concurrently processes all inputs in the streams. In this paper, we present a new formulation of attention via the lens of the kernel. To be more precise, we realize that the attention can be seen as applying kernel smoother over the inputs with the kernel scores being the similarities between inputs. This new formulation gives us a better way to understand individual components of the Transformer’s attention, such as the better way to integrate the positional embedding. Another important advantage of our kernel-based formulation is that it paves the way to a larger space of composing Transformer’s attention. As an example, we propose a new variant of Transformer’s attention which models the input as a product of symmetric kernels. This approach achieves competitive performance to the current state of the art model with less computation. In our experiments, we empirically study different kernel construction strategies on two widely used tasks: neural machine translation and sequence prediction.</abstract>
       <url>D19-1443</url>
       <doi>10.18653/v1/D19-1443</doi>
@@ -5129,7 +5128,7 @@
       <author><first>Jiawei</first><last>Wu</last></author>
       <author><first>Wenhan</first><last>Xiong</last></author>
       <author><first>William Yang</first><last>Wang</last></author>
-      <pages>4353–4363</pages>
+      <pages>4354–4364</pages>
       <abstract>Many tasks in natural language processing can be viewed as multi-label classification problems. However, most of the existing models are trained with the standard cross-entropy loss function and use a fixed prediction policy (e.g., a threshold of 0.5) for all the labels, which completely ignores the complexity and dependencies among different labels. In this paper, we propose a meta-learning method to capture these complex label dependencies. More specifically, our method utilizes a meta-learner to jointly learn the training policies and prediction policies for different labels. The training policies are then used to train the classifier with the cross-entropy loss function, and the prediction policies are further implemented for prediction. Experimental results on fine-grained entity typing and text classification demonstrate that our proposed method can obtain more accurate multi-label classification results.</abstract>
       <url>D19-1444</url>
       <doi>10.18653/v1/D19-1444</doi>
@@ -5140,7 +5139,7 @@
       <author><first>Alexey</first><last>Romanov</last></author>
       <author><first>Anna</first><last>Rogers</last></author>
       <author><first>Anna</first><last>Rumshisky</last></author>
-      <pages>4364–4373</pages>
+      <pages>4365–4374</pages>
       <abstract>BERT-based architectures currently give state-of-the-art performance on many NLP tasks, but little is known about the exact mechanisms that contribute to its success. In the current work, we focus on the interpretation of self-attention, which is one of the fundamental underlying components of BERT. Using a subset of GLUE tasks and a set of handcrafted features-of-interest, we propose the methodology and carry out a qualitative and quantitative analysis of the information encoded by the individual BERT’s heads. Our findings suggest that there is a limited set of attention patterns that are repeated across different heads, indicating the overall model overparametrization. While different heads consistently use the same attention patterns, they have varying impact on performance across different tasks. We show that manually disabling attention in certain heads leads to a performance improvement over the regular fine-tuned BERT models.</abstract>
       <url>D19-1445</url>
       <attachment>D19-1445.Attachment.zip</attachment>
@@ -5155,7 +5154,7 @@
       <author><first>Tao</first><last>Qin</last></author>
       <author><first>Jianhuang</first><last>Lai</last></author>
       <author><first>Tie-Yan</first><last>Liu</last></author>
-      <pages>4374–4383</pages>
+      <pages>4375–4384</pages>
       <abstract>Neural machine translation, which achieves near human-level performance in some languages, strongly relies on the large amounts of parallel sentences, which hinders its applicability to low-resource language pairs. Recent works explore the possibility of unsupervised machine translation with monolingual data only, leading to much lower accuracy compared with the supervised one. Observing that weakly paired bilingual documents are much easier to collect than bilingual sentences, e.g., from Wikipedia, news websites or books, in this paper, we investigate training translation models with weakly paired bilingual documents. Our approach contains two components. 1) We provide a simple approach to mine implicitly bilingual sentence pairs from document pairs which can then be used as supervised training signals. 2) We leverage the topic consistency of two weakly paired documents and learn the sentence translation model by constraining the word distribution-level alignments. We evaluate our method on weakly paired documents from Wikipedia on six tasks, the widely used WMT16 German<tex-math>\leftrightarrow</tex-math>English, WMT13 Spanish<tex-math>\leftrightarrow</tex-math>English and WMT16 Romanian<tex-math>\leftrightarrow</tex-math>English translation tasks. We obtain 24.1/30.3, 28.1/27.6 and 30.1/27.6 BLEU points separately, outperforming previous results by more than 5 BLEU points in each direction and reducing the gap between unsupervised translation and supervised translation up to 50%.</abstract>
       <url>D19-1446</url>
       <doi>10.18653/v1/D19-1446</doi>
@@ -5165,7 +5164,7 @@
       <author><first>Jason</first><last>Lee</last></author>
       <author><first>Kyunghyun</first><last>Cho</last></author>
       <author><first>Douwe</first><last>Kiela</last></author>
-      <pages>4384–4394</pages>
+      <pages>4385–4395</pages>
       <abstract>Emergent multi-agent communication protocols are very different from natural language and not easily interpretable by humans. We find that agents that were initially pretrained to produce natural language can also experience detrimental language drift: when a non-linguistic reward is used in a goal-based task, e.g. some scalar success metric, the communication protocol may easily and radically diverge from natural language. We recast translation as a multi-agent communication game and examine auxiliary training constraints for their effectiveness in mitigating language drift. We show that a combination of syntactic (language model likelihood) and semantic (visual grounding) constraints gives the best communication performance, allowing pre-trained agents to retain English syntax while learning to accurately convey the intended meaning.</abstract>
       <url>D19-1447</url>
       <doi>10.18653/v1/D19-1447</doi>
@@ -5175,7 +5174,7 @@
       <author><first>Elena</first><last>Voita</last></author>
       <author><first>Rico</first><last>Sennrich</last></author>
       <author><first>Ivan</first><last>Titov</last></author>
-      <pages>4395–4405</pages>
+      <pages>4396–4406</pages>
       <abstract>We seek to understand how the representations of individual tokens and the structure of the learned feature space evolve between layers in deep neural networks under different learning objectives. We chose the Transformers for our analysis as they have been shown effective with various tasks, including machine translation (MT), standard left-to-right language models (LM) and masked language modeling (MLM). Previous work used black-box probing tasks to show that the representations learned by the Transformer differ significantly depending on the objective. In this work, we use canonical correlation analysis and mutual information estimators to study how information flows across Transformer layers and observe that the choice of the objective determines this process. For example, as you go from bottom to top layers, information about the past in left-to-right language models gets vanished and predictions about the future get formed. In contrast, for MLM, representations initially acquire information about the context around the token, partially forgetting the token identity and producing a more generalized token representation. The token identity then gets recreated at the top MLM layers.</abstract>
       <url>D19-1448</url>
       <attachment>D19-1448.Attachment.zip</attachment>
@@ -5187,7 +5186,7 @@
       <author><first>Goran</first><last>Glavaš</last></author>
       <author><first>Roi</first><last>Reichart</last></author>
       <author><first>Anna</first><last>Korhonen</last></author>
-      <pages>4406–4417</pages>
+      <pages>4407–4418</pages>
       <abstract>Recent efforts in cross-lingual word embedding (CLWE) learning have predominantly focused on fully unsupervised approaches that project monolingual embeddings into a shared cross-lingual space without any cross-lingual signal. The lack of any supervision makes such approaches conceptually attractive. Yet, their only core difference from (weakly) supervised projection-based CLWE methods is in the way they obtain a seed dictionary used to initialize an iterative self-learning procedure. The fully unsupervised methods have arguably become more robust, and their primary use case is CLWE induction for pairs of resource-poor and distant languages. In this paper, we question the ability of even the most robust unsupervised CLWE approaches to induce meaningful CLWEs in these more challenging settings. A series of bilingual lexicon induction (BLI) experiments with 15 diverse languages (210 language pairs) show that fully unsupervised CLWE methods still fail for a large number of language pairs (e.g., they yield zero BLI performance for 87/210 pairs). Even when they succeed, they never surpass the performance of weakly supervised methods (seeded with 500-1,000 translation pairs) using the same self-learning procedure in any BLI setup, and the gaps are often substantial. These findings call for revisiting the main motivations behind fully unsupervised CLWE methods.</abstract>
       <url>D19-1449</url>
       <attachment>D19-1449.Attachment.zip</attachment>
@@ -5198,7 +5197,7 @@
       <author><first>Haozhou</first><last>Wang</last></author>
       <author><first>James</first><last>Henderson</last></author>
       <author><first>Paola</first><last>Merlo</last></author>
-      <pages>4418–4429</pages>
+      <pages>4419–4430</pages>
       <abstract>Distributed representations of words which map each word to a continuous vector have proven useful in capturing important linguistic information not only in a single language but also across different languages. Current unsupervised adversarial approaches show that it is possible to build a mapping matrix that aligns two sets of monolingual word embeddings without high quality parallel data, such as a dictionary or a sentence-aligned corpus. However, without an additional step of refinement, the preliminary mapping learnt by these methods is unsatisfactory, leading to poor performance for typologically distant languages. In this paper, we propose a weakly-supervised adversarial training method to overcome this limitation, based on the intuition that mapping across languages is better done at the concept level than at the word level. We propose a concept-based adversarial training method which improves the performance of previous unsupervised adversarial methods for most languages, and especially for typologically distant language pairs.</abstract>
       <url>D19-1450</url>
       <doi>10.18653/v1/D19-1450</doi>
@@ -5211,7 +5210,7 @@
       <author><first>Wei</first><last>Lu</last></author>
       <author><first>Jimmy</first><last>Lin</last></author>
       <author><first>Xu</first><last>Sun</last></author>
-      <pages>4430–4440</pages>
+      <pages>4431–4441</pages>
       <abstract>Multilingual knowledge graphs (KGs), such as YAGO and DBpedia, represent entities in different languages. The task of cross-lingual entity alignment is to match entities in a source language with their counterparts in target languages. In this work, we investigate embedding-based approaches to encode entities from multilingual KGs into the same vector space, where equivalent entities are close to each other. Specifically, we apply graph convolutional networks (GCNs) to combine multi-aspect information of entities, including topological connections, relations, and attributes of entities, to learn entity embeddings. To exploit the literal descriptions of entities expressed in different languages, we propose two uses of a pretrained multilingual BERT model to bridge cross-lingual gaps. We further propose two strategies to integrate GCN-based and BERT-based modules to boost performance. Extensive experiments on two benchmark datasets demonstrate that our method significantly outperforms existing systems.</abstract>
       <url>D19-1451</url>
       <doi>10.18653/v1/D19-1451</doi>
@@ -5221,7 +5220,7 @@
       <author><first>Mitra</first><last>Mohtarami</last></author>
       <author><first>James</first><last>Glass</last></author>
       <author><first>Preslav</first><last>Nakov</last></author>
-      <pages>4441–4451</pages>
+      <pages>4442–4452</pages>
       <abstract>We study cross-lingual stance detection, which aims to leverage labeled data in one language to identify the relative perspective (or stance) of a given document with respect to a claim in a different target language. In particular, we introduce a novel contrastive language adaptation approach applied to memory networks, which ensures accurate alignment of stances in the source and target languages, and can effectively deal with the challenge of limited labeled data in the target language. The evaluation results on public benchmark datasets and comparison against current state-of-the-art approaches demonstrate the effectiveness of our approach.</abstract>
       <url>D19-1452</url>
       <doi>10.18653/v1/D19-1452</doi>
@@ -5232,7 +5231,7 @@
       <author><first>Stephan</first><last>Peitz</last></author>
       <author><first>Udhyakumar</first><last>Nallasamy</last></author>
       <author><first>Matthias</first><last>Paulik</last></author>
-      <pages>4452–4461</pages>
+      <pages>4453–4462</pages>
       <abstract>The state of the art in machine translation (MT) is governed by neural approaches, which typically provide superior translation accuracy over statistical approaches. However, on the closely related task of word alignment, traditional statistical word alignment models often remain the go-to solution. In this paper, we present an approach to train a Transformer model to produce both accurate translations and alignments. We extract discrete alignments from the attention probabilities learnt during regular neural machine translation model training and leverage them in a multi-task framework to optimize towards translation and alignment objectives. We demonstrate that our approach produces competitive results compared to GIZA++ trained IBM alignment models without sacrificing translation accuracy and outperforms previous attempts on Transformer model based word alignment. Finally, by incorporating IBM model alignments into our multi-task training, we report significantly better alignment accuracies compared to GIZA++ on three publicly available data sets.</abstract>
       <url>D19-1453</url>
       <attachment>D19-1453.Attachment.zip</attachment>
@@ -5245,7 +5244,7 @@
       <author><first>Derek</first><last>Chen</last></author>
       <author><first>Ronan</first><last>Le Bras</last></author>
       <author><first>Yejin</first><last>Choi</last></author>
-      <pages>4462–4472</pages>
+      <pages>4463–4473</pages>
       <abstract>We introduce Social IQa, the first large-scale benchmark for commonsense reasoning about social situations. Social IQa contains 38,000 multiple choice questions for probing emotional and social intelligence in a variety of everyday situations (e.g., Q: “Jordan wanted to tell Tracy a secret, so Jordan leaned towards Tracy. Why did Jordan do this?” A: “Make sure no one else could hear”). Through crowdsourcing, we collect commonsense questions along with correct and incorrect answers about social interactions, using a new framework that mitigates stylistic artifacts in incorrect answers by asking workers to provide the right answer to a different but related question. Empirical results show that our benchmark is challenging for existing question-answering models based on pretrained language models, compared to human performance (&gt;20% gap). Notably, we further establish Social IQa as a resource for transfer learning of commonsense knowledge, achieving state-of-the-art performance on multiple commonsense reasoning tasks (Winograd Schemas, COPA).</abstract>
       <url>D19-1454</url>
       <doi>10.18653/v1/D19-1454</doi>
@@ -5254,7 +5253,7 @@
       <title>Self-Assembling Modular Networks for Interpretable Multi-Hop Reasoning</title>
       <author><first>Yichen</first><last>Jiang</last></author>
       <author><first>Mohit</first><last>Bansal</last></author>
-      <pages>4473–4483</pages>
+      <pages>4474–4484</pages>
       <abstract>Multi-hop QA requires a model to connect multiple pieces of evidence scattered in a long context to answer the question. The recently proposed HotpotQA (Yang et al., 2018) dataset is comprised of questions embodying four different multi-hop reasoning paradigms (two bridge entity setups, checking multiple properties, and comparing two entities), making it challenging for a single neural network to handle all four. In this work, we present an interpretable, controller-based Self-Assembling Neural Modular Network (Hu et al., 2017, 2018) for multi-hop reasoning, where we design four novel modules (Find, Relocate, Compare, NoOp) to perform unique types of language reasoning. Based on a question, our layout controller RNN dynamically infers a series of reasoning modules to construct the entire network. Empirically, we show that our dynamic, multi-hop modular network achieves significant improvements over the static, single-hop baseline (on both regular and adversarial evaluation). We further demonstrate the interpretability of our model via three analyses. First, the controller can softly decompose the multi-hop question into multiple single-hop sub-questions to promote compositional reasoning behavior of the main network. Second, the controller can predict layouts that conform to the layouts designed by human experts. Finally, the intermediate module can infer the entity that connects two distantly-located supporting facts by addressing the sub-question from the controller.</abstract>
       <url>D19-1455</url>
       <doi>10.18653/v1/D19-1455</doi>
@@ -5265,7 +5264,7 @@
       <author><first>Ignacio</first><last>Cases</last></author>
       <author><first>Lauri</first><last>Karttunen</last></author>
       <author><first>Christopher</first><last>Potts</last></author>
-      <pages>4484–4494</pages>
+      <pages>4485–4495</pages>
       <abstract>Deep learning models for semantics are generally evaluated using naturalistic corpora. Adversarial testing methods, in which models are evaluated on new examples with known semantic properties, have begun to reveal that good performance at these naturalistic tasks can hide serious shortcomings. However, we should insist that these evaluations be fair – that the models are given data sufficient to support the requisite kinds of generalization. In this paper, we define and motivate a formal notion of fairness in this sense. We then apply these ideas to natural language inference by constructing very challenging but provably fair artificial datasets and showing that standard neural models fail to generalize in the required ways; only task-specific models that jointly compose the premise and hypothesis are able to achieve high performance, and even these models do not solve the task perfectly.</abstract>
       <url>D19-1456</url>
       <attachment>D19-1456.Attachment.zip</attachment>
@@ -5278,7 +5277,7 @@
       <author><first>Antoine</first><last>Bosselut</last></author>
       <author><first>Wen-tau</first><last>Yih</last></author>
       <author><first>Peter</first><last>Clark</last></author>
-      <pages>4495–4504</pages>
+      <pages>4496–4505</pages>
       <abstract>Our goal is to better comprehend procedural text, e.g., a paragraph about photosynthesis, by not only predicting what happens, but *why* some actions need to happen before others. Our approach builds on a prior process comprehension framework for predicting actions’ effects, to also identify subsequent steps that those effects enable. We present our new model (XPAD) that biases effect predictions towards those that (1) explain more of the actions in the paragraph and (2) are more plausible with respect to background knowledge. We also extend an existing benchmark dataset for procedural text comprehension, ProPara, by adding the new task of explaining actions by predicting their dependencies. We find that XPAD significantly outperforms prior systems on this task, while maintaining the performance on the original task in ProPara. The dataset is available at http://data.allenai.org/propara</abstract>
       <url>D19-1457</url>
       <doi>10.18653/v1/D19-1457</doi>
@@ -5290,7 +5289,7 @@
       <author><first>Jin</first><last>Dong</last></author>
       <author><first>Joelle</first><last>Pineau</last></author>
       <author><first>William L.</first><last>Hamilton</last></author>
-      <pages>4505–4514</pages>
+      <pages>4506–4515</pages>
       <abstract>The recent success of natural language understanding (NLU) systems has been troubled by results highlighting the failure of these models to generalize in a systematic and robust way. In this work, we introduce a diagnostic benchmark suite, named CLUTRR, to clarify some key issues related to the robustness and systematicity of NLU systems. Motivated by the classic work on inductive logic programming, CLUTRR requires that an NLU system infer kinship relations between characters in short stories. Successful performance on this task requires both extracting relationships between entities, as well as inferring the logical rules governing these relationships. CLUTRR allows us to precisely measure a model’s ability for systematic generalization by evaluating on held-out combinations of logical rules, and allows us to evaluate a model’s robustness by adding curated noise facts. Our empirical results highlight a substantial performance gap between state-of-the-art NLU models (e.g., BERT and MAC) and a graph neural network model that works directly with symbolic inputs—with the graph-based model exhibiting both stronger generalization and greater robustness.</abstract>
       <url>D19-1458</url>
       <attachment>D19-1458.Attachment.zip</attachment>
@@ -5308,7 +5307,7 @@
       <author><first>Amit</first><last>Dubey</last></author>
       <author><first>Kyu-Young</first><last>Kim</last></author>
       <author><first>Andy</first><last>Cedilnik</last></author>
-      <pages>4515–4524</pages>
+      <pages>4516–4525</pages>
       <abstract>A significant barrier to progress in data-driven approaches to building dialog systems is the lack of high quality, goal-oriented conversational data. To help satisfy this elementary requirement, we introduce the initial release of the Taskmaster-1 dataset which includes 13,215 task-based dialogs comprising six domains. Two procedures were used to create this collection, each with unique advantages. The first involves a two-person, spoken “Wizard of Oz” (WOz) approach in which trained agents and crowdsourced workers interact to complete the task while the second is “self-dialog” in which crowdsourced workers write the entire dialog themselves. We do not restrict the workers to detailed scripts or to a small knowledge base and hence we observe that our dataset contains more realistic and diverse conversations in comparison to existing datasets. We offer several baseline models including state of the art neural seq2seq architectures with benchmark performance as well as qualitative human evaluations. Dialogs are labeled with API calls and arguments, a simple and cost effective approach which avoids the requirement of complex annotation schema. The layer of abstraction between the dialog model and the service provider API allows for a given model to interact with multiple services that provide similar functionally. Finally, the dataset will evoke interest in written vs. spoken language, discourse patterns, error handling and other linguistic phenomena related to dialog system research, development and design.</abstract>
       <url>D19-1459</url>
       <doi>10.18653/v1/D19-1459</doi>
@@ -5322,7 +5321,7 @@
       <author><first>Yi</first><last>Zhang</last></author>
       <author><first>Adel</first><last>Youssef</last></author>
       <author><first>Mona</first><last>Diab</last></author>
-      <pages>4525–4535</pages>
+      <pages>4526–4536</pages>
       <abstract>The need for high-quality, large-scale, goal-oriented dialogue datasets continues to grow as virtual assistants become increasingly wide-spread. However, publicly available datasets useful for this area are limited either in their size, linguistic diversity, domain coverage, or annotation granularity. In this paper, we present strategies toward curating and annotating large scale goal oriented dialogue data. We introduce the MultiDoGO dataset to overcome these limitations. With a total of over 81K dialogues harvested across six domains, MultiDoGO is over 8 times the size of MultiWOZ, the other largest comparable dialogue dataset currently available to the public. Over 54K of these harvested conversations are annotated for intent classes and slot labels. We adopt a Wizard-of-Oz approach wherein a crowd-sourced worker (the “customer”) is paired with a trained annotator (the “agent”). The data curation process was controlled via biases to ensure a diversity in dialogue flows following variable dialogue policies. We provide distinct class label tags for agents vs. customer utterances, along with applicable slot labels. We also compare and contrast our strategies on annotation granularity, i.e. turn vs. sentence level. Furthermore, we compare and contrast annotations curated by leveraging professional annotators vs the crowd. We believe our strategies for eliciting and annotating such a dialogue dataset scales across modalities and domains and potentially languages in the future. To demonstrate the efficacy of our devised strategies we establish neural baselines for classification on the agent and customer utterances as well as slot labeling for each domain.</abstract>
       <url>D19-1460</url>
       <attachment>D19-1460.Attachment.pdf</attachment>
@@ -5334,7 +5333,7 @@
       <author><first>Samuel</first><last>Humeau</last></author>
       <author><first>Bharath</first><last>Chintagunta</last></author>
       <author><first>Jason</first><last>Weston</last></author>
-      <pages>4536–4545</pages>
+      <pages>4537–4546</pages>
       <abstract>The detection of offensive language in the context of a dialogue has become an increasingly important application of natural language processing. The detection of trolls in public forums (Galán-García et al., 2016), and the deployment of chatbots in the public domain (Wolf et al., 2017) are two examples that show the necessity of guarding against adversarially offensive behavior on the part of humans. In this work, we develop a training scheme for a model to become robust to such human attacks by an iterative build it, break it, fix it scheme with humans and models in the loop. In detailed experiments we show this approach is considerably more robust than previous systems. Further, we show that offensive language used within a conversation critically depends on the dialogue context, and cannot be viewed as a single sentence offensive detection task as in most previous work. Our newly collected tasks and methods are all made open source and publicly available.</abstract>
       <url>D19-1461</url>
       <attachment>D19-1461.Attachment.zip</attachment>
@@ -5346,7 +5345,7 @@
       <author><first>Deyi</first><last>Xiong</last></author>
       <author><first>Bonnie</first><last>Webber</last></author>
       <author><first>Changjian</first><last>Hu</last></author>
-      <pages>4546–4556</pages>
+      <pages>4547–4557</pages>
       <abstract>Ellipsis and co-reference are common and ubiquitous especially in multi-turn dialogues. In this paper, we treat the resolution of ellipsis and co-reference in dialogue as a problem of generating omitted or referred expressions from the dialogue context. We therefore propose a unified end-to-end Generative Ellipsis and CO-reference Resolution model (GECOR) in the context of dialogue. The model can generate a new pragmatically complete user utterance by alternating the generation and copy mode for each user utterance. A multi-task learning framework is further proposed to integrate the GECOR into an end-to-end task-oriented dialogue. In order to train both the GECOR and the multi-task learning framework, we manually construct a new dataset on the basis of the public dataset CamRest676 with both ellipsis and co-reference annotation. On this dataset, intrinsic evaluations on the resolution of ellipsis and co-reference show that the GECOR model significantly outperforms the sequence-to-sequence (seq2seq) baseline model in terms of EM, BLEU and F1 while extrinsic evaluations on the downstream dialogue task demonstrate that our multi-task learning framework with GECOR achieves a higher success rate of task completion than TSCP, a state-of-the-art end-to-end task-oriented dialogue model.</abstract>
       <url>D19-1462</url>
       <doi>10.18653/v1/D19-1462</doi>
@@ -5358,7 +5357,7 @@
       <author><first>Feng</first><last>Ji</last></author>
       <author><first>Haiqing</first><last>Chen</last></author>
       <author><first>Yin</first><last>Zhang</last></author>
-      <pages>4557–4566</pages>
+      <pages>4558–4567</pages>
       <abstract>How to incorporate external knowledge into a neural dialogue model is critically important for dialogue systems to behave like real humans. To handle this problem, memory networks are usually a great choice and a promising way. However, existing memory networks do not perform well when leveraging heterogeneous information from different sources. In this paper, we propose a novel and versatile external memory networks called Heterogeneous Memory Networks (HMNs), to simultaneously utilize user utterances, dialogue history and background knowledge tuples. In our method, historical sequential dialogues are encoded and stored into the context-aware memory enhanced by gating mechanism while grounding knowledge tuples are encoded and stored into the context-free memory. During decoding, the decoder augmented with HMNs recurrently selects each word in one response utterance from these two memories and a general vocabulary. Experimental results on multiple real-world datasets show that HMNs significantly outperform the state-of-the-art data-driven task-oriented dialogue models in most domains.</abstract>
       <url>D19-1463</url>
       <doi>10.18653/v1/D19-1463</doi>
@@ -5368,8 +5367,8 @@
       <author><first>Chen</first><last>Zhang</last></author>
       <author><first>Qiuchi</first><last>Li</last></author>
       <author><first>Dawei</first><last>Song</last></author>
-      <pages>4567–4577</pages>
-      <abstract>Due to their inherent capability in semantic alignment of aspects and their context words, attention mechanism and Convolutional Neural Networks (CNNs) are widely applied for aspect-based sentiment classification. However, these models lack a mechanism to account for relevant syntactical constraints and long-range word dependencies, and hence may mistakenly recognize syntactically irrelevant contextual words as clues for judging aspect sentiment. To tackle this problem, we propose to build a Graph Convolutional Network (GCN) over the dependency tree of a sentence to exploit syntactical information and word dependencies. Based on it, a novel aspect-specific sentiment classification framework is raised. Experiments on three benchmarking collections illustrate that our proposed model has comparable effectiveness to a range of state-of-the-art modelsCode and preprocessed datasets are available at https://github.com/GeneZC/ASGCN<fixed-case>https://github.com/GeneZC/ASGCN</fixed-case>., and further demonstrate that both syntactical information and long-range word dependencies are properly captured by the graph convolution structure.</abstract>
+      <pages>4568–4578</pages>
+      <abstract>Due to their inherent capability in semantic alignment of aspects and their context words, attention mechanism and Convolutional Neural Networks (CNNs) are widely applied for aspect-based sentiment classification. However, these models lack a mechanism to account for relevant syntactical constraints and long-range word dependencies, and hence may mistakenly recognize syntactically irrelevant contextual words as clues for judging aspect sentiment. To tackle this problem, we propose to build a Graph Convolutional Network (GCN) over the dependency tree of a sentence to exploit syntactical information and word dependencies. Based on it, a novel aspect-specific sentiment classification framework is raised. Experiments on three benchmarking collections illustrate that our proposed model has comparable effectiveness to a range of state-of-the-art models, and further demonstrate that both syntactical information and long-range word dependencies are properly captured by the graph convolution structure.</abstract>
       <url>D19-1464</url>
       <doi>10.18653/v1/D19-1464</doi>
     </paper>
@@ -5381,7 +5380,7 @@
       <author><first>Lingzhi</first><last>Wang</last></author>
       <author><first>Xixin</first><last>Wu</last></author>
       <author><first>Kam-Fai</first><last>Wong</last></author>
-      <pages>4578–4588</pages>
+      <pages>4579–4589</pages>
       <abstract>Aspect words, indicating opinion targets, are essential in expressing and understanding human opinions. To identify aspects, most previous efforts focus on using sequence tagging models trained on human-annotated data. This work studies unsupervised aspect extraction and explores how words appear in global context (on sentence level) and local context (conveyed by neighboring words). We propose a novel neural model, capable of coupling global and local representation to discover aspect words. Experimental results on two benchmarks, laptop and restaurant reviews, show that our model significantly outperforms the state-of-the-art models from previous studies evaluated with varying metrics. Analysis on model output show our ability to learn meaningful and coherent aspect representations. We further investigate how words distribute in global and local context, and find that aspect and non-aspect words do exhibit different context, interpreting our superiority in unsupervised aspect extraction.</abstract>
       <url>D19-1465</url>
       <doi>10.18653/v1/D19-1465</doi>
@@ -5394,7 +5393,7 @@
       <author><first>Lidong</first><last>Bing</last></author>
       <author><first>Yu</first><last>Zhang</last></author>
       <author><first>Qiang</first><last>Yang</last></author>
-      <pages>4589–4599</pages>
+      <pages>4590–4600</pages>
       <abstract>Joint extraction of aspects and sentiments can be effectively formulated as a sequence labeling problem. However, such formulation hinders the effectiveness of supervised methods due to the lack of annotated sequence data in many domains. To address this issue, we firstly explore an unsupervised domain adaptation setting for this task. Prior work can only use common syntactic relations between aspect and opinion words to bridge the domain gaps, which highly relies on external linguistic resources. To resolve it, we propose a novel Selective Adversarial Learning (SAL) method to align the inferred correlation vectors that automatically capture their latent relations. The SAL method can dynamically learn an alignment weight for each word such that more important words can possess higher alignment weights to achieve fine-grained (word-level) adaptation. Empirically, extensive experiments demonstrate the effectiveness of the proposed SAL method.</abstract>
       <url>D19-1466</url>
       <attachment>D19-1466.Attachment.pdf</attachment>
@@ -5409,7 +5408,7 @@
       <author><first>Zhong</first><last>Su</last></author>
       <author><first>Renhong</first><last>Cheng</last></author>
       <author><first>Xiaowei</first><last>Shen</last></author>
-      <pages>4600–4609</pages>
+      <pages>4601–4610</pages>
       <abstract>Aspect level sentiment classification is a fine-grained sentiment analysis task. To detect the sentiment towards a particular aspect in a sentence, previous studies have developed various attention-based methods for generating aspect-specific sentence representations. However, the attention may inherently introduce noise and downgrade the performance. In this paper, we propose constrained attention networks (CAN), a simple yet effective solution, to regularize the attention for multi-aspect sentiment analysis, which alleviates the drawback of the attention mechanism. Specifically, we introduce orthogonal regularization on multiple aspects and sparse regularization on each single aspect. Experimental results on two public datasets demonstrate the effectiveness of our approach. We further extend our approach to multi-task settings and outperform the state-of-the-art methods.</abstract>
       <url>D19-1467</url>
       <attachment>D19-1467.Attachment.zip</attachment>
@@ -5420,7 +5419,7 @@
       <author><first>Giannis</first><last>Karamanolakis</last></author>
       <author><first>Daniel</first><last>Hsu</last></author>
       <author><first>Luis</first><last>Gravano</last></author>
-      <pages>4610–4620</pages>
+      <pages>4611–4621</pages>
       <abstract>User-generated reviews can be decomposed into fine-grained segments (e.g., sentences, clauses), each evaluating a different aspect of the principal entity (e.g., price, quality, appearance). Automatically detecting these aspects can be useful for both users and downstream opinion mining applications. Current supervised approaches for learning aspect classifiers require many fine-grained aspect labels, which are labor-intensive to obtain. And, unfortunately, unsupervised topic models often fail to capture the aspects of interest. In this work, we consider weakly supervised approaches for training aspect classifiers that only require the user to provide a small set of seed words (i.e., weakly positive indicators) for the aspects of interest. First, we show that current weakly supervised approaches fail to leverage the predictive power of seed words for aspect detection. Next, we propose a student-teacher approach that effectively leverages seed words in a bag-of-words classifier (teacher); in turn, we use the teacher to train a second model (student) that is potentially more powerful (e.g., a neural network that uses pre-trained word embeddings). Finally, we show that iterative co-training can be used to cope with noisy seed words, leading to both improved teacher and student models. Our proposed approach consistently outperforms previous weakly supervised approaches (by 14.1 absolute F1 points on average) in six different domains of product reviews and six multilingual datasets of restaurant reviews.</abstract>
       <url>D19-1468</url>
       <attachment>D19-1468.Attachment.zip</attachment>
@@ -5434,7 +5433,7 @@
       <author><first>Xiao</first><last>Lin</last></author>
       <author><first>Dan</first><last>Jurafsky</last></author>
       <author><first>Ajay</first><last>Divakaran</last></author>
-      <pages>4621–4631</pages>
+      <pages>4622–4632</pages>
       <abstract>Computing author intent from multimodal data like Instagram posts requires modeling a complex relationship between text and image. For example, a caption might evoke an ironic contrast with the image, so neither caption nor image is a mere transcript of the other. Instead they combine—via what has been called meaning multiplication (Bateman et al.)- to create a new meaning that has a more complex relation to the literal meanings of text and image. Here we introduce a multimodal dataset of 1299 Instagram posts labeled for three orthogonal taxonomies: the authorial intent behind the image-caption pair, the contextual relationship between the literal meanings of the image and caption, and the semiotic relationship between the signified meanings of the image and caption. We build a baseline deep multimodal classifier to validate the taxonomy, showing that employing both text and image improves intent detection by 9.6 compared to using only the image modality, demonstrating the commonality of non-intersective meaning multiplication. The gain with multimodality is greatest when the image and caption diverge semiotically. Our dataset offers a new resource for the study of the rich meanings that result from pairing text and image.</abstract>
       <url>D19-1469</url>
       <doi>10.18653/v1/D19-1469</doi>
@@ -5445,7 +5444,7 @@
       <author><first>Jing</first><last>Li</last></author>
       <author><first>Lu</first><last>Wang</last></author>
       <author><first>Kam-Fai</first><last>Wong</last></author>
-      <pages>4632–4642</pages>
+      <pages>4633–4643</pages>
       <abstract>The prevalent use of social media leads to a vast amount of online conversations being produced on a daily basis. It presents a concrete challenge for individuals to better discover and engage in social media discussions. In this paper, we present a novel framework to automatically recommend conversations to users based on their prior conversation behaviors. Built on neural collaborative filtering, our model explores deep semantic features that measure how a user’s preferences match an ongoing conversation’s context. Furthermore, to identify salient characteristics from interleaving user interactions, our model incorporates graph-structured networks, where both replying relations and temporal features are encoded as conversation context. Experimental results on two large-scale datasets collected from Twitter and Reddit show that our model yields better performance than previous state-of-the-art models, which only utilize lexical features and ignore past user interactions in the conversations.</abstract>
       <url>D19-1470</url>
       <doi>10.18653/v1/D19-1470</doi>
@@ -5457,7 +5456,7 @@
       <author><first>Haolin</first><last>Jin</last></author>
       <author><first>Ambreen</first><last>Nazir</last></author>
       <author><first>Ling</first><last>Sun</last></author>
-      <pages>4643–4652</pages>
+      <pages>4644–4653</pages>
       <abstract>Recently, neural networks based on multi-task learning have achieved promising performance on fake news detection, which focuses on learning shared features among tasks as complementarity features to serve different tasks. However, in most of the existing approaches, the shared features are completely assigned to different tasks without selection, which may lead to some useless and even adverse features integrated into specific tasks. In this paper, we design a sifted multi-task learning method with a selected sharing layer for fake news detection. The selected sharing layer adopts gate mechanism and attention mechanism to filter and select shared feature flows between tasks. Experiments on two public and widely used competition datasets, i.e. RumourEval and PHEME, demonstrate that our proposed method achieves the state-of-the-art performance and boosts the F1-score by more than 0.87%, 1.31%, respectively.</abstract>
       <url>D19-1471</url>
       <doi>10.18653/v1/D19-1471</doi>
@@ -5468,7 +5467,7 @@
       <author><first>Renato</first><last>Ferreira Pinto Junior</last></author>
       <author><first>Graeme</first><last>Hirst</last></author>
       <author><first>Yang</first><last>Xu</last></author>
-      <pages>4653–4662</pages>
+      <pages>4654–4663</pages>
       <abstract>We present a text-based framework for investigating moral sentiment change of the public via longitudinal corpora. Our framework is based on the premise that language use can inform people’s moral perception toward right or wrong, and we build our methodology by exploring moral biases learned from diachronic word embeddings. We demonstrate how a parameter-free model supports inference of historical shifts in moral sentiment toward concepts such as slavery and democracy over centuries at three incremental levels: moral relevance, moral polarity, and fine-grained moral dimensions. We apply this methodology to visualizing moral time courses of individual concepts and analyzing the relations between psycholinguistic variables and rates of moral sentiment change at scale. Our work offers opportunities for applying natural language processing toward characterizing moral sentiment change in society.</abstract>
       <url>D19-1472</url>
       <attachment>D19-1472.Attachment.zip</attachment>
@@ -5479,7 +5478,7 @@
       <author><first>Bei</first><last>Yu</last></author>
       <author><first>Yingya</first><last>Li</last></author>
       <author><first>Jun</first><last>Wang</last></author>
-      <pages>4663–4673</pages>
+      <pages>4664–4674</pages>
       <abstract>Causal interpretation of correlational findings from observational studies has been a major type of misinformation in science communication. Prior studies on identifying inappropriate use of causal language relied on manual content analysis, which is not scalable for examining a large volume of science publications. In this study, we first annotated a corpus of over 3,000 PubMed research conclusion sentences, then developed a BERT-based prediction model that classifies conclusion sentences into “no relationship”, “correlational”, “conditional causal”, and “direct causal” categories, achieving an accuracy of 0.90 and a macro-F1 of 0.88. We then applied the prediction model to measure the causal language use in the research conclusions of about 38,000 observational studies in PubMed. The prediction result shows that 21.7% studies used direct causal language exclusively in their conclusions, and 32.4% used some direct causal language. We also found that the ratio of causal language use differs among authors from different countries, challenging the notion of a shared consensus on causal language use in the global science community. Our prediction model could also be used to help identify the inappropriate use of causal language in science publications.</abstract>
       <url>D19-1473</url>
       <doi>10.18653/v1/D19-1473</doi>
@@ -5491,7 +5490,7 @@
       <author><first>Hongming</first><last>Zhang</last></author>
       <author><first>Yangqiu</first><last>Song</last></author>
       <author><first>Dit-Yan</first><last>Yeung</last></author>
-      <pages>4674–4683</pages>
+      <pages>4675–4684</pages>
       <abstract>Current research on hate speech analysis is typically oriented towards monolingual and single classification tasks. In this paper, we present a new multilingual multi-aspect hate speech analysis dataset and use it to test the current state-of-the-art multilingual multitask learning approaches. We evaluate our dataset in various classification settings, then we discuss how to leverage our annotations in order to improve hate speech detection and classification in general.</abstract>
       <url>D19-1474</url>
       <doi>10.18653/v1/D19-1474</doi>
@@ -5505,7 +5504,7 @@
       <author><first>Casper</first><last>Hansen</last></author>
       <author><first>Christian</first><last>Hansen</last></author>
       <author><first>Jakob Grue</first><last>Simonsen</last></author>
-      <pages>4684–4696</pages>
+      <pages>4685–4697</pages>
       <abstract>We contribute the largest publicly available dataset of naturally occurring factual claims for the purpose of automatic claim verification. It is collected from 26 fact checking websites in English, paired with textual sources and rich metadata, and labelled for veracity by human expert journalists. We present an in-depth analysis of the dataset, highlighting characteristics and challenges. Further, we present results for automatic veracity prediction, both with established baselines and with a novel method for joint ranking of evidence pages and predicting veracity that outperforms all baselines. Significant performance increases are achieved by encoding evidence, and by modelling metadata. Our best-performing model achieves a Macro F1 of 49.2%, showing that this is a challenging testbed for claim veracity prediction.</abstract>
       <url>D19-1475</url>
       <doi>10.18653/v1/D19-1475</doi>
@@ -5516,7 +5515,7 @@
       <author><first>Qinliang</first><last>Su</last></author>
       <author><first>Xiaojun</first><last>Quan</last></author>
       <author><first>Weijia</first><last>Zhang</last></author>
-      <pages>4697–4705</pages>
+      <pages>4698–4706</pages>
       <abstract>Textual network embeddings aim to learn a low-dimensional representation for every node in the network so that both the structural and textual information from the networks can be well preserved in the representations. Traditionally, the structural and textual embeddings were learned by models that rarely take the mutual influences between them into account. In this paper, a deep neural architecture is proposed to effectively fuse the two kinds of informations into one representation. The novelties of the proposed architecture are manifested in the aspects of a newly defined objective function, the complementary information fusion method for structural and textual features, and the mutual gate mechanism for textual feature extraction. Experimental results show that the proposed model outperforms the comparing methods on all three datasets.</abstract>
       <url>D19-1476</url>
       <doi>10.18653/v1/D19-1476</doi>
@@ -5527,7 +5526,7 @@
       <author><first>Diego</first><last>Marcheggiani</last></author>
       <author><first>Sabine</first><last>Schulte im Walde</last></author>
       <author><first>Raquel</first><last>Fernández</last></author>
-      <pages>4706–4716</pages>
+      <pages>4707–4717</pages>
       <abstract>Information about individuals can help to better understand what they say, particularly in social media where texts are short. Current approaches to modelling social media users pay attention to their social connections, but exploit this information in a static way, treating all connections uniformly. This ignores the fact, well known in sociolinguistics, that an individual may be part of several communities which are not equally relevant in all communicative situations. We present a model based on Graph Attention Networks that captures this observation. It dynamically explores the social graph of a user, computes a user representation given the most relevant connections for a target task, and combines it with linguistic information to make a prediction. We apply our model to three different tasks, evaluate it against alternative models, and analyse the results extensively, showing that it significantly outperforms other current methods.</abstract>
       <url>D19-1477</url>
       <attachment>D19-1477.Attachment.pdf</attachment>
@@ -5539,7 +5538,7 @@
       <author><first>Barea</first><last>Sinno</last></author>
       <author><first>Alex</first><last>Rosenfeld</last></author>
       <author><first>Junyi Jessy</first><last>Li</last></author>
-      <pages>4717–4729</pages>
+      <pages>4718–4730</pages>
       <abstract>Insightful findings in political science often require researchers to analyze documents of a certain subject or type, yet these documents are usually contained in large corpora that do not distinguish between pertinent and non-pertinent documents. In contrast, we can find corpora that label relevant documents but have limitations (e.g., from a single source or era), preventing their use for political science research. To bridge this gap, we present adaptive ensembling, an unsupervised domain adaptation framework, equipped with a novel text classification model and time-aware training to ensure our methods work well with diachronic corpora. Experiments on an expert-annotated dataset show that our framework outperforms strong benchmarks. Further analysis indicates that our methods are more stable, learn better representations, and extract cleaner corpora for fine-grained analysis.</abstract>
       <url>D19-1478</url>
       <attachment>D19-1478.Attachment.pdf</attachment>
@@ -5549,7 +5548,7 @@
       <title>A Hierarchical Location Prediction Neural Network for Twitter User Geolocation</title>
       <author><first>Binxuan</first><last>Huang</last></author>
       <author><first>Kathleen</first><last>Carley</last></author>
-      <pages>4731–4741</pages>
+      <pages>4732–4742</pages>
       <abstract>Accurate estimation of user location is important for many online services. Previous neural network based methods largely ignore the hierarchical structure among locations. In this paper, we propose a hierarchical location prediction neural network for Twitter user geolocation. Our model first predicts the home country for a user, then uses the country result to guide the city-level prediction. In addition, we employ a character-aware word embedding layer to overcome the noisy information in tweets. With the feature fusion layer, our model can accommodate various feature combinations and achieves state-of-the-art results over three commonly used benchmarks under different feature settings. It not only improves the prediction accuracy but also greatly reduces the mean error distance.</abstract>
       <url>D19-1480</url>
       <doi>10.18653/v1/D19-1480</doi>
@@ -5558,7 +5557,7 @@
       <title>Trouble on the Horizon: Forecasting the Derailment of Online Conversations as they Develop</title>
       <author><first>Jonathan P.</first><last>Chang</last></author>
       <author><first>Cristian</first><last>Danescu-Niculescu-Mizil</last></author>
-      <pages>4742–4753</pages>
+      <pages>4743–4754</pages>
       <abstract>Online discussions often derail into toxic exchanges between participants. Recent efforts mostly focused on detecting antisocial behavior after the fact, by analyzing single comments in isolation. To provide more timely notice to human moderators, a system needs to preemptively detect that a conversation is heading towards derailment before it actually turns toxic. This means modeling derailment as an emerging property of a conversation rather than as an isolated utterance-level event. Forecasting emerging conversational properties, however, poses several inherent modeling challenges. First, since conversations are dynamic, a forecasting model needs to capture the flow of the discussion, rather than properties of individual comments. Second, real conversations have an unknown horizon: they can end or derail at any time; thus a practical forecasting model needs to assess the risk in an online fashion, as the conversation develops. In this work we introduce a conversational forecasting model that learns an unsupervised representation of conversational dynamics and exploits it to predict future derailment as the conversation develops. By applying this model to two new diverse datasets of online conversations with labels for antisocial events, we show that it outperforms state-of-the-art systems at forecasting derailment.</abstract>
       <url>D19-1481</url>
       <doi>10.18653/v1/D19-1481</doi>
@@ -5570,7 +5569,7 @@
       <author><first>Yinyin</first><last>Liu</last></author>
       <author><first>Elizabeth</first><last>Belding</last></author>
       <author><first>William Yang</first><last>Wang</last></author>
-      <pages>4754–4763</pages>
+      <pages>4755–4764</pages>
       <abstract>Countering online hate speech is a critical yet challenging task, but one which can be aided by the use of Natural Language Processing (NLP) techniques. Previous research has primarily focused on the development of NLP methods to automatically and effectively detect online hate speech while disregarding further action needed to calm and discourage individuals from using hate speech in the future. In addition, most existing hate speech datasets treat each post as an isolated instance, ignoring the conversational context. In this paper, we propose a novel task of generative hate speech intervention, where the goal is to automatically generate responses to intervene during online conversations that contain hate speech. As a part of this work, we introduce two fully-labeled large-scale hate speech intervention datasets collected from Gab and Reddit. These datasets provide conversation segments, hate speech labels, as well as intervention responses written by Mechanical Turk Workers. In this paper, we also analyze the datasets to understand the common intervention strategies and explore the performance of common automatic response generation methods on these new datasets to provide a benchmark for future research.</abstract>
       <url>D19-1482</url>
       <doi>10.18653/v1/D19-1482</doi>
@@ -5582,7 +5581,7 @@
       <author><first>Desmond</first><last>Patton</last></author>
       <author><first>Charlotte</first><last>Selous</last></author>
       <author><first>Kathy</first><last>McKeown</last></author>
-      <pages>4764–4774</pages>
+      <pages>4765–4775</pages>
       <abstract>Gang-involved youth in cities such as Chicago sometimes post on social media to express their aggression towards rival gangs and previous research has demonstrated that a deep learning approach can predict aggression and loss in posts. To address the possibility of bias in this sensitive application, we developed an approach to systematically interpret the state of the art model. We found, surprisingly, that it frequently bases its predictions on stop words such as “a” or “on”, an approach that could harm social media users who have no aggressive intentions. To tackle this bias, domain experts annotated the rationales, highlighting words that explain why a tweet is labeled as “aggression”. These new annotations enable us to quantitatively measure how justified the model predictions are, and build models that drastically reduce bias. Our study shows that in high stake scenarios, accuracy alone cannot guarantee a good system and we need new evaluation methods.</abstract>
       <url>D19-1483</url>
       <attachment>D19-1483.Attachment.pdf</attachment>
@@ -5593,7 +5592,7 @@
       <author><first>Ella</first><last>Rabinovich</last></author>
       <author><first>Masih</first><last>Sultani</last></author>
       <author><first>Suzanne</first><last>Stevenson</last></author>
-      <pages>4775–4785</pages>
+      <pages>4776–4786</pages>
       <abstract>In contrast to many decades of research on oral code-switching, the study of written multilingual productions has only recently enjoyed a surge of interest. Many open questions remain regarding the sociolinguistic underpinnings of written code-switching, and progress has been limited by a lack of suitable resources. We introduce a novel, large, and diverse dataset of written code-switched productions, curated from topical threads of multiple bilingual communities on the Reddit discussion platform, and explore questions that were mainly addressed in the context of spoken language thus far. We investigate whether findings in oral code-switching concerning content and style, as well as speaker proficiency, are carried over into written code-switching in discussion forums. The released dataset can further facilitate a range of research and practical activities.</abstract>
       <url>D19-1484</url>
       <attachment>D19-1484.Attachment.zip</attachment>
@@ -5604,7 +5603,7 @@
       <author><first>Penghui</first><last>Wei</last></author>
       <author><first>Nan</first><last>Xu</last></author>
       <author><first>Wenji</first><last>Mao</last></author>
-      <pages>4786–4797</pages>
+      <pages>4787–4798</pages>
       <abstract>Automatically verifying rumorous information has become an important and challenging task in natural language processing and social media analytics. Previous studies reveal that people’s stances towards rumorous messages can provide indicative clues for identifying the veracity of rumors, and thus determining the stances of public reactions is a crucial preceding step for rumor veracity prediction. In this paper, we propose a hierarchical multi-task learning framework for jointly predicting rumor stance and veracity on Twitter, which consists of two components. The bottom component of our framework classifies the stances of tweets in a conversation discussing a rumor via modeling the structural property based on a novel graph convolutional network. The top component predicts the rumor veracity by exploiting the temporal dynamics of stance evolution. Experimental results on two benchmark datasets show that our method outperforms previous methods in both rumor stance classification and veracity prediction.</abstract>
       <url>D19-1485</url>
       <doi>10.18653/v1/D19-1485</doi>
@@ -5618,7 +5617,7 @@
       <author><first>Qimai</first><last>Li</last></author>
       <author><first>Xiao-Ming</first><last>Wu</last></author>
       <author><first>Albert Y.S.</first><last>Lam</last></author>
-      <pages>4798–4808</pages>
+      <pages>4799–4809</pages>
       <abstract>Intent classification is an important building block of dialogue systems. With the burgeoning of conversational AI, existing systems are not capable of handling numerous fast-emerging intents, which motivates zero-shot intent classification. Nevertheless, research on this problem is still in the incipient stage and few methods are available. A recently proposed zero-shot intent classification method, IntentCapsNet, has been shown to achieve state-of-the-art performance. However, it has two unaddressed limitations: (1) it cannot deal with polysemy when extracting semantic capsules; (2) it hardly recognizes the utterances of unseen intents in the generalized zero-shot intent classification setting. To overcome these limitations, we propose to reconstruct capsule networks for zero-shot intent classification. First, we introduce a dimensional attention mechanism to fight against polysemy. Second, we reconstruct the transformation matrices for unseen intents by utilizing abundant latent information of the labeled utterances, which significantly improves the model generalization ability. Experimental results on two task-oriented dialogue datasets in different languages show that our proposed method outperforms IntentCapsNet and other strong baselines.</abstract>
       <url>D19-1486</url>
       <doi>10.18653/v1/D19-1486</doi>
@@ -5630,7 +5629,7 @@
       <author><first>Yang</first><last>Song</last></author>
       <author><first>Tao</first><last>Zhang</last></author>
       <author><first>Ji-Rong</first><last>Wen</last></author>
-      <pages>4809–4819</pages>
+      <pages>4810–4820</pages>
       <abstract>Person-job fit has been an important task which aims to automatically match job positions with suitable candidates. Previous methods mainly focus on solving the match task in single-domain setting, which may not work well when labeled data is limited. We study the domain adaptation problem for person-job fit. We first propose a deep global match network for capturing the global semantic interactions between two sentences from a job posting and a candidate resume respectively. Furthermore, we extend the match network and implement domain adaptation in three levels, sentence-level representation, sentence-level match, and global match. Extensive experiment results on a large real-world dataset consisting of six domains have demonstrated the effectiveness of the proposed model, especially when there is not sufficient labeled data.</abstract>
       <url>D19-1487</url>
       <doi>10.18653/v1/D19-1487</doi>
@@ -5642,7 +5641,7 @@
       <author><first>Chuan</first><last>Shi</last></author>
       <author><first>Houye</first><last>Ji</last></author>
       <author><first>Xiaoli</first><last>Li</last></author>
-      <pages>4820–4829</pages>
+      <pages>4821–4830</pages>
       <abstract>Short text classification has found rich and critical applications in news and tweet tagging to help users find relevant information. Due to lack of labeled training data in many practical use cases, there is a pressing need for studying semi-supervised short text classification. Most existing studies focus on long texts and achieve unsatisfactory performance on short texts due to the sparsity and limited labeled data. In this paper, we propose a novel heterogeneous graph neural network based method for semi-supervised short text classification, leveraging full advantage of few labeled data and large unlabeled data through information propagation along the graph. In particular, we first present a flexible HIN (heterogeneous information network) framework for modeling the short texts, which can integrate any type of additional information as well as capture their relations to address the semantic sparsity. Then, we propose Heterogeneous Graph ATtention networks (HGAT) to embed the HIN for short text classification based on a dual-level attention mechanism, including node-level and type-level attentions. The attention mechanism can learn the importance of different neighboring nodes as well as the importance of different node (information) types to a current node. Extensive experimental results have demonstrated that our proposed model outperforms state-of-the-art methods across six benchmark datasets significantly.</abstract>
       <url>D19-1488</url>
       <doi>10.18653/v1/D19-1488</doi>
@@ -5662,7 +5661,7 @@
       <author><first>Sean</first><last>Kross</last></author>
       <author><first>Michelle</first><last>Mazurek</last></author>
       <author><first>Hal</first><last>Daumé III</last></author>
-      <pages>4830–4841</pages>
+      <pages>4831–4842</pages>
       <abstract>The readability of a digital text can influence people’s ability to learn new things about a range topics from digital resources (e.g., Wikipedia, WebMD). Readability also impacts search rankings, and is used to evaluate the performance of NLP systems. Despite this, we lack a thorough understanding of how to validly measure readability at scale, especially for domain-specific texts. In this work, we present a comparison of the validity of well-known readability measures and introduce a novel approach, Smart Cloze, which is designed to address shortcomings of existing measures. We compare these approaches across four different corpora: crowdworker-generated stories, Wikipedia articles, security and privacy advice, and health information. On these corpora, we evaluate the convergent and content validity of each measure, and detail tradeoffs in score precision, domain-specificity, and participant burden. These results provide a foundation for more accurate readability measurements and better evaluation of new natural-language-processing systems and tools.</abstract>
       <url>D19-1489</url>
       <doi>10.18653/v1/D19-1489</doi>
@@ -5673,7 +5672,7 @@
       <author><first>Lu</first><last>Zong</last></author>
       <author><first>Yikang</first><last>Yang</last></author>
       <author><first>Jionglong</first><last>Su</last></author>
-      <pages>4842–4851</pages>
+      <pages>4843–4852</pages>
       <abstract>With the development of NLP technologies, news can be automatically categorized and labeled according to a variety of characteristics, at the same time be represented as low dimensional embeddings. However, it lacks a systematic approach that effectively integrates the inherited features and inter-textual knowledge of news to represent the collective information with a dense vector. With the aim of filling this gap, the News2vec model is proposed to allow the distributed representation of news taking into account its associated features. To describe the cross-document linkages between news, a network consisting of news and its attributes is constructed. Moreover, the News2vec model treats the news node as a bag of features by developing the Subnode model. Based on the biased random walk and the skip-gram model, each news feature is mapped to a vector, and the news is thus represented as the sum of its features. This approach offers an easy solution to create embeddings for unseen news nodes based on its attributes. To evaluate our model, dimension reduction plots and correlation heat-maps are created to visualize the news vectors, together with the application of two downstream tasks, the stock movement prediction and news recommendation. By comparing with other established text/sentence embedding models, we show that News2vec achieves state-of-the-art performance on these news-related tasks.</abstract>
       <url>D19-1490</url>
       <doi>10.18653/v1/D19-1490</doi>
@@ -5682,7 +5681,7 @@
       <title>Recursive Context-Aware Lexical Simplification</title>
       <author><first>Sian</first><last>Gooding</last></author>
       <author><first>Ekaterina</first><last>Kochmar</last></author>
-      <pages>4852–4862</pages>
+      <pages>4853–4863</pages>
       <abstract>This paper presents a novel architecture for recursive context-aware lexical simplification, REC-LS, that is capable of (1) making use of the wider context when detecting the words in need of simplification and suggesting alternatives, and (2) taking previous simplification steps into account. We show that our system outputs lexical simplifications that are grammatically correct and semantically appropriate, and outperforms the current state-of-the-art systems in lexical simplification.</abstract>
       <url>D19-1491</url>
       <attachment>D19-1491.Attachment.pdf</attachment>
@@ -5693,7 +5692,7 @@
       <author><first>Sara</first><last>Rosenthal</last></author>
       <author><first>Ken</first><last>Barker</last></author>
       <author><first>Zhicheng</first><last>Liang</last></author>
-      <pages>4863–4872</pages>
+      <pages>4864–4873</pages>
       <abstract>Electronic Health Records (EHRs) contain both structured content and unstructured (text) content about a patient’s medical history. In the unstructured text parts, there are common sections such as Assessment and Plan, Social History, and Medications. These sections help physicians find information easily and can be used by an information retrieval system to return specific information sought by a user. However, it is common that the exact format of sections in a particular EHR does not adhere to known patterns. Therefore, being able to predict sections and headers in EHRs automatically is beneficial to physicians. Prior approaches in EHR section prediction have only used text data from EHRs and have required significant manual annotation. We propose using sections from medical literature (e.g., textbooks, journals, web content) that contain content similar to that found in EHR sections. Our approach uses data from a different kind of source where labels are provided without the need of a time-consuming annotation effort. We use this data to train two models: an RNN and a BERT-based model. We apply the learned models along with source data via transfer learning to predict sections in EHRs. Our results show that medical literature can provide helpful supervision signal for this classification task.</abstract>
       <url>D19-1492</url>
       <doi>10.18653/v1/D19-1492</doi>
@@ -5707,7 +5706,7 @@
       <author><first>Jianqiang</first><last>Huang</last></author>
       <author><first>Yongfeng</first><last>Huang</last></author>
       <author><first>Xing</first><last>Xie</last></author>
-      <pages>4873–4882</pages>
+      <pages>4874–4883</pages>
       <abstract>News recommendation is important for online news platforms to help users find interested news and alleviate information overload. Existing news recommendation methods usually rely on the news click history to model user interest. However, these methods may suffer from the data sparsity problem, since the news click behaviors of many users in online news platforms are usually very limited. Fortunately, some other kinds of user behaviors such as webpage browsing and search queries can also provide useful clues of users’ news reading interest. In this paper, we propose a neural news recommendation approach which can exploit heterogeneous user behaviors. Our approach contains two major modules, i.e., news representation and user representation. In the news representation module, we learn representations of news from their titles via CNN networks, and apply attention networks to select important words. In the user representation module, we propose an attentive multi-view learning framework to learn unified representations of users from their heterogeneous behaviors such as search queries, clicked news and browsed webpages. In addition, we use word- and record-level attentions to select informative words and behavior records. Experiments on a real-world dataset validate the effectiveness of our approach.</abstract>
       <url>D19-1493</url>
       <doi>10.18653/v1/D19-1493</doi>
@@ -5720,7 +5719,7 @@
       <author><first>Suyu</first><last>Ge</last></author>
       <author><first>Yongfeng</first><last>Huang</last></author>
       <author><first>Xing</first><last>Xie</last></author>
-      <pages>4883–4892</pages>
+      <pages>4884–4893</pages>
       <abstract>User and item representation learning is critical for recommendation. Many of existing recommendation methods learn representations of users and items based on their ratings and reviews. However, the user-user and item-item relatedness are usually not considered in these methods, which may be insufficient. In this paper, we propose a neural recommendation approach which can utilize useful information from both review content and user-item graphs. Since reviews and graphs have different characteristics, we propose to use a multi-view learning framework to incorporate them as different views. In the review content-view, we propose to use a hierarchical model to first learn sentence representations from words, then learn review representations from sentences, and finally learn user/item representations from reviews. In addition, we propose to incorporate a three-level attention network into this view to select important words, sentences and reviews for learning informative user and item representations. In the graph-view, we propose a hierarchical graph neural network to jointly model the user-item, user-user and item-item relatedness by capturing the first- and second-order interactions between users and items in the user-item graph. In addition, we apply attention mechanism to model the importance of these interactions to learn informative user and item representations. Extensive experiments on four benchmark datasets validate the effectiveness of our approach.</abstract>
       <url>D19-1494</url>
       <doi>10.18653/v1/D19-1494</doi>
@@ -5732,7 +5731,7 @@
       <author><first>Ting</first><last>Liu</last></author>
       <author><first>Zhongyang</first><last>Li</last></author>
       <author><first>Junwen</first><last>Duan</last></author>
-      <pages>4893–4902</pages>
+      <pages>4894–4903</pages>
       <abstract>Prior work has proposed effective methods to learn event representations that can capture syntactic and semantic information over text corpus, demonstrating their effectiveness for downstream tasks such as script event prediction. On the other hand, events extracted from raw texts lacks of commonsense knowledge, such as the intents and emotions of the event participants, which are useful for distinguishing event pairs when there are only subtle differences in their surface realizations. To address this issue, this paper proposes to leverage external commonsense knowledge about the intent and sentiment of the event. Experiments on three event-related tasks, i.e., event similarity, script event prediction and stock market prediction, show that our model obtains much better event embeddings for the tasks, achieving 78% improvements on hard similarity task, yielding more precise inferences on subsequent events under given contexts, and better accuracies in predicting the volatilities of the stock market.</abstract>
       <url>D19-1495</url>
       <doi>10.18653/v1/D19-1495</doi>
@@ -5743,7 +5742,7 @@
       <author><first>Jyun-Yu</first><last>Jiang</last></author>
       <author><first>Kai-Wei</first><last>Chang</last></author>
       <author><first>Wei</first><last>Wang</last></author>
-      <pages>4903–4912</pages>
+      <pages>4904–4913</pages>
       <abstract>Adversarial attacks against machine learning models have threatened various real-world applications such as spam filtering and sentiment analysis. In this paper, we propose a novel framework, learning to discriminate perturbations (DISP), to identify and adjust malicious perturbations, thereby blocking adversarial attacks for text classification models. To identify adversarial attacks, a perturbation discriminator validates how likely a token in the text is perturbed and provides a set of potential perturbations. For each potential perturbation, an embedding estimator learns to restore the embedding of the original word based on the context and a replacement token is chosen based on approximate kNN search. DISP can block adversarial attacks for any NLP model without modifying the model structure or training procedure. Extensive experiments on two benchmark datasets demonstrate that DISP significantly outperforms baseline methods in blocking adversarial attacks for text classification. In addition, in-depth analysis shows the robustness of DISP across different situations.</abstract>
       <url>D19-1496</url>
       <doi>10.18653/v1/D19-1496</doi>
@@ -5754,7 +5753,7 @@
       <author><first>Wayne Xin</first><last>Zhao</last></author>
       <author><first>Eddy Jing</first><last>Yin</last></author>
       <author><first>Ji-Rong</first><last>Wen</last></author>
-      <pages>4913–4923</pages>
+      <pages>4914–4924</pages>
       <abstract>Citation count prediction (CCP) has been an important research task for automatically estimating the future impact of a scholarly paper. Previous studies mainly focus on extracting or mining useful features from the paper itself or the associated authors. An important kind of data signals, peer review text, has not been utilized for the CCP task. In this paper, we take the initiative to utilize peer review data for the CCP task with a neural prediction model. Our focus is to learn a comprehensive semantic representation for peer review text for improving the prediction performance. To achieve this goal, we incorporate the abstract-review match mechanism and the cross-review match mechanism to learn deep features from peer review text. We also consider integrating hand-crafted features via a wide component. The deep and wide components jointly make the prediction. Extensive experiments have demonstrated the usefulness of the peer review data and the effectiveness of the proposed model. Our dataset has been released online.</abstract>
       <url>D19-1497</url>
       <doi>10.18653/v1/D19-1497</doi>
@@ -5764,7 +5763,7 @@
       <author><first>Fenia</first><last>Christopoulou</last></author>
       <author><first>Makoto</first><last>Miwa</last></author>
       <author><first>Sophia</first><last>Ananiadou</last></author>
-      <pages>4924–4935</pages>
+      <pages>4925–4936</pages>
       <abstract>Document-level relation extraction is a complex human process that requires logical inference to extract relationships between named entities in text. Existing approaches use graph-based neural models with words as nodes and edges as relations between them, to encode relations across sentences. These models are node-based, i.e., they form pair representations based solely on the two target node representations. However, entity relations can be better expressed through unique edge representations formed as paths between nodes. We thus propose an edge-oriented graph neural model for document-level relation extraction. The model utilises different types of nodes and edges to create a document-level graph. An inference mechanism on the graph edges enables to learn intra- and inter-sentence relations using multi-instance learning internally. Experiments on two document-level biomedical datasets for chemical-disease and gene-disease associations show the usefulness of the proposed edge-oriented approach.</abstract>
       <url>D19-1498</url>
       <doi>10.18653/v1/D19-1498</doi>
@@ -5778,7 +5777,7 @@
       <author><first>Dongyan</first><last>Zhao</last></author>
       <author><first>Shuming</first><last>Shi</last></author>
       <author><first>Rui</first><last>Yan</last></author>
-      <pages>4936–4945</pages>
+      <pages>4937–4946</pages>
       <abstract>Text style transfer task requires the model to transfer a sentence of one style to another style while retaining its original content meaning, which is a challenging problem that has long suffered from the shortage of parallel data. In this paper, we first propose a semi-supervised text style transfer model that combines the small-scale parallel data with the large-scale nonparallel data. With these two types of training data, we introduce a projection function between the latent space of different styles and design two constraints to train it. We also introduce two other simple but effective semi-supervised methods to compare with. To evaluate the performance of the proposed methods, we build and release a novel style transfer dataset that alters sentences between the style of ancient Chinese poem and the modern Chinese.</abstract>
       <url>D19-1499</url>
       <doi>10.18653/v1/D19-1499</doi>
@@ -5790,7 +5789,7 @@
       <author><first>Shomir</first><last>Wilson</last></author>
       <author><first>Thomas</first><last>Norton</last></author>
       <author><first>Norman</first><last>Sadeh</last></author>
-      <pages>4946–4957</pages>
+      <pages>4947–4958</pages>
       <abstract>Privacy policies are long and complex documents that are difficult for users to read and understand. Yet, they have legal effects on how user data can be collected, managed and used. Ideally, we would like to empower users to inform themselves about the issues that matter to them, and enable them to selectively explore these issues. We present PrivacyQA, a corpus consisting of 1750 questions about the privacy policies of mobile applications, and over 3500 expert annotations of relevant answers. We observe that a strong neural baseline underperforms human performance by almost 0.3 F1 on PrivacyQA, suggesting considerable room for improvement for future systems. Further, we use this dataset to categorically identify challenges to question answerability, with domain-general implications for any question answering system. The PrivacyQA corpus offers a challenging corpus for question answering, with genuine real world utility.</abstract>
       <url>D19-1500</url>
       <doi>10.18653/v1/D19-1500</doi>
@@ -5805,7 +5804,7 @@
       <author><first>Kun</first><last>Gai</last></author>
       <author><first>Dongyan</first><last>Zhao</last></author>
       <author><first>Rui</first><last>Yan</last></author>
-      <pages>4958–4967</pages>
+      <pages>4959–4968</pages>
       <abstract>Different from other text generation tasks, in product description generation, it is of vital importance to generate faithful descriptions that stick to the product attribute information. However, little attention has been paid to this problem. To bridge this gap we propose a model named Fidelity-oriented Product Description Generator (FPDG). FPDG takes the entity label of each word into account, since the product attribute information is always conveyed by entity words. Specifically, we first propose a Recurrent Neural Network (RNN) decoder based on the Entity-label-guided Long Short-Term Memory (ELSTM) cell, taking both the embedding and the entity label of each word as input. Second, we establish a keyword memory that stores the entity labels as keys and keywords as values, and FPDG will attend to keywords through attending to their entity labels. Experiments conducted a large-scale real-world product description dataset show that our model achieves the state-of-the-art performance in terms of both traditional generation metrics as well as human evaluations. Specifically, FPDG increases the fidelity of the generated descriptions by 25%.</abstract>
       <url>D19-1501</url>
       <doi>10.18653/v1/D19-1501</doi>
@@ -5816,7 +5815,7 @@
       <author><first>Lei</first><last>Hou</last></author>
       <author><first>Juanzi</first><last>Li</last></author>
       <author><first>Tiansi</first><last>Dong</last></author>
-      <pages>4968–4977</pages>
+      <pages>4969–4978</pages>
       <abstract>This paper addresses the problem of inferring the fine-grained type of an entity from a knowledge base. We convert this problem into the task of graph-based semi-supervised classification, and propose Hierarchical Multi Graph Convolutional Network (HMGCN), a novel Deep Learning architecture to tackle this problem. We construct three kinds of connectivity matrices to capture different kinds of semantic correlations between entities. A recursive regularization is proposed to model the subClassOf relations between types in given type hierarchy. Extensive experiments with two large-scale public datasets show that our proposed method significantly outperforms four state-of-the-art methods.</abstract>
       <url>D19-1502</url>
       <doi>10.18653/v1/D19-1502</doi>
@@ -5828,7 +5827,7 @@
       <author><first>Linh</first><last>Tran</last></author>
       <author><first>Gang</first><last>Lee</last></author>
       <author><first>Izhak</first><last>Shafran</last></author>
-      <pages>4978–4989</pages>
+      <pages>4979–4990</pages>
       <abstract>Recently we proposed the Span Attribute Tagging (SAT) Model to infer clinical entities (e.g., symptoms) and their properties (e.g., duration). It tackles the challenge of large label space and limited training data using a hierarchical two-stage approach that identifies the span of interest in a tagging step and assigns labels to the span in a classification step. We extend the SAT model to jointly infer not only entities and their properties but also relations between them. Most relation extraction models restrict inferring relations between tokens within a few neighboring sentences, mainly to avoid high computational complexity. In contrast, our proposed Relation-SAT (R-SAT) model is computationally efficient and can infer relations over the entire conversation, spanning an average duration of 10 minutes. We evaluate our model on a corpus of clinical conversations. When the entities are given, the R-SAT outperforms baselines in identifying relations between symptoms and their properties by about 32% (0.82 vs 0.62 F-score) and by about 50% (0.60 vs 0.41 F-score) on medications and their properties. On the more difficult task of jointly inferring entities and relations, the R-SAT model achieves a performance of 0.34 and 0.45 for symptoms and medications respectively, which is significantly better than 0.18 and 0.35 for the baseline model. The contributions of different components of the model are quantified using ablation analysis.</abstract>
       <url>D19-1503</url>
       <doi>10.18653/v1/D19-1503</doi>
@@ -5839,7 +5838,7 @@
       <author><first>Sungjun</first><last>Cho</last></author>
       <author><first>David</first><last>Bindel</last></author>
       <author><first>David</first><last>Mimno</last></author>
-      <pages>4990–5000</pages>
+      <pages>4991–5001</pages>
       <abstract>Despite great scalability on large data and their ability to understand correlations between topics, spectral topic models have not been widely used due to the absence of reliability in real data and lack of practical implementations. This paper aims to solidify the foundations of spectral topic inference and provide a practical implementation for anchor-based topic modeling. Beginning with vocabulary curation, we scrutinize every single inference step with other viable options. We also evaluate our matrix-based approach against popular alternatives including a tensor-based spectral method as well as probabilistic algorithms. Our quantitative and qualitative experiments demonstrate the power of Rectified Anchor Word algorithm in various real datasets, providing a complete guide to practical correlated topic modeling.</abstract>
       <url>D19-1504</url>
       <attachment>D19-1504.Attachment.zip</attachment>
@@ -5852,7 +5851,7 @@
       <author><first>Michael</first><last>Gamon</last></author>
       <author><first>Sujay Kumar</first><last>Jauhar</last></author>
       <author><first>ChangTien</first><last>Lu</last></author>
-      <pages>5001–5010</pages>
+      <pages>5002–5011</pages>
       <abstract>Management of collaborative documents can be difficult, given the profusion of edits and comments that multiple authors make during a document’s evolution. Reliably modeling the relationship between edits and comments is a crucial step towards helping the user keep track of a document in flux. A number of authoring tasks, such as categorizing and summarizing edits, detecting completed to-dos, and visually rearranging comments could benefit from such a contribution. Thus, in this paper we explore the relationship between comments and edits by defining two novel, related tasks: Comment Ranking and Edit Anchoring. We begin by collecting a dataset with more than half a million comment-edit pairs based on Wikipedia revision histories. We then propose a hierarchical multi-layer deep neural-network to model the relationship between edits and comments. Our architecture tackles both Comment Ranking and Edit Anchoring tasks by encoding specific edit actions such as additions and deletions, while also accounting for document context. In a number of evaluation settings, our experimental results show that our approach outperforms several strong baselines significantly. We are able to achieve a precision@1 of 71.0% and a precision@3 of 94.4% for Comment Ranking, while we achieve 74.4% accuracy on Edit Anchoring.</abstract>
       <url>D19-1505</url>
       <doi>10.18653/v1/D19-1505</doi>
@@ -5862,7 +5861,7 @@
       <author><first>Prabhu</first><last>Kaliamoorthi</last></author>
       <author><first>Sujith</first><last>Ravi</last></author>
       <author><first>Zornitsa</first><last>Kozareva</last></author>
-      <pages>5011–5020</pages>
+      <pages>5012–5021</pages>
       <abstract>Recently, there has been a great interest in the development of small and accurate neural networks that run entirely on devices such as mobile phones, smart watches and IoT. This enables user privacy, consistent user experience and low latency. Although a wide range of applications have been targeted from wake word detection to short text classification, yet there are no on-device networks for long text classification. We propose a novel projection attention neural network PRADO that combines trainable projections with attention and convolutions. We evaluate our approach on multiple large document text classification tasks. Our results show the effectiveness of the trainable projection model in finding semantically similar phrases and reaching high performance while maintaining compact size. Using this approach, we train tiny neural networks just 200 Kilobytes in size that improve over prior CNN and LSTM models and achieve near state of the art performance on multiple long document classification tasks. We also apply our model for transfer learning, show its robustness and ability to further improve the performance in limited data scenarios.</abstract>
       <url>D19-1506</url>
       <doi>10.18653/v1/D19-1506</doi>
@@ -5870,7 +5869,7 @@
     <paper id="507">
       <title>Subword Language Model for Query Auto-Completion</title>
       <author><first>Gyuwan</first><last>Kim</last></author>
-      <pages>5021–5031</pages>
+      <pages>5022–5032</pages>
       <abstract>Current neural query auto-completion (QAC) systems rely on character-level language models, but they slow down when queries are long. We present how to utilize subword language models for the fast and accurate generation of query completion candidates. Representing queries with subwords shorten a decoding length significantly. To deal with issues coming from introducing subword language model, we develop a retrace algorithm and a reranking method by approximate marginalization. As a result, our model achieves up to 2.5 times faster while maintaining a similar quality of generated results compared to the character-level baseline. Also, we propose a new evaluation metric, mean recoverable length (MRL), measuring how many upcoming characters the model could complete correctly. It provides more explicit meaning and eliminates the need for prefix length sampling for existing rank-based metrics. Moreover, we performed a comprehensive analysis with ablation study to figure out the importance of each component.</abstract>
       <url>D19-1507</url>
       <doi>10.18653/v1/D19-1507</doi>
@@ -5883,7 +5882,7 @@
       <author><first>Huaixiao</first><last>Tou</last></author>
       <author><first>Zhongyu</first><last>Wei</last></author>
       <author><first>Ting</first><last>Chen</last></author>
-      <pages>5032–5041</pages>
+      <pages>5033–5042</pages>
       <abstract>Symptom diagnosis is a challenging yet profound problem in natural language processing. Most previous research focus on investigating the standard electronic medical records for symptom diagnosis, while the dialogues between doctors and patients that contain more rich information are not well studied. In this paper, we first construct a dialogue symptom diagnosis dataset based on an online medical forum with a large amount of dialogues between patients and doctors. Then, we provide some benchmark models on this dataset to boost the research of dialogue symptom diagnosis. In order to further enhance the performance of symptom diagnosis over dialogues, we propose a global attention mechanism to capture more symptom related information, and build a symptom graph to model the associations between symptoms rather than treating each symptom independently. Experimental results show that both the global attention and symptom graph are effective to boost dialogue symptom diagnosis. In particular, our proposed model achieves the state-of-the-art performance on the constructed dataset.</abstract>
       <url>D19-1508</url>
       <doi>10.18653/v1/D19-1508</doi>
@@ -5896,7 +5895,7 @@
       <author><first>Chandra</first><last>Bhagavatula</last></author>
       <author><first>Elizabeth</first><last>Clark</last></author>
       <author><first>Yejin</first><last>Choi</last></author>
-      <pages>5042–5052</pages>
+      <pages>5043–5053</pages>
       <abstract>Counterfactual reasoning requires predicting how alternative events, contrary to what actually happened, might have resulted in different outcomes. Despite being considered a necessary component of AI-complete systems, few resources have been developed for evaluating counterfactual reasoning in narratives. In this paper, we propose Counterfactual Story Rewriting: given an original story and an intervening counterfactual event, the task is to minimally revise the story to make it compatible with the given counterfactual event. Solving this task will require deep understanding of causal narrative chains and counterfactual invariance, and integration of such story reasoning capabilities into conditional language generation models. We present TIMETRAVEL, a new dataset of 29,849 counterfactual rewritings, each with the original story, a counterfactual event, and human-generated revision of the original story compatible with the counterfactual event. Additionally, we include 81,407 counterfactual “branches” without a rewritten storyline to support future work on semi- or un-supervised approaches to counterfactual story rewriting. Finally, we evaluate the counterfactual rewriting capacities of several competitive baselines based on pretrained language models, and assess whether common overlap and model-based automatic metrics for text generation correlate well with human scores for counterfactual rewriting.</abstract>
       <url>D19-1509</url>
       <attachment>D19-1509.Attachment.pdf</attachment>
@@ -5909,7 +5908,7 @@
       <author><first>Sascha</first><last>Rothe</last></author>
       <author><first>Daniil</first><last>Mirylenka</last></author>
       <author><first>Aliaksei</first><last>Severyn</last></author>
-      <pages>5053–5064</pages>
+      <pages>5054–5065</pages>
       <abstract>We propose LaserTagger - a sequence tagging approach that casts text generation as a text editing task. Target texts are reconstructed from the inputs using three main edit operations: keeping a token, deleting it, and adding a phrase before the token. To predict the edit operations, we propose a novel model, which combines a BERT encoder with an autoregressive Transformer decoder. This approach is evaluated on English text on four tasks: sentence fusion, sentence splitting, abstractive summarization, and grammar correction. LaserTagger achieves new state-of-the-art results on three of these tasks, performs comparably to a set of strong seq2seq baselines with a large number of training examples, and outperforms them when the number of examples is limited. Furthermore, we show that at inference time tagging can be more than two orders of magnitude faster than comparable seq2seq models, making it more attractive for running in a live environment.</abstract>
       <url>D19-1510</url>
       <attachment>D19-1510.Attachment.zip</attachment>
@@ -5921,7 +5920,7 @@
       <author><first>Shi</first><last>Feng</last></author>
       <author><first>Daling</first><last>Wang</last></author>
       <author><first>Yifei</first><last>Zhang</last></author>
-      <pages>5065–5075</pages>
+      <pages>5066–5076</pages>
       <abstract>Generating intriguing question is a key step towards building human-like open-domain chatbots. Although some recent works have focused on this task, compared with questions raised by humans, significant gaps remain in maintaining semantic coherence with post, which may result in generating dull or deviated questions. We observe that the answer has strong semantic coherence to its question and post, which can be used to guide question generation. Thus, we devise two methods to further enhance semantic coherence between post and question under the guidance of answer. First, the coherence score between generated question and answer is used as the reward function in a reinforcement learning framework, to encourage the cases that are consistent with the answer in semantic. Second, we incorporate adversarial training to explicitly control question generation in the direction of question-answer coherence. Extensive experiments show that our two methods outperform state-of-the-art baseline algorithms with large margins in raising semantic coherent questions.</abstract>
       <url>D19-1511</url>
       <doi>10.18653/v1/D19-1511</doi>
@@ -5932,7 +5931,7 @@
       <author><first>Can</first><last>Xu</last></author>
       <author><first>Wei</first><last>Wu</last></author>
       <author><first>Zhoujun</first><last>Li</last></author>
-      <pages>5076–5088</pages>
+      <pages>5077–5089</pages>
       <abstract>Automatic news comment generation is beneficial for real applications but has not attracted enough attention from the research community. In this paper, we propose a “read-attend-comment” procedure for news comment generation and formalize the procedure with a reading network and a generation network. The reading network comprehends a news article and distills some important points from it, then the generation network creates a comment by attending to the extracted discrete points and the news title. We optimize the model in an end-to-end manner by maximizing a variational lower bound of the true objective using the back-propagation algorithm. Experimental results on two public datasets indicate that our model can significantly outperform existing methods in terms of both automatic evaluation and human judgment.</abstract>
       <url>D19-1512</url>
       <doi>10.18653/v1/D19-1512</doi>
@@ -5942,7 +5941,7 @@
       <author><first>Hongyin</first><last>Tang</last></author>
       <author><first>Miao</first><last>Li</last></author>
       <author><first>Beihong</first><last>Jin</last></author>
-      <pages>5089–5098</pages>
+      <pages>5090–5099</pages>
       <abstract>Text generation is among the most fundamental tasks in natural language processing. In this paper, we propose a text generation model that learns semantics and structural features simultaneously. This model captures structural features by a sequential variational autoencoder component and leverages a topic modeling component based on Gaussian distribution to enhance the recognition of text semantics. To make the reconstructed text more coherent to the topics, the model further adapts the encoder of the topic modeling component for a discriminator. The results of experiments over several datasets demonstrate that our model outperforms several states of the art models in terms of text perplexity and topic coherence. Moreover, the latent representations learned by our model is superior to others in a text classification task. Finally, given the input texts, our model can generate meaningful texts which hold similar structures but under different topics.</abstract>
       <url>D19-1513</url>
       <attachment>D19-1513.Attachment.zip</attachment>
@@ -5952,7 +5951,7 @@
       <title><fixed-case>LXMERT</fixed-case>: Learning Cross-Modality Encoder Representations from Transformers</title>
       <author><first>Hao</first><last>Tan</last></author>
       <author><first>Mohit</first><last>Bansal</last></author>
-      <pages>5099–5110</pages>
+      <pages>5100–5111</pages>
       <abstract>Vision-and-language reasoning requires an understanding of visual concepts, language semantics, and, most importantly, the alignment and relationships between these two modalities. We thus propose the LXMERT (Learning Cross-Modality Encoder Representations from Transformers) framework to learn these vision-and-language connections. In LXMERT, we build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative pre-training tasks: masked language modeling, masked object prediction (feature regression and label classification), cross-modality matching, and image question answering. These tasks help in learning both intra-modality and cross-modality relationships. After fine-tuning from our pre-trained parameters, our model achieves the state-of-the-art results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our pre-trained cross-modality model by adapting it to a challenging visual-reasoning task, NLVR2, and improve the previous best result by 22% absolute (54% to 76%). Lastly, we demonstrate detailed ablation studies to prove that both our novel model components and pre-training strategies significantly contribute to our strong results. Code and pre-trained models publicly available at: https://github.com/airsplay/lxmert</abstract>
       <url>D19-1514</url>
       <doi>10.18653/v1/D19-1514</doi>
@@ -5961,7 +5960,7 @@
       <title>Phrase Grounding by Soft-Label Chain Conditional Random Field</title>
       <author><first>Jiacheng</first><last>Liu</last></author>
       <author><first>Julia</first><last>Hockenmaier</last></author>
-      <pages>5111–5121</pages>
+      <pages>5112–5122</pages>
       <abstract>The phrase grounding task aims to ground each entity mention in a given caption of an image to a corresponding region in that image. Although there are clear dependencies between how different mentions of the same caption should be grounded, previous structured prediction methods that aim to capture such dependencies need to resort to approximate inference or non-differentiable losses. In this paper, we formulate phrase grounding as a sequence labeling task where we treat candidate regions as potential labels, and use neural chain Conditional Random Fields (CRFs) to model dependencies among regions for adjacent mentions. In contrast to standard sequence labeling tasks, the phrase grounding task is defined such that there may be multiple correct candidate regions. To address this multiplicity of gold labels, we define so-called Soft-Label Chain CRFs, and present an algorithm that enables convenient end-to-end training. Our method establishes a new state-of-the-art on phrase grounding on the Flickr30k Entities dataset. Analysis shows that our model benefits both from the entity dependencies captured by the CRF and from the soft-label training regime. Our code is available at <url>github.com/liujch1998/SoftLabelCCRF</url>
       </abstract>
       <url>D19-1515</url>
@@ -5975,7 +5974,7 @@
       <author><first>Yangqiu</first><last>Song</last></author>
       <author><first>Yan</first><last>Song</last></author>
       <author><first>Changshui</first><last>Zhang</last></author>
-      <pages>5122–5131</pages>
+      <pages>5123–5132</pages>
       <abstract>Grounding a pronoun to a visual object it refers to requires complex reasoning from various information sources, especially in conversational scenarios. For example, when people in a conversation talk about something all speakers can see, they often directly use pronouns (e.g., it) to refer to it without previous introduction. This fact brings a huge challenge for modern natural language understanding systems, particularly conventional context-based pronoun coreference models. To tackle this challenge, in this paper, we formally define the task of visual-aware pronoun coreference resolution (PCR) and introduce VisPro, a large-scale dialogue PCR dataset, to investigate whether and how the visual information can help resolve pronouns in dialogues. We then propose a novel visual-aware PCR model, VisCoref, for this task and conduct comprehensive experiments and case studies on our dataset. Results demonstrate the importance of the visual information in this PCR case and show the effectiveness of the proposed model.</abstract>
       <url>D19-1516</url>
       <doi>10.18653/v1/D19-1516</doi>
@@ -5986,7 +5985,7 @@
       <author><first>Yongcheng</first><last>Wang</last></author>
       <author><first>Shizhe</first><last>Chen</last></author>
       <author><first>Qin</first><last>Jin</last></author>
-      <pages>5132–5142</pages>
+      <pages>5133–5143</pages>
       <abstract>Multimodal semantic comprehension has attracted increasing research interests recently such as visual question answering and caption generation. However, due to the data limitation, fine-grained semantic comprehension has not been well investigated, which requires to capture semantic details of multimodal contents. In this work, we introduce “YouMakeup”, a large-scale multimodal instructional video dataset to support fine-grained semantic comprehension research in specific domain. YouMakeup contains 2,800 videos from YouTube, spanning more than 420 hours in total. Each video is annotated with a sequence of natural language descriptions for instructional steps, grounded in temporal video range and spatial facial areas. The annotated steps in a video involve subtle difference in actions, products and regions, which requires fine-grained understanding and reasoning both temporally and spatially. In order to evaluate models’ ability for fined-grained comprehension, we further propose two groups of tasks including generation tasks and visual question answering from different aspects. We also establish a baseline of step caption generation for future comparison. The dataset will be publicly available at https://github. com/AIM3-RUC/YouMakeup to support research investigation in fine-grained semantic comprehension.</abstract>
       <url>D19-1517</url>
       <doi>10.18653/v1/D19-1517</doi>
@@ -5998,7 +5997,7 @@
       <author><first>Chilie</first><last>Tan</last></author>
       <author><first>Xiaolin</first><last>Li</last></author>
       <author><first>Jun</first><last>Xiao</last></author>
-      <pages>5143–5152</pages>
+      <pages>5144–5153</pages>
       <abstract>In this paper, we focus on natural language video localization: localizing (ie, grounding) a natural language description in a long and untrimmed video sequence. All currently published models for addressing this problem can be categorized into two types: (i) top-down approach: it does classification and regression for a set of pre-cut video segment candidates; (ii) bottom-up approach: it directly predicts probabilities for each video frame as the temporal boundaries (ie, start and end time point). However, both two approaches suffer several limitations: the former is computation-intensive for densely placed candidates, while the latter has trailed the performance of the top-down counterpart thus far. To this end, we propose a novel dense bottom-up framework: DEnse Bottom-Up Grounding (DEBUG). DEBUG regards all frames falling in the ground truth segment as foreground, and each foreground frame regresses the unique distances from its location to bi-directional ground truth boundaries. Extensive experiments on three challenging benchmarks (TACoS, Charades-STA, and ActivityNet Captions) show that DEBUG is able to match the speed of bottom-up models while surpassing the performance of the state-of-the-art top-down models.</abstract>
       <url>D19-1518</url>
       <doi>10.18653/v1/D19-1518</doi>
@@ -6011,7 +6010,7 @@
       <author><first>Lihao</first><last>Lu</last></author>
       <author><first>Jiacheng</first><last>Liu</last></author>
       <author><first>Jiawei</first><last>Han</last></author>
-      <pages>5153–5162</pages>
+      <pages>5154–5163</pages>
       <abstract>Everyone makes mistakes. So do human annotators when curating labels for named entity recognition (NER). Such label mistakes might hurt model training and interfere model comparison. In this study, we dive deep into one of the widely-adopted NER benchmark datasets, CoNLL03 NER. We are able to identify label mistakes in about 5.38% test sentences, which is a significant ratio considering that the state-of-the-art test F1 score is already around 93%. Therefore, we manually correct these label mistakes and form a cleaner test set. Our re-evaluation of popular models on this corrected test set leads to more accurate assessments, compared to those on the original test set. More importantly, we propose a simple yet effective framework, CrossWeigh, to handle label mistakes during NER model training. Specifically, it partitions the training data into several folds and train independent NER models to identify potential mistakes in each fold. Then it adjusts the weights of training data accordingly to train the final NER model. Extensive experiments demonstrate significant improvements of plugging various NER models into our proposed framework on three datasets. All implementations and corrected test set are available at our Github repo https://github.com/ZihanWangKi/CrossWeigh.</abstract>
       <url>D19-1519</url>
       <doi>10.18653/v1/D19-1519</doi>
@@ -6023,7 +6022,7 @@
       <author><first>Zaid</first><last>Sheikh</last></author>
       <author><first>Graham</first><last>Neubig</last></author>
       <author><first>Jaime</first><last>Carbonell</last></author>
-      <pages>5163–5173</pages>
+      <pages>5164–5174</pages>
       <abstract>Most state-of-the-art models for named entity recognition (NER) rely on the availability of large amounts of labeled data, making them challenging to extend to new, lower-resourced languages. However, there are now many proposed solutions to this problem involving either cross-lingual transfer learning, which learns from other highly resourced languages, or active learning, which efficiently selects effective training data based on model predictions. In this paper, we ask the question: given this recent progress, and some amount of human annotation, what is the most effective method for efficiently creating high-quality entity recognizers in under-resourced languages? Based on extensive experimentation using both simulated and real human annotation, we settle on a recipe of starting with a cross-lingual transferred model, then performing targeted annotation of only uncertain entity spans in the target language, minimizing annotator effort. Results demonstrate that cross-lingual transfer is a powerful tool when very little data can be annotated, but an entity-targeted annotation strategy can achieve competitive accuracy quickly, with just one-tenth of training data.</abstract>
       <url>D19-1520</url>
       <attachment>D19-1520.Attachment.pdf</attachment>
@@ -6036,7 +6035,7 @@
       <author><first>Chenyan</first><last>Xiong</last></author>
       <author><first>Daniel</first><last>Campos</last></author>
       <author><first>Arnold</first><last>Overwijk</last></author>
-      <pages>5174–5183</pages>
+      <pages>5175–5184</pages>
       <abstract>This paper studies keyphrase extraction in real-world scenarios where documents are from diverse domains and have variant content quality. We curate and release OpenKP, a large scale open domain keyphrase extraction dataset with near one hundred thousand web documents and expert keyphrase annotations. To handle the variations of domain and content quality, we develop BLING-KPE, a neural keyphrase extraction model that goes beyond language understanding using visual presentations of documents and weak supervision from search queries. Experimental results on OpenKP confirm the effectiveness of BLING-KPE and the contributions of its neural architecture, visual features, and search log weak supervision. Zero-shot evaluations on DUC-2001 demonstrate the improved generalization ability of learning from the open domain data compared to a specific domain.</abstract>
       <url>D19-1521</url>
       <attachment>D19-1521.Attachment.zip</attachment>
@@ -6047,7 +6046,7 @@
       <author><first>Ivana</first><last>Balazevic</last></author>
       <author><first>Carl</first><last>Allen</last></author>
       <author><first>Timothy</first><last>Hospedales</last></author>
-      <pages>5184–5193</pages>
+      <pages>5185–5194</pages>
       <abstract>Knowledge graphs are structured representations of real world facts. However, they typically contain only a small subset of all possible facts. Link prediction is a task of inferring missing facts based on existing ones. We propose TuckER, a relatively straightforward but powerful linear model based on Tucker decomposition of the binary tensor representation of knowledge graph triples. TuckER outperforms previous state-of-the-art models across standard link prediction datasets, acting as a strong baseline for more elaborate models. We show that TuckER is a fully expressive model, derive sufficient bounds on its embedding dimensionalities and demonstrate that several previously introduced linear models can be viewed as special cases of TuckER.</abstract>
       <url>D19-1522</url>
       <attachment>D19-1522.Attachment.zip</attachment>
@@ -6057,7 +6056,7 @@
       <title>Human-grounded Evaluations of Explanation Methods for Text Classification</title>
       <author><first>Piyawat</first><last>Lertvittayakumjorn</last></author>
       <author><first>Francesca</first><last>Toni</last></author>
-      <pages>5194–5204</pages>
+      <pages>5195–5205</pages>
       <abstract>Due to the black-box nature of deep learning models, methods for explaining the models’ results are crucial to gain trust from humans and support collaboration between AIs and humans. In this paper, we consider several model-agnostic and model-specific explanation methods for CNNs for text classification and conduct three human-grounded evaluations, focusing on different purposes of explanations: (1) revealing model behavior, (2) justifying model predictions, and (3) helping humans investigate uncertain predictions. The results highlight dissimilar qualities of the various explanation methods we consider and show the degree to which these methods could serve for each purpose.</abstract>
       <url>D19-1523</url>
       <attachment>D19-1523.Attachment.zip</attachment>
@@ -6070,7 +6069,7 @@
       <author><first>Chong</first><last>Feng</last></author>
       <author><first>Anqing</first><last>Zheng</last></author>
       <author><first>Xiaopeng</first><last>Liu</last></author>
-      <pages>5205–5214</pages>
+      <pages>5206–5215</pages>
       <abstract>We introduce a new task of modeling the role and function for on-line resource citations in scientific literature. By categorizing the on-line resources and analyzing the purpose of resource citations in scientific texts, it can greatly help resource search and recommendation systems to better understand and manage the scientific resources. For this novel task, we are the first to create an annotation scheme, which models the different granularity of information from a hierarchical perspective. And we construct a dataset SciRes, which includes 3,088 manually annotated resource contexts. In this paper, we propose a possible solution by using a multi-task framework to build the scientific resource classifier (SciResCLF) for jointly recognizing the role and function types. Then we use the classification results to help a scientific resource recommendation (SciResREC) task. Experiments show that our model achieves the best results on both the classification task and the recommendation task. The SciRes dataset is released for future research.</abstract>
       <url>D19-1524</url>
       <attachment>D19-1524.Attachment.pdf</attachment>
@@ -6082,7 +6081,7 @@
       <author><first>Shehzeen</first><last>Hussain</last></author>
       <author><first>Shlomo</first><last>Dubnov</last></author>
       <author><first>Farinaz</first><last>Koushanfar</last></author>
-      <pages>5215–5224</pages>
+      <pages>5216–5225</pages>
       <abstract>In this work, we develop methods to repurpose text classification neural networks for alternate tasks without modifying the network architecture or parameters. We propose a context based vocabulary remapping method that performs a computationally inexpensive input transformation to reprogram a victim classification model for a new set of sequences. We propose algorithms for training such an input transformation in both white box and black box settings where the adversary may or may not have access to the victim model’s architecture and parameters. We demonstrate the application of our model and the vulnerability of neural networks by adversarially repurposing various text-classification models including LSTM, bi-directional LSTM and CNN for alternate classification tasks.</abstract>
       <url>D19-1525</url>
       <attachment>D19-1525.Attachment.pdf</attachment>
@@ -6094,7 +6093,7 @@
       <author><first>Qinliang</first><last>Su</last></author>
       <author><first>Dinghan</first><last>Shen</last></author>
       <author><first>Changyou</first><last>Chen</last></author>
-      <pages>5225–5234</pages>
+      <pages>5226–5235</pages>
       <abstract>Hashing is promising for large-scale information retrieval tasks thanks to the efficiency of distance evaluation between binary codes. Generative hashing is often used to generate hashing codes in an unsupervised way. However, existing generative hashing methods only considered the use of simple priors, like Gaussian and Bernoulli priors, which limits these methods to further improve their performance. In this paper, two mixture-prior generative models are proposed, under the objective to produce high-quality hashing codes for documents. Specifically, a Gaussian mixture prior is first imposed onto the variational auto-encoder (VAE), followed by a separate step to cast the continuous latent representation of VAE into binary code. To avoid the performance loss caused by the separate casting, a model using a Bernoulli mixture prior is further developed, in which an end-to-end training is admitted by resorting to the straight-through (ST) discrete gradient estimator. Experimental results on several benchmark datasets demonstrate that the proposed methods, especially the one using Bernoulli mixture priors, consistently outperform existing ones by a substantial margin.</abstract>
       <url>D19-1526</url>
       <doi>10.18653/v1/D19-1526</doi>
@@ -6105,7 +6104,7 @@
       <author><first>Zhixin</first><last>Zhou</last></author>
       <author><first>Zhaozhuo</first><last>Xu</last></author>
       <author><first>Ping</first><last>Li</last></author>
-      <pages>5235–5245</pages>
+      <pages>5236–5246</pages>
       <abstract>Retrieval of relevant vectors produced by representation learning critically influences the efficiency in natural language processing (NLP) tasks. In this paper, we demonstrate an efficient method for searching vectors via a typical non-metric matching function: inner product. Our method, which constructs an approximate Inner Product Delaunay Graph (IPDG) for top-1 Maximum Inner Product Search (MIPS), transforms retrieving the most suitable latent vectors into a graph search problem with great benefits of efficiency. Experiments on data representations learned for different machine learning tasks verify the outperforming effectiveness and efficiency of the proposed IPDG.</abstract>
       <url>D19-1527</url>
       <doi>10.18653/v1/D19-1527</doi>
@@ -6120,7 +6119,7 @@
       <author><first>Yangqiu</first><last>Song</last></author>
       <author><first>Wilfred</first><last>Ng</last></author>
       <author><first>Dong</first><last>Yu</last></author>
-      <pages>5246–5255</pages>
+      <pages>5247–5256</pages>
       <abstract>Conventional word embeddings represent words with fixed vectors, which are usually trained based on co-occurrence patterns among words. In doing so, however, the power of such representations is limited, where the same word might be functionalized separately under different syntactic relations. To address this limitation, one solution is to incorporate relational dependencies of different words into their embeddings. Therefore, in this paper, we propose a multiplex word embedding model, which can be easily extended according to various relations among words. As a result, each word has a center embedding to represent its overall semantics, and several relational embeddings to represent its relational dependencies. Compared to existing models, our model can effectively distinguish words with respect to different relations without introducing unnecessary sparseness. Moreover, to accommodate various relations, we use a small dimension for relational embeddings and our model is able to keep their effectiveness. Experiments on selectional preference acquisition and word similarity demonstrate the effectiveness of the proposed model, and a further study of scalability also proves that our embeddings only need 1/20 of the original embedding size to achieve better performance.</abstract>
       <url>D19-1528</url>
       <doi>10.18653/v1/D19-1528</doi>
@@ -6130,7 +6129,7 @@
       <author><first>Yukun</first><last>Ma</last></author>
       <author><first>Patrick H.</first><last>Chen</last></author>
       <author><first>Cho-Jui</first><last>Hsieh</last></author>
-      <pages>5256–5265</pages>
+      <pages>5257–5266</pages>
       <abstract>It is challenging to deploy deep neural nets on memory-constrained devices due to the explosion of numbers of parameters. Especially, the input embedding layer and Softmax layer usually dominate the memory usage in an RNN-based language model. For example, input embedding and Softmax matrices in IWSLT-2014 German-to-English data set account for more than 80% of the total model parameters. To compress these embedding layers, we propose MulCode, a novel multi-way multiplicative neural compressor. MulCode learns an adaptively created matrix and its multiplicative compositions. Together with a prior weighted loss, Multicode is more effective than the state-of-the-art compression methods. On the IWSLT-2014 machine translation data set, MulCode achieved 17 times compression rate for the embedding and Softmax matrices, and when combined with quantization technique, our method can achieve 41.38 times compression rate with very little loss in performance.</abstract>
       <url>D19-1529</url>
       <doi>10.18653/v1/D19-1529</doi>
@@ -6141,7 +6140,7 @@
       <author><first>Hila</first><last>Gonen</last></author>
       <author><first>Ryan</first><last>Cotterell</last></author>
       <author><first>Simone</first><last>Teufel</last></author>
-      <pages>5266–5274</pages>
+      <pages>5267–5275</pages>
       <abstract>This paper treats gender bias latent in word embeddings. Previous mitigation attempts rely on the operationalisation of gender bias as a projection over a linear subspace. An alternative approach is Counterfactual Data Augmentation (CDA), in which a corpus is duplicated and augmented to remove bias, e.g. by swapping all inherently-gendered words in the copy. We perform an empirical comparison of these approaches on the English Gigaword and Wikipedia, and find that whilst both successfully reduce direct bias and perform well in tasks which quantify embedding quality, CDA variants outperform projection-based methods at the task of drawing non-biased gender analogies by an average of 19% across both corpora. We propose two improvements to CDA: Counterfactual Data Substitution (CDS), a variant of CDA in which potentially biased text is randomly substituted to avoid duplication, and the Names Intervention, a novel name-pairing technique that vastly increases the number of words being treated. CDA/S with the Names Intervention is the only approach which is able to mitigate indirect gender bias: following debiasing, previously biased words are significantly less clustered according to gender (cluster purity is reduced by 49%), thus improving on the state-of-the-art for bias mitigation.</abstract>
       <url>D19-1530</url>
       <attachment>D19-1530.Attachment.zip</attachment>
@@ -6156,7 +6155,7 @@
       <author><first>Muhao</first><last>Chen</last></author>
       <author><first>Ryan</first><last>Cotterell</last></author>
       <author><first>Kai-Wei</first><last>Chang</last></author>
-      <pages>5275–5283</pages>
+      <pages>5276–5284</pages>
       <abstract>Recent studies have shown that word embeddings exhibit gender bias inherited from the training corpora. However, most studies to date have focused on quantifying and mitigating such bias only in English. These analyses cannot be directly extended to languages that exhibit morphological agreement on gender, such as Spanish and French. In this paper, we propose new metrics for evaluating gender bias in word embeddings of these languages and further demonstrate evidence of gender bias in bilingual embeddings which align these languages with English. Finally, we extend an existing approach to mitigate gender bias in word embedding of these languages under both monolingual and bilingual settings. Experiments on modified Word Embedding Association Test, word similarity, word translation, and word pair translation tasks show that the proposed approaches can effectively reduce the gender bias while preserving the utility of the original embeddings.</abstract>
       <url>D19-1531</url>
       <doi>10.18653/v1/D19-1531</doi>
@@ -6165,7 +6164,7 @@
       <title>Weakly Supervised Cross-lingual Semantic Relation Classification via Knowledge Distillation</title>
       <author><first>Yogarshi</first><last>Vyas</last></author>
       <author><first>Marine</first><last>Carpuat</last></author>
-      <pages>5284–5295</pages>
+      <pages>5285–5296</pages>
       <abstract>Words in different languages rarely cover the exact same semantic space. This work characterizes differences in meaning between words across languages using semantic relations that have been used to relate the meaning of English words. However, because of translation ambiguity, semantic relations are not always preserved by translation. We introduce a cross-lingual relation classifier trained only with English examples and a bilingual dictionary. Our classifier relies on a novel attention-based distillation approach to account for translation ambiguity when transferring knowledge from English to cross-lingual settings. On new English-Chinese and English-Hindi test sets, the resulting models largely outperform baselines that more naively rely on bilingual embeddings or dictionaries for cross-lingual transfer, and approach the performance of fully supervised systems on English tasks.</abstract>
       <url>D19-1532</url>
       <attachment>D19-1532.Attachment.pdf</attachment>
@@ -6176,7 +6175,7 @@
       <author><first>Christian</first><last>Hadiwinoto</last></author>
       <author><first>Hwee Tou</first><last>Ng</last></author>
       <author><first>Wee Chung</first><last>Gan</last></author>
-      <pages>5296–5305</pages>
+      <pages>5297–5306</pages>
       <abstract>Contextualized word representations are able to give different representations for the same word in different contexts, and they have been shown to be effective in downstream natural language processing tasks, such as question answering, named entity recognition, and sentiment analysis. However, evaluation on word sense disambiguation (WSD) in prior work shows that using contextualized word representations does not outperform the state-of-the-art approach that makes use of non-contextualized word embeddings. In this paper, we explore different strategies of integrating pre-trained contextualized word representations and our best strategy achieves accuracies exceeding the best prior published accuracies by significant margins on multiple benchmark WSD datasets.</abstract>
       <url>D19-1533</url>
       <doi>10.18653/v1/D19-1533</doi>
@@ -6188,7 +6187,7 @@
       <author><first>Sujian</first><last>Li</last></author>
       <author><first>Sameer</first><last>Singh</last></author>
       <author><first>Matt</first><last>Gardner</last></author>
-      <pages>5306–5314</pages>
+      <pages>5307–5315</pages>
       <abstract>The ability to understand and work with numbers (numeracy) is critical for many complex reasoning tasks. Currently, most NLP models treat numbers in text in the same way as other tokens—they embed them as distributed vectors. Is this enough to capture numeracy? We begin by investigating the numerical reasoning capabilities of a state-of-the-art question answering model on the DROP dataset. We find this model excels on questions that require numerical reasoning, i.e., it already captures numeracy. To understand how this capability emerges, we probe token embedding methods (e.g., BERT, GloVe) on synthetic list maximum, number decoding, and addition tasks. A surprising degree of numeracy is naturally present in standard embeddings. For example, GloVe and word2vec accurately encode magnitude for numbers up to 1,000. Furthermore, character-level embeddings are even more precise—ELMo captures numeracy the best for all pre-trained methods—but BERT, which uses sub-word units, is less exact.</abstract>
       <url>D19-1534</url>
       <attachment>D19-1534.Attachment.zip</attachment>
@@ -6203,7 +6202,7 @@
       <author><first>Lei</first><last>Fang</last></author>
       <author><first>Bin</first><last>Zhou</last></author>
       <author><first>Dongmei</first><last>Zhang</last></author>
-      <pages>5315–5325</pages>
+      <pages>5316–5326</pages>
       <abstract>Context-dependent semantic parsing has proven to be an important yet challenging task. To leverage the advances in context-independent semantic parsing, we propose to perform follow-up query analysis, aiming to restate context-dependent natural language queries with contextual information. To accomplish the task, we propose STAR, a novel approach with a well-designed two-phase process. It is parser-independent and able to handle multifarious follow-up scenarios in different domains. Experiments on the FollowUp dataset show that STAR outperforms the state-of-the-art baseline by a large margin of nearly 8%. The superiority on parsing results verifies the feasibility of follow-up query analysis. We also explore the extensibility of STAR on the SQA dataset, which is very promising.</abstract>
       <url>D19-1535</url>
       <doi>10.18653/v1/D19-1535</doi>
@@ -6212,7 +6211,7 @@
       <title><fixed-case>T</fixed-case>ext2<fixed-case>M</fixed-case>ath: End-to-end Parsing Text into Math Expressions</title>
       <author><first>Yanyan</first><last>Zou</last></author>
       <author><first>Wei</first><last>Lu</last></author>
-      <pages>5326–5336</pages>
+      <pages>5327–5337</pages>
       <abstract>We propose Text2Math, a model for semantically parsing text into math expressions. The model can be used to solve different math related problems including arithmetic word problems and equation parsing problems. Unlike previous approaches, we tackle the problem from an end-to-end structured prediction perspective where our algorithm aims to predict the complete math expression at once as a tree structure, where minimal manual efforts are involved in the process. Empirical results on benchmark datasets demonstrate the efficacy of our approach.</abstract>
       <url>D19-1536</url>
       <attachment>D19-1536.Attachment.zip</attachment>
@@ -6230,7 +6229,7 @@
       <author><first>Caiming</first><last>Xiong</last></author>
       <author><first>Richard</first><last>Socher</last></author>
       <author><first>Dragomir</first><last>Radev</last></author>
-      <pages>5337–5348</pages>
+      <pages>5338–5349</pages>
       <abstract>We focus on the cross-domain context-dependent text-to-SQL generation task. Based on the observation that adjacent natural language questions are often linguistically dependent and their corresponding SQL queries tend to overlap, we utilize the interaction history by editing the previous predicted query to improve the generation quality. Our editing mechanism views SQL as sequences and reuses generation results at the token level in a simple manner. It is flexible to change individual tokens and robust to error propagation. Furthermore, to deal with complex table structures in different domains, we employ an utterance-table encoder and a table-aware decoder to incorporate the context of the user utterance and the table schema. We evaluate our approach on the SParC dataset and demonstrate the benefit of editing compared with the state-of-the-art baselines which generate SQL from scratch. Our code is available at <url>https://github.com/ryanzhumich/sparc_atis_pytorch</url>.</abstract>
       <url>D19-1537</url>
       <doi>10.18653/v1/D19-1537</doi>
@@ -6240,7 +6239,7 @@
       <author><first>Shexia</first><last>He</last></author>
       <author><first>Zuchao</first><last>Li</last></author>
       <author><first>Hai</first><last>Zhao</last></author>
-      <pages>5349–5358</pages>
+      <pages>5350–5359</pages>
       <abstract>Recently, semantic role labeling (SRL) has earned a series of success with even higher performance improvements, which can be mainly attributed to syntactic integration and enhanced word representation. However, most of these efforts focus on English, while SRL on multiple languages more than English has received relatively little attention so that is kept underdevelopment. Thus this paper intends to fill the gap on multilingual SRL with special focus on the impact of syntax and contextualized word representation. Unlike existing work, we propose a novel method guided by syntactic rule to prune arguments, which enables us to integrate syntax into multilingual SRL model simply and effectively. We present a unified SRL model designed for multiple languages together with the proposed uniform syntax enhancement. Our model achieves new state-of-the-art results on the CoNLL-2009 benchmarks of all seven languages. Besides, we pose a discussion on the syntactic role among different languages and verify the effectiveness of deep enhanced representation for multilingual SRL.</abstract>
       <url>D19-1538</url>
       <doi>10.18653/v1/D19-1538</doi>
@@ -6252,7 +6251,7 @@
       <author><first>Yinhan</first><last>Liu</last></author>
       <author><first>Luke</first><last>Zettlemoyer</last></author>
       <author><first>Michael</first><last>Auli</last></author>
-      <pages>5359–5368</pages>
+      <pages>5360–5369</pages>
       <abstract>We present a new approach for pretraining a bi-directional transformer model that provides significant performance gains across a variety of language understanding problems. Our model solves a cloze-style word reconstruction task, where each word is ablated and must be predicted given the rest of the text. Experiments demonstrate large performance gains on GLUE and new state of the art results on NER as well as constituency parsing benchmarks, consistent with BERT. We also present a detailed analysis of a number of factors that contribute to effective pretraining, including data domain and size, model capacity, and variations on the cloze objective.</abstract>
       <url>D19-1539</url>
       <doi>10.18653/v1/D19-1539</doi>
@@ -6265,7 +6264,7 @@
       <author><first>Wei</first><last>Yang</last></author>
       <author><first>Peng</first><last>Shi</last></author>
       <author><first>Jimmy</first><last>Lin</last></author>
-      <pages>5369–5380</pages>
+      <pages>5370–5381</pages>
       <abstract>A core problem of information retrieval (IR) is relevance matching, which is to rank documents by relevance to a user’s query. On the other hand, many NLP problems, such as question answering and paraphrase identification, can be considered variants of semantic matching, which is to measure the semantic distance between two pieces of short texts. While at a high level both relevance and semantic matching require modeling textual similarity, many existing techniques for one cannot be easily adapted to the other. To bridge this gap, we propose a novel model, HCAN (Hybrid Co-Attention Network), that comprises (1) a hybrid encoder module that includes ConvNet-based and LSTM-based encoders, (2) a relevance matching module that measures soft term matches with importance weighting at multiple granularities, and (3) a semantic matching module with co-attention mechanisms that capture context-aware semantic relatedness. Evaluations on multiple IR and NLP benchmarks demonstrate state-of-the-art effectiveness compared to approaches that do not exploit pretraining on external data. Extensive ablation studies suggest that relevance and semantic matching signals are complementary across many problem settings, regardless of the choice of underlying encoders.</abstract>
       <url>D19-1540</url>
       <doi>10.18653/v1/D19-1540</doi>
@@ -6275,7 +6274,7 @@
       <author><first>Qingrong</first><last>Xia</last></author>
       <author><first>Zhenghua</first><last>Li</last></author>
       <author><first>Min</first><last>Zhang</last></author>
-      <pages>5381–5391</pages>
+      <pages>5382–5392</pages>
       <abstract>Semantic role labeling (SRL) aims to identify the predicate-argument structure of a sentence. Inspired by the strong correlation between syntax and semantics, previous works pay much attention to improve SRL performance on exploiting syntactic knowledge, achieving significant results. Pipeline methods based on automatic syntactic trees and multi-task learning (MTL) approaches using standard syntactic trees are two common research orientations. In this paper, we adopt a simple unified span-based model for both span-based and word-based Chinese SRL as a strong baseline. Besides, we present a MTL framework that includes the basic SRL module and a dependency parser module. Different from the commonly used hard parameter sharing strategy in MTL, the main idea is to extract implicit syntactic representations from the dependency parser as external inputs for the basic SRL model. Experiments on the benchmarks of Chinese Proposition Bank 1.0 and CoNLL-2009 Chinese datasets show that our proposed framework can effectively improve the performance over the strong baselines. With the external BERT representations, our framework achieves new state-of-the-art 87.54 and 88.5 F1 scores on the two test data of the two benchmarks, respectively. In-depth analysis are conducted to gain more insights on the proposed framework and the effectiveness of syntax.</abstract>
       <url>D19-1541</url>
       <doi>10.18653/v1/D19-1541</doi>
@@ -6284,7 +6283,7 @@
       <title>Transfer Fine-Tuning: A <fixed-case>BERT</fixed-case> Case Study</title>
       <author><first>Yuki</first><last>Arase</last></author>
       <author><first>Jun’ichi</first><last>Tsujii</last></author>
-      <pages>5392–5403</pages>
+      <pages>5393–5404</pages>
       <abstract>A semantic equivalence assessment is defined as a task that assesses semantic equivalence in a sentence pair by binary judgment (i.e., paraphrase identification) or grading (i.e., semantic textual similarity measurement). It constitutes a set of tasks crucial for research on natural language understanding. Recently, BERT realized a breakthrough in sentence representation learning (Devlin et al., 2019), which is broadly transferable to various NLP tasks. While BERT’s performance improves by increasing its model size, the required computational power is an obstacle preventing practical applications from adopting the technology. Herein, we propose to inject phrasal paraphrase relations into BERT in order to generate suitable representations for semantic equivalence assessment instead of increasing the model size. Experiments on standard natural language understanding tasks confirm that our method effectively improves a smaller BERT model while maintaining the model size. The generated model exhibits superior performance compared to a larger BERT model on semantic equivalence assessment tasks. Furthermore, it achieves larger performance gains on tasks with limited training datasets for fine-tuning, which is a property desirable for transfer learning.</abstract>
       <url>D19-1542</url>
       <doi>10.18653/v1/D19-1542</doi>
@@ -6296,7 +6295,7 @@
       <author><first>Hongzhi</first><last>Liu</last></author>
       <author><first>Jian-Guang</first><last>Lou</last></author>
       <author><first>Dongmei</first><last>Zhang</last></author>
-      <pages>5404–5413</pages>
+      <pages>5405–5414</pages>
       <abstract>On text-to-SQL generation, the input utterance usually contains lots of tokens that are related to column names or cells in the table, called <i>table-related tokens</i>. These table-related tokens are troublesome for the downstream neural semantic parser because it brings complex semantics and hinders the sharing across the training examples. However, existing approaches either ignore handling these tokens before the semantic parser or simply use deterministic approaches based on string-match or word embedding similarity. In this work, we propose a more efficient approach to handle table-related tokens before the semantic parser. First, we formulate it as a sequential tagging problem and propose a two-stage anonymization model to learn the semantic relationship between tables and input utterances. Then, we leverage the implicit supervision from SQL queries by policy gradient to guide the training. Experiments demonstrate that our approach consistently improves performances of different neural semantic parsers and significantly outperforms deterministic approaches.</abstract>
       <url>D19-1543</url>
       <doi>10.18653/v1/D19-1543</doi>
@@ -6306,7 +6305,7 @@
       <author><first>Xinchi</first><last>Chen</last></author>
       <author><first>Chunchuan</first><last>Lyu</last></author>
       <author><first>Ivan</first><last>Titov</last></author>
-      <pages>5414–5424</pages>
+      <pages>5415–5425</pages>
       <abstract>Semantic role labeling (SRL) involves extracting propositions (i.e. predicates and their typed arguments) from natural language sentences. State-of-the-art SRL models rely on powerful encoders (e.g., LSTMs) and do not model non-local interaction between arguments. We propose a new approach to modeling these interactions while maintaining efficient inference. Specifically, we use Capsule Networks (Sabour et al., 2017): each proposition is encoded as a tuple of <i>capsules</i>, one capsule per argument type (i.e. role). These tuples serve as embeddings of entire propositions. In every network layer, the capsules interact with each other and with representations of words in the sentence. Each iteration results in updated proposition embeddings and updated predictions about the SRL structure. Our model substantially outperforms the non-refinement baseline model on all 7 CoNLL-2019 languages and achieves state-of-the-art results on 5 languages (including English) for dependency SRL. We analyze the types of mistakes corrected by the refinement procedure. For example, each role is typically (but not always) filled with at most one argument. Whereas enforcing this approximate constraint is not useful with the modern SRL system, iterative procedure corrects the mistakes by capturing this intuition in a flexible and context-sensitive way.</abstract>
       <url>D19-1544</url>
       <doi>10.18653/v1/D19-1544</doi>
@@ -6316,7 +6315,7 @@
       <author><first>Srinivasan</first><last>Iyer</last></author>
       <author><first>Alvin</first><last>Cheung</last></author>
       <author><first>Luke</first><last>Zettlemoyer</last></author>
-      <pages>5425–5434</pages>
+      <pages>5426–5435</pages>
       <abstract>Programmers typically organize executable source code using high-level coding patterns or idiomatic structures such as nested loops, exception handlers and recursive blocks, rather than as individual code tokens. In contrast, state of the art (SOTA) semantic parsers still map natural language instructions to source code by building the code syntax tree one node at a time. In this paper, we introduce an iterative method to extract code idioms from large source code corpora by repeatedly collapsing most-frequent depth-2 subtrees of their syntax trees, and train semantic parsers to apply these idioms during decoding. Applying idiom-based decoding on a recent context-dependent semantic parsing task improves the SOTA by 2.2% BLEU score while reducing training time by more than 50%. This improved speed enables us to scale up the model by training on an extended training set that is 5<tex-math>\times</tex-math> larger, to further move up the SOTA by an additional 2.3% BLEU and 0.9% exact match. Finally, idioms also significantly improve accuracy of semantic parsing to SQL on the ATIS-SQL dataset, when training data is limited.</abstract>
       <url>D19-1545</url>
       <attachment>D19-1545.Attachment.zip</attachment>
@@ -6327,7 +6326,7 @@
       <author><first>Rajas</first><last>Agashe</last></author>
       <author><first>Srinivasan</first><last>Iyer</last></author>
       <author><first>Luke</first><last>Zettlemoyer</last></author>
-      <pages>5435–5445</pages>
+      <pages>5436–5446</pages>
       <abstract>Interactive programming with interleaved code snippet cells and natural language markdown is recently gaining popularity in the form of Jupyter notebooks, which accelerate prototyping and collaboration. To study code generation conditioned on a long context history, we present JuICe, a corpus of 1.5 million examples with a curated test set of 3.7K instances based on online programming assignments. Compared with existing contextual code generation datasets, JuICe provides refined human-curated data, open-domain code, and an order of magnitude more training data. Using JuICe, we train models for two tasks: (1) generation of the API call sequence in a code cell, and (2) full code cell generation, both conditioned on the NL-Code history up to a particular code cell. Experiments using current baseline code generation models show that both context and distant supervision aid in generation, and that the dataset is challenging for current systems.</abstract>
       <url>D19-1546</url>
       <doi>10.18653/v1/D19-1546</doi>
@@ -6338,7 +6337,7 @@
       <author><first>Yu</first><last>Su</last></author>
       <author><first>Huan</first><last>Sun</last></author>
       <author><first>Wen-tau</first><last>Yih</last></author>
-      <pages>5446–5457</pages>
+      <pages>5447–5458</pages>
       <abstract>As a promising paradigm, interactive semantic parsing has shown to improve both semantic parsing accuracy and user confidence in the results. In this paper, we propose a new, unified formulation of the interactive semantic parsing problem, where the goal is to design a model-based intelligent agent. The agent maintains its own state as the current predicted semantic parse, decides whether and where human intervention is needed, and generates a clarification question in natural language. A key part of the agent is a world model: it takes a percept (either an initial question or subsequent feedback from the user) and transitions to a new state. We then propose a simple yet remarkably effective instantiation of our framework, demonstrated on two text-to-SQL datasets (WikiSQL and Spider) with different state-of-the-art base semantic parsers. Compared to an existing interactive semantic parsing approach that treats the base parser as a black box, our approach solicits less user feedback but yields higher run-time accuracy.</abstract>
       <url>D19-1547</url>
       <attachment>D19-1547.Attachment.pdf</attachment>
@@ -6352,7 +6351,7 @@
       <author><first>Longhua</first><last>Qian</last></author>
       <author><first>Min</first><last>Zhang</last></author>
       <author><first>Guodong</first><last>Zhou</last></author>
-      <pages>5458–5467</pages>
+      <pages>5459–5468</pages>
       <abstract>Recent studies on AMR-to-text generation often formalize the task as a sequence-to-sequence (seq2seq) learning problem by converting an Abstract Meaning Representation (AMR) graph into a word sequences. Graph structures are further modeled into the seq2seq framework in order to utilize the structural information in the AMR graphs. However, previous approaches only consider the relations between directly connected concepts while ignoring the rich structure in AMR graphs. In this paper we eliminate such a strong limitation and propose a novel structure-aware self-attention approach to better model the relations between indirectly connected concepts in the state-of-the-art seq2seq model, i.e. the Transformer. In particular, a few different methods are explored to learn structural representations between two concepts. Experimental results on English AMR benchmark datasets show that our approach significantly outperforms the state-of-the-art with 29.66 and 31.82 BLEU scores on LDC2015E86 and LDC2017T10, respectively. To the best of our knowledge, these are the best results achieved so far by supervised models on the benchmarks.</abstract>
       <url>D19-1548</url>
       <doi>10.18653/v1/D19-1548</doi>
@@ -6361,7 +6360,7 @@
       <title>Syntax-Aware Aspect Level Sentiment Classification with Graph Attention Networks</title>
       <author><first>Binxuan</first><last>Huang</last></author>
       <author><first>Kathleen</first><last>Carley</last></author>
-      <pages>5468–5476</pages>
+      <pages>5469–5477</pages>
       <abstract>Aspect level sentiment classification aims to identify the sentiment expressed towards an aspect given a context sentence. Previous neural network based methods largely ignore the syntax structure in one sentence. In this paper, we propose a novel target-dependent graph attention network (TD-GAT) for aspect level sentiment classification, which explicitly utilizes the dependency relationship among words. Using the dependency graph, it propagates sentiment features directly from the syntactic context of an aspect target. In our experiments, we show our method outperforms multiple baselines with GloVe embeddings. We also demonstrate that using BERT representations further substantially boosts the performance.</abstract>
       <url>D19-1549</url>
       <doi>10.18653/v1/D19-1549</doi>
@@ -6370,7 +6369,7 @@
       <title>Learning Explicit and Implicit Structures for Targeted Sentiment Analysis</title>
       <author><first>Hao</first><last>Li</last></author>
       <author><first>Wei</first><last>Lu</last></author>
-      <pages>5477–5487</pages>
+      <pages>5478–5488</pages>
       <abstract>Targeted sentiment analysis is the task of jointly predicting target entities and their associated sentiment information. Existing research efforts mostly regard this joint task as a sequence labeling problem, building models that can capture explicit structures in the output space. However, the importance of capturing implicit global structural information that resides in the input space is largely unexplored. In this work, we argue that both types of information (implicit and explicit structural information) are crucial for building a successful targeted sentiment analysis model. Our experimental results show that properly capturing both information is able to lead to better performance than competitive existing approaches. We also conduct extensive experiments to investigate our model’s effectiveness and robustness.</abstract>
       <url>D19-1550</url>
       <attachment>D19-1550.Attachment.pdf</attachment>
@@ -6385,7 +6384,7 @@
       <author><first>Jianxin</first><last>Liao</last></author>
       <author><first>Tong</first><last>Xu</last></author>
       <author><first>Ming</first><last>Liu</last></author>
-      <pages>5488–5497</pages>
+      <pages>5489–5498</pages>
       <abstract>Aspect-level sentiment classification is a crucial task for sentiment analysis, which aims to identify the sentiment polarities of specific targets in their context. The main challenge comes from multi-aspect sentences, which express multiple sentiment polarities towards different targets, resulting in overlapped feature representation. However, most existing neural models tend to utilize static pooling operation or attention mechanism to identify sentimental words, which therefore insufficient for dealing with overlapped features. To solve this problem, we propose to utilize capsule network to construct vector-based feature representation and cluster features by an EM routing algorithm. Furthermore, interactive attention mechanism is introduced in the capsule routing procedure to model the semantic relationship between aspect terms and context. The iterative routing also enables encoding sentence from a global perspective. Experimental results on three datasets show that our proposed model achieves state-of-the-art performance.</abstract>
       <url>D19-1551</url>
       <doi>10.18653/v1/D19-1551</doi>
@@ -6397,7 +6396,7 @@
       <author><first>Shoushan</first><last>Li</last></author>
       <author><first>Guodong</first><last>Zhou</last></author>
       <author><first>Min</first><last>Zhang</last></author>
-      <pages>5498–5506</pages>
+      <pages>5499–5507</pages>
       <abstract>There have been a recent line of works to automatically predict the emotions of posts in social media. Existing approaches consider the posts individually and predict their emotions independently. Different from previous researches, we explore the dependence among relevant posts via the authors’ backgrounds, since the authors with similar backgrounds, e.g., gender, location, tend to express similar emotions. However, such personal attributes are not easy to obtain in most social media websites, and it is hard to capture attributes-aware words to connect similar people. Accordingly, we propose a Neural Personal Discrimination (NPD) approach to address above challenges by determining personal attributes from posts, and connecting relevant posts with similar attributes to jointly learn their emotions. In particular, we employ adversarial discriminators to determine the personal attributes, with attention mechanisms to aggregate attributes-aware words. In this way, social correlationship among different posts can be better addressed. Experimental results show the usefulness of personal attributes, and the effectiveness of our proposed NPD approach in capturing such personal attributes with significant gains over the state-of-the-art models.</abstract>
       <url>D19-1552</url>
       <attachment>D19-1552.Attachment.zip</attachment>
@@ -6411,7 +6410,7 @@
       <author><first>Jun</first><last>Xie</last></author>
       <author><first>Qi</first><last>Su</last></author>
       <author><first>Xu</first><last>Sun</last></author>
-      <pages>5507–5516</pages>
+      <pages>5508–5517</pages>
       <abstract>The task of unsupervised sentiment modification aims to reverse the sentiment polarity of the input text while preserving its semantic content without any parallel data. Most previous work follows a two-step process. They first separate the content from the original sentiment, and then directly generate text with the target sentiment only based on the content produced by the first step. However, the second step bears both the target sentiment addition and content reconstruction, thus resulting in a lack of specific information like proper nouns in the generated text. To remedy this, we propose a specificity-driven cascading approach in this work, which can effectively increase the specificity of the generated text and further improve content preservation. In addition, we propose a more reasonable metric to evaluate sentiment modification. The experiments show that our approach outperforms competitive baselines by a large margin, which achieves 11% and 38% relative improvements of the overall metric on the Yelp and Amazon datasets, respectively.</abstract>
       <url>D19-1553</url>
       <doi>10.18653/v1/D19-1553</doi>
@@ -6424,7 +6423,7 @@
       <author><first>Qi</first><last>Zeng</last></author>
       <author><first>Yun</first><last>Liang</last></author>
       <author><first>Xu</first><last>Sun</last></author>
-      <pages>5517–5526</pages>
+      <pages>5518–5527</pages>
       <abstract>Recent work has shown that current text classification models are fragile and sensitive to simple perturbations. In this work, we propose a novel adversarial training approach, LexicalAT, to improve the robustness of current classification models. The proposed approach consists of a generator and a classifier. The generator learns to generate examples to attack the classifier while the classifier learns to defend these attacks. Considering the diversity of attacks, the generator uses a large-scale lexical knowledge base, WordNet, to generate attacking examples by replacing some words in training examples with their synonyms (e.g., sad and unhappy), neighbor words (e.g., fox and wolf), or super-superior words (e.g., chair and armchair). Due to the discrete generation step in the generator, we use policy gradient, a reinforcement learning approach, to train the two modules. Experiments show LexicalAT outperforms strong baselines and reduces test errors on various neural networks, including CNN, RNN, and BERT.</abstract>
       <url>D19-1554</url>
       <doi>10.18653/v1/D19-1554</doi>
@@ -6433,7 +6432,7 @@
       <title>Leveraging Structural and Semantic Correspondence for Attribute-Oriented Aspect Sentiment Discovery</title>
       <author><first>Zhe</first><last>Zhang</last></author>
       <author><first>Munindar</first><last>Singh</last></author>
-      <pages>5527–5537</pages>
+      <pages>5528–5538</pages>
       <abstract>Opinionated text often involves attributes such as authorship and location that influence the sentiments expressed for different aspects. We posit that structural and semantic correspondence is both prevalent in opinionated text, especially when associated with attributes, and crucial in accurately revealing its latent aspect and sentiment structure. However, it is not recognized by existing approaches. We propose Trait, an unsupervised probabilistic model that discovers aspects and sentiments from text and associates them with different attributes. To this end, Trait infers and leverages structural and semantic correspondence using a Markov Random Field. We show empirically that by incorporating attributes explicitly Trait significantly outperforms state-of-the-art baselines both by generating attribute profiles that accord with our intuitions, as shown via visualization, and yielding topics of greater semantic cohesion.</abstract>
       <url>D19-1555</url>
       <doi>10.18653/v1/D19-1555</doi>
@@ -6445,7 +6444,7 @@
       <author><first>Florence</first><last>d’Alché-Buc</last></author>
       <author><first>Slim</first><last>Essid</last></author>
       <author><first>Chloé</first><last>Clavel</last></author>
-      <pages>5538–5547</pages>
+      <pages>5539–5548</pages>
       <abstract>The task of predicting fine grained user opinion based on spontaneous spoken language is a key problem arising in the development of Computational Agents as well as in the development of social network based opinion miners. Unfortunately, gathering reliable data on which a model can be trained is notoriously difficult and existing works rely only on coarsely labeled opinions. In this work we aim at bridging the gap separating fine grained opinion models already developed for written language and coarse grained models developed for spontaneous multimodal opinion mining. We take advantage of the implicit hierarchical structure of opinions to build a joint fine and coarse grained opinion model that exploits different views of the opinion expression. The resulting model shares some properties with attention-based models and is shown to provide competitive results on a recently released multimodal fine grained annotated corpus.</abstract>
       <url>D19-1556</url>
       <attachment>D19-1556.Attachment.pdf</attachment>
@@ -6456,7 +6455,7 @@
       <author><first>Prathusha</first><last>K Sarma</last></author>
       <author><first>Yingyu</first><last>Liang</last></author>
       <author><first>William</first><last>Sethares</last></author>
-      <pages>5548–5557</pages>
+      <pages>5549–5558</pages>
       <abstract>This paper proposes a way to improve the performance of existing algorithms for text classification in domains with strong language semantics. A proposed domain adaptation layer learns weights to combine a generic and a domain specific (DS) word embedding into a domain adapted (DA) embedding. The DA word embeddings are then used as inputs to a generic encoder + classifier framework to perform a downstream task such as classification. This adaptation layer is particularly suited to data sets that are modest in size, and which are, therefore, not ideal candidates for (re)training a deep neural network architecture. Results on binary and multi-class classification tasks using popular encoder architectures, including current state-of-the-art methods (with and without the shallow adaptation layer) show the effectiveness of the proposed approach.</abstract>
       <url>D19-1557</url>
       <attachment>D19-1557.Attachment.zip</attachment>
@@ -6470,7 +6469,7 @@
       <author><first>Honglei</first><last>Guo</last></author>
       <author><first>Renhong</first><last>Cheng</last></author>
       <author><first>Zhong</first><last>Su</last></author>
-      <pages>5558–5567</pages>
+      <pages>5559–5568</pages>
       <abstract>Cross-domain sentiment classification has drawn much attention in recent years. Most existing approaches focus on learning domain-invariant representations in both the source and target domains, while few of them pay attention to the domain-specific information. Despite the non-transferability of the domain-specific information, simultaneously learning domain-dependent representations can facilitate the learning of domain-invariant representations. In this paper, we focus on aspect-level cross-domain sentiment classification, and propose to distill the domain-invariant sentiment features with the help of an orthogonal domain-dependent task, i.e. aspect detection, which is built on the aspects varying widely in different domains. We conduct extensive experiments on three public datasets and the experimental results demonstrate the effectiveness of our method.</abstract>
       <url>D19-1558</url>
       <doi>10.18653/v1/D19-1558</doi>
@@ -6483,7 +6482,7 @@
       <author><first>Jinan</first><last>Xu</last></author>
       <author><first>Yufeng</first><last>Chen</last></author>
       <author><first>Jie</first><last>Zhou</last></author>
-      <pages>5568–5579</pages>
+      <pages>5569–5580</pages>
       <abstract>Aspect based sentiment analysis (ABSA) aims to identify the sentiment polarity towards the given aspect in a sentence, while previous models typically exploit an aspect-independent (weakly associative) encoder for sentence representation generation. In this paper, we propose a novel Aspect-Guided Deep Transition model, named AGDT, which utilizes the given aspect to guide the sentence encoding from scratch with the specially-designed deep transition architecture. Furthermore, an aspect-oriented objective is designed to enforce AGDT to reconstruct the given aspect with the generated sentence representation. In doing so, our AGDT can accurately generate aspect-specific sentence representation, and thus conduct more accurate sentiment predictions. Experimental results on multiple SemEval datasets demonstrate the effectiveness of our proposed approach, which significantly outperforms the best reported results with the same setting.</abstract>
       <url>D19-1559</url>
       <doi>10.18653/v1/D19-1559</doi>
@@ -6498,7 +6497,7 @@
       <author><first>Min</first><last>Zhang</last></author>
       <author><first>Xiaozhong</first><last>Liu</last></author>
       <author><first>Guodong</first><last>Zhou</last></author>
-      <pages>5580–5589</pages>
+      <pages>5581–5590</pages>
       <abstract>Recently, neural networks have shown promising results on Document-level Aspect Sentiment Classification (DASC). However, these approaches often offer little transparency w.r.t. their inner working mechanisms and lack interpretability. In this paper, to simulating the steps of analyzing aspect sentiment in a document by human beings, we propose a new Hierarchical Reinforcement Learning (HRL) approach to DASC. This approach incorporates clause selection and word selection strategies to tackle the data noise problem in the task of DASC. First, a high-level policy is proposed to select aspect-relevant clauses and discard noisy clauses. Then, a low-level policy is proposed to select sentiment-relevant words and discard noisy words inside the selected clauses. Finally, a sentiment rating predictor is designed to provide reward signals to guide both clause and word selection. Experimental results demonstrate the impressive effectiveness of the proposed approach to DASC over the state-of-the-art baselines.</abstract>
       <url>D19-1560</url>
       <doi>10.18653/v1/D19-1560</doi>
@@ -6516,7 +6515,7 @@
       <author><first>Michal</first><last>Jacovi</last></author>
       <author><first>Ranit</first><last>Aharonov</last></author>
       <author><first>Noam</first><last>Slonim</last></author>
-      <pages>5590–5600</pages>
+      <pages>5591–5601</pages>
       <abstract>In Natural Language Understanding, the task of response generation is usually focused on responses to short texts, such as tweets or a turn in a dialog. Here we present a novel task of producing a critical response to a long argumentative text, and suggest a method based on general rebuttal arguments to address it. We do this in the context of the recently-suggested task of listening comprehension over argumentative content: given a speech on some specified topic, and a list of relevant arguments, the goal is to determine which of the arguments appear in the speech. The general rebuttals we describe here (in English) overcome the need for topic-specific arguments to be provided, by proving to be applicable for a large set of topics. This allows creating responses beyond the scope of topics for which specific arguments are available. All data collected during this work is freely available for research.</abstract>
       <url>D19-1561</url>
       <attachment>D19-1561.Attachment.zip</attachment>
@@ -6525,7 +6524,7 @@
     <paper id="562">
       <title>Rethinking Attribute Representation and Injection for Sentiment Classification</title>
       <author><first>Reinald Kim</first><last>Amplayo</last></author>
-      <pages>5601–5612</pages>
+      <pages>5602–5613</pages>
       <abstract>Text attributes, such as user and product information in product reviews, have been used to improve the performance of sentiment classification models. The de facto standard method is to incorporate them as additional biases in the attention mechanism, and more performance gains are achieved by extending the model architecture. In this paper, we show that the above method is the least effective way to represent and inject attributes. To demonstrate this hypothesis, unlike previous models with complicated architectures, we limit our base model to a simple BiLSTM with attention classifier, and instead focus on how and where the attributes should be incorporated in the model. We propose to represent attributes as chunk-wise importance weight matrices and consider four locations in the model (i.e., embedding, encoding, attention, classifier) to inject attributes. Experiments show that our proposed method achieves significant improvements over the standard approach and that attention mechanism is the worst location to inject attributes, contradicting prior work. We also outperform the state-of-the-art despite our use of a simple base model. Finally, we show that these representations transfer well to other tasks. Model implementation and datasets are released here: https://github.com/rktamplayo/CHIM.</abstract>
       <url>D19-1562</url>
       <doi>10.18653/v1/D19-1562</doi>
@@ -6540,7 +6539,7 @@
       <author><first>Min</first><last>Yang</last></author>
       <author><first>Ruifeng</first><last>Xu</last></author>
       <author><first>Ruibin</first><last>Mao</last></author>
-      <pages>5613–5623</pages>
+      <pages>5614–5624</pages>
       <abstract>Emotion cause analysis, which aims to identify the reasons behind emotions, is a key topic in sentiment analysis. A variety of neural network models have been proposed recently, however, these previous models mostly focus on the learning architecture with local textual information, ignoring the discourse and prior knowledge, which play crucial roles in human text comprehension. In this paper, we propose a new method to extract emotion cause with a hierarchical neural model and knowledge-based regularizations, which aims to incorporate discourse context information and restrain the parameters by sentiment lexicon and common knowledge. The experimental results demonstrate that our proposed method achieves the state-of-the-art performance on two public datasets in different languages (Chinese and English), outperforming a number of competitive baselines by at least 2.08% in F-measure.</abstract>
       <url>D19-1563</url>
       <doi>10.18653/v1/D19-1563</doi>
@@ -6556,7 +6555,7 @@
       <author><first>Michal</first><last>Jacovi</last></author>
       <author><first>Ranit</first><last>Aharonov</last></author>
       <author><first>Noam</first><last>Slonim</last></author>
-      <pages>5624–5634</pages>
+      <pages>5625–5635</pages>
       <abstract>We explore the task of automatic assessment of argument quality. To that end, we actively collected 6.3k arguments, more than a factor of five compared to previously examined data. Each argument was explicitly and carefully annotated for its quality. In addition, 14k pairs of arguments were annotated independently, identifying the higher quality argument in each pair. In spite of the inherent subjective nature of the task, both annotation schemes led to surprisingly consistent results. We release the labeled datasets to the community. Furthermore, we suggest neural methods based on a recently released language model, for argument ranking as well as for argument-pair classification. In the former task, our results are comparable to state-of-the-art; in the latter task our results significantly outperform earlier methods.</abstract>
       <url>D19-1564</url>
       <attachment>D19-1564.Attachment.zip</attachment>
@@ -6569,7 +6568,7 @@
       <author><first>Alberto</first><last>Barrón-Cedeño</last></author>
       <author><first>Rostislav</first><last>Petrov</last></author>
       <author><first>Preslav</first><last>Nakov</last></author>
-      <pages>5635–5645</pages>
+      <pages>5636–5646</pages>
       <abstract>Propaganda aims at influencing people’s mindset with the purpose of advancing a specific agenda. Previous work has addressed propaganda detection at document level, typically labelling all articles from a propagandistic news outlet as propaganda. Such noisy gold labels inevitably affect the quality of any learning system trained on them. A further issue with most existing systems is the lack of explainability. To overcome these limitations, we propose a novel task: performing fine-grained analysis of texts by detecting all fragments that contain propaganda techniques as well as their type. In particular, we create a corpus of news articles manually annotated at fragment level with eighteen propaganda techniques and propose a suitable evaluation measure. We further design a novel multi-granularity neural network, and we show that it outperforms several strong BERT-based baselines.</abstract>
       <url>D19-1565</url>
       <attachment>D19-1565.Attachment.zip</attachment>
@@ -6581,7 +6580,7 @@
       <author><first>Md Shad</first><last>Akhtar</last></author>
       <author><first>Asif</first><last>Ekbal</last></author>
       <author><first>Pushpak</first><last>Bhattacharyya</last></author>
-      <pages>5646–5656</pages>
+      <pages>5647–5657</pages>
       <abstract>In recent times, multi-modal analysis has been an emerging and highly sought-after field at the intersection of natural language processing, computer vision, and speech processing. The prime objective of such studies is to leverage the diversified information, (e.g., textual, acoustic and visual), for learning a model. The effective interaction among these modalities often leads to a better system in terms of performance. In this paper, we introduce a recurrent neural network based approach for the multi-modal sentiment and emotion analysis. The proposed model learns the inter-modal interaction among the participating modalities through an auto-encoder mechanism. We employ a context-aware attention module to exploit the correspondence among the neighboring utterances. We evaluate our proposed approach for five standard multi-modal affect analysis datasets. Experimental results suggest the efficacy of the proposed model for both sentiment and emotion analysis over various existing state-of-the-art systems.</abstract>
       <url>D19-1566</url>
       <doi>10.18653/v1/D19-1566</doi>
@@ -6590,7 +6589,7 @@
       <title>Sequential Learning of Convolutional Features for Effective Text Classification</title>
       <author><first>Avinash</first><last>Madasu</last></author>
       <author><first>Vijjini</first><last>Anvesh Rao</last></author>
-      <pages>5657–5666</pages>
+      <pages>5658–5667</pages>
       <abstract>Text classification has been one of the major problems in natural language processing. With the advent of deep learning, convolutional neural network (CNN) has been a popular solution to this task. However, CNNs which were first proposed for images, face many crucial challenges in the context of text processing, namely in their elementary blocks: convolution filters and max pooling. These challenges have largely been overlooked by the most existing CNN models proposed for text classification. In this paper, we present an experimental study on the fundamental blocks of CNNs in text categorization. Based on this critique, we propose Sequential Convolutional Attentive Recurrent Network (SCARN). The proposed SCARN model utilizes both the advantages of recurrent and convolutional structures efficiently in comparison to previously proposed recurrent convolutional models. We test our model on different text classification datasets across tasks like sentiment analysis and question classification. Extensive experiments establish that SCARN outperforms other recurrent convolutional architectures with significantly less parameters. Furthermore, SCARN achieves better performance compared to equally large various deep CNN and LSTM architectures.</abstract>
       <url>D19-1567</url>
       <doi>10.18653/v1/D19-1567</doi>
@@ -6600,7 +6599,7 @@
       <author><first>Esin</first><last>Durmus</last></author>
       <author><first>Faisal</first><last>Ladhak</last></author>
       <author><first>Claire</first><last>Cardie</last></author>
-      <pages>5667–5677</pages>
+      <pages>5668–5678</pages>
       <abstract>Research in the social sciences and psychology has shown that the persuasiveness of an argument depends not only the language employed, but also on attributes of the source/communicator, the audience, and the appropriateness and strength of the argument’s claims given the pragmatic and discourse context of the argument. Among these characteristics of persuasive arguments, prior work in NLP does not explicitly investigate the effect of the pragmatic and discourse context when determining argument quality. This paper presents a new dataset to initiate the study of this aspect of argumentation: it consists of a diverse collection of arguments covering 741 controversial topics and comprising over 47,000 claims. We further propose predictive models that incorporate the pragmatic and discourse context of argumentative claims and show that they outperform models that rely only on claim-specific linguistic features for predicting the perceived impact of individual claims within a particular line of argument.</abstract>
       <url>D19-1568</url>
       <doi>10.18653/v1/D19-1568</doi>
@@ -6612,7 +6611,7 @@
       <author><first>Samuel</first><last>Mensah</last></author>
       <author><first>Yongyi</first><last>Mao</last></author>
       <author><first>Xudong</first><last>Liu</last></author>
-      <pages>5678–5687</pages>
+      <pages>5679–5688</pages>
       <abstract>We propose a method based on neural networks to identify the sentiment polarity of opinion words expressed on a specific aspect of a sentence. Although a large majority of works typically focus on leveraging the expressive power of neural networks in handling this task, we explore the possibility of integrating dependency trees with neural networks for representation learning. To this end, we present a convolution over a dependency tree (CDT) model which exploits a Bi-directional Long Short Term Memory (Bi-LSTM) to learn representations for features of a sentence, and further enhance the embeddings with a graph convolutional network (GCN) which operates directly on the dependency tree of the sentence. Our approach propagates both contextual and dependency information from opinion words to aspect words, offering discriminative properties for supervision. Experimental results ranks our approach as the new state-of-the-art in aspect-based sentiment classification.</abstract>
       <url>D19-1569</url>
       <doi>10.18653/v1/D19-1569</doi>
@@ -6624,7 +6623,7 @@
       <author><first>Guoping</first><last>Huang</last></author>
       <author><first>Conghui</first><last>Zhu</last></author>
       <author><first>Tiejun</first><last>Zhao</last></author>
-      <pages>5688–5694</pages>
+      <pages>5689–5695</pages>
       <abstract>Many Data Augmentation (DA) methods have been proposed for neural machine translation. Existing works measure the superiority of DA methods in terms of their performance on a specific test set, but we find that some DA methods do not exhibit consistent improvements across translation tasks. Based on the observation, this paper makes an initial attempt to answer a fundamental question: what benefits, which are consistent across different methods and tasks, does DA in general obtain? Inspired by recent theoretic advances in deep learning, the paper understands DA from two perspectives towards the generalization ability of a model: input sensitivity and prediction margin, which are defined independent of specific test set thereby may lead to findings with relatively low variance. Extensive experiments show that relatively consistent benefits across five DA methods and four translation tasks are achieved regarding both perspectives.</abstract>
       <url>D19-1570</url>
       <attachment>D19-1570.Attachment.pdf</attachment>
@@ -6635,7 +6634,7 @@
       <author><first>Kyra</first><last>Yee</last></author>
       <author><first>Yann</first><last>Dauphin</last></author>
       <author><first>Michael</first><last>Auli</last></author>
-      <pages>5695–5700</pages>
+      <pages>5696–5701</pages>
       <abstract>Previous work on neural noisy channel modeling relied on latent variable models that incrementally process the source and target sentence. This makes decoding decisions based on partial source prefixes even though the full source is available. We pursue an alternative approach based on standard sequence to sequence models which utilize the entire source. These models perform remarkably well as channel models, even though they have neither been trained on, nor designed to factor over incomplete target sentences. Experiments with neural language models trained on billions of words show that noisy channel models can outperform a direct model by up to 3.2 BLEU on WMT’17 German-English translation. We evaluate on four language-pairs and our channel models consistently outperform strong alternatives such right-to-left reranking models and ensembles of direct models.</abstract>
       <url>D19-1571</url>
       <attachment>D19-1571.Attachment.pdf</attachment>
@@ -6649,7 +6648,7 @@
       <author><first>Marcin</first><last>Kadras</last></author>
       <author><first>Sylvain</first><last>Gugger</last></author>
       <author><first>Jeremy</first><last>Howard</last></author>
-      <pages>5701–5706</pages>
+      <pages>5702–5707</pages>
       <abstract>Pretrained language models are promising particularly for low-resource languages as they only require unlabelled data. However, training existing models requires huge amounts of compute, while pretrained cross-lingual models often underperform on low-resource languages. We propose Multi-lingual language model Fine-Tuning (MultiFiT) to enable practitioners to train and fine-tune language models efficiently in their own language. In addition, we propose a zero-shot method using an existing pretrained cross-lingual model. We evaluate our methods on two widely used cross-lingual classification datasets where they outperform models pretrained on orders of magnitude more data and compute. We release all models and code.</abstract>
       <url>D19-1572</url>
       <attachment>D19-1572.Attachment.zip</attachment>
@@ -6664,7 +6663,7 @@
       <author><first>Tao</first><last>Qin</last></author>
       <author><first>Liwei</first><last>Wang</last></author>
       <author><first>Tie-Yan</first><last>Liu</last></author>
-      <pages>5707–5712</pages>
+      <pages>5708–5713</pages>
       <abstract>Due to the unparallelizable nature of the autoregressive factorization, AutoRegressive Translation (ART) models have to generate tokens sequentially during decoding and thus suffer from high inference latency. Non-AutoRegressive Translation (NART) models were proposed to reduce the inference time, but could only achieve inferior translation accuracy. In this paper, we proposed a novel approach to leveraging the hints from hidden states and word alignments to help the training of NART models. The results achieve significant improvement over previous NART models for the WMT14 En-De and De-En datasets and are even comparable to a strong LSTM-based ART baseline but one order of magnitude faster in inference.</abstract>
       <url>D19-1573</url>
       <attachment>D19-1573.Attachment.zip</attachment>
@@ -6675,7 +6674,7 @@
       <author><first>Adam</first><last>Fisch</last></author>
       <author><first>Jiang</first><last>Guo</last></author>
       <author><first>Regina</first><last>Barzilay</last></author>
-      <pages>5713–5719</pages>
+      <pages>5714–5720</pages>
       <abstract>This paper explores the task of leveraging typology in the context of cross-lingual dependency parsing. While this linguistic information has shown great promise in pre-neural parsing, results for neural architectures have been mixed. The aim of our investigation is to better understand this state-of-the-art. Our main findings are as follows: 1) The benefit of typological information is derived from coarsely grouping languages into syntactically-homogeneous clusters rather than from learning to leverage variations along individual typological dimensions in a compositional manner; 2) Typology consistent with the actual corpus statistics yields better transfer performance; 3) Typological similarity is only a rough proxy of cross-lingual transferability with respect to parsing.</abstract>
       <url>D19-1574</url>
       <attachment>D19-1574.Attachment.zip</attachment>
@@ -6688,7 +6687,7 @@
       <author><first>Jiang</first><last>Guo</last></author>
       <author><first>Yijia</first><last>Liu</last></author>
       <author><first>Ting</first><last>Liu</last></author>
-      <pages>5720–5726</pages>
+      <pages>5721–5727</pages>
       <abstract>This paper investigates the problem of learning cross-lingual representations in a contextual space. We propose Cross-Lingual BERT Transformation (CLBT), a simple and efficient approach to generate cross-lingual contextualized word embeddings based on publicly available pre-trained BERT models (Devlin et al., 2018). In this approach, a linear transformation is learned from contextual word alignments to align the contextualized embeddings independently trained in different languages. We demonstrate the effectiveness of this approach on zero-shot cross-lingual transfer parsing. Experiments show that our embeddings substantially outperform the previous state-of-the-art that uses static embeddings. We further compare our approach with XLM (Lample and Conneau, 2019), a recently proposed cross-lingual language model trained with massive parallel data, and achieve highly competitive results.</abstract>
       <url>D19-1575</url>
       <attachment>D19-1575.Attachment.pdf</attachment>
@@ -6700,7 +6699,7 @@
       <author><first>Ge</first><last>Wang</last></author>
       <author><first>Yong</first><last>Jiang</last></author>
       <author><first>Kewei</first><last>Tu</last></author>
-      <pages>5727–5732</pages>
+      <pages>5728–5733</pages>
       <abstract>The key to multilingual grammar induction is to couple grammar parameters of different languages together by exploiting the similarity between languages. Previous work relies on linguistic phylogenetic knowledge to specify similarity between languages. In this work, we propose a novel universal grammar induction approach that represents language identities with continuous vectors and employs a neural network to predict grammar parameters based on the representation. Without any prior linguistic phylogenetic knowledge, we automatically capture similarity between languages with the vector representations and softly tie the grammar parameters of different languages. In our experiments, we apply our approach to 15 languages across 8 language families and subfamilies in the Universal Dependency Treebank dataset, and we observe substantial performance gain on average over monolingual and multilingual baselines.</abstract>
       <url>D19-1576</url>
       <doi>10.18653/v1/D19-1576</doi>
@@ -6712,7 +6711,7 @@
       <author><first>Lawrence</first><last>Wolf-Sonkin</last></author>
       <author><first>Hanna</first><last>Wallach</last></author>
       <author><first>Ryan</first><last>Cotterell</last></author>
-      <pages>5733–5738</pages>
+      <pages>5734–5739</pages>
       <abstract>Many of the world’s languages employ grammatical gender on the lexeme. For instance, in Spanish, house “casa” is feminine, whereas the word for paper “papel” is masculine. To a speaker of a genderless language, this categorization seems to exist with neither rhyme nor reason. But, is the association of nouns to gender classes truly arbitrary? In this work, we present the first large-scale investigation of the arbitrariness of gender assignment that uses canonical correlation analysis as a method for correlating the gender of inanimate nouns with their lexical semantic meaning. We find that the gender systems of 18 languages exhibit a significant correlation with an externally grounded definition of lexical semantics.</abstract>
       <url>D19-1577</url>
       <doi>10.18653/v1/D19-1577</doi>
@@ -6722,7 +6721,7 @@
       <author><first>Vinodkumar</first><last>Prabhakaran</last></author>
       <author><first>Ben</first><last>Hutchinson</last></author>
       <author><first>Margaret</first><last>Mitchell</last></author>
-      <pages>5739–5744</pages>
+      <pages>5740–5745</pages>
       <abstract>Data-driven statistical Natural Language Processing (NLP) techniques leverage large amounts of language data to build models that can understand language. However, most language data reflect the public discourse at the time the data was produced, and hence NLP models are susceptible to learning incidental associations around named referents at a particular point in time, in addition to general linguistic meaning. An NLP system designed to model notions such as sentiment and toxicity should ideally produce scores that are independent of the identity of such entities mentioned in text and their social associations. For example, in a general purpose sentiment analysis system, a phrase such as I hate Katy Perry should be interpreted as having the same sentiment as I hate Taylor Swift. Based on this idea, we propose a generic evaluation framework, Perturbation Sensitivity Analysis, which detects unintended model biases related to named entities, and requires no new annotations or corpora. We demonstrate the utility of this analysis by employing it on two different NLP models — a sentiment model and a toxicity model — applied on online comments in English language from four different genres.</abstract>
       <url>D19-1578</url>
       <doi>10.18653/v1/D19-1578</doi>
@@ -6731,7 +6730,7 @@
       <title>Automatically Inferring Gender Associations from Language</title>
       <author><first>Serina</first><last>Chang</last></author>
       <author><first>Kathy</first><last>McKeown</last></author>
-      <pages>5745–5751</pages>
+      <pages>5746–5752</pages>
       <abstract>In this paper, we pose the question: do people talk about women and men in different ways? We introduce two datasets and a novel integration of approaches for automatically inferring gender associations from language, discovering coherent word clusters, and labeling the clusters for the semantic concepts they represent. The datasets allow us to compare how people write about women and men in two different settings – one set draws from celebrity news and the other from student reviews of computer science professors. We demonstrate that there are large-scale differences in the ways that people talk about women and men and that these differences vary across domains. Human evaluations show that our methods significantly outperform strong baselines.</abstract>
       <url>D19-1579</url>
       <attachment>D19-1579.Attachment.pdf</attachment>
@@ -6750,7 +6749,7 @@
       <author><first>Arineh</first><last>Mirinjian</last></author>
       <author><first>Xiang</first><last>Ren</last></author>
       <author><first>Morteza</first><last>Dehghani</last></author>
-      <pages>5752–5756</pages>
+      <pages>5753–5757</pages>
       <abstract>Official reports of hate crimes in the US are under-reported relative to the actual number of such incidents. Further, despite statistical approximations, there are no official reports from a large number of US cities regarding incidents of hate. Here, we first demonstrate that event extraction and multi-instance learning, applied to a corpus of local news articles, can be used to predict instances of hate crime. We then use the trained model to detect incidents of hate in cities for which the FBI lacks statistics. Lastly, we train models on predicting homicide and kidnapping, compare the predictions to FBI reports, and establish that incidents of hate are indeed under-reported, compared to other types of crimes, in local press.</abstract>
       <url>D19-1580</url>
       <doi>10.18653/v1/D19-1580</doi>
@@ -6760,7 +6759,7 @@
       <author><first>Jun</first><last>Saito</last></author>
       <author><first>Yugo</first><last>Murawaki</last></author>
       <author><first>Sadao</first><last>Kurohashi</last></author>
-      <pages>5757–5764</pages>
+      <pages>5758–5765</pages>
       <abstract>Recognizing affective events that trigger positive or negative sentiment has a wide range of natural language processing applications but remains a challenging problem mainly because the polarity of an event is not necessarily predictable from its constituent words. In this paper, we propose to propagate affective polarity using discourse relations. Our method is simple and only requires a very small seed lexicon and a large raw corpus. Our experiments using Japanese data show that our method learns affective events effectively without manually labeled data. It also improves supervised learning results when labeled data are small.</abstract>
       <url>D19-1581</url>
       <doi>10.18653/v1/D19-1581</doi>
@@ -6772,7 +6771,7 @@
       <author><first>Xiangbin</first><last>Meng</last></author>
       <author><first>Jiafeng</first><last>Guo</last></author>
       <author><first>Xueqi</first><last>Cheng</last></author>
-      <pages>5765–5769</pages>
+      <pages>5766–5770</pages>
       <abstract>Syntactic relations are broadly used in many NLP tasks. For event detection, syntactic relation representations based on dependency tree can better capture the interrelations between candidate trigger words and related entities than sentence representations. But, existing studies only use first-order syntactic relations (i.e., the arcs) in dependency trees to identify trigger words. For this reason, this paper proposes a new method for event detection, which uses a dependency tree based graph convolution network with aggregative attention to explicitly model and aggregate multi-order syntactic representations in sentences. Experimental comparison with state-of-the-art baselines shows the superiority of the proposed method.</abstract>
       <url>D19-1582</url>
       <doi>10.18653/v1/D19-1582</doi>
@@ -6783,7 +6782,7 @@
       <author><first>Nitisha</first><last>Jain</last></author>
       <author><first>Paramita</first><last>Mirza</last></author>
       <author><first>Gerhard</first><last>Weikum</last></author>
-      <pages>5770–5775</pages>
+      <pages>5771–5776</pages>
       <abstract>Scalar implicatures are language features that imply the negation of stronger statements, e.g., “She was married twice” typically implicates that she was not married thrice. In this paper we discuss the importance of scalar implicatures in the context of textual information extraction. We investigate how textual features can be used to predict whether a given text segment mentions all objects standing in a certain relationship with a certain subject. Preliminary results on Wikipedia indicate that this prediction is feasible, and yields informative assessments.</abstract>
       <url>D19-1583</url>
       <doi>10.18653/v1/D19-1583</doi>
@@ -6799,7 +6798,7 @@
       <author><first>Maosong</first><last>Sun</last></author>
       <author><first>Jie</first><last>Zhou</last></author>
       <author><first>Xiang</first><last>Ren</last></author>
-      <pages>5776–5782</pages>
+      <pages>5777–5783</pages>
       <abstract>Existing event extraction methods classify each argument role independently, ignoring the conceptual correlations between different argument roles. In this paper, we propose a Hierarchical Modular Event Argument Extraction (HMEAE) model, to provide effective inductive bias from the concept hierarchy of event argument roles. Specifically, we design a neural module network for each basic unit of the concept hierarchy, and then hierarchically compose relevant unit modules with logical operations into a role-oriented modular network to classify a specific argument role. As many argument roles share the same high-level unit module, their correlation can be utilized to extract specific event arguments better. Experiments on real-world datasets show that HMEAE can effectively leverage useful knowledge from the concept hierarchy and significantly outperform the state-of-the-art baselines. The source code can be obtained from https://github.com/thunlp/HMEAE.</abstract>
       <url>D19-1584</url>
       <attachment>D19-1584.Attachment.zip</attachment>
@@ -6811,7 +6810,7 @@
       <author><first>Ulme</first><last>Wennberg</last></author>
       <author><first>Yi</first><last>Luan</last></author>
       <author><first>Hannaneh</first><last>Hajishirzi</last></author>
-      <pages>5783–5788</pages>
+      <pages>5784–5789</pages>
       <abstract>We examine the capabilities of a unified, multi-task framework for three information extraction tasks: named entity recognition, relation extraction, and event extraction. Our framework (called DyGIE++) accomplishes all tasks by enumerating, refining, and scoring text spans designed to capture local (within-sentence) and global (cross-sentence) context. Our framework achieves state-of-the-art results across all tasks, on four datasets from a variety of domains. We perform experiments comparing different techniques to construct span representations. Contextualized embeddings like BERT perform well at capturing relationships among entities in the same or adjacent sentences, while dynamic span graph updates model long-range cross-sentence relationships. For instance, propagating span representations via predicted coreference links can enable the model to disambiguate challenging entity mentions. Our code is publicly available at https://github.com/dwadden/dygiepp and can be easily adapted for new tasks or datasets.</abstract>
       <url>D19-1585</url>
       <attachment>D19-1585.Attachment.pdf</attachment>
@@ -6821,7 +6820,7 @@
       <title>Next Sentence Prediction helps Implicit Discourse Relation Classification within and across Domains</title>
       <author><first>Wei</first><last>Shi</last></author>
       <author><first>Vera</first><last>Demberg</last></author>
-      <pages>5789–5795</pages>
+      <pages>5790–5796</pages>
       <abstract>Implicit discourse relation classification is one of the most difficult tasks in discourse parsing. Previous studies have generally focused on extracting better representations of the relational arguments. In order to solve the task, it is however additionally necessary to capture what events are expected to cause or follow each other. Current discourse relation classifiers fall short in this respect. We here show that this shortcoming can be effectively addressed by using the bidirectional encoder representation from transformers (BERT) proposed by Devlin et al. (2019), which were trained on a next-sentence prediction task, and thus encode a representation of likely next sentences. The BERT-based model outperforms the current state of the art in 11-way classification by 8% points on the standard PDTB dataset. Our experiments also demonstrate that the model can be successfully ported to other domains: on the BioDRB dataset, the model outperforms the state of the art system around 15% points.</abstract>
       <url>D19-1586</url>
       <doi>10.18653/v1/D19-1586</doi>
@@ -6834,7 +6833,7 @@
       <author><first>Hidetaka</first><last>Kamigaito</last></author>
       <author><first>Manabu</first><last>Okumura</last></author>
       <author><first>Masaaki</first><last>Nagata</last></author>
-      <pages>5796–5801</pages>
+      <pages>5797–5802</pages>
       <abstract>Rhetorical Structure Theory (RST) parsing is crucial for many downstream NLP tasks that require a discourse structure for a text. Most of the previous RST parsers have been based on supervised learning approaches. That is, they require an annotated corpus of sufficient size and quality, and heavily rely on the language and domain dependent corpus. In this paper, we present two language-independent unsupervised RST parsing methods based on dynamic programming. The first one builds the optimal tree in terms of a dissimilarity score function that is defined for splitting a text span into smaller ones. The second builds the optimal tree in terms of a similarity score function that is defined for merging two adjacent spans into a large one. Experimental results on English and German RST treebanks showed that our parser based on span merging achieved the best score, around 0.8 F<tex-math>_1</tex-math> score, which is close to the scores of the previous supervised parsers.</abstract>
       <url>D19-1587</url>
       <doi>10.18653/v1/D19-1587</doi>
@@ -6845,7 +6844,7 @@
       <author><first>Omer</first><last>Levy</last></author>
       <author><first>Luke</first><last>Zettlemoyer</last></author>
       <author><first>Daniel</first><last>Weld</last></author>
-      <pages>5802–5807</pages>
+      <pages>5803–5808</pages>
       <abstract>We apply BERT to coreference resolution, achieving a new state of the art on the GAP (+11.5 F1) and OntoNotes (+3.9 F1) benchmarks. A qualitative analysis of model predictions indicates that, compared to ELMo and BERT-base, BERT-large is particularly better at distinguishing between related but distinct entities (e.g., President and CEO), but that there is still room for improvement in modeling document-level context, conversations, and mention paraphrasing. We will release all code and trained models upon publication.</abstract>
       <url>D19-1588</url>
       <doi>10.18653/v1/D19-1588</doi>
@@ -6854,7 +6853,7 @@
       <title>Linguistic Versus Latent Relations for Modeling Coherent Flow in Paragraphs</title>
       <author><first>Dongyeop</first><last>Kang</last></author>
       <author><first>Eduard</first><last>Hovy</last></author>
-      <pages>5808–5814</pages>
+      <pages>5809–5815</pages>
       <abstract>Generating a long, coherent text such as a paragraph requires a high-level control of different levels of relations between sentences (e.g., tense, coreference). We call such a logical connection between sentences as a (paragraph) flow. In order to produce a coherent flow of text, we explore two forms of intersentential relations in a paragraph: one is a human-created linguistical relation that forms a structure (e.g., discourse tree) and the other is a relation from latent representation learned from the sentences themselves. Our two proposed models incorporate each form of relations into document-level language models: the former is a supervised model that jointly learns a language model as well as discourse relation prediction, and the latter is an unsupervised model that is hierarchically conditioned by a recurrent neural network (RNN) over the latent information. Our proposed models with both forms of relations outperform the baselines in partially conditioned paragraph generation task. Our codes and data are publicly available.</abstract>
       <url>D19-1589</url>
       <attachment>D19-1589.Attachment.pdf</attachment>
@@ -6867,7 +6866,7 @@
       <author><first>Kentaro</first><last>Torisawa</last></author>
       <author><first>Jong-Hoon</first><last>Oh</last></author>
       <author><first>Julien</first><last>Kloetzer</last></author>
-      <pages>5815–5821</pages>
+      <pages>5816–5822</pages>
       <abstract>We propose new BERT-based methods for recognizing event causality such as “smoke cigarettes” –&gt; “die of lung cancer” written in web texts. In our methods, we grasp each annotator’s policy by training multiple classifiers, each of which predicts the labels given by a single annotator, and combine the resulting classifiers’ outputs to predict the final labels determined by majority vote. Furthermore, we investigate the effect of supplying background knowledge to our classifiers. Since BERT models are pre-trained with a large corpus, some sort of background knowledge for event causality may be learned during pre-training. Our experiments with a Japanese dataset suggest that this is actually the case: Performance improved when we pre-trained the BERT models with web texts containing a large number of event causalities instead of Wikipedia articles or randomly sampled web texts. However, this effect was limited. Therefore, we further improved performance by simply adding texts related to an input causality candidate as background knowledge to the input of the BERT models. We believe these findings indicate a promising future research direction.</abstract>
       <url>D19-1590</url>
       <doi>10.18653/v1/D19-1590</doi>
@@ -6877,7 +6876,7 @@
       <author><first>Ji</first><last>Xin</last></author>
       <author><first>Jimmy</first><last>Lin</last></author>
       <author><first>Yaoliang</first><last>Yu</last></author>
-      <pages>5822–5829</pages>
+      <pages>5823–5830</pages>
       <abstract>Memory neurons of long short-term memory (LSTM) networks encode and process information in powerful yet mysterious ways. While there has been work to analyze their behavior in carrying low-level information such as linguistic properties, how they directly contribute to label prediction remains unclear. We find inspiration from biologists and study the affinity between individual neurons and labels, propose a novel metric to quantify the sensitivity of neurons to each label, and conduct experiments to show the validity of our proposed metric. We discover that some neurons are trained to specialize on a subset of labels, and while dropping an arbitrary neuron has little effect on the overall accuracy of the model, dropping label-specialized neurons predictably and significantly degrades prediction accuracy on the associated label. We further examine the consistency of neuron-label affinity across different models. These observations provide insight into the inner mechanisms of LSTMs.</abstract>
       <url>D19-1591</url>
       <doi>10.18653/v1/D19-1591</doi>
@@ -6887,7 +6886,7 @@
       <author><first>Marten</first><last>van Schijndel</last></author>
       <author><first>Aaron</first><last>Mueller</last></author>
       <author><first>Tal</first><last>Linzen</last></author>
-      <pages>5830–5836</pages>
+      <pages>5831–5837</pages>
       <abstract>Recurrent neural networks can learn to predict upcoming words remarkably well on average; in syntactically complex contexts, however, they often assign unexpectedly high probabilities to ungrammatical words. We investigate to what extent these shortcomings can be mitigated by increasing the size of the network and the corpus on which it is trained. We find that gains from increasing network size are minimal beyond a certain point. Likewise, expanding the training corpus yields diminishing returns; we estimate that the training corpus would need to be unrealistically large for the models to match human performance. A comparison to GPT and BERT, Transformer-based models trained on billions of words, reveals that these models perform even more poorly than our LSTMs in some constructions. Our results make the case for more data efficient architectures.</abstract>
       <url>D19-1592</url>
       <attachment>D19-1592.Attachment.zip</attachment>
@@ -6900,7 +6899,7 @@
       <author><first>Felix</first><last>Hill</last></author>
       <author><first>Daniel M.</first><last>Low</last></author>
       <author><first>Anders</first><last>Søgaard</last></author>
-      <pages>5837–5844</pages>
+      <pages>5838–5845</pages>
       <abstract>Representational Similarity Analysis (RSA) is a technique developed by neuroscientists for comparing activity patterns of different measurement modalities (e.g., fMRI, electrophysiology, behavior). As a framework, RSA has several advantages over existing approaches to interpretation of language encoders based on probing or diagnostic classification: namely, it does not require large training samples, is not prone to overfitting, and it enables a more transparent comparison between the representational geometries of different models and modalities. We demonstrate the utility of RSA by establishing a previously unknown correspondence between widely-employed pretrained language encoders and human processing difficulty via eye-tracking data, showcasing its potential in the interpretability toolbox for neural models.</abstract>
       <url>D19-1593</url>
       <attachment>D19-1593.Attachment.pdf</attachment>
@@ -6913,7 +6912,7 @@
       <author><first>Keith</first><last>Hall</last></author>
       <author><first>Chris</first><last>Dyer</last></author>
       <author><first>Jonathan</first><last>Brennan</last></author>
-      <pages>5845–5851</pages>
+      <pages>5846–5852</pages>
       <abstract>Domain-specific training typically makes NLP systems work better. We show that this extends to cognitive modeling as well by relating the states of a neural phrase-structure parser to electrophysiological measures from human participants. These measures were recorded as participants listened to a spoken recitation of the same literary text that was supplied as input to the neural parser. Given more training data, the system derives a better cognitive model — but only when the training examples come from the same textual genre. This finding is consistent with the idea that humans adapt syntactic expectations to particular genres during language comprehension (Kaan and Chun, 2018; Branigan and Pickering, 2017).</abstract>
       <url>D19-1594</url>
       <doi>10.18653/v1/D19-1594</doi>
@@ -6923,7 +6922,7 @@
       <author><first>Steven</first><last>Derby</last></author>
       <author><first>Paul</first><last>Miller</last></author>
       <author><first>Barry</first><last>Devereux</last></author>
-      <pages>5852–5858</pages>
+      <pages>5853–5859</pages>
       <abstract>Feature norm datasets of human conceptual knowledge, collected in surveys of human volunteers, yield highly interpretable models of word meaning and play an important role in neurolinguistic research on semantic cognition. However, these datasets are limited in size due to practical obstacles associated with exhaustively listing properties for a large number of words. In contrast, the development of distributional modelling techniques and the availability of vast text corpora have allowed researchers to construct effective vector space models of word meaning over large lexicons. However, this comes at the cost of interpretable, human-like information about word meaning. We propose a method for mapping human property knowledge onto a distributional semantic space, which adapts the word2vec architecture to the task of modelling concept features. Our approach gives a measure of concept and feature affinity in a single semantic space, which makes for easy and efficient ranking of candidate human-derived semantic properties for arbitrary words. We compare our model with a previous approach, and show that it performs better on several evaluation tasks. Finally, we discuss how our method could be used to develop efficient sampling techniques to extend existing feature norm datasets in a reliable way.</abstract>
       <url>D19-1595</url>
       <doi>10.18653/v1/D19-1595</doi>
@@ -6935,7 +6934,7 @@
       <author><first>Ajay</first><last>Divakaran</last></author>
       <author><first>Stefan</first><last>Lee</last></author>
       <author><first>Giedrius</first><last>Burachas</last></author>
-      <pages>5859–5864</pages>
+      <pages>5860–5865</pages>
       <abstract>While models for Visual Question Answering (VQA) have steadily improved over the years, interacting with one quickly reveals that these models lack consistency. For instance, if a model answers “red” to “What color is the balloon?”, it might answer “no” if asked, “Is the balloon red?”. These responses violate simple notions of entailment and raise questions about how effectively VQA models ground language. In this work, we introduce a dataset, ConVQA, and metrics that enable quantitative evaluation of consistency in VQA. For a given observable fact in an image (e.g. the balloon’s color), we generate a set of logically consistent question-answer (QA) pairs (e.g. Is the balloon red?) and also collect a human-annotated set of common-sense based consistent QA pairs (e.g. Is the balloon the same color as tomato sauce?). Further, we propose a consistency-improving data augmentation module, a Consistency Teacher Module (CTM). CTM automatically generates entailed (or similar-intent) questions for a source QA pair and fine-tunes the VQA model if the VQA’s answer to the entailed question is consistent with the source QA pair. We demonstrate that our CTM-based training improves the consistency of VQA models on the Con-VQA datasets and is a strong baseline for further research.</abstract>
       <url>D19-1596</url>
       <attachment>D19-1596.Attachment.pdf</attachment>
@@ -6951,7 +6950,7 @@
       <author><first>Lin</first><last>Zhou</last></author>
       <author><first>Xinyu</first><last>Dai</last></author>
       <author><first>Yuzhong</first><last>Qu</last></author>
-      <pages>5865–5870</pages>
+      <pages>5866–5871</pages>
       <abstract>Scenario-based question answering (SQA) has attracted increasing research attention. It typically requires retrieving and integrating knowledge from multiple sources, and applying general knowledge to a specific case described by a scenario. SQA widely exists in the medical, geography, and legal domains—both in practice and in the exams. In this paper, we introduce the GeoSQA dataset. It consists of 1,981 scenarios and 4,110 multiple-choice questions in the geography domain at high school level, where diagrams (e.g., maps, charts) have been manually annotated with natural language descriptions to benefit NLP research. Benchmark results on a variety of state-of-the-art methods for question answering, textual entailment, and reading comprehension demonstrate the unique challenges presented by SQA for future research.</abstract>
       <url>D19-1597</url>
       <doi>10.18653/v1/D19-1597</doi>
@@ -6961,7 +6960,7 @@
       <author><first>Matthew</first><last>Le</last></author>
       <author><first>Y-Lan</first><last>Boureau</last></author>
       <author><first>Maximilian</first><last>Nickel</last></author>
-      <pages>5871–5876</pages>
+      <pages>5872–5877</pages>
       <abstract>Theory of mind, i.e., the ability to reason about intents and beliefs of agents is an important task in artificial intelligence and central to resolving ambiguous references in natural language dialogue. In this work, we revisit the evaluation of theory of mind through question answering. We show that current evaluation methods are flawed and that existing benchmark tasks can be solved without theory of mind due to dataset biases. Based on prior work, we propose an improved evaluation protocol and dataset in which we explicitly control for data regularities via a careful examination of the answer space. We show that state-of-the-art methods which are successful on existing benchmarks fail to solve theory-of-mind tasks in our proposed approach.</abstract>
       <url>D19-1598</url>
       <attachment>D19-1598.Attachment.pdf</attachment>
@@ -6974,7 +6973,7 @@
       <author><first>Xiaofei</first><last>Ma</last></author>
       <author><first>Ramesh</first><last>Nallapati</last></author>
       <author><first>Bing</first><last>Xiang</last></author>
-      <pages>5877–5881</pages>
+      <pages>5878–5882</pages>
       <abstract>BERT model has been successfully applied to open-domain QA tasks. However, previous work trains BERT by viewing passages corresponding to the same question as independent training instances, which may cause incomparable scores for answers from different passages. To tackle this issue, we propose a multi-passage BERT model to globally normalize answer scores across all passages of the same question, and this change enables our QA model find better answers by utilizing more passages. In addition, we find that splitting articles into passages with the length of 100 words by sliding window improves performance by 4%. By leveraging a passage ranker to select high-quality passages, multi-passage BERT gains additional 2%. Experiments on four standard benchmarks showed that our multi-passage BERT outperforms all state-of-the-art models on all benchmarks. In particular, on the OpenSQuAD dataset, our model gains 21.4% EM and 21.5% F1 over all non-BERT models, and 5.8% EM and 6.5% F1 over BERT-based models.</abstract>
       <url>D19-1599</url>
       <doi>10.18653/v1/D19-1599</doi>
@@ -6989,7 +6988,7 @@
       <author><first>Wentao</first><last>Ma</last></author>
       <author><first>Shijin</first><last>Wang</last></author>
       <author><first>Guoping</first><last>Hu</last></author>
-      <pages>5882–5888</pages>
+      <pages>5883–5889</pages>
       <abstract>Machine Reading Comprehension (MRC) has become enormously popular recently and has attracted a lot of attention. However, the existing reading comprehension datasets are mostly in English. In this paper, we introduce a Span-Extraction dataset for Chinese machine reading comprehension to add language diversities in this area. The dataset is composed by near 20,000 real questions annotated on Wikipedia paragraphs by human experts. We also annotated a challenge set which contains the questions that need comprehensive understanding and multi-sentence inference throughout the context. We present several baseline systems as well as anonymous submissions for demonstrating the difficulties in this dataset. With the release of the dataset, we hosted the Second Evaluation Workshop on Chinese Machine Reading Comprehension (CMRC 2018). We hope the release of the dataset could further accelerate the Chinese machine reading comprehension research. Resources are available: https://github.com/ymcui/cmrc2018</abstract>
       <url>D19-1600</url>
       <doi>10.18653/v1/D19-1600</doi>
@@ -7000,7 +6999,7 @@
       <author><first>Seungtaek</first><last>Choi</last></author>
       <author><first>Haeju</first><last>Park</last></author>
       <author><first>Seung-won</first><last>Hwang</last></author>
-      <pages>5889–5894</pages>
+      <pages>5890–5895</pages>
       <abstract>This paper studies the problem of non-factoid question answering, where the answer may span over multiple sentences. Existing solutions can be categorized into representation- and interaction-focused approaches. We combine their complementary strength, by a hybrid approach allowing multi-granular interactions, but represented at word level, enabling an easy integration with strong word-level signals. Specifically, we propose MICRON: Multigranular Interaction for Contextualizing RepresentatiON, a novel approach which derives contextualized uni-gram representation from n-grams. Our contributions are as follows: First, we enable multi-granular matches between question and answer <tex-math>n</tex-math>-grams. Second, by contextualizing word representation with surrounding n-grams, MICRON can naturally utilize word-based signals for query term weighting, known to be effective in information retrieval. We validate MICRON in two public non-factoid question answering datasets: WikiPassageQA and InsuranceQA, showing our model achieves the state of the art among baselines with reported performances on both datasets.</abstract>
       <url>D19-1601</url>
       <doi>10.18653/v1/D19-1601</doi>
@@ -7015,7 +7014,7 @@
       <author><first>Yajuan</first><last>Lyu</last></author>
       <author><first>Kang</first><last>Liu</last></author>
       <author><first>Jun</first><last>Zhao</last></author>
-      <pages>5895–5900</pages>
+      <pages>5896–5901</pages>
       <abstract>Leveraging external knowledge is an emerging trend in machine comprehension task. Previous work usually utilizes knowledge graphs such as ConceptNet as external knowledge, and extracts triples from them to enhance the initial representation of the machine comprehension context. However, such method cannot capture the structural information in the knowledge graph. To this end, we propose a Structural Knowledge Graph-aware Network(SKG) model, constructing sub-graphs for entities in the machine comprehension context. Our method dynamically updates the representation of the knowledge according to the structural information of the constructed sub-graph. Experiments show that SKG achieves state-of-the-art performance on the ReCoRD dataset.</abstract>
       <url>D19-1602</url>
       <doi>10.18653/v1/D19-1602</doi>
@@ -7027,7 +7026,7 @@
       <author><first>Peter</first><last>Shaw</last></author>
       <author><first>Massimo</first><last>Nicosia</last></author>
       <author><first>Yasemin</first><last>Altun</last></author>
-      <pages>5901–5909</pages>
+      <pages>5902–5910</pages>
       <abstract>We present a novel approach to answering sequential questions based on structured objects such as knowledge bases or tables without using a logical form as an intermediate representation. We encode tables as graphs using a graph neural network model based on the Transformer architecture. The answers are then selected from the encoded graph using a pointer network. This model is appropriate for processing conversations around structured data, where the attention mechanism that selects the answers to a question can also be used to resolve conversational references. We demonstrate the validity of this approach with competitive results on the Sequential Question Answering (SQA) task.</abstract>
       <url>D19-1603</url>
       <doi>10.18653/v1/D19-1603</doi>
@@ -7038,7 +7037,7 @@
       <author><first>Shweta</first><last>Garg</last></author>
       <author><first>Kartik</first><last>Mehta</last></author>
       <author><first>Nikhil</first><last>Rasiwasia</last></author>
-      <pages>5910–5916</pages>
+      <pages>5911–5917</pages>
       <abstract>In this paper, we establish the effectiveness of using hard negatives, coupled with a siamese network and a suitable loss function, for the tasks of answer selection and answer triggering. We show that the choice of sampling strategy is key for achieving improved performance on these tasks. Evaluating on recent answer selection datasets - InsuranceQA, SelQA, and an internal QA dataset, we show that using hard negatives with relatively simple model architectures (bag of words and LSTM-CNN) drives significant performance gains. On InsuranceQA, this strategy alone improves over previously reported results by a minimum of 1.6 points in P@1. Using hard negatives with a Transformer encoder provides a further improvement of 2.3 points. Further, we propose to use quadruplet loss for answer triggering, with the aim of producing globally meaningful similarity scores. We show that quadruplet loss function coupled with the selection of hard negatives enables bag-of-words models to improve F1 score by 2.3 points over previous baselines, on SelQA answer triggering dataset. Our results provide key insights into answer selection and answer triggering tasks.</abstract>
       <url>D19-1604</url>
       <attachment>D19-1604.Attachment.pdf</attachment>
@@ -7049,7 +7048,7 @@
       <author><first>Ahmed</first><last>Elgohary</last></author>
       <author><first>Denis</first><last>Peskov</last></author>
       <author><first>Jordan</first><last>Boyd-Graber</last></author>
-      <pages>5917–5923</pages>
+      <pages>5918–5924</pages>
       <abstract>Question answering is an AI-complete problem, but existing datasets lack key elements of language understanding such as coreference and ellipsis resolution. We consider sequential question answering: multiple questions are asked one-by-one in a conversation between a questioner and an answerer. Answering these questions is only possible through understanding the conversation history. We introduce the task of question-in-context rewriting: given the context of a conversation’s history, rewrite a context-dependent into a self-contained question with the same answer. We construct, CANARD, a dataset of 40,527 questions based on QuAC (Choi et al., 2018) and train Seq2Seq models for incorporating context into standalone questions.</abstract>
       <url>D19-1605</url>
       <attachment>D19-1605.Attachment.zip</attachment>
@@ -7062,7 +7061,7 @@
       <author><first>Ana</first><last>Marasović</last></author>
       <author><first>Noah A.</first><last>Smith</last></author>
       <author><first>Matt</first><last>Gardner</last></author>
-      <pages>5924–5931</pages>
+      <pages>5925–5932</pages>
       <abstract>Machine comprehension of texts longer than a single sentence often requires coreference resolution. However, most current reading comprehension benchmarks do not contain complex coreferential phenomena and hence fail to evaluate the ability of models to resolve coreference. We present a new crowdsourced dataset containing more than 24K span-selection questions that require resolving coreference among entities in over 4.7K English paragraphs from Wikipedia. Obtaining questions focused on such phenomena is challenging, because it is hard to avoid lexical cues that shortcut complex reasoning. We deal with this issue by using a strong baseline model as an adversary in the crowdsourcing loop, which helps crowdworkers avoid writing questions with exploitable surface cues. We show that state-of-the-art reading comprehension models perform significantly worse than humans on this benchmark—the best model performance is 70.5 F1, while the estimated human performance is 93.4 F1.</abstract>
       <url>D19-1606</url>
       <doi>10.18653/v1/D19-1606</doi>
@@ -7072,7 +7071,7 @@
       <author><first>Tsung-Yuan</first><last>Hsu</last></author>
       <author><first>Chi-Liang</first><last>Liu</last></author>
       <author><first>Hung-yi</first><last>Lee</last></author>
-      <pages>5932–5939</pages>
+      <pages>5933–5940</pages>
       <abstract>Because it is not feasible to collect training data for every language, there is a growing interest in cross-lingual transfer learning. In this paper, we systematically explore zero-shot cross-lingual transfer learning on reading comprehension tasks with language representation model pre-trained on multi-lingual corpus. The experimental results show that with pre-trained language representation zero-shot learning is feasible, and translating the source data into the target language is not necessary and even degrades the performance. We further explore what does the model learn in zero-shot setting.</abstract>
       <url>D19-1607</url>
       <attachment>D19-1607.Attachment.pdf</attachment>
@@ -7084,7 +7083,7 @@
       <author><first>Matt</first><last>Gardner</last></author>
       <author><first>Kevin</first><last>Lin</last></author>
       <author><first>Peter</first><last>Clark</last></author>
-      <pages>5940–5945</pages>
+      <pages>5941–5946</pages>
       <abstract>We introduce the first open-domain dataset, called QuaRTz, for reasoning about textual qualitative relationships. QuaRTz contains general qualitative statements, e.g., “A sunscreen with a higher SPF protects the skin longer.”, twinned with 3864 crowdsourced situated questions, e.g., “Billy is wearing sunscreen with a lower SPF than Lucy. Who will be best protected from the sun?”, plus annotations of the properties being compared. Unlike previous datasets, the general knowledge is textual and not tied to a fixed set of relationships, and tests a system’s ability to comprehend and apply textual qualitative knowledge in a novel setting. We find state-of-the-art results are substantially (20%) below human performance, presenting an open challenge to the NLP community.</abstract>
       <url>D19-1608</url>
       <doi>10.18653/v1/D19-1608</doi>
@@ -7095,7 +7094,7 @@
       <author><first>Luheng</first><last>He</last></author>
       <author><first>Kenton</first><last>Lee</last></author>
       <author><first>Emily</first><last>Pitler</last></author>
-      <pages>5946–5951</pages>
+      <pages>5947–5952</pages>
       <abstract>Reading comprehension models have been successfully applied to extractive text answers, but it is unclear how best to generalize these models to abstractive numerical answers. We enable a BERT-based reading comprehension model to perform lightweight numerical reasoning. We augment the model with a predefined set of executable ‘programs’ which encompass simple arithmetic as well as extraction. Rather than having to learn to manipulate numbers directly, the model can pick a program and execute it. On the recent Discrete Reasoning Over Passages (DROP) dataset, designed to challenge reading comprehension models, we show a 33% absolute improvement by adding shallow programs. The model can learn to predict new operations when appropriate in a math word problem setting (Roy and Roth, 2015) with very few training examples.</abstract>
       <url>D19-1609</url>
       <attachment>D19-1609.Attachment.zip</attachment>
@@ -7107,7 +7106,7 @@
       <author><first>Quan Hung</first><last>Tran</last></author>
       <author><first>Trung</first><last>Bui</last></author>
       <author><first>Daisuke</first><last>Kihara</last></author>
-      <pages>5952–5958</pages>
+      <pages>5953–5959</pages>
       <abstract>Answer selection is an important research problem, with applications in many areas. Previous deep learning based approaches for the task mainly adopt the Compare-Aggregate architecture that performs word-level comparison followed by aggregation. In this work, we take a departure from the popular Compare-Aggregate architecture, and instead, propose a new gated self-attention memory network for the task. Combined with a simple transfer learning technique from a large-scale online corpus, our model outperforms previous methods by a large margin, achieving new state-of-the-art results on two standard answer selection datasets: TrecQA and WikiQA.</abstract>
       <url>D19-1610</url>
       <doi>10.18653/v1/D19-1610</doi>
@@ -7116,7 +7115,7 @@
       <title>Polly Want a Cracker: Analyzing Performance of Parroting on Paraphrase Generation Datasets</title>
       <author><first>Hong-Ren</first><last>Mao</last></author>
       <author><first>Hung-Yi</first><last>Lee</last></author>
-      <pages>5959–5967</pages>
+      <pages>5960–5968</pages>
       <abstract>Paraphrase generation is an interesting and challenging NLP task which has numerous practical applications. In this paper, we analyze datasets commonly used for paraphrase generation research, and show that simply parroting input sentences surpasses state-of-the-art models in the literature when evaluated on standard metrics. Our findings illustrate that a model could be seemingly adept at generating paraphrases, despite only making trivial changes to the input sentence or even none at all.</abstract>
       <url>D19-1611</url>
       <attachment>D19-1611.Attachment.zip</attachment>
@@ -7126,7 +7125,7 @@
       <title>Query-focused Sentence Compression in Linear Time</title>
       <author><first>Abram</first><last>Handler</last></author>
       <author><first>Brendan</first><last>O’Connor</last></author>
-      <pages>5968–5974</pages>
+      <pages>5969–5975</pages>
       <abstract>Search applications often display shortened sentences which must contain certain query terms and must fit within the space constraints of a user interface. This work introduces a new transition-based sentence compression technique developed for such settings. Our query-focused method constructs length and lexically constrained compressions in linear time, by growing a subgraph in the dependency parse of a sentence. This theoretically efficient approach achieves an 11x empirical speedup over baseline ILP methods, while better reconstructing gold constrained shortenings. Such speedups help query-focused applications, because users are measurably hindered by interface lags. Additionally, our technique does not require an ILP solver or a GPU.</abstract>
       <url>D19-1612</url>
       <attachment>D19-1612.Attachment.zip</attachment>
@@ -7138,7 +7137,7 @@
       <author><first>Shuyang</first><last>Li</last></author>
       <author><first>Jianmo</first><last>Ni</last></author>
       <author><first>Julian</first><last>McAuley</last></author>
-      <pages>5975–5981</pages>
+      <pages>5976–5982</pages>
       <abstract>Existing approaches to recipe generation are unable to create recipes for users with culinary preferences but incomplete knowledge of ingredients in specific dishes. We propose a new task of personalized recipe generation to help these users: expanding a name and incomplete ingredient details into complete natural-text instructions aligned with the user’s historical preferences. We attend on technique- and recipe-level representations of a user’s previously consumed recipes, fusing these ‘user-aware’ representations in an attention fusion layer to control recipe text generation. Experiments on a new dataset of 180K recipes and 700K interactions show our model’s ability to generate plausible and personalized recipes compared to non-personalized baselines.</abstract>
       <url>D19-1613</url>
       <attachment>D19-1613.Attachment.pdf</attachment>
@@ -7148,7 +7147,7 @@
       <title>Generating Highly Relevant Questions</title>
       <author><first>Jiazuo</first><last>Qiu</last></author>
       <author><first>Deyi</first><last>Xiong</last></author>
-      <pages>5982–5986</pages>
+      <pages>5983–5987</pages>
       <abstract>The neural seq2seq based question generation (QG) is prone to generating generic and undiversified questions that are poorly relevant to the given passage and target answer. In this paper, we propose two methods to address the issue. (1) By a partial copy mechanism, we prioritize words that are morphologically close to words in the input passage when generating questions; (2) By a QA-based reranker, from the n-best list of question candidates, we select questions that are preferred by both the QA and QG model. Experiments and analyses demonstrate that the proposed two methods substantially improve the relevance of generated questions to passages and answers.</abstract>
       <url>D19-1614</url>
       <doi>10.18653/v1/D19-1614</doi>
@@ -7159,7 +7158,7 @@
       <author><first>Bodhisattwa Prasad</first><last>Majumder</last></author>
       <author><first>Julian</first><last>McAuley</last></author>
       <author><first>Garrison</first><last>Cottrell</last></author>
-      <pages>5987–5992</pages>
+      <pages>5988–5993</pages>
       <abstract>Stories generated with neural language models have shown promise in grammatical and stylistic consistency. However, the generated stories are still lacking in common sense reasoning, e.g., they often contain sentences deprived of world knowledge. We propose a simple multi-task learning scheme to achieve quantitatively better common sense reasoning in language models by leveraging auxiliary training signals from datasets designed to provide common sense grounding. When combined with our two-stage fine-tuning pipeline, our method achieves improved common sense reasoning and state-of-the-art perplexity on the WritingPrompts (Fan et al., 2018) story generation dataset.</abstract>
       <url>D19-1615</url>
       <attachment>D19-1615.Attachment.pdf</attachment>
@@ -7169,7 +7168,7 @@
       <title>Abstract Text Summarization: A Low Resource Challenge</title>
       <author><first>Shantipriya</first><last>Parida</last></author>
       <author><first>Petr</first><last>Motlicek</last></author>
-      <pages>5993–5997</pages>
+      <pages>5994–5998</pages>
       <abstract>Text summarization is considered as a challenging task in the NLP community. The availability of datasets for the task of multilingual text summarization is rare, and such datasets are difficult to construct. In this work, we build an abstract text summarizer for the German language text using the state-of-the-art “Transformer” model. We propose an iterative data augmentation approach which uses synthetic data along with the real summarization data for the German language. To generate synthetic data, the Common Crawl (German) dataset is exploited, which covers different domains. The synthetic data is effective for the low resource condition and is particularly helpful for our multilingual scenario where availability of summarizing data is still a challenging issue. The data are also useful in deep learning scenarios where the neural models require a large amount of training data for utilization of its capacity. The obtained summarization performance is measured in terms of ROUGE and BLEU score. We achieve an absolute improvement of +1.5 and +16.0 in ROUGE1 F1 (R1_F1) on the development and test sets, respectively, compared to the system which does not rely on data augmentation.</abstract>
       <url>D19-1616</url>
       <doi>10.18653/v1/D19-1616</doi>
@@ -7178,7 +7177,7 @@
       <title>Generating Modern Poetry Automatically in <fixed-case>F</fixed-case>innish</title>
       <author><first>Mika</first><last>Hämäläinen</last></author>
       <author><first>Khalid</first><last>Alnajjar</last></author>
-      <pages>5998–6003</pages>
+      <pages>5999–6004</pages>
       <abstract>We present a novel approach for generating poetry automatically for the morphologically rich Finnish language by using a genetic algorithm. The approach improves the state of the art of the previous Finnish poem generators by introducing a higher degree of freedom in terms of structural creativity. Our approach is evaluated and described within the paradigm of computational creativity, where the fitness functions of the genetic algorithm are assimilated with the notion of aesthetics. The output is considered to be a poem 81.5% of the time by human evaluators.</abstract>
       <url>D19-1617</url>
       <doi>10.18653/v1/D19-1617</doi>
@@ -7189,7 +7188,7 @@
       <author><first>Prodromos</first><last>Malakasiotis</last></author>
       <author><first>Marianna</first><last>Apidianaki</last></author>
       <author><first>Ion</first><last>Androutsopoulos</last></author>
-      <pages>6004–6010</pages>
+      <pages>6005–6011</pages>
       <abstract>We propose SUM-QE, a novel Quality Estimation model for summarization based on BERT. The model addresses linguistic quality aspects that are only indirectly captured by content-based approaches to summary evaluation, without involving comparison with human references. SUM-QE achieves very high correlations with human ratings, outperforming simpler models addressing these linguistic aspects. Predictions of the SUM-QE model can be used for system development, and to inform users of the quality of automatically produced summaries and other types of generated text.</abstract>
       <url>D19-1618</url>
       <attachment>D19-1618.Attachment.zip</attachment>
@@ -7199,7 +7198,7 @@
       <title>An Empirical Comparison on Imitation Learning and Reinforcement Learning for Paraphrase Generation</title>
       <author><first>Wanyu</first><last>Du</last></author>
       <author><first>Yangfeng</first><last>Ji</last></author>
-      <pages>6011–6017</pages>
+      <pages>6012–6018</pages>
       <abstract>Generating paraphrases from given sentences involves decoding words step by step from a large vocabulary. To learn a decoder, supervised learning which maximizes the likelihood of tokens always suffers from the exposure bias. Although both reinforcement learning (RL) and imitation learning (IL) have been widely used to alleviate the bias, the lack of direct comparison leads to only a partial image on their benefits. In this work, we present an empirical study on how RL and IL can help boost the performance of generating paraphrases, with the pointer-generator as a base model. Experiments on the benchmark datasets show that (1) imitation learning is constantly better than reinforcement learning; and (2) the pointer-generator models with imitation learning outperform the state-of-the-art methods with a large margin.</abstract>
       <url>D19-1619</url>
       <attachment>D19-1619.Attachment.zip</attachment>
@@ -7211,7 +7210,7 @@
       <author><first>Yue</first><last>Dong</last></author>
       <author><first>Jackie Chi Kit</first><last>Cheung</last></author>
       <author><first>Annie</first><last>Louis</last></author>
-      <pages>6018–6023</pages>
+      <pages>6019–6024</pages>
       <abstract>Sentence position is a strong feature for news summarization, since the lead often (but not always) summarizes the key points of the article. In this paper, we show that recent neural systems excessively exploit this trend, which although powerful for many inputs, is also detrimental when summarizing documents where important content should be extracted from later parts of the article. We propose two techniques to make systems sensitive to the importance of content in different parts of the article. The first technique employs ‘unbiased’ data; i.e., randomly shuffled sentences of the source document, to pretrain the model. The second technique uses an auxiliary ROUGE-based loss that encourages the model to distribute importance scores throughout a document by mimicking sentence-level ROUGE scores on the training data. We show that these techniques significantly improve the performance of a competitive reinforcement learning based extractive system, with the auxiliary loss being more powerful than pretraining.</abstract>
       <url>D19-1620</url>
       <attachment>D19-1620.Attachment.zip</attachment>
@@ -7223,7 +7222,7 @@
       <author><first>Sanket Vaibhav</first><last>Mehta</last></author>
       <author><first>Jaime</first><last>Carbonell</last></author>
       <author><first>Taylor</first><last>Berg-Kirkpatrick</last></author>
-      <pages>6024–6030</pages>
+      <pages>6025–6031</pages>
       <abstract>Existing recurrent neural language models often fail to capture higher-level structure present in text: for example, rhyming patterns present in poetry. Much prior work on poetry generation uses manually defined constraints which are satisfied during decoding using either specialized decoding procedures or rejection sampling. The rhyming constraints themselves are typically not learned by the generator. We propose an alternate approach that uses a structured discriminator to learn a poetry generator that directly captures rhyming constraints in a generative adversarial setup. By causing the discriminator to compare poems based only on a learned similarity matrix of pairs of line ending words, the proposed approach is able to successfully learn rhyming patterns in two different English poetry datasets (Sonnet and Limerick) without explicitly being provided with any phonetic information</abstract>
       <url>D19-1621</url>
       <doi>10.18653/v1/D19-1621</doi>
@@ -7233,7 +7232,7 @@
       <author><first>Wenjie</first><last>Zhou</last></author>
       <author><first>Minghua</first><last>Zhang</last></author>
       <author><first>Yunfang</first><last>Wu</last></author>
-      <pages>6031–6036</pages>
+      <pages>6032–6037</pages>
       <abstract>Question generation is a challenging task which aims to ask a question based on an answer and relevant context. The existing works suffer from the mismatching between question type and answer, i.e. generating a question with type <tex-math>how</tex-math> while the answer is a personal name. We propose to automatically predict the question type based on the input answer and context. Then, the question type is fused into a seq2seq model to guide the question generation, so as to deal with the mismatching problem. We achieve significant improvement on the accuracy of question type prediction and finally obtain state-of-the-art results for question generation on both SQuAD and MARCO datasets.</abstract>
       <url>D19-1622</url>
       <doi>10.18653/v1/D19-1622</doi>
@@ -7244,7 +7243,7 @@
       <author><first>Deren</first><last>Lei</last></author>
       <author><first>Pengda</first><last>Qin</last></author>
       <author><first>William Yang</first><last>Wang</last></author>
-      <pages>6037–6043</pages>
+      <pages>6038–6044</pages>
       <abstract>Deep reinforcement learning (RL) has been a commonly-used strategy for the abstractive summarization task to address both the exposure bias and non-differentiable task issues. However, the conventional reward Rouge-L simply looks for exact n-grams matches between candidates and annotated references, which inevitably makes the generated sentences repetitive and incoherent. In this paper, instead of Rouge-L, we explore the practicability of utilizing the distributional semantics to measure the matching degrees. With distributional semantics, sentence-level evaluation can be obtained, and semantically-correct phrases can also be generated without being limited to the surface form of the reference sentences. Human judgments on Gigaword and CNN/Daily Mail datasets show that our proposed distributional semantics reward (DSR) has distinct superiority in capturing the lexical and compositional diversity of natural language.</abstract>
       <url>D19-1623</url>
       <doi>10.18653/v1/D19-1623</doi>
@@ -7252,7 +7251,7 @@
     <paper id="624">
       <title>Clause-Wise and Recursive Decoding for Complex and Cross-Domain Text-to-<fixed-case>SQL</fixed-case> Generation</title>
       <author><first>Dongjun</first><last>Lee</last></author>
-      <pages>6044–6050</pages>
+      <pages>6045–6051</pages>
       <abstract>Most deep learning approaches for text-to-SQL generation are limited to the WikiSQL dataset, which only supports very simple queries over a single table. We focus on the Spider dataset, a complex and cross-domain text-to-SQL task, which includes complex queries over multiple tables. In this paper, we propose a SQL clause-wise decoding neural architecture with a self-attention based database schema encoder to address the Spider task. Each of the clause-specific decoders consists of a set of sub-modules, which is defined by the syntax of each clause. Additionally, our model works recursively to support nested queries. When evaluated on the Spider dataset, our approach achieves 4.6% and 9.8% accuracy gain in the test and dev sets, respectively. In addition, we show that our model is significantly more effective at predicting complex and nested queries than previous work.</abstract>
       <url>D19-1624</url>
       <doi>10.18653/v1/D19-1624</doi>
@@ -7263,7 +7262,7 @@
       <author><first>Jonathan</first><last>Gordon</last></author>
       <author><first>Nanyun</first><last>Peng</last></author>
       <author><first>Jonathan</first><last>May</last></author>
-      <pages>6051–6057</pages>
+      <pages>6052–6058</pages>
       <abstract>How do adjectives project from a noun to its parts? If a motorcycle is red, are its wheels red? Is a nuclear submarine’s captain nuclear? These questions are easy for humans to judge using our commonsense understanding of the world, but are difficult for computers. To attack this challenge, we crowdsource a set of human judgments that answer the English-language question “Given a whole described by an adjective, does the adjective also describe a given part?” We build strong baselines for this task with a classification approach. Our findings indicate that, despite the recent successes of large language models on tasks aimed to assess commonsense knowledge, these models do not greatly outperform simple word-level models based on pre-trained word embeddings. This provides evidence that the amount of commonsense knowledge encoded in these language models does not extend far beyond that already baked into the word embeddings. Our dataset will serve as a useful testbed for future research in commonsense reasoning, especially as it relates to adjectives and objects</abstract>
       <url>D19-1625</url>
       <attachment>D19-1625.Attachment.pdf</attachment>
@@ -7276,7 +7275,7 @@
       <author><first>Weizhen</first><last>Qi</last></author>
       <author><first>Nan</first><last>Duan</last></author>
       <author><first>Xiaola</first><last>Lin</last></author>
-      <pages>6058–6062</pages>
+      <pages>6059–6063</pages>
       <abstract>In this work, we propose an aggregation method to combine the Bidirectional Encoder Representations from Transformer (BERT) with a MatchLSTM layer for Sequence Matching. Given a sentence pair, we extract the output representations of it from BERT. Then we extend BERT with a MatchLSTM layer to get further interaction of the sentence pair for sequence matching tasks. Taking natural language inference as an example, we split BERT output into two parts, which is from premise sentence and hypothesis sentence. At each position of the hypothesis sentence, both the weighted representation of the premise sentence and the representation of the current token are fed into LSTM. We jointly train the aggregation layer and pre-trained layer for sequence matching. We conduct an experiment on two publicly available datasets, WikiQA and SNLI. Experiments show that our model achieves significantly improvement compared with state-of-the-art methods on both datasets.</abstract>
       <url>D19-1626</url>
       <doi>10.18653/v1/D19-1626</doi>
@@ -7285,7 +7284,7 @@
       <title>What Does This Word Mean? Explaining Contextualized Embeddings with Natural Language Definition</title>
       <author><first>Ting-Yun</first><last>Chang</last></author>
       <author><first>Yun-Nung</first><last>Chen</last></author>
-      <pages>6063–6069</pages>
+      <pages>6064–6070</pages>
       <abstract>Contextualized word embeddings have boosted many NLP tasks compared with traditional static word embeddings. However, the word with a specific sense may have different contextualized embeddings due to its various contexts. To further investigate what contextualized word embeddings capture, this paper analyzes whether they can indicate the corresponding sense definitions and proposes a general framework that is capable of explaining word meanings given contextualized word embeddings for better interpretation. The experiments show that both ELMo and BERT embeddings can be well interpreted via a readable textual form, and the findings may benefit the research community for a better understanding of what the embeddings capture.</abstract>
       <url>D19-1627</url>
       <attachment>D19-1627.Attachment.zip</attachment>
@@ -7299,7 +7298,7 @@
       <author><first>Tengfei</first><last>Ma</last></author>
       <author><first>Vinay</first><last>Reddy</last></author>
       <author><first>Rishi</first><last>Arora</last></author>
-      <pages>6070–6074</pages>
+      <pages>6071–6075</pages>
       <abstract>Pre-trained BERT contextualized representations have achieved state-of-the-art results on multiple downstream NLP tasks by fine-tuning with task-specific data. While there has been a lot of focus on task-specific fine-tuning, there has been limited work on improving the pre-trained representations. In this paper, we explore ways of improving the pre-trained contextual representations for the task of automatic short answer grading, a critical component of intelligent tutoring systems. We show that the pre-trained BERT model can be improved by augmenting data from the domain-specific resources like textbooks. We also present a new approach to use labeled short answering grading data for further enhancement of the language model. Empirical evaluation on multi-domain datasets shows that task-specific fine-tuning on the enhanced pre-trained language model achieves superior performance for short answer grading.</abstract>
       <url>D19-1628</url>
       <doi>10.18653/v1/D19-1628</doi>
@@ -7311,7 +7310,7 @@
       <author><first>Keisuke</first><last>Sakaguchi</last></author>
       <author><first>Peter</first><last>Clark</last></author>
       <author><first>Antoine</first><last>Bosselut</last></author>
-      <pages>6075–6084</pages>
+      <pages>6076–6085</pages>
       <abstract>We introduce WIQA, the first large-scale dataset of “What if...” questions over procedural text. WIQA contains a collection of paragraphs, each annotated with multiple influence graphs describing how one change affects another, and a large (40k) collection of “What if...?” multiple-choice questions derived from these. For example, given a paragraph about beach erosion, would stormy weather hasten or decelerate erosion? WIQA contains three kinds of questions: perturbations to steps mentioned in the paragraph; external (out-of-paragraph) perturbations requiring commonsense knowledge; and irrelevant (no effect) perturbations. We find that state-of-the-art models achieve 73.8% accuracy, well below the human performance of 96.3%. We analyze the challenges, in particular tracking chains of influences, and present the dataset as an open challenge to the community.</abstract>
       <url>D19-1629</url>
       <attachment>D19-1629.Attachment.zip</attachment>
@@ -7321,7 +7320,7 @@
       <title>Evaluating <fixed-case>BERT</fixed-case> for natural language inference: A case study on the <fixed-case>C</fixed-case>ommitment<fixed-case>B</fixed-case>ank</title>
       <author><first>Nanjiang</first><last>Jiang</last></author>
       <author><first>Marie-Catherine</first><last>de Marneffe</last></author>
-      <pages>6085–6090</pages>
+      <pages>6086–6091</pages>
       <abstract>Natural language inference (NLI) datasets (e.g., MultiNLI) were collected by soliciting hypotheses for a given premise from annotators. Such data collection led to annotation artifacts: systems can identify the premise-hypothesis relationship without observing the premise (e.g., negation in hypothesis being indicative of contradiction). We address this problem by recasting the CommitmentBank for NLI, which contains items involving reasoning over the extent to which a speaker is committed to complements of clause-embedding verbs under entailment-canceling environments (conditional, negation, modal and question). Instead of being constructed to stand in certain relationships with the premise, hypotheses in the recast CommitmentBank are the complements of the clause-embedding verb in each premise, leading to no annotation artifacts in the hypothesis. A state-of-the-art BERT-based model performs well on the CommitmentBank with 85% F1. However analysis of model behavior shows that the BERT models still do not capture the full complexity of pragmatic reasoning, nor encode some of the linguistic generalizations, highlighting room for improvement.</abstract>
       <url>D19-1630</url>
       <doi>10.18653/v1/D19-1630</doi>
@@ -7334,7 +7333,7 @@
       <author><first>Santosh</first><last>Tokala</last></author>
       <author><first>Niloy</first><last>Ganguly</last></author>
       <author><first>Pawan</first><last>Goyal</last></author>
-      <pages>6091–6096</pages>
+      <pages>6092–6097</pages>
       <abstract>Recently, biomedical version of embeddings obtained from language models such as BioELMo have shown state-of-the-art results for the textual inference task in the medical domain. In this paper, we explore how to incorporate structured domain knowledge, available in the form of a knowledge graph (UMLS), for the Medical NLI task. Specifically, we experiment with fusing embeddings obtained from knowledge graph with the state-of-the-art approaches for NLI task (ESIM model). We also experiment with fusing the domain-specific sentiment information for the task. Experiments conducted on MedNLI dataset clearly show that this strategy improves the baseline BioELMo architecture for the Medical NLI task.</abstract>
       <url>D19-1631</url>
       <attachment>D19-1631.Attachment.pdf</attachment>
@@ -7350,7 +7349,7 @@
       <author><first>Philipp</first><last>Koehn</last></author>
       <author><first>Vishrav</first><last>Chaudhary</last></author>
       <author><first>Marc’Aurelio</first><last>Ranzato</last></author>
-      <pages>6097–6110</pages>
+      <pages>6098–6111</pages>
       <abstract>For machine translation, a vast majority of language pairs in the world are considered low-resource because they have little parallel data available. Besides the technical challenges of learning with limited supervision, it is difficult to evaluate methods trained on low-resource language pairs because of the lack of freely and publicly available benchmarks. In this work, we introduce the FLORES evaluation datasets for Nepali–English and Sinhala– English, based on sentences translated from Wikipedia. Compared to English, these are languages with very different morphology and syntax, for which little out-of-domain parallel data is available and for which relatively large amounts of monolingual data are freely available. We describe our process to collect and cross-check the quality of translations, and we report baseline performance using several learning settings: fully supervised, weakly supervised, semi-supervised, and fully unsupervised. Our experiments demonstrate that current state-of-the-art methods perform rather poorly on this benchmark, posing a challenge to the research community working on low-resource MT. Data and code to reproduce our experiments are available at https://github.com/facebookresearch/flores.</abstract>
       <url>D19-1632</url>
       <attachment>D19-1632.Attachment.zip</attachment>
@@ -7362,7 +7361,7 @@
       <author><first>Omer</first><last>Levy</last></author>
       <author><first>Yinhan</first><last>Liu</last></author>
       <author><first>Luke</first><last>Zettlemoyer</last></author>
-      <pages>6111–6120</pages>
+      <pages>6112–6121</pages>
       <abstract>Most machine translation systems generate text autoregressively from left to right. We, instead, use a masked language modeling objective to train a model to predict any subset of the target words, conditioned on both the input text and a partially masked target translation. This approach allows for efficient iterative decoding, where we first predict all of the target words non-autoregressively, and then repeatedly mask out and regenerate the subset of words that the model is least confident about. By applying this strategy for a constant number of iterations, our model improves state-of-the-art performance levels for non-autoregressive and parallel decoding translation models by over 4 BLEU on average. It is also able to reach within about 1 BLEU point of a typical left-to-right transformer model, while decoding significantly faster.</abstract>
       <url>D19-1633</url>
       <doi>10.18653/v1/D19-1633</doi>
@@ -7374,7 +7373,7 @@
       <author><first>Huanbo</first><last>Luan</last></author>
       <author><first>Jingfang</first><last>Xu</last></author>
       <author><first>Maosong</first><last>Sun</last></author>
-      <pages>6121–6131</pages>
+      <pages>6122–6132</pages>
       <abstract>Automatic post-editing (APE), which aims to correct errors in the output of machine translation systems in a post-processing step, is an important task in natural language processing. While recent work has achieved considerable performance gains by using neural networks, how to model the copying mechanism for APE remains a challenge. In this work, we propose a new method for modeling copying for APE. To better identify translation errors, our method learns the representations of source sentences and system outputs in an interactive way. These representations are used to explicitly indicate which words in the system outputs should be copied. Finally, CopyNet (Gu et.al., 2016) can be combined with our method to place the copied words in correct positions in post-edited translations. Experiments on the datasets of the WMT 2016-2017 APE shared tasks show that our approach outperforms all best published results.</abstract>
       <url>D19-1634</url>
       <doi>10.18653/v1/D19-1634</doi>
@@ -7384,7 +7383,7 @@
       <author><first>Yupei</first><last>Du</last></author>
       <author><first>Yuanbin</first><last>Wu</last></author>
       <author><first>Man</first><last>Lan</last></author>
-      <pages>6132–6142</pages>
+      <pages>6133–6143</pages>
       <abstract>Word embeddings have been widely used to study gender stereotypes in texts. One key problem regarding existing bias scores is to evaluate their validities: do they really reflect true bias levels? For a small set of words (e.g. occupations), we can rely on human annotations or external data. However, for most words, evaluating the correctness of them is still an open problem. In this work, we utilize word association test, which contains rich types of word connections annotated by human participants, to explore how gender stereotypes spread within our minds. Specifically, we use random walk on word association graph to derive bias scores for a large amount of words. Experiments show that these bias scores correlate well with bias in the real world. More importantly, comparing with word-embedding-based bias scores, it provides a different perspective on gender stereotypes in words.</abstract>
       <url>D19-1635</url>
       <doi>10.18653/v1/D19-1635</doi>
@@ -7394,7 +7393,7 @@
       <author><first>Abhijit</first><last>Mishra</last></author>
       <author><first>Tarun</first><last>Tater</last></author>
       <author><first>Karthik</first><last>Sankaranarayanan</last></author>
-      <pages>6143–6153</pages>
+      <pages>6144–6154</pages>
       <abstract>In this paper, we propose a novel framework for sarcasm generation; the system takes a literal negative opinion as input and translates it into a sarcastic version. Our framework does not require any paired data for training. Sarcasm emanates from context-incongruity which becomes apparent as the sentence unfolds. Our framework introduces incongruity into the literal input version through modules that: (a) filter factual content from the input opinion, (b) retrieve incongruous phrases related to the filtered facts and (c) synthesize sarcastic text from the incongruous filtered and incongruous phrases. The framework employs reinforced neural sequence to sequence learning and information retrieval and is trained only using unlabeled non-sarcastic and sarcastic opinions. Since no labeled dataset exists for such a task, for evaluation, we manually prepare a benchmark dataset containing literal opinions and their sarcastic paraphrases. Qualitative and quantitative performance analyses on the data reveal our system’s superiority over baselines built using known unsupervised statistical and neural machine translation and style transfer techniques.</abstract>
       <url>D19-1636</url>
       <doi>10.18653/v1/D19-1636</doi>
@@ -7408,7 +7407,7 @@
       <author><first>Weijiang</first><last>Feng</last></author>
       <author><first>Elena Suet-Ying</first><last>Chiu</last></author>
       <author><first>Hong</first><last>Yu</last></author>
-      <pages>6154–6163</pages>
+      <pages>6155–6164</pages>
       <abstract>Classical Chinese poetry is a jewel in the treasure house of Chinese culture. Previous poem generation models only allow users to employ keywords to interfere the meaning of generated poems, leaving the dominion of generation to the model. In this paper, we propose a novel task of generating classical Chinese poems from vernacular, which allows users to have more control over the semantic of generated poems. We adapt the approach of unsupervised machine translation (UMT) to our task. We use segmentation-based padding and reinforcement learning to address under-translation and over-translation respectively. According to experiments, our approach significantly improve the perplexity and BLEU compared with typical UMT models. Furthermore, we explored guidelines on how to write the input vernacular to generate better poems. Human evaluation showed our approach can generate high-quality poems which are comparable to amateur poems.</abstract>
       <url>D19-1637</url>
       <attachment>D19-1637.Attachment.pdf</attachment>
@@ -7418,7 +7417,7 @@
       <title>Set to Ordered Text: Generating Discharge Instructions from Medical Billing Codes</title>
       <author><first>Litton</first><last>J Kurisinkel</last></author>
       <author><first>Nancy</first><last>Chen</last></author>
-      <pages>6164–6174</pages>
+      <pages>6165–6175</pages>
       <abstract>We present set to ordered text, a natural language generation task applied to automatically generating discharge instructions from admission ICD (International Classification of Diseases) codes. This task differs from other natural language generation tasks in the following ways: (1) The input is a set of identifiable entities (ICD codes) where the relations between individual entity are not explicitly specified. (2) The output text is not a narrative description (e.g. news articles) composed from the input. Rather, inferences are made from the input (symptoms specified in ICD codes) to generate the output (instructions). (3) There is an optimal order in which each sentence (instruction) should appear in the output. Unlike most other tasks, neither the input (ICD codes) nor their corresponding symptoms appear in the output, so the ordering of the output instructions needs to be learned in an unsupervised fashion. Based on clinical intuition, we hypothesize that each instruction in the output is mapped to a subset of ICD codes specified in the input. We propose a neural architecture that jointly models (a) subset selection: choosing relevant subsets from a set of input entities; (b) content ordering: learning the order of instructions; and (c) text generation: representing the instructions corresponding to the selected subsets in natural language. In addition, we penalize redundancy during beam search to improve tractability for long text generation. Our model outperforms baseline models in BLEU scores and human evaluation. We plan to extend this work to other tasks such as recipe generation from ingredients.</abstract>
       <url>D19-1638</url>
       <doi>10.18653/v1/D19-1638</doi>
@@ -7431,7 +7430,7 @@
       <author><first>Eric</first><last>Meinhardt</last></author>
       <author><first>Eric</first><last>Bakovic</last></author>
       <author><first>Leon</first><last>Bergen</last></author>
-      <pages>6175–6185</pages>
+      <pages>6176–6186</pages>
       <abstract>Phonological processes are context-dependent sound changes in natural languages. We present an unsupervised approach to learning human-readable descriptions of phonological processes from collections of related utterances. Our approach builds upon a technique from the programming languages community called *constraint-based program synthesis*. We contribute a novel encoding of the learning problem into Boolean Satisfiability constraints, which enables both data efficiency and fast inference. We evaluate our system on textbook phonology problems and datasets from the literature, and show that it achieves high accuracy at interactive speeds.</abstract>
       <url>D19-1639</url>
       <doi>10.18653/v1/D19-1639</doi>
@@ -7446,7 +7445,7 @@
       <author><first>Qiong</first><last>Zhang</last></author>
       <author><first>Luo</first><last>Si</last></author>
       <author><first>Xiaozhong</first><last>Liu</last></author>
-      <pages>6186–6195</pages>
+      <pages>6187–6196</pages>
       <abstract>The task of Chinese text spam detection is very challenging due to both glyph and phonetic variations of Chinese characters. This paper proposes a novel framework to jointly model Chinese variational, semantic, and contextualized representations for Chinese text spam detection task. In particular, a Variation Family-enhanced Graph Embedding (VFGE) algorithm is designed based on a Chinese character variation graph. The VFGE can learn both the graph embeddings of the Chinese characters (local) and the latent variation families (global). Furthermore, an enhanced bidirectional language model, with a combination gate function and an aggregation learning function, is proposed to integrate the graph and text information while capturing the sequential information. Extensive experiments have been conducted on both SMS and review datasets, to show the proposed method outperforms a series of state-of-the-art models for Chinese spam detection.</abstract>
       <url>D19-1640</url>
       <doi>10.18653/v1/D19-1640</doi>
@@ -7455,7 +7454,7 @@
       <title>An Attentive Fine-Grained Entity Typing Model with Latent Type Representation</title>
       <author><first>Ying</first><last>Lin</last></author>
       <author><first>Heng</first><last>Ji</last></author>
-      <pages>6196–6201</pages>
+      <pages>6197–6202</pages>
       <abstract>We propose a fine-grained entity typing model with a novel attention mechanism and a hybrid type classifier. We advance existing methods in two aspects: feature extraction and type prediction. To capture richer contextual information, we adopt contextualized word representations instead of fixed word embeddings used in previous work. In addition, we propose a two-step mention-aware attention mechanism to enable the model to focus on important words in mentions and contexts. We also present a hybrid classification method beyond binary relevance to exploit type inter-dependency with latent type representation. Instead of independently predicting each type, we predict a low-dimensional vector that encodes latent type features and reconstruct the type vector from this latent representation. Experiment results on multiple data sets show that our model significantly advances the state-of-the-art on fine-grained entity typing, obtaining up to 6.1% and 5.5% absolute gains in macro averaged F-score and micro averaged F-score respectively.</abstract>
       <url>D19-1641</url>
       <doi>10.18653/v1/D19-1641</doi>
@@ -7465,7 +7464,7 @@
       <author><first>Qiang</first><last>Ning</last></author>
       <author><first>Sanjay</first><last>Subramanian</last></author>
       <author><first>Dan</first><last>Roth</last></author>
-      <pages>6202–6208</pages>
+      <pages>6203–6209</pages>
       <abstract>Determining temporal relations (e.g., before or after) between events has been a challenging natural language understanding task, partly due to the difficulty to generate large amounts of high-quality training data. Consequently, neural approaches have not been widely used on it, or showed only moderate improvements. This paper proposes a new neural system that achieves about 10% absolute improvement in accuracy over the previous best system (25% error reduction) on two benchmark datasets. The proposed system is trained on the state-of-the-art MATRES dataset and applies contextualized word embeddings, a Siamese encoder of a temporal common sense knowledge base, and global inference via integer linear programming (ILP). We suggest that the new approach could serve as a strong baseline for future research in this area.</abstract>
       <url>D19-1642</url>
       <attachment>D19-1642.Attachment.zip</attachment>
@@ -7477,7 +7476,7 @@
       <author><first>Donghong</first><last>Du</last></author>
       <author><first>Xin</first><last>Li</last></author>
       <author><first>Yangqiu</first><last>Song</last></author>
-      <pages>6209–6214</pages>
+      <pages>6210–6215</pages>
       <abstract>Fine-grained entity typing is a challenging problem since it usually involves a relatively large tag set and may require to understand the context of the entity mention. In this paper, we use entity linking to help with the fine-grained entity type classification process. We propose a deep neural model that makes predictions based on both the context and the information obtained from entity linking results. Experimental results on two commonly used datasets demonstrates the effectiveness of our approach. On both datasets, it achieves more than 5% absolute strict accuracy improvement over the state of the art.</abstract>
       <url>D19-1643</url>
       <doi>10.18653/v1/D19-1643</doi>
@@ -7486,7 +7485,7 @@
       <title>Combining Spans into Entities: A Neural Two-Stage Approach for Recognizing Discontiguous Entities</title>
       <author><first>Bailin</first><last>Wang</last></author>
       <author><first>Wei</first><last>Lu</last></author>
-      <pages>6215–6223</pages>
+      <pages>6216–6224</pages>
       <abstract>In medical documents, it is possible that an entity of interest not only contains a discontiguous sequence of words but also overlaps with another entity. Entities of such structures are intrinsically hard to recognize due to the large space of possible entity combinations. In this work, we propose a neural two-stage approach to recognizing discontiguous and overlapping entities by decomposing this problem into two subtasks: 1) it first detects all the overlapping spans that either form entities on their own or present as segments of discontiguous entities, based on the representation of segmental hypergraph, 2) next it learns to combine these segments into discontiguous entities with a classifier, which filters out other incorrect combinations of segments. Two neural components are designed for these subtasks respectively and they are learned jointly using a shared encoder for text. Our model achieves the state-of-the-art performance in a standard dataset, even in the absence of external features that previous methods used.</abstract>
       <url>D19-1644</url>
       <doi>10.18653/v1/D19-1644</doi>
@@ -7497,7 +7496,7 @@
       <author><first>Takuya</first><last>Hiraoka</last></author>
       <author><first>Kunihiko</first><last>Sadamasa</last></author>
       <author><first>Mathias</first><last>Niepert</last></author>
-      <pages>6224–6230</pages>
+      <pages>6225–6231</pages>
       <abstract>Most existing relation extraction approaches exclusively target binary relations, and n-ary relation extraction is relatively unexplored. Current state-of-the-art n-ary relation extraction method is based on a supervised learning approach and, therefore, may suffer from the lack of sufficient relation labels. In this paper, we propose a novel approach to cross-sentence n-ary relation extraction based on universal schemas. To alleviate the sparsity problem and to leverage inherent decomposability of n-ary relations, we propose to learn relation representations of lower-arity facts that result from decomposing higher-arity facts. The proposed method computes a score of a new n-ary fact by aggregating scores of its decomposed lower-arity facts. We conduct experiments with datasets for ternary relation extraction and empirically show that our method improves the n-ary relation extraction performance compared to previous methods.</abstract>
       <url>D19-1645</url>
       <attachment>D19-1645.Attachment.zip</attachment>
@@ -7511,7 +7510,7 @@
       <author><first>Le</first><last>Sun</last></author>
       <author><first>Bin</first><last>Dong</last></author>
       <author><first>Shanshan</first><last>Jiang</last></author>
-      <pages>6231–6236</pages>
+      <pages>6232–6237</pages>
       <abstract>Current region-based NER models only rely on fully-annotated training data to learn effective region encoder, which often face the training data bottleneck. To alleviate this problem, this paper proposes Gazetteer-Enhanced Attentive Neural Networks, which can enhance region-based NER by learning name knowledge of entity mentions from easily-obtainable gazetteers, rather than only from fully-annotated data. Specially, we first propose an attentive neural network (ANN), which explicitly models the mention-context association and therefore is convenient for integrating externally-learned knowledge. Then we design an auxiliary gazetteer network, which can effectively encode name regularity of mentions only using gazetteers. Finally, the learned gazetteer network is incorporated into ANN for better NER. Experiments show that our ANN can achieve the state-of-the-art performance on ACE2005 named entity recognition benchmark. Besides, incorporating gazetteer network can further improve the performance and significantly reduce the requirement of training data.</abstract>
       <url>D19-1646</url>
       <doi>10.18653/v1/D19-1646</doi>
@@ -7522,7 +7521,7 @@
       <author><first>Robert</first><last>Jäschke</last></author>
       <author><first>Frank</first><last>Fischer</last></author>
       <author><first>Jannik</first><last>Strötgen</last></author>
-      <pages>6237–6242</pages>
+      <pages>6238–6243</pages>
       <abstract>Attributing a particular property to a person by naming another person, who is typically wellknown for the respective property, is called a Vossian Antonomasia (VA). This subtpye of metonymy, which overlaps with metaphor, has a specific syntax and is especially frequent in journalistic texts. While identifying Vossian Antonomasia is of particular interest in the study of stylistics, it is also a source of errors in relation and fact extraction as an explicitly mentioned entity occurs only metaphorically and should not be associated with respective contexts. Despite rather simple syntactic variations, the automatic extraction of VA was never addressed as yet since it requires a deeper semantic understanding of mentioned entities and underlying relations. In this paper, we propose a first method for the extraction of VAs that works completely automatically. Our approaches use named entity recognition, distant supervision based on Wikidata, and a bi-directional LSTM for postprocessing. The evaluation on 1.8 million articles of the New York Times corpus shows that our approach significantly outperforms the only existing semi-automatic approach for VA identification by more than 30 percentage points in precision.</abstract>
       <url>D19-1647</url>
       <doi>10.18653/v1/D19-1647</doi>
@@ -7534,7 +7533,7 @@
       <author><first>Takashi</first><last>Ninomiya</last></author>
       <author><first>Takuya</first><last>Makino</last></author>
       <author><first>Tomoya</first><last>Iwakura</last></author>
-      <pages>6243–6248</pages>
+      <pages>6244–6249</pages>
       <abstract>We propose a method to improve named entity recognition (NER) for chemical compounds using multi-task learning by jointly training a chemical NER model and a chemical com- pound paraphrase model. Our method en- ables the long short-term memory (LSTM) of the NER model to capture chemical com- pound paraphrases by sharing the parameters of the LSTM and character embeddings be- tween the two models. The experimental re- sults on the BioCreative IV’s CHEMDNER task show that our method improves chemi- cal NER and achieves state-of-the-art perfor- mance.</abstract>
       <url>D19-1648</url>
       <doi>10.18653/v1/D19-1648</doi>
@@ -7548,7 +7547,7 @@
       <author><first>Peng</first><last>Li</last></author>
       <author><first>Maosong</first><last>Sun</last></author>
       <author><first>Jie</first><last>Zhou</last></author>
-      <pages>6249–6254</pages>
+      <pages>6250–6255</pages>
       <abstract>We present FewRel 2.0, a more challenging task to investigate two aspects of few-shot relation classification models: (1) Can they adapt to a new domain with only a handful of instances? (2) Can they detect none-of-the-above (NOTA) relations? To construct FewRel 2.0, we build upon the FewRel dataset by adding a new test set in a quite different domain, and a NOTA relation choice. With the new dataset and extensive experimental analysis, we found (1) that the state-of-the-art few-shot relation classification models struggle on these two aspects, and (2) that the commonly-used techniques for domain adaptation and NOTA detection still cannot handle the two challenges well. Our research calls for more attention and further efforts to these two real-world issues. All details and resources about the dataset and baselines are released at https://github.com/thunlp/fewrel.</abstract>
       <url>D19-1649</url>
       <doi>10.18653/v1/D19-1649</doi>
@@ -7558,7 +7557,7 @@
       <author><first>Stephen</first><last>Mayhew</last></author>
       <author><first>Tatiana</first><last>Tsygankova</last></author>
       <author><first>Dan</first><last>Roth</last></author>
-      <pages>6255–6260</pages>
+      <pages>6256–6261</pages>
       <abstract>For those languages which use it, capitalization is an important signal for the fundamental NLP tasks of Named Entity Recognition (NER) and Part of Speech (POS) tagging. In fact, it is such a strong signal that model performance on these tasks drops sharply in common lowercased scenarios, such as noisy web text or machine translation outputs. In this work, we perform a systematic analysis of solutions to this problem, modifying only the casing of the train or test data using lowercasing and truecasing methods. While prior work and first impressions might suggest training a caseless model, or using a truecaser at test time, we show that the most effective strategy is a concatenation of cased and lowercased training data, producing a single model with high performance on both cased and uncased text. As shown in our experiments, this result holds across tasks and input representations. Finally, we show that our proposed solution gives an 8% F1 improvement in mention detection on noisy out-of-domain Twitter data.</abstract>
       <url>D19-1650</url>
       <doi>10.18653/v1/D19-1650</doi>
@@ -7568,7 +7567,7 @@
       <author><first>Sangnie</first><last>Bhardwaj</last></author>
       <author><first>Samarth</first><last>Aggarwal</last></author>
       <author><first>Mausam</first><last>Mausam</last></author>
-      <pages>6261–6266</pages>
+      <pages>6262–6267</pages>
       <abstract>Open Information Extraction (Open IE) systems have been traditionally evaluated via manual annotation. Recently, an automated evaluator with a benchmark dataset (OIE2016) was released – it scores Open IE systems automatically by matching system predictions with predictions in the benchmark dataset. Unfortunately, our analysis reveals that its data is rather noisy, and the tuple matching in the evaluator has issues, making the results of automated comparisons less trustworthy. We contribute CaRB, an improved dataset and framework for testing Open IE systems. To the best of our knowledge, CaRB is the first crowdsourced Open IE dataset and it also makes substantive changes in the matching code and metrics. NLP experts annotate CaRB’s dataset to be more accurate than OIE2016. Moreover, we find that on one pair of Open IE systems, CaRB framework provides contradictory results to OIE2016. Human assessment verifies that CaRB’s ranking of the two systems is the accurate ranking. We release the CaRB framework along with its crowdsourced dataset.</abstract>
       <url>D19-1651</url>
       <attachment>D19-1651.Attachment.zip</attachment>
@@ -7578,7 +7577,7 @@
       <title>Weakly Supervised Attention Networks for Entity Recognition</title>
       <author><first>Barun</first><last>Patra</last></author>
       <author><first>Joel Ruben Antony</first><last>Moniz</last></author>
-      <pages>6267–6272</pages>
+      <pages>6268–6273</pages>
       <abstract>The task of entity recognition has traditionally been modelled as a sequence labelling task. However, this usually requires a large amount of fine-grained data annotated at the token level, which in turn can be expensive and cumbersome to obtain. In this work, we aim to circumvent this requirement of word-level annotated data. To achieve this, we propose a novel architecture for entity recognition from a corpus containing weak binary presence/absence labels, which are relatively easier to obtain. We show that our proposed weakly supervised model, trained solely on a multi-label classification task, performs reasonably well on the task of entity recognition, despite not having access to any token-level ground truth data.</abstract>
       <url>D19-1652</url>
       <attachment>D19-1652.Attachment.pdf</attachment>
@@ -7589,7 +7588,7 @@
       <author><first>Gaku</first><last>Morio</last></author>
       <author><first>Ryo</first><last>Egawa</last></author>
       <author><first>Katsuhide</first><last>Fujita</last></author>
-      <pages>6273–6278</pages>
+      <pages>6274–6279</pages>
       <abstract>In online arguments, identifying how users construct their arguments to persuade others is important in order to understand a persuasive strategy directly. However, existing research lacks empirical investigations on highly semantic aspects of elementary units (EUs), such as propositions for a persuasive online argument. Therefore, this paper focuses on a pilot study, revealing a persuasion strategy using EUs. Our contributions are as follows: (1) annotating five types of EUs in a persuasive forum, the so-called ChangeMyView, (2) revealing both intuitive and non-intuitive strategic insights for the persuasion by analyzing 4612 annotated EUs, and (3) proposing baseline neural models that identify the EU boundary and type. Our observations imply that EUs definitively characterize online persuasion strategies.</abstract>
       <url>D19-1653</url>
       <doi>10.18653/v1/D19-1653</doi>
@@ -7601,7 +7600,7 @@
       <author><first>Ruifeng</first><last>Xu</last></author>
       <author><first>Xiang</first><last>Ao</last></author>
       <author><first>Min</first><last>Yang</last></author>
-      <pages>6279–6284</pages>
+      <pages>6280–6285</pages>
       <abstract>Aspect-based sentiment analysis (ABSA) has attracted increasing attention recently due to its broad applications. In existing ABSA datasets, most sentences contain only one aspect or multiple aspects with the same sentiment polarity, which makes ABSA task degenerate to sentence-level sentiment analysis. In this paper, we present a new large-scale Multi-Aspect Multi-Sentiment (MAMS) dataset, in which each sentence contains at least two different aspects with different sentiment polarities. The release of this dataset would push forward the research in this field. In addition, we propose simple yet effective CapsNet and CapsNet-BERT models which combine the strengths of recent NLP advances. Experiments on our new dataset show that the proposed model significantly outperforms the state-of-the-art baseline methods</abstract>
       <url>D19-1654</url>
       <doi>10.18653/v1/D19-1654</doi>
@@ -7613,7 +7612,7 @@
       <author><first>Chaozhuo</first><last>Li</last></author>
       <author><first>Yan</first><last>Yang</last></author>
       <author><first>Tianrui</first><last>Li</last></author>
-      <pages>6285–6291</pages>
+      <pages>6286–6292</pages>
       <abstract>Deep neural networks (DNNs) can fit (or even over-fit) the training data very well. If a DNN model is trained using data with noisy labels and tested on data with clean labels, the model may perform poorly. This paper studies the problem of learning with noisy labels for sentence-level sentiment classification. We propose a novel DNN model called NetAb (as shorthand for convolutional neural Networks with Ab-networks) to handle noisy labels during training. NetAb consists of two convolutional neural networks, one with a noise transition layer for dealing with the input noisy labels and the other for predicting ‘clean’ labels. We train the two networks using their respective loss functions in a mutual reinforcement manner. Experimental results demonstrate the effectiveness of the proposed model.</abstract>
       <url>D19-1655</url>
       <doi>10.18653/v1/D19-1655</doi>
@@ -7623,7 +7622,7 @@
       <author><first>Chen</first><last>Liu</last></author>
       <author><first>Muhammad</first><last>Osama</last></author>
       <author><first>Anderson</first><last>De Andrade</last></author>
-      <pages>6292–6297</pages>
+      <pages>6293–6298</pages>
       <abstract>We introduce a new dataset for multi-class emotion analysis from long-form narratives in English. The Dataset for Emotions of Narrative Sequences (DENS) was collected from both classic literature available on Project Gutenberg and modern online narratives avail- able on Wattpad, annotated using Amazon Mechanical Turk. A number of statistics and baseline benchmarks are provided for the dataset. Of the tested techniques, we find that the fine-tuning of a pre-trained BERT model achieves the best results, with an average micro-F1 score of 60.4%. Our results show that the dataset provides a novel opportunity in emotion analysis that requires moving beyond existing sentence-level techniques.</abstract>
       <url>D19-1656</url>
       <attachment>D19-1656.Attachment.zip</attachment>
@@ -7633,7 +7632,7 @@
       <title>Multi-Task Stance Detection with Sentiment and Stance Lexicons</title>
       <author><first>Yingjie</first><last>Li</last></author>
       <author><first>Cornelia</first><last>Caragea</last></author>
-      <pages>6298–6304</pages>
+      <pages>6299–6305</pages>
       <abstract>Stance detection aims to detect whether the opinion holder is in support of or against a given target. Recent works show improvements in stance detection by using either the attention mechanism or sentiment information. In this paper, we propose a multi-task framework that incorporates target-specific attention mechanism and at the same time takes sentiment classification as an auxiliary task. Moreover, we used a sentiment lexicon and constructed a stance lexicon to provide guidance for the attention layer. Experimental results show that the proposed model significantly outperforms state-of-the-art deep learning methods on the SemEval-2016 dataset.</abstract>
       <url>D19-1657</url>
       <doi>10.18653/v1/D19-1657</doi>
@@ -7642,7 +7641,7 @@
       <title>A Robust Self-Learning Framework for Cross-Lingual Text Classification</title>
       <author><first>Xin</first><last>Dong</last></author>
       <author><first>Gerard</first><last>de Melo</last></author>
-      <pages>6305–6309</pages>
+      <pages>6306–6310</pages>
       <abstract>Based on massive amounts of data, recent pretrained contextual representation models have made significant strides in advancing a number of different English NLP tasks. However, for other languages, relevant training data may be lacking, while state-of-the-art deep learning methods are known to be data-hungry. In this paper, we present an elegantly simple robust self-learning framework to include unlabeled non-English samples in the fine-tuning process of pretrained multilingual representation models. We leverage a multilingual model’s own predictions on unlabeled non-English data in order to obtain additional information that can be used during further fine-tuning. Compared with original multilingual models and other cross-lingual classification models, we observe significant gains in effectiveness on document and sentiment classification for a range of diverse languages.</abstract>
       <url>D19-1658</url>
       <doi>10.18653/v1/D19-1658</doi>
@@ -7650,7 +7649,7 @@
     <paper id="659">
       <title>Learning to Flip the Sentiment of Reviews from Non-Parallel Corpora</title>
       <author><first>Canasai</first><last>Kruengkrai</last></author>
-      <pages>6310–6315</pages>
+      <pages>6311–6316</pages>
       <abstract>Flipping sentiment while preserving sentence meaning is challenging because parallel sentences with the same content but different sentiment polarities are not always available for model learning. We introduce a method for acquiring imperfectly aligned sentences from non-parallel corpora and propose a model that learns to minimize the sentiment and content losses in a fully end-to-end manner. Our model is simple and offers well-balanced results across two domains: Yelp restaurant and Amazon product reviews.</abstract>
       <url>D19-1659</url>
       <doi>10.18653/v1/D19-1659</doi>
@@ -7662,7 +7661,7 @@
       <author><first>Yuka</first><last>Takei</last></author>
       <author><first>Hiroki</first><last>Okamoto</last></author>
       <author><first>Jun</first><last>Goto</last></author>
-      <pages>6316–6321</pages>
+      <pages>6317–6322</pages>
       <abstract>Twitter is used for various applications such as disaster monitoring and news material gathering. In these applications, each Tweet is classified into pre-defined classes. These classes have a semantic relationship with each other and can be classified into a hierarchical structure, which is regarded as important information. Label texts of pre-defined classes themselves also include important clues for classification. Therefore, we propose a method that can consider the hierarchical structure of labels and label texts themselves. We conducted evaluation over the Text REtrieval Conference (TREC) 2018 Incident Streams (IS) track dataset, and we found that our method outperformed the methods of the conference participants.</abstract>
       <url>D19-1660</url>
       <doi>10.18653/v1/D19-1660</doi>
@@ -7672,7 +7671,7 @@
       <author><first>Miriam</first><last>Hurtado Bodell</last></author>
       <author><first>Martin</first><last>Arvidsson</last></author>
       <author><first>Måns</first><last>Magnusson</last></author>
-      <pages>6322–6328</pages>
+      <pages>6323–6329</pages>
       <abstract>Word embeddings have demonstrated strong performance on NLP tasks. However, lack of interpretability and the unsupervised nature of word embeddings have limited their use within computational social science and digital humanities. We propose the use of informative priors to create interpretable and domain-informed dimensions for probabilistic word embeddings. Experimental results show that sensible priors can capture latent semantic concepts better than or on-par with the current state of the art, while retaining the simplicity and generalizability of using priors.</abstract>
       <url>D19-1661</url>
       <attachment>D19-1661.Attachment.zip</attachment>
@@ -7685,7 +7684,7 @@
       <author><first>Yanai</first><last>Elazar</last></author>
       <author><first>Desmond</first><last>Elliott</last></author>
       <author><first>Anders</first><last>Søgaard</last></author>
-      <pages>6329–6334</pages>
+      <pages>6330–6335</pages>
       <abstract>Elazar and Goldberg (2018) showed that protected attributes can be extracted from the representations of a debiased neural network for mention detection at above-chance levels, by evaluating a diagnostic classifier on a held-out subsample of the data it was trained on. We revisit their experiments and conduct a series of follow-up experiments showing that, in fact, the diagnostic classifier generalizes poorly to both new in-domain samples and new domains, indicating that it relies on correlations specific to their particular data sample. We further show that a diagnostic classifier trained on the biased baseline neural network also does not generalize to new samples. In other words, the biases detected in Elazar and Goldberg (2018) seem restricted to their particular data sample, and would therefore not bias the decisions of the model on new samples, whether in-domain or out-of-domain. In light of this, we discuss better methodologies for detecting bias in our models.</abstract>
       <url>D19-1662</url>
       <attachment>D19-1662.Attachment.pdf</attachment>
@@ -7696,7 +7695,7 @@
       <author><first>Jasabanta</first><last>Patro</last></author>
       <author><first>Srijan</first><last>Bansal</last></author>
       <author><first>Animesh</first><last>Mukherjee</last></author>
-      <pages>6335–6341</pages>
+      <pages>6336–6342</pages>
       <abstract>In this paper we propose a deep learning framework for sarcasm target detection in predefined sarcastic texts. Identification of sarcasm targets can help in many core natural language processing tasks such as aspect based sentiment analysis, opinion mining etc. To begin with, we perform an empirical study of the socio-linguistic features and identify those that are statistically significant in indicating sarcasm targets (p-values in the range(0.05,0.001)). Finally, we present a deep-learning framework augmented with socio-linguistic features to detect sarcasm targets in sarcastic book-snippets and tweets.We achieve a huge improvement in the performance in terms of exact match and dice scores compared to the current state-of-the-art baseline.</abstract>
       <url>D19-1663</url>
       <doi>10.18653/v1/D19-1663</doi>
@@ -7710,7 +7709,7 @@
       <author><first>Prafulla Kumar</first><last>Choubey</last></author>
       <author><first>Ruihong</first><last>Huang</last></author>
       <author><first>Lu</first><last>Wang</last></author>
-      <pages>6342–6348</pages>
+      <pages>6343–6349</pages>
       <abstract>The increasing prevalence of political bias in news media calls for greater public awareness of it, as well as robust methods for its detection. While prior work in NLP has primarily focused on the lexical bias captured by linguistic attributes such as word choice and syntax, other types of bias stem from the actual content selected for inclusion in the text. In this work, we investigate the effects of informational bias: factual content that can nevertheless be deployed to sway reader opinion. We first produce a new dataset, BASIL, of 300 news articles annotated with 1,727 bias spans and find evidence that informational bias appears in news articles more frequently than lexical bias. We further study our annotations to observe how informational bias surfaces in news articles by different media outlets. Lastly, a baseline model for informational bias prediction is presented by fine-tuning BERT on our labeled data, indicating the challenges of the task and future directions.</abstract>
       <url>D19-1664</url>
       <attachment>D19-1664.Attachment.zip</attachment>
@@ -7720,7 +7719,7 @@
       <title>Incorporating Label Dependencies in Multilabel Stance Detection</title>
       <author><first>William</first><last>Ferreira</last></author>
       <author><first>Andreas</first><last>Vlachos</last></author>
-      <pages>6349–6353</pages>
+      <pages>6350–6354</pages>
       <abstract>Stance detection in social media is a well-studied task in a variety of domains. Nevertheless, previous work has mostly focused on multiclass versions of the problem, where the labels are mutually exclusive, and typically positive, negative or neutral. In this paper, we address versions of the task in which an utterance can have multiple labels, thus corresponding to multilabel classification. We propose a method that explicitly incorporates label dependencies in the training objective and compare it against a variety of baselines, as well as a reduction of multilabel to multiclass learning. In experiments with three datasets, we find that our proposed method improves upon all baselines on two out of three datasets. We also show that the reduction of multilabel to multiclass classification can be very competitive, especially in cases where the output consists of a small number of labels and one can enumerate over all label combinations.</abstract>
       <url>D19-1665</url>
       <attachment>D19-1665.Attachment.zip</attachment>
@@ -7734,7 +7733,7 @@
       <author><first>Alvin</first><last>Grissom II</last></author>
       <author><first>Brendan</first><last>O’Connor</last></author>
       <author><first>Mohit</first><last>Iyyer</last></author>
-      <pages>6354–6360</pages>
+      <pages>6355–6361</pages>
       <abstract>Sports broadcasters inject drama into play-by-play commentary by building team and player narratives through subjective analyses and anecdotes. Prior studies based on small datasets and manual coding show that such theatrics evince commentator bias in sports broadcasts. To examine this phenomenon, we assemble FOOTBALL, which contains 1,455 broadcast transcripts from American football games across six decades that are automatically annotated with 250K player mentions and linked with racial metadata. We identify major confounding factors for researchers examining racial bias in FOOTBALL, and perform a computational analysis that supports conclusions from prior social science studies.</abstract>
       <url>D19-1666</url>
       <attachment>D19-1666.Attachment.pdf</attachment>
@@ -7747,7 +7746,7 @@
       <author><first>Wei</first><last>Dai</last></author>
       <author><first>Zehui</first><last>Dai</last></author>
       <author><first>Yadong</first><last>Ding</last></author>
-      <pages>6361–6366</pages>
+      <pages>6362–6367</pages>
       <abstract>Judgment prediction for legal cases has attracted much research efforts for its practice use, of which the ultimate goal is prison term prediction. While existing work merely predicts the total prison term, in reality a defendant is often charged with multiple crimes. In this paper, we argue that charge-based prison term prediction (CPTP) not only better fits realistic needs, but also makes the total prison term prediction more accurate and interpretable. We collect the first large-scale structured data for CPTP and evaluate several competitive baselines. Based on the observation that fine-grained feature selection is the key to achieving good performance, we propose the Deep Gating Network (DGN) for charge-specific feature selection and aggregation. Experiments show that DGN achieves the state-of-the-art performance.</abstract>
       <url>D19-1667</url>
       <doi>10.18653/v1/D19-1667</doi>
@@ -7757,7 +7756,7 @@
       <author><first>Yannis</first><last>Assael</last></author>
       <author><first>Thea</first><last>Sommerschield</last></author>
       <author><first>Jonathan</first><last>Prag</last></author>
-      <pages>6367–6374</pages>
+      <pages>6368–6375</pages>
       <abstract>Ancient History relies on disciplines such as Epigraphy, the study of ancient inscribed texts, for evidence of the recorded past. However, these texts, “inscriptions”, are often damaged over the centuries, and illegible parts of the text must be restored by specialists, known as epigraphists. This work presents Pythia, the first ancient text restoration model that recovers missing characters from a damaged text input using deep neural networks. Its architecture is carefully designed to handle long-term context information, and deal efficiently with missing or corrupted character and word representations. To train it, we wrote a non-trivial pipeline to convert PHI, the largest digital corpus of ancient Greek inscriptions, to machine actionable text, which we call PHI-ML. On PHI-ML, Pythia’s predictions achieve a 30.1% character error rate, compared to the 57.3% of human epigraphists. Moreover, in 73.5% of cases the ground-truth sequence was among the Top-20 hypotheses of Pythia, which effectively demonstrates the impact of this assistive method on the field of digital epigraphy, and sets the state-of-the-art in ancient text restoration.</abstract>
       <url>D19-1668</url>
       <attachment>D19-1668.Attachment.zip</attachment>
@@ -7769,7 +7768,7 @@
       <author><first>Andrew</first><last>Cattle</last></author>
       <author><first>Evangelos</first><last>Papalexakis</last></author>
       <author><first>Xiaojuan</first><last>Ma</last></author>
-      <pages>6375–6380</pages>
+      <pages>6376–6381</pages>
       <abstract>We propose a novel tensor embedding method that can effectively extract lexical features for humor recognition. Specifically, we use word-word co-occurrence to encode the contextual content of documents, and then decompose the tensor to get corresponding vector representations. We show that this simple method can capture features of lexical humor effectively for continuous humor recognition. In particular, we achieve a distance of 0.887 on a global humor ranking task, comparable to the top performing systems from SemEval 2017 Task 6B (Potash et al., 2017) but without the need for any external training corpus. In addition, we further show that this approach is also beneficial for small sample humor recognition tasks through a semi-supervised label propagation procedure, which achieves about 0.7 accuracy on the 16000 One-Liners (Mihalcea and Strapparava, 2005) and Pun of the Day (Yang et al., 2015) humour classification datasets using only 10% of known labels.</abstract>
       <url>D19-1669</url>
       <doi>10.18653/v1/D19-1669</doi>
@@ -7778,7 +7777,7 @@
       <title><fixed-case>EDA</fixed-case>: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks</title>
       <author><first>Jason</first><last>Wei</last></author>
       <author><first>Kai</first><last>Zou</last></author>
-      <pages>6381–6387</pages>
+      <pages>6382–6388</pages>
       <abstract>We present EDA: easy data augmentation techniques for boosting performance on text classification tasks. EDA consists of four simple but powerful operations: synonym replacement, random insertion, random swap, and random deletion. On five text classification tasks, we show that EDA improves performance for both convolutional and recurrent neural networks. EDA demonstrates particularly strong results for smaller datasets; on average, across five datasets, training with EDA while using only 50% of the available training set achieved the same accuracy as normal training with all available data. We also performed extensive ablation studies and suggest parameters for practical use.</abstract>
       <url>D19-1670</url>
       <attachment>D19-1670.Attachment.zip</attachment>
@@ -7792,7 +7791,7 @@
       <author><first>Tao</first><last>Qi</last></author>
       <author><first>Yongfeng</first><last>Huang</last></author>
       <author><first>Xing</first><last>Xie</last></author>
-      <pages>6388–6393</pages>
+      <pages>6389–6394</pages>
       <abstract>News recommendation can help users find interested news and alleviate information overload. Precisely modeling news and users is critical for news recommendation, and capturing the contexts of words and news is important to learn news and user representations. In this paper, we propose a neural news recommendation approach with multi-head self-attention (NRMS). The core of our approach is a news encoder and a user encoder. In the news encoder, we use multi-head self-attentions to learn news representations from news titles by modeling the interactions between words. In the user encoder, we learn representations of users from their browsed news and use multi-head self-attention to capture the relatedness between the news. Besides, we apply additive attention to learn more informative news and user representations by selecting important words and news. Experiments on a real-world dataset validate the effectiveness and efficiency of our approach.</abstract>
       <url>D19-1671</url>
       <doi>10.18653/v1/D19-1671</doi>
@@ -7802,7 +7801,7 @@
       <author><first>Xiaolei</first><last>Huang</last></author>
       <author><first>Jonathan</first><last>May</last></author>
       <author><first>Nanyun</first><last>Peng</last></author>
-      <pages>6394–6400</pages>
+      <pages>6395–6401</pages>
       <abstract>Building named entity recognition (NER) models for languages that do not have much training data is a challenging task. While recent work has shown promising results on cross-lingual transfer from high-resource languages, it is unclear what knowledge is transferred. In this paper, we first propose a simple and efficient neural architecture for cross-lingual NER. Experiments show that our model achieves competitive performance with the state-of-the-art. We further explore how transfer learning works for cross-lingual NER on two transferable factors: sequential order and multilingual embedding. Our results shed light on future research for improving cross-lingual NER.</abstract>
       <url>D19-1672</url>
       <doi>10.18653/v1/D19-1672</doi>
@@ -7814,7 +7813,7 @@
       <author><first>Xikai</first><last>Liu</last></author>
       <author><first>Hongfei</first><last>Lin</last></author>
       <author><first>Feng</first><last>Xia</last></author>
-      <pages>6401–6406</pages>
+      <pages>6402–6407</pages>
       <abstract>Humor plays important role in human communication, which makes it important problem for natural language processing. Prior work on the analysis of humor focuses on whether text is humorous or not, or the degree of funniness, but this is insufficient to explain why it is funny. We therefore create a dataset on humor with 9,123 manually annotated jokes in Chinese. We propose a novel annotation scheme to give scenarios of how humor arises in text. Specifically, our annotations of linguistic humor not only contain the degree of funniness, like previous work, but they also contain key words that trigger humor as well as character relationship, scene, and humor categories. We report reasonable agreement between annota-tors. We also conduct an analysis and exploration of the dataset. To the best of our knowledge, we are the first to approach humor annotation for exploring the underlying mechanism of the use of humor, which may contribute to a significantly deeper analysis of humor. We also contribute with a scarce and valuable dataset, which we will release publicly.</abstract>
       <url>D19-1673</url>
       <doi>10.18653/v1/D19-1673</doi>
@@ -7825,7 +7824,7 @@
       <author><first>Sho</first><last>Takase</last></author>
       <author><first>Tsutomu</first><last>Hirao</last></author>
       <author><first>Masaaki</first><last>Nagata</last></author>
-      <pages>6407–6411</pages>
+      <pages>6408–6412</pages>
       <abstract>An anagram is a sentence or a phrase that is made by permutating the characters of an input sentence or a phrase. For example, “Trims cash” is an anagram of “Christmas”. Existing automatic anagram generation methods can find possible combinations of words form an anagram. However, they do not pay much attention to the naturalness of the generated anagrams. In this paper, we show that simple depth-first search can yield natural anagrams when it is combined with modern neural language models. Human evaluation results show that the proposed method can generate significantly more natural anagrams than baseline methods.</abstract>
       <url>D19-1674</url>
       <doi>10.18653/v1/D19-1674</doi>
@@ -7836,7 +7835,7 @@
       <author><first>Subhabrata</first><last>Mukherjee</last></author>
       <author><first>Andrew</first><last>Yates</last></author>
       <author><first>Gerhard</first><last>Weikum</last></author>
-      <pages>6412–6417</pages>
+      <pages>6413–6418</pages>
       <abstract>Controversial claims are abundant in online media and discussion forums. A better understanding of such claims requires analyzing them from different perspectives. Stance classification is a necessary step for inferring these perspectives in terms of supporting or opposing the claim. In this work, we present a neural network model for stance classification leveraging BERT representations and augmenting them with a novel consistency constraint. Experiments on the Perspectrum dataset, consisting of claims and users’ perspectives from various debate websites, demonstrate the effectiveness of our approach over state-of-the-art baselines.</abstract>
       <url>D19-1675</url>
       <doi>10.18653/v1/D19-1675</doi>
@@ -7848,7 +7847,7 @@
       <author><first>Georgios Christos</first><last>Chouliaras</last></author>
       <author><first>Amina</first><last>Keldibek</last></author>
       <author><first>Maarten</first><last>Versteegh</last></author>
-      <pages>6418–6423</pages>
+      <pages>6419–6424</pages>
       <abstract>This paper explores different approaches to multilingual intent classification in a low resource setting. Recent advances in multilingual text representations promise cross-lingual transfer for classifiers. We investigate the potential for this transfer in an applied industrial setting and compare to multilingual classification using machine translated text. Our results show that while the recently developed methods show promise, practical application calls for a combination of techniques for useful results.</abstract>
       <url>D19-1676</url>
       <doi>10.18653/v1/D19-1676</doi>
@@ -7859,7 +7858,7 @@
       <author><first>Sang-Ki</first><last>Ko</last></author>
       <author><first>Marco</first><last>Cognetta</last></author>
       <author><first>Yo-Sub</first><last>Han</last></author>
-      <pages>6424–6430</pages>
+      <pages>6425–6431</pages>
       <abstract>We continue the study of generating se-mantically correct regular expressions from natural language descriptions (NL). The current state-of-the-art model SemRegex produces regular expressions from NLs by rewarding the reinforced learning based on the semantic (rather than syntactic) equivalence between two regular expressions. Since the regular expression equivalence problem is PSPACE-complete, we introduce the EQ_Reg model for computing the simi-larity of two regular expressions using deep neural networks. Our EQ_Reg mod-el essentially softens the equivalence of two regular expressions when used as a reward function. We then propose a new regex generation model, SoftRegex, us-ing the EQ_Reg model, and empirically demonstrate that SoftRegex substantially reduces the training time (by a factor of at least 3.6) and produces state-of-the-art results on three benchmark datasets.</abstract>
       <url>D19-1677</url>
       <doi>10.18653/v1/D19-1677</doi>
@@ -7870,7 +7869,7 @@
       <author><first>Karan</first><last>Aggarwal</last></author>
       <author><first>Shafiq</first><last>Joty</last></author>
       <author><first>Jaideep</first><last>Srivastava</last></author>
-      <pages>6431–6436</pages>
+      <pages>6432–6437</pages>
       <abstract>Monitoring patients in ICU is a challenging and high-cost task. Hence, predicting the condition of patients during their ICU stay can help provide better acute care and plan the hospital’s resources. There has been continuous progress in machine learning research for ICU management, and most of this work has focused on using time series signals recorded by ICU instruments. In our work, we show that adding clinical notes as another modality improves the performance of the model for three benchmark tasks: in-hospital mortality prediction, modeling decompensation, and length of stay forecasting that play an important role in ICU management. While the time-series data is measured at regular intervals, doctor notes are charted at irregular times, making it challenging to model them together. We propose a method to model them jointly, achieving considerable improvement across benchmark tasks over baseline time-series model.</abstract>
       <url>D19-1678</url>
       <doi>10.18653/v1/D19-1678</doi>
@@ -7880,7 +7879,7 @@
       <author><first>Adithya</first><last>Renduchintala</last></author>
       <author><first>Philipp</first><last>Koehn</last></author>
       <author><first>Jason</first><last>Eisner</last></author>
-      <pages>6437–6442</pages>
+      <pages>6438–6443</pages>
       <abstract>We present a machine foreign-language teacher that modifies text in a student’s native language (L1) by replacing some word tokens with glosses in a foreign language (L2), in such a way that the student can acquire L2 vocabulary simply by reading the resulting macaronic text. The machine teacher uses no supervised data from human students. Instead, to guide the machine teacher’s choice of which words to replace, we equip a cloze language model with a training procedure that can incrementally learn representations for novel words, and use this model as a proxy for the word guessing and learning ability of real human students. We use Mechanical Turk to evaluate two variants of the student model: (i) one that generates a representation for a novel word using only surrounding context and (ii) an extension that also uses the spelling of the novel word.</abstract>
       <url>D19-1679</url>
       <attachment>D19-1679.Attachment.pdf</attachment>
@@ -7892,7 +7891,7 @@
       <author><first>Yee Seng</first><last>Chan</last></author>
       <author><first>Haoling</first><last>Qiu</last></author>
       <author><first>Joshua</first><last>Fasching</last></author>
-      <pages>6443–6447</pages>
+      <pages>6444–6448</pages>
       <abstract>Solving long-lasting problems such as food insecurity requires a comprehensive understanding of interventions applied by governments and international humanitarian assistance organizations, and their results and consequences. Towards achieving this grand goal, a crucial first step is to extract past interventions and when and where they have been applied, from hundreds of thousands of reports automatically. In this paper, we developed a corpus annotated with interventions to foster research, and developed an information extraction system for extracting interventions and their location and time from text. We demonstrate early, very encouraging results on extracting interventions.</abstract>
       <url>D19-1680</url>
       <doi>10.18653/v1/D19-1680</doi>
@@ -7901,7 +7900,7 @@
       <title><fixed-case>RUN</fixed-case> through the Streets: A New Dataset and Baseline Models for Realistic Urban Navigation</title>
       <author><first>Tzuf</first><last>Paz-Argaman</last></author>
       <author><first>Reut</first><last>Tsarfaty</last></author>
-      <pages>6448–6454</pages>
+      <pages>6449–6455</pages>
       <abstract>Following navigation instructions in natural language (NL) requires a composition of language, action, and knowledge of the environment. Knowledge of the environment may be provided via visual sensors or as a symbolic world representation referred to as a map. Previous work on map-based NL navigation relied on small artificial worlds with a fixed set of entities known in advance. Here we introduce the Realistic Urban Navigation (RUN) task, aimed at interpreting NL navigation instructions based on a real, dense, urban map. Using Amazon Mechanical Turk, we collected a dataset of 2515 instructions aligned with actual routes over three regions of Manhattan. We then empirically study which aspects of a neural architecture are important for the RUN success, and empirically show that entity abstraction, attention over words and worlds, and a constantly updating world-state, significantly contribute to task accuracy.</abstract>
       <url>D19-1681</url>
       <attachment>D19-1681.Attachment.zip</attachment>
@@ -7917,7 +7916,7 @@
       <author><first>Xiaoxiao</first><last>Guo</last></author>
       <author><first>Shiyu</first><last>Chang</last></author>
       <author><first>Mo</first><last>Yu</last></author>
-      <pages>6455–6460</pages>
+      <pages>6456–6461</pages>
       <abstract>In multi-party chat, it is common for multiple conversations to occur concurrently, leading to intermingled conversation threads in chat logs. In this work, we propose a novel Context-Aware Thread Detection (CATD) model that automatically disentangles these conversation threads. We evaluate our model on four real-world datasets and demonstrate an overall im-provement in thread detection accuracy over state-of-the-art benchmarks.</abstract>
       <url>D19-1682</url>
       <attachment>D19-1682.Attachment.zip</attachment>
diff --git a/data/xml/R19.xml b/data/xml/R19.xml
new file mode 100644
index 0000000000..327aac62c8
--- /dev/null
+++ b/data/xml/R19.xml
@@ -0,0 +1,1751 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="R19">
+  <volume id="1" ingest-date="2020-01-15">
+    <meta>
+      <booktitle>Natural Language Processing in a Deep Learning World</booktitle>
+      <url>R19-1</url>
+      <editor><first>Ruslan</first><last>Mitkov</last></editor>
+      <editor><first>Galia</first><last>Angelova</last></editor>
+      <publisher>INCOMA Ltd.</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2019</year>
+    </meta>
+    <frontmatter>
+      <url>R19-1000</url>
+    </frontmatter>
+    <paper id="1">
+      <title>Table Structure Recognition Based on Cell Relationship, a Bottom-Up Approach</title>
+      <author><first>Darshan</first><last>Adiga</last></author>
+      <author><first>Shabir Ahmad</first><last>Bhat</last></author>
+      <author><first>Muzaffar Bashir</first><last>Shah</last></author>
+      <author><first>Viveka</first><last>Vyeth</last></author>
+      <pages>1–8</pages>
+      <abstract>In this paper, we present a relationship extraction based methodology for table structure recognition in PDF documents. The proposed deep learning-based method takes a bottom-up approach to table recognition in PDF documents. We outline the shortcomings of conventional approaches based on heuristics and machine learning-based top-down approaches. In this work, we explain how the task of table structure recognition can be modeled as a cell relationship extraction task and the importance of the bottom-up approach in recognizing the table cells. We use Multilayer Feedforward Neural Network for table structure recognition and compare the results of three feature sets. To gauge the performance of the proposed method, we prepared a training dataset using 250 tables in PDF documents, carefully selecting the table structures that are most commonly found in the documents. Our model achieves an overall accuracy of 97.95% and an F1-Score of 92.62% on the test dataset.</abstract>
+      <url>R19-1001</url>
+      <doi>10.26615/978-954-452-056-4_001</doi>
+    </paper>
+    <paper id="2">
+      <title>Identification of Good and Bad News on <fixed-case>T</fixed-case>witter</title>
+      <author><first>Piush</first><last>Aggarwal</last></author>
+      <author><first>Ahmet</first><last>Aker</last></author>
+      <pages>9–17</pages>
+      <abstract>Social media plays a great role in news dissemination which includes good and bad news. However, studies show that news, in general, has a significant impact on our mental stature and that this influence is more in bad news. An ideal situation would be that we have a tool that can help to filter out the type of news we do not want to consume. In this paper, we provide the basis for such a tool. In our work, we focus on Twitter. We release a manually annotated dataset containing 6,853 tweets from 5 different topical categories. Each tweet is annotated with good and bad labels. We also investigate various machine learning systems and features and evaluate their performance on the newly generated dataset. We also perform a comparative analysis with sentiments showing that sentiment alone is not enough to distinguish between good and bad news.</abstract>
+      <url>R19-1002</url>
+      <doi>10.26615/978-954-452-056-4_002</doi>
+    </paper>
+    <paper id="3">
+      <title>Bilingual Low-Resource Neural Machine Translation with Round-Tripping: The Case of <fixed-case>P</fixed-case>ersian-<fixed-case>S</fixed-case>panish</title>
+      <author><first>Benyamin</first><last>Ahmadnia</last></author>
+      <author><first>Bonnie</first><last>Dorr</last></author>
+      <pages>18–24</pages>
+      <abstract>The quality of Neural Machine Translation (NMT), as a data-driven approach, massively depends on quantity, quality, and relevance of the training dataset. Such approaches have achieved promising results for bilingually high-resource scenarios but are inadequate for low-resource conditions. This paper describes a round-trip training approach to bilingual low-resource NMT that takes advantage of monolingual datasets to address training data scarcity, thus augmenting translation quality. We conduct detailed experiments on Persian-Spanish as a bilingually low-resource scenario. Experimental results demonstrate that this competitive approach outperforms the baselines.</abstract>
+      <url>R19-1003</url>
+      <doi>10.26615/978-954-452-056-4_003</doi>
+    </paper>
+    <paper id="4">
+      <title>Enhancing Phrase-Based Statistical Machine Translation by Learning Phrase Representations Using Long Short-Term Memory Network</title>
+      <author><first>Benyamin</first><last>Ahmadnia</last></author>
+      <author><first>Bonnie</first><last>Dorr</last></author>
+      <pages>25–32</pages>
+      <abstract>Phrases play a key role in Machine Translation (MT). In this paper, we apply a Long Short-Term Memory (LSTM) model over conventional Phrase-Based Statistical MT (PBSMT). The core idea is to use an LSTM encoder-decoder to score the phrase table generated by the PBSMT decoder. Given a source sequence, the encoder and decoder are jointly trained in order to maximize the conditional probability of a target sequence. Analytically, the performance of a PBSMT system is enhanced by using the conditional probabilities of phrase pairs computed by an LSTM encoder-decoder as an additional feature in the existing log-linear model. We compare the performance of the phrase tables in the PBSMT to the performance of the proposed LSTM and observe its positive impact on translation quality. We construct a PBSMT model using the Moses decoder and enrich the Language Model (LM) utilizing an external dataset. We then rank the phrase tables using an LSTM-based encoder-decoder. This method produces a gain of up to 3.14 BLEU score on the test set.</abstract>
+      <url>R19-1004</url>
+      <doi>10.26615/978-954-452-056-4_004</doi>
+    </paper>
+    <paper id="5">
+      <title>Automatic <fixed-case>P</fixed-case>ropbank Generation for <fixed-case>T</fixed-case>urkish</title>
+      <author><first>Koray</first><last>AK</last></author>
+      <author><first>Olcay Taner</first><last>Yıldız</last></author>
+      <pages>33–41</pages>
+      <abstract>Semantic role labeling (SRL) is an important task for understanding natural languages, where the objective is to analyse propositions expressed by the verb and to identify each word that bears a semantic role. It provides an extensive dataset to enhance NLP applications such as information retrieval, machine translation, information extraction, and question answering. However, creating SRL models are difficult. Even in some languages, it is infeasible to create SRL models that have predicate-argument structure due to lack of linguistic resources. In this paper, we present our method to create an automatic Turkish PropBank by exploiting parallel data from the translated sentences of English PropBank. Experiments show that our method gives promising results.</abstract>
+      <url>R19-1005</url>
+      <doi>10.26615/978-954-452-056-4_005</doi>
+    </paper>
+    <paper id="6">
+      <title>Multilingual sentence-level bias detection in <fixed-case>W</fixed-case>ikipedia</title>
+      <author><first>Desislava</first><last>Aleksandrova</last></author>
+      <author><first>François</first><last>Lareau</last></author>
+      <author><first>Pierre André</first><last>Ménard</last></author>
+      <pages>42–51</pages>
+      <abstract>We propose a multilingual method for the extraction of biased sentences from Wikipedia, and use it to create corpora in Bulgarian, French and English. Sifting through the revision history of the articles that at some point had been considered biased and later corrected, we retrieve the last tagged and the first untagged revisions as the before/after snapshots of what was deemed a violation of Wikipedia’s neutral point of view policy. We extract the sentences that were removed or rewritten in that edit. The approach yields sufficient data even in the case of relatively small Wikipedias, such as the Bulgarian one, where 62k articles produced 5k biased sentences. We evaluate our method by manually annotating 520 sentences for Bulgarian and French, and 744 for English. We assess the level of noise and analyze its sources. Finally, we exploit the data with well-known classification methods to detect biased sentences. Code and datasets are hosted at https://github.com/crim-ca/wiki-bias.</abstract>
+      <url>R19-1006</url>
+      <doi>10.26615/978-954-452-056-4_006</doi>
+    </paper>
+    <paper id="7">
+      <title>Supervised Morphological Segmentation Using Rich Annotated Lexicon</title>
+      <author><first>Ebrahim</first><last>Ansari</last></author>
+      <author><first>Zdeněk</first><last>Žabokrtský</last></author>
+      <author><first>Mohammad</first><last>Mahmoudi</last></author>
+      <author><first>Hamid</first><last>Haghdoost</last></author>
+      <author><first>Jonáš</first><last>Vidra</last></author>
+      <pages>52–61</pages>
+      <abstract>Morphological segmentation of words is the process of dividing a word into smaller units called morphemes; it is tricky especially when a morphologically rich or polysynthetic language is under question. In this work, we designed and evaluated several Recurrent Neural Network (RNN) based models as well as various other machine learning based approaches for the morphological segmentation task. We trained our models using annotated segmentation lexicons. To evaluate the effect of the training data size on our models, we decided to create a large hand-annotated morphologically segmented corpus of Persian words, which is, to the best of our knowledge, the first and the only segmentation lexicon for the Persian language. In the experimental phase, using the hand-annotated Persian lexicon and two smaller similar lexicons for Czech and Finnish languages, we evaluated the effect of the training data size, different hyper-parameters settings as well as different RNN-based models.</abstract>
+      <url>R19-1007</url>
+      <doi>10.26615/978-954-452-056-4_007</doi>
+    </paper>
+    <paper id="8">
+      <title>Combining Lexical Substitutes in Neural Word Sense Induction</title>
+      <author><first>Nikolay</first><last>Arefyev</last></author>
+      <author><first>Boris</first><last>Sheludko</last></author>
+      <author><first>Alexander</first><last>Panchenko</last></author>
+      <pages>62–70</pages>
+      <abstract>Word Sense Induction (WSI) is the task of grouping of occurrences of an ambiguous word according to their meaning. In this work, we improve the approach to WSI proposed by Amrami and Goldberg (2018) based on clustering of lexical substitutes for an ambiguous word in a particular context obtained from neural language models. Namely, we propose methods for combining information from left and right context and similarity to the ambiguous word, which result in generating more accurate substitutes than the original approach. Our simple yet efficient improvement establishes a new state-of-the-art on WSI datasets for two languages. Besides, we show improvements to the original approach on a lexical substitution dataset.</abstract>
+      <url>R19-1008</url>
+      <doi>10.26615/978-954-452-056-4_008</doi>
+    </paper>
+    <paper id="9">
+      <title>Detecting Clitics Related Orthographic Errors in <fixed-case>T</fixed-case>urkish</title>
+      <author><first>Ugurcan</first><last>Arikan</last></author>
+      <author><first>Onur</first><last>Gungor</last></author>
+      <author><first>Suzan</first><last>Uskudarli</last></author>
+      <pages>71–76</pages>
+      <abstract>For the spell correction task, vocabulary based methods have been replaced with methods that take morphological and grammar rules into account. However, such tools are fairly immature, and, worse, non-existent for many low resource languages. Checking only if a word is well-formed with respect to the morphological rules of a language may produce false negatives due to the ambiguity resulting from the presence of numerous homophonic words. In this work, we propose an approach to detect and correct the “de/da” clitic errors in Turkish text. Our model is a neural sequence tagger trained with a synthetically constructed dataset consisting of positive and negative samples. The model’s performance with this dataset is presented according to different word embedding configurations. The model achieved an F1 score of 86.67% on a synthetically constructed dataset. We also compared the model’s performance on a manually curated dataset of challenging samples that proved superior to other spelling correctors with 71% accuracy compared to the second-best (Google Docs) with and accuracy of 34%.</abstract>
+      <url>R19-1009</url>
+      <doi>10.26615/978-954-452-056-4_009</doi>
+    </paper>
+    <paper id="10">
+      <title>Benchmark Dataset for Propaganda Detection in <fixed-case>C</fixed-case>zech Newspaper Texts</title>
+      <author><first>Vít</first><last>Baisa</last></author>
+      <author><first>Ondřej</first><last>Herman</last></author>
+      <author><first>Ales</first><last>Horak</last></author>
+      <pages>77–83</pages>
+      <abstract>Propaganda of various pressure groups ranging from big economies to ideological blocks is often presented in a form of objective newspaper texts. However, the real objectivity is here shaded with the support of imbalanced views and distorted attitudes by means of various manipulative stylistic techniques. In the project of Manipulative Propaganda Techniques in the Age of Internet, a new resource for automatic analysis of stylistic mechanisms for influencing the readers’ opinion is developed. In its current version, the resource consists of 7,494 newspaper articles from four selected Czech digital news servers annotated for the presence of specific manipulative techniques. In this paper, we present the current state of the annotations and describe the structure of the dataset in detail. We also offer an evaluation of bag-of-words classification algorithms for the annotated manipulative techniques.</abstract>
+      <url>R19-1010</url>
+      <doi>10.26615/978-954-452-056-4_010</doi>
+    </paper>
+    <paper id="11">
+      <title>Diachronic Analysis of Entities by Exploiting <fixed-case>W</fixed-case>ikipedia Page revisions</title>
+      <author><first>Pierpaolo</first><last>Basile</last></author>
+      <author><first>Annalina</first><last>Caputo</last></author>
+      <author><first>Seamus</first><last>Lawless</last></author>
+      <author><first>Giovanni</first><last>Semeraro</last></author>
+      <pages>84–91</pages>
+      <abstract>In the last few years, the increasing availability of large corpora spanning several time periods has opened new opportunities for the diachronic analysis of language. This type of analysis can bring to the light not only linguistic phenomena related to the shift of word meanings over time, but it can also be used to study the impact that societal and cultural trends have on this language change. This paper introduces a new resource for performing the diachronic analysis of named entities built upon Wikipedia page revisions. This resource enables the analysis over time of changes in the relations between entities (concepts), surface forms (words), and the contexts surrounding entities and surface forms, by analysing the whole history of Wikipedia internal links. We provide some useful use cases that prove the impact of this resource on diachronic studies and delineate some possible future usage.</abstract>
+      <url>R19-1011</url>
+      <doi>10.26615/978-954-452-056-4_011</doi>
+    </paper>
+    <paper id="12">
+      <title>Using a Lexical Semantic Network for the Ontology Building</title>
+      <author><first>Nadia</first><last>Bebeshina-Clairet</last></author>
+      <author><first>Sylvie</first><last>Despres</last></author>
+      <author><first>Mathieu</first><last>Lafourcade</last></author>
+      <pages>92–101</pages>
+      <abstract>Building multilingual ontologies is a hard task as ontologies are often data-rich resources. We introduce an approach which allows exploiting structured lexical semantic knowledge for the ontology building. Given a multilingual lexical semantic (non ontological) resource and an ontology model, it allows mining relevant semantic knowledge and make the ontology building and enhancement process faster.</abstract>
+      <url>R19-1012</url>
+      <doi>10.26615/978-954-452-056-4_012</doi>
+    </paper>
+    <paper id="13">
+      <title>Naive Regularizers for Low-Resource Neural Machine Translation</title>
+      <author><first>Meriem</first><last>Beloucif</last></author>
+      <author><first>Ana Valeria</first><last>Gonzalez</last></author>
+      <author><first>Marcel</first><last>Bollmann</last></author>
+      <author><first>Anders</first><last>Søgaard</last></author>
+      <pages>102–111</pages>
+      <abstract>Neural machine translation models have little inductive bias, which can be a disadvantage in low-resource scenarios. Neural models have to be trained on large amounts of data and have been shown to perform poorly when only limited data is available. We show that using naive regularization methods, based on sentence length, punctuation and word frequencies, to penalize translations that are very different from the input sentences, consistently improves the translation quality across multiple low-resource languages. We experiment with 12 language pairs, varying the training data size between 17k to 230k sentence pairs. Our best regularizer achieves an average increase of 1.5 BLEU score and 1.0 TER score across all the language pairs. For example, we achieve a BLEU score of 26.70 on the IWSLT15 English–Vietnamese translation task simply by using relative differences in punctuation as a regularizer.</abstract>
+      <url>R19-1013</url>
+      <doi>10.26615/978-954-452-056-4_013</doi>
+    </paper>
+    <paper id="14">
+      <title>Exploring Graph-Algebraic <fixed-case>CCG</fixed-case> Combinators for Syntactic-Semantic <fixed-case>AMR</fixed-case> Parsing</title>
+      <author><first>Sebastian</first><last>Beschke</last></author>
+      <pages>112–121</pages>
+      <abstract>We describe a new approach to semantic parsing based on Combinatory Categorial Grammar (CCG). The grammar’s semantic construction operators are defined in terms of a graph algebra, which allows our system to induce a compact CCG lexicon. We introduce an expectation maximisation algorithm which we use to filter our lexicon down to 2500 lexical templates. Our system achieves a semantic triple (Smatch) precision that is competitive with other CCG-based AMR parsing approaches.</abstract>
+      <url>R19-1014</url>
+      <doi>10.26615/978-954-452-056-4_014</doi>
+    </paper>
+    <paper id="15">
+      <title>Quasi Bidirectional Encoder Representations from Transformers for Word Sense Disambiguation</title>
+      <author><first>Michele</first><last>Bevilacqua</last></author>
+      <author><first>Roberto</first><last>Navigli</last></author>
+      <pages>122–131</pages>
+      <abstract>While contextualized embeddings have produced performance breakthroughs in many Natural Language Processing (NLP) tasks, Word Sense Disambiguation (WSD) has not benefited from them yet. In this paper, we introduce QBERT, a Transformer-based architecture for contextualized embeddings which makes use of a co-attentive layer to produce more deeply bidirectional representations, better-fitting for the WSD task. As a result, we are able to train a WSD system that beats the state of the art on the concatenation of all evaluation datasets by over 3 points, also outperforming a comparable model using ELMo.</abstract>
+      <url>R19-1015</url>
+      <doi>10.26615/978-954-452-056-4_015</doi>
+    </paper>
+    <paper id="16">
+      <title>Evaluating the Consistency of Word Embeddings from Small Data</title>
+      <author><first>Jelke</first><last>Bloem</last></author>
+      <author><first>Antske</first><last>Fokkens</last></author>
+      <author><first>Aurélie</first><last>Herbelot</last></author>
+      <pages>132–141</pages>
+      <abstract>In this work, we address the evaluation of distributional semantic models trained on smaller, domain-specific texts, specifically, philosophical text. Specifically, we inspect the behaviour of models using a pre-trained background space in learning. We propose a measure of consistency which can be used as an evaluation metric when no in-domain gold-standard data is available. This measure simply computes the ability of a model to learn similar embeddings from different parts of some homogeneous data. We show that in spite of being a simple evaluation, consistency actually depends on various combinations of factors, including the nature of the data itself, the model used to train the semantic space, and the frequency of the learnt terms, both in the background space and in the in-domain data of interest.</abstract>
+      <url>R19-1016</url>
+      <doi>10.26615/978-954-452-056-4_016</doi>
+    </paper>
+    <paper id="17">
+      <title>Cross-Domain Training for Goal-Oriented Conversational Agents</title>
+      <author><first>Alexandra Maria</first><last>Bodîrlău</last></author>
+      <author><first>Stefania</first><last>Budulan</last></author>
+      <author><first>Traian</first><last>Rebedea</last></author>
+      <pages>142–150</pages>
+      <abstract>Goal-Oriented Chatbots in fields such as customer support, providing certain information or general help with bookings or reservations, suffer from low performance partly due to the difficulty of obtaining large domain-specific annotated datasets. Given that the problem is closely related to the domain of the conversational agent and the data belonging to a specific domain is difficult to annotate, there have been some attempts at surpassing these challenges such as unsupervised pre-training or transfer learning between different domains. A more thorough analysis of the transfer learning mechanism is justified by the significant improvement of the results demonstrated in the results section. We describe extensive experiments using transfer learning and warm-starting techniques with improvements of more than 5% in relative percentage of success rate in the majority of cases, and up to 10x faster convergence as opposed to training the system without them.</abstract>
+      <url>R19-1017</url>
+      <doi>10.26615/978-954-452-056-4_017</doi>
+    </paper>
+    <paper id="18">
+      <title>Learning Sentence Embeddings for Coherence Modelling and Beyond</title>
+      <author><first>Tanner</first><last>Bohn</last></author>
+      <author><first>Yining</first><last>Hu</last></author>
+      <author><first>Jinhang</first><last>Zhang</last></author>
+      <author><first>Charles</first><last>Ling</last></author>
+      <pages>151–160</pages>
+      <abstract>We present a novel and effective technique for performing text coherence tasks while facilitating deeper insights into the data. Despite obtaining ever-increasing task performance, modern deep-learning approaches to NLP tasks often only provide users with the final network decision and no additional understanding of the data. In this work, we show that a new type of sentence embedding learned through self-supervision can be applied effectively to text coherence tasks while serving as a window through which deeper understanding of the data can be obtained. To produce these sentence embeddings, we train a recurrent neural network to take individual sentences and predict their location in a document in the form of a distribution over locations. We demonstrate that these embeddings, combined with simple visual heuristics, can be used to achieve performance competitive with state-of-the-art on multiple text coherence tasks, outperforming more complex and specialized approaches. Additionally, we demonstrate that these embeddings can provide insights useful to writers for improving writing quality and informing document structuring, and assisting readers in summarizing and locating information.</abstract>
+      <url>R19-1018</url>
+      <doi>10.26615/978-954-452-056-4_018</doi>
+    </paper>
+    <paper id="19">
+      <title>Risk Factors Extraction from Clinical Texts based on Linked Open Data</title>
+      <author><first>Svetla</first><last>Boytcheva</last></author>
+      <author><first>Galia</first><last>Angelova</last></author>
+      <author><first>Zhivko</first><last>Angelov</last></author>
+      <pages>161–167</pages>
+      <abstract>This paper presents experiments in risk factors analysis based on clinical texts enhanced with Linked Open Data (LOD). The idea is to determine whether a patient has risk factors for a specific disease analyzing only his/her outpatient records. A semantic graph of “meta-knowledge” about a disease of interest is constructed, with integrated multilingual terms (labels) of symptoms, risk factors etc. coming from Wikidata, PubMed, Wikipedia and MESH, and linked to clinical records of individual patients via ICD–10 codes. Then a predictive model is trained to foretell whether patients are at risk to develop the disease of interest. The testing was done using outpatient records from a nation-wide repository available for the period 2011-2016. The results show improvement of the overall performance of all tested algorithms (kNN, Naive Bayes, Tree, Logistic regression, ANN), when the clinical texts are enriched with LOD resources.</abstract>
+      <url>R19-1019</url>
+      <doi>10.26615/978-954-452-056-4_019</doi>
+    </paper>
+    <paper id="20">
+      <title>Parallel Sentence Retrieval From Comparable Corpora for Biomedical Text Simplification</title>
+      <author><first>Rémi</first><last>Cardon</last></author>
+      <author><first>Natalia</first><last>Grabar</last></author>
+      <pages>168–177</pages>
+      <abstract>Parallel sentences provide semantically similar information which can vary on a given dimension, such as language or register. Parallel sentences with register variation (like expert and non-expert documents) can be exploited for the automatic text simplification. The aim of automatic text simplification is to better access and understand a given information. In the biomedical field, simplification may permit patients to understand medical and health texts. Yet, there is currently no such available resources. We propose to exploit comparable corpora which are distinguished by their registers (specialized and simplified versions) to detect and align parallel sentences. These corpora are in French and are related to the biomedical area. Manually created reference data show 0.76 inter-annotator agreement. Our purpose is to state whether a given pair of specialized and simplified sentences is parallel and can be aligned or not. We treat this task as binary classification (alignment/non-alignment). We perform experiments with a controlled ratio of imbalance and on the highly unbalanced real data. Our results show that the method we present here can be used to automatically generate a corpus of parallel sentences from our comparable corpus.</abstract>
+      <url>R19-1020</url>
+      <doi>10.26615/978-954-452-056-4_020</doi>
+    </paper>
+    <paper id="21">
+      <title>Classifying Author Intention for Writer Feedback in Related Work</title>
+      <author><first>Arlene</first><last>Casey</last></author>
+      <author><first>Bonnie</first><last>Webber</last></author>
+      <author><first>Dorota</first><last>Glowacka</last></author>
+      <pages>178–187</pages>
+      <abstract>The ability to produce high-quality publishable material is critical to academic success but many Post-Graduate students struggle to learn to do so. While recent years have seen an increase in tools designed to provide feedback on aspects of writing, one aspect that has so far been neglected is the Related Work section of academic research papers. To address this, we have trained a supervised classifier on a corpus of 94 Related Work sections and evaluated it against a manually annotated gold standard. The classifier uses novel features pertaining to citation types and co-reference, along with patterns found from studying Related Works. We show that these novel features contribute to classifier performance with performance being favourable compared to other similar works that classify author intentions and consider feedback for academic writing.</abstract>
+      <url>R19-1021</url>
+      <doi>10.26615/978-954-452-056-4_021</doi>
+    </paper>
+    <paper id="22">
+      <title>Sparse Victory – A Large Scale Systematic Comparison of count-based and prediction-based vectorizers for text classification</title>
+      <author><first>Rupak</first><last>Chakraborty</last></author>
+      <author><first>Ashima</first><last>Elhence</last></author>
+      <author><first>Kapil</first><last>Arora</last></author>
+      <pages>188–197</pages>
+      <abstract>In this paper we study the performance of several text vectorization algorithms on a diverse collection of 73 publicly available datasets. Traditional sparse vectorizers like Tf-Idf and Feature Hashing have been systematically compared with the latest state of the art neural word embeddings like Word2Vec, GloVe, FastText and character embeddings like ELMo, Flair. We have carried out an extensive analysis of the performance of these vectorizers across different dimensions like classification metrics (.i.e. precision, recall, accuracy), dataset-size, and imbalanced data (in terms of the distribution of the number of class labels). Our experiments reveal that the sparse vectorizers beat the neural word and character embedding models on 61 of the 73 datasets by an average margin of 3-5% (in terms of macro f1 score) and this performance is consistent across the different dimensions of comparison.</abstract>
+      <url>R19-1022</url>
+      <doi>10.26615/978-954-452-056-4_022</doi>
+    </paper>
+    <paper id="23">
+      <title>A Fine-Grained Annotated Multi-Dialectal <fixed-case>A</fixed-case>rabic Corpus</title>
+      <author><first>Anis</first><last>Charfi</last></author>
+      <author><first>Wajdi</first><last>Zaghouani</last></author>
+      <author><first>Syed Hassan</first><last>Mehdi</last></author>
+      <author><first>Esraa</first><last>Mohamed</last></author>
+      <pages>198–204</pages>
+      <abstract>We present ARAP-Tweet 2.0, a corpus of 5 million dialectal Arabic tweets and 50 million words of about 3000 Twitter users from 17 Arab countries. Compared to the first version, the new corpus has significant improvements in terms of the data volume and the annotation quality. It is fully balanced with respect to dialect, gender, and three age groups: under 25 years, between 25 and 34, and 35 years and above. This paper describes the process of creating the corpus starting from gathering the dialectal phrases to find the users, to annotating their accounts and retrieving their tweets. We also report on the evaluation of the annotation quality using the inter-annotator agreement measures which were applied to the whole corpus and not just a subset. The obtained results were substantial with average Cohen’s Kappa values of 0.99, 0.92, and 0.88 for the annotation of gender, dialect, and age respectively. We also discuss some challenges encountered when developing this corpus.s.</abstract>
+      <url>R19-1023</url>
+      <doi>10.26615/978-954-452-056-4_023</doi>
+    </paper>
+    <paper id="24">
+      <title>Personality-dependent Neural Text Summarization</title>
+      <author><first>Pablo</first><last>Costa</last></author>
+      <author><first>Ivandré</first><last>Paraboni</last></author>
+      <pages>205–212</pages>
+      <abstract>In Natural Language Generation systems, personalization strategies - i.e, the use of information about a target author to generate text that (more) closely resembles human-produced language - have long been applied to improve results. The present work addresses one such strategy - namely, the use of Big Five personality information about the target author - applied to the case of abstractive text summarization using neural sequence-to-sequence models. Initial results suggest that having access to personality information does lead to more accurate (or human-like) text summaries, and paves the way for more robust systems of this kind.</abstract>
+      <url>R19-1024</url>
+      <doi>10.26615/978-954-452-056-4_024</doi>
+    </paper>
+    <paper id="25">
+      <title>Self-Adaptation for Unsupervised Domain Adaptation</title>
+      <author><first>Xia</first><last>Cui</last></author>
+      <author><first>Danushka</first><last>Bollegala</last></author>
+      <pages>213–222</pages>
+      <abstract>Lack of labelled data in the target domain for training is a common problem in domain adaptation. To overcome this problem, we propose a novel unsupervised domain adaptation method that combines projection and self-training based approaches. Using the labelled data from the source domain, we first learn a projection that maximises the distance among the nearest neighbours with opposite labels in the source domain. Next, we project the source domain labelled data using the learnt projection and train a classifier for the target class prediction. We then use the trained classifier to predict pseudo labels for the target domain unlabelled data. Finally, we learn a projection for the target domain as we did for the source domain using the pseudo-labelled target domain data, where we maximise the distance between nearest neighbours having opposite pseudo labels. Experiments on a standard benchmark dataset for domain adaptation show that the proposed method consistently outperforms numerous baselines and returns competitive results comparable to that of SOTA including self-training, tri-training, and neural adaptations.</abstract>
+      <url>R19-1025</url>
+      <doi>10.26615/978-954-452-056-4_025</doi>
+    </paper>
+    <paper id="26">
+      <title>Speculation and Negation detection in <fixed-case>F</fixed-case>rench biomedical corpora</title>
+      <author><first>Clément</first><last>Dalloux</last></author>
+      <author><first>Vincent</first><last>Claveau</last></author>
+      <author><first>Natalia</first><last>Grabar</last></author>
+      <pages>223–232</pages>
+      <abstract>In this work, we propose to address the detection of negation and speculation, and of their scope, in French biomedical documents. It has been indeed observed that they play an important role and provide crucial clues for other NLP applications. Our methods are based on CRFs and BiLSTM. We reach up to 97.21 % and 91.30 % F-measure for the detection of negation and speculation cues, respectively, using CRFs. For the computing of scope, we reach up to 90.81 % and 86.73 % F-measure on negation and speculation, respectively, using BiLSTM-CRF fed with word embeddings.</abstract>
+      <url>R19-1026</url>
+      <doi>10.26615/978-954-452-056-4_026</doi>
+    </paper>
+    <paper id="27">
+      <title>Porting Multilingual Morphological Resources to <fixed-case>O</fixed-case>nto<fixed-case>L</fixed-case>ex-Lemon</title>
+      <author><first>Thierry</first><last>Declerck</last></author>
+      <author><first>Stefania</first><last>Racioppa</last></author>
+      <pages>233–238</pages>
+      <abstract>We describe work consisting in porting various morphological resources to the OntoLex-Lemon model. A main objective of this work is to offer a uniform representation of different morphological data sets in order to be able to compare and interlink multilingual resources and to cross-check and interlink or merge the content of morphological resources of one and the same language. The results of our work will be published on the Linguistic Linked Open Data cloud.</abstract>
+      <url>R19-1027</url>
+      <doi>10.26615/978-954-452-056-4_027</doi>
+    </paper>
+    <paper id="28">
+      <title>Dependency-Based Self-Attention for Transformer <fixed-case>NMT</fixed-case></title>
+      <author><first>Hiroyuki</first><last>Deguchi</last></author>
+      <author><first>Akihiro</first><last>Tamura</last></author>
+      <author><first>Takashi</first><last>Ninomiya</last></author>
+      <pages>239–246</pages>
+      <abstract>In this paper, we propose a new Transformer neural machine translation (NMT) model that incorporates dependency relations into self-attention on both source and target sides, dependency-based self-attention. The dependency-based self-attention is trained to attend to the modifiee for each token under constraints based on the dependency relations, inspired by Linguistically-Informed Self-Attention (LISA). While LISA is originally proposed for Transformer encoder for semantic role labeling, this paper extends LISA to Transformer NMT by masking future information on words in the decoder-side dependency-based self-attention. Additionally, our dependency-based self-attention operates at sub-word units created by byte pair encoding. The experiments show that our model improves 1.0 BLEU points over the baseline model on the WAT’18 Asian Scientific Paper Excerpt Corpus Japanese-to-English translation task.</abstract>
+      <url>R19-1028</url>
+      <doi>10.26615/978-954-452-056-4_028</doi>
+    </paper>
+    <paper id="29">
+      <title>Detecting Toxicity in News Articles: Application to <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Yoan</first><last>Dinkov</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <pages>247–258</pages>
+      <abstract>Online media aim for reaching ever bigger audience and for attracting ever longer attention span. This competition creates an environment that rewards sensational, fake, and toxic news. To help limit their spread and impact, we propose and develop a news toxicity detector that can recognize various types of toxic content. While previous research primarily focused on English, here we target Bulgarian. We created a new dataset by crawling a website that for five years has been collecting Bulgarian news articles that were manually categorized into eight toxicity groups. Then we trained a multi-class classifier with nine categories: eight toxic and one non-toxic. We experimented with different representations based on ElMo, BERT, and XLM, as well as with a variety of domain-specific features. Due to the small size of our dataset, we created a separate model for each feature type, and we ultimately combined these models into a meta-classifier. The evaluation results show an accuracy of 59.0% and a macro-F1 score of 39.7%, which represent sizable improvements over the majority-class baseline (Acc=30.3%, macro-F1=5.2%).</abstract>
+      <url>R19-1029</url>
+      <doi>10.26615/978-954-452-056-4_029</doi>
+    </paper>
+    <paper id="30">
+      <title>De-Identification of Emails: Pseudonymizing Privacy-Sensitive Data in a <fixed-case>G</fixed-case>erman Email Corpus</title>
+      <author><first>Elisabeth</first><last>Eder</last></author>
+      <author><first>Ulrike</first><last>Krieg-Holz</last></author>
+      <author><first>Udo</first><last>Hahn</last></author>
+      <pages>259–269</pages>
+      <abstract>We deal with the pseudonymization of those stretches of text in emails that might allow to identify real individual persons. This task is decomposed into two steps. First, named entities carrying privacy-sensitive information (e.g., names of persons, locations, phone numbers or dates) are identified, and, second, these privacy-bearing entities are replaced by synthetically generated surrogates (e.g., a person originally named ‘John Doe’ is renamed as ‘Bill Powers’). We describe a system architecture for surrogate generation and evaluate our approach on CodeAlltag, a German email corpus.</abstract>
+      <url>R19-1030</url>
+      <doi>10.26615/978-954-452-056-4_030</doi>
+    </paper>
+    <paper id="31">
+      <title>Lexical Quantile-Based Text Complexity Measure</title>
+      <author><first>Maksim</first><last>Eremeev</last></author>
+      <author><first>Konstantin</first><last>Vorontsov</last></author>
+      <pages>270–275</pages>
+      <abstract>This paper introduces a new approach to estimating the text document complexity. Common readability indices are based on average length of sentences and words. In contrast to these methods, we propose to count the number of rare words occurring abnormally often in the document. We use the reference corpus of texts and the quantile approach in order to determine what words are rare, and what frequencies are abnormal. We construct a general text complexity model, which can be adjusted for the specific task, and introduce two special models. The experimental design is based on a set of thematically similar pairs of Wikipedia articles, labeled using crowdsourcing. The experiments demonstrate the competitiveness of the proposed approach.</abstract>
+      <url>R19-1031</url>
+      <doi>10.26615/978-954-452-056-4_031</doi>
+    </paper>
+    <paper id="32">
+      <title>Demo Application for <fixed-case>LETO</fixed-case>: Learning Engine Through Ontologies</title>
+      <author><first>Suilan</first><last>Estevez-Velarde</last></author>
+      <author><first>Andrés</first><last>Montoyo</last></author>
+      <author><first>Yudivian</first><last>Almeida-Cruz</last></author>
+      <author><first>Yoan</first><last>Gutiérrez</last></author>
+      <author><first>Alejandro</first><last>Piad-Morffis</last></author>
+      <author><first>Rafael</first><last>Muñoz</last></author>
+      <pages>276–284</pages>
+      <abstract>The massive amount of multi-formatted information available on the Web necessitates the design of software systems that leverage this information to obtain knowledge that is valid and useful. The main challenge is to discover relevant information and continuously update, enrich and integrate knowledge from various sources of structured and unstructured data. This paper presents the Learning Engine Through Ontologies(LETO) framework, an architecture for the continuous and incremental discovery of knowledge from multiple sources of unstructured and structured data. We justify the main design decision behind LETO’s architecture and evaluate the framework’s feasibility using the Internet Movie Data Base(IMDB) and Twitter as a practical application.</abstract>
+      <url>R19-1032</url>
+      <doi>10.26615/978-954-452-056-4_032</doi>
+    </paper>
+    <paper id="33">
+      <title>Sentence Simplification for Semantic Role Labelling and Information Extraction</title>
+      <author><first>Richard</first><last>Evans</last></author>
+      <author><first>Constantin</first><last>Orasan</last></author>
+      <pages>285–294</pages>
+      <abstract>In this paper, we report on the extrinsic evaluation of an automatic sentence simplification method with respect to two NLP tasks: semantic role labelling (SRL) and information extraction (IE). The paper begins with our observation of challenges in the intrinsic evaluation of sentence simplification systems, which motivates the use of extrinsic evaluation of these systems with respect to other NLP tasks. We describe the two NLP systems and the test data used in the extrinsic evaluation, and present arguments and evidence motivating the integration of a sentence simplification step as a means of improving the accuracy of these systems. Our evaluation reveals that their performance is improved by the simplification step: the SRL system is better able to assign semantic roles to the majority of the arguments of verbs and the IE system is better able to identify fillers for all IE template slots.</abstract>
+      <url>R19-1033</url>
+      <doi>10.26615/978-954-452-056-4_033</doi>
+    </paper>
+    <paper id="34">
+      <title><fixed-case>O</fixed-case>llo<fixed-case>B</fixed-case>ot - Towards A Text-Based <fixed-case>A</fixed-case>rabic Health Conversational Agent: Evaluation and Results</title>
+      <author><first>Ahmed</first><last>Fadhil</last></author>
+      <author><first>Ahmed</first><last>AbuRa’ed</last></author>
+      <pages>295–303</pages>
+      <abstract>We introduce OlloBot, an Arabic conversational agent that assists physicians and supports patients with the care process. It doesn’t replace the physicians, instead provides health tracking and support and assists physicians with the care delivery through a conversation medium. The current model comprises healthy diet, physical activity, mental health, in addition to food logging. Not only OlloBot tracks user daily food, it also offers useful tips for healthier living. We will discuss the design, development and testing of OlloBot, and highlight the findings and limitations arose from the testing.</abstract>
+      <url>R19-1034</url>
+      <doi>10.26615/978-954-452-056-4_034</doi>
+    </paper>
+    <paper id="35">
+      <title>Developing the Old <fixed-case>T</fixed-case>ibetan Treebank</title>
+      <author><first>Christian</first><last>Faggionato</last></author>
+      <author><first>Marieke</first><last>Meelen</last></author>
+      <pages>304–312</pages>
+      <abstract>This paper presents a full procedure for the development of a segmented, POS-tagged and chunkparsed corpus of Old Tibetan. As an extremely low-resource language, Old Tibetan poses non-trivial problems in every step towards the development of a searchable treebank. We demonstrate, however, that a carefully developed, semisupervised method of optimising and extending existing tools for Classical Tibetan, as well as creating specific ones for Old Tibetan can address these issues. We thus also present the first very Tibetan Treebank in a variety of formats to facilitate research in the fields of NLP, historical linguistics and Tibetan Studies.</abstract>
+      <url>R19-1035</url>
+      <doi>10.26615/978-954-452-056-4_035</doi>
+    </paper>
+    <paper id="36">
+      <title>Summarizing Legal Rulings: Comparative Experiments</title>
+      <author><first>Diego</first><last>Feijo</last></author>
+      <author><first>Viviane</first><last>Moreira</last></author>
+      <pages>313–322</pages>
+      <abstract>In the context of text summarization, texts in the legal domain have peculiarities related to their length and to their specialized vocabulary. Recent neural network-based approaches can achieve high-quality scores for text summarization. However, these approaches have been used mostly for generating very short abstracts for news articles. Thus, their applicability to the legal domain remains an open issue. In this work, we experimented with ten extractive and four abstractive models in a real dataset of legal rulings. These models were compared with an extractive baseline based on heuristics to select the most relevant parts of the text. Our results show that abstractive approaches significantly outperform extractive methods in terms of ROUGE scores.</abstract>
+      <url>R19-1036</url>
+      <doi>10.26615/978-954-452-056-4_036</doi>
+    </paper>
+    <paper id="37">
+      <title>Entropy as a Proxy for Gap Complexity in Open Cloze Tests</title>
+      <author><first>Mariano</first><last>Felice</last></author>
+      <author><first>Paula</first><last>Buttery</last></author>
+      <pages>323–327</pages>
+      <abstract>This paper presents a pilot study of entropy as a measure of gap complexity in open cloze tests aimed at learners of English. Entropy is used to quantify the information content in each gap, which can be used to estimate complexity. Our study shows that average gap entropy correlates positively with proficiency levels while individual gap entropy can capture contextual complexity. To the best of our knowledge, this is the first unsupervised information-theoretical approach to evaluating the quality of cloze tests.</abstract>
+      <url>R19-1037</url>
+      <doi>10.26615/978-954-452-056-4_037</doi>
+    </paper>
+    <paper id="38">
+      <title>Song Lyrics Summarization Inspired by Audio Thumbnailing</title>
+      <author><first>Michael</first><last>Fell</last></author>
+      <author><first>Elena</first><last>Cabrio</last></author>
+      <author><first>Fabien</first><last>Gandon</last></author>
+      <author><first>Alain</first><last>Giboin</last></author>
+      <pages>328–337</pages>
+      <abstract>Given the peculiar structure of songs, applying generic text summarization methods to lyrics can lead to the generation of highly redundant and incoherent text. In this paper, we propose to enhance state-of-the-art text summarization approaches with a method inspired by audio thumbnailing. Instead of searching for the thumbnail clues in the audio of the song, we identify equivalent clues in the lyrics. We then show how these summaries that take into account the audio nature of the lyrics outperform the generic methods according to both an automatic evaluation and human judgments.</abstract>
+      <url>R19-1038</url>
+      <doi>10.26615/978-954-452-056-4_038</doi>
+    </paper>
+    <paper id="39">
+      <title>Comparing Automated Methods to Detect Explicit Content in Song Lyrics</title>
+      <author><first>Michael</first><last>Fell</last></author>
+      <author><first>Elena</first><last>Cabrio</last></author>
+      <author><first>Michele</first><last>Corazza</last></author>
+      <author><first>Fabien</first><last>Gandon</last></author>
+      <pages>338–344</pages>
+      <abstract>The Parental Advisory Label (PAL) is a warning label that is placed on audio recordings in recognition of profanity or inappropriate references, with the intention of alerting parents of material potentially unsuitable for children. Since 2015, digital providers – such as iTunes, Spotify, Amazon Music and Deezer – also follow PAL guidelines and tag such tracks as “explicit”. Nowadays, such labelling is carried out mainly manually on voluntary basis, with the drawbacks of being time consuming and therefore costly, error prone and partly a subjective task. In this paper, we compare automated methods ranging from dictionary-based lookup to state-of-the-art deep neural networks to automatically detect explicit contents in English lyrics. We show that more complex models perform only slightly better on this task, and relying on a qualitative analysis of the data, we discuss the inherent hardness and subjectivity of the task.</abstract>
+      <url>R19-1039</url>
+      <doi>10.26615/978-954-452-056-4_039</doi>
+    </paper>
+    <paper id="40">
+      <title>Linguistic classification: dealing jointly with irrelevance and inconsistency</title>
+      <author><first>Laura</first><last>Franzoi</last></author>
+      <author><first>Andrea</first><last>Sgarro</last></author>
+      <author><first>Anca</first><last>Dinu</last></author>
+      <author><first>Liviu P.</first><last>Dinu</last></author>
+      <pages>345–352</pages>
+      <abstract>In this paper, we present new methods for language classification which put to good use both syntax and fuzzy tools, and are capable of dealing with irrelevant linguistic features (i.e. features which should not contribute to the classification) and even inconsistent features (which do not make sense for specific languages). We introduce a metric distance, based on the generalized Steinhaus transform, which allows one to deal jointly with irrelevance and inconsistency. To evaluate our methods, we test them on a syntactic data set, due to the linguist G. Longobardi and his school. We obtain phylogenetic trees which sometimes outperform the ones obtained by Atkinson and Gray.</abstract>
+      <url>R19-1040</url>
+      <doi>10.26615/978-954-452-056-4_040</doi>
+    </paper>
+    <paper id="41">
+      <title>Corpus Lexicography in a Wider Context</title>
+      <author><first>Chen</first><last>Gafni</last></author>
+      <pages>353–359</pages>
+      <abstract>This paper describes a set of tools that offers comprehensive solutions for corpus lexicography. The tools perform a range of tasks, including construction of corpus lexicon, integrating information from external dictionaries, internal analysis of the lexicon, and lexical analysis of the corpus. The set of tools is particularly useful for creating dictionaries for under-resourced languages. The tools are integrated in a general-purpose software that includes additional tools for various research tasks, such as linguistic development analysis. Equipped with a user-friendly interface, the described system can be easily incorporated in research in a variety of fields.</abstract>
+      <url>R19-1041</url>
+      <doi>10.26615/978-954-452-056-4_041</doi>
+    </paper>
+    <paper id="42">
+      <title>A Universal System for Automatic Text-to-Phonetics Conversion</title>
+      <author><first>Chen</first><last>Gafni</last></author>
+      <pages>360–366</pages>
+      <abstract>This paper describes an automatic text-to-phonetics conversion system. The system was constructed to primarily serve as a research tool. It is implemented in a general-purpose linguistic software, which allows it to be incorporated in a multifaceted linguistic research in essentially any language. The system currently relies on two mechanisms to generate phonetic transcriptions from texts: (i) importing ready-made phonetic word forms from external dictionaries, and (ii) automatic generation of phonetic word forms based on a set of deterministic linguistic rules. The current paper describes the proposed system and its potential application to linguistic research.</abstract>
+      <url>R19-1042</url>
+      <doi>10.26615/978-954-452-056-4_042</doi>
+    </paper>
+    <paper id="43">
+      <title>Two Discourse Tree - Based Approaches to Indexing Answers</title>
+      <author><first>Boris</first><last>Galitsky</last></author>
+      <author><first>Dmitry</first><last>Ilvovsky</last></author>
+      <pages>367–372</pages>
+      <abstract>We explore anatomy of answers with respect to which text fragments from an answer are worth matching with a question and which should not be matched. We apply the Rhetorical Structure Theory to build a discourse tree of an answer and select elementary discourse units that are suitable for indexing. Manual rules for selection of these discourse units as well as automated classification based on web search engine mining are evaluated con-cerning improving search accuracy. We form two sets of question-answer pairs for FAQ and community QA search domains and use them for evaluation of the proposed indexing methodology, which delivers up to 16 percent improvement in search recall.</abstract>
+      <url>R19-1043</url>
+      <doi>10.26615/978-954-452-056-4_043</doi>
+    </paper>
+    <paper id="44">
+      <title>Discourse-Based Approach to Involvement of Background Knowledge for Question Answering</title>
+      <author><first>Boris</first><last>Galitsky</last></author>
+      <author><first>Dmitry</first><last>Ilvovsky</last></author>
+      <pages>373–381</pages>
+      <abstract>We introduce a concept of a virtual discourse tree to improve question answering (Q/A) recall for complex, multi-sentence questions. Augmenting the discourse tree of an answer with tree fragments obtained from text corpora playing the role of ontology, we obtain on the fly a canonical discourse representation of this answer that is independent of the thought structure of a given author. This mechanism is critical for finding an answer that is not only relevant in terms of questions entities but also in terms of inter-relations between these entities in an answer and its style. We evaluate the Q/A system enabled with virtual discourse trees and observe a substantial increase of performance answering complex questions such as Yahoo! Answers and www.2carpros.com.</abstract>
+      <url>R19-1044</url>
+      <doi>10.26615/978-954-452-056-4_044</doi>
+    </paper>
+    <paper id="45">
+      <title>On a Chatbot Providing Virtual Dialogues</title>
+      <author><first>Boris</first><last>Galitsky</last></author>
+      <author><first>Dmitry</first><last>Ilvovsky</last></author>
+      <author><first>Elizaveta</first><last>Goncharova</last></author>
+      <pages>382–387</pages>
+      <abstract>We present a chatbot that delivers content in the form of virtual dialogues automatically produced from the plain texts that are extracted and selected from the documents. This virtual dialogue content is provided in the form of answers derived from the found and selected documents split into fragments, and questions that are automatically generated for these answers based on the initial text.</abstract>
+      <url>R19-1045</url>
+      <doi>10.26615/978-954-452-056-4_045</doi>
+    </paper>
+    <paper id="46">
+      <title>Assessing socioeconomic status of <fixed-case>T</fixed-case>witter users: A survey</title>
+      <author><first>Dhouha</first><last>GHAZOUANI</last></author>
+      <author><first>Luigi</first><last>LANCIERI</last></author>
+      <author><first>Habib</first><last>OUNELLI</last></author>
+      <author><first>Chaker</first><last>JEBARI</last></author>
+      <pages>388–398</pages>
+      <abstract>Every day, the emotion and opinion of different people across the world are reflected in the form of short messages using microblogging platforms. Despite the existence of enormous potential introduced by this data source, the Twitter community is still ambiguous and is not fully explored yet. While there are a huge number of studies examining the possibilities of inferring gender and age, there exist hardly researches on socioeconomic status (SES) inference of Twitter users. As socioeconomic status is essential to treating diverse questions linked to human behavior in several fields (sociology, demography, public health, etc.), we conducted a comprehensive literature review of SES studies, inference methods, and metrics. With reference to the research on literature’s results, we came to outline the most critical challenges for researchers. To the best of our knowledge, this paper is the first review that introduces the different aspects of SES inference. Indeed, this article provides the benefits for practitioners who aim to process and explore Twitter SES inference.</abstract>
+      <url>R19-1046</url>
+      <doi>10.26615/978-954-452-056-4_046</doi>
+    </paper>
+    <paper id="47">
+      <title>Divide and Extract – Disentangling Clause Splitting and Proposition Extraction</title>
+      <author><first>Darina</first><last>Gold</last></author>
+      <author><first>Torsten</first><last>Zesch</last></author>
+      <pages>399–408</pages>
+      <abstract>Proposition extraction from sentences is an important task for information extraction systems Evaluation of such systems usually conflates two aspects: splitting complex sentences into clauses and the extraction of propositions. It is thus difficult to independently determine the quality of the proposition extraction step. We create a manually annotated proposition dataset from sentences taken from restaurant reviews that distinguishes between clauses that need to be split and those that do not. The resulting proposition evaluation dataset allows us to independently compare the performance of proposition extraction systems on simple and complex clauses. Although performance drastically drops on more complex sentences, we show that the same systems perform best on both simple and complex clauses. Furthermore, we show that specific kinds of subordinate clauses pose difficulties to most systems.</abstract>
+      <url>R19-1047</url>
+      <doi>10.26615/978-954-452-056-4_047</doi>
+    </paper>
+    <paper id="48">
+      <title>Sparse Coding in Authorship Attribution for Polish Tweets</title>
+      <author><first>Piotr</first><last>Grzybowski</last></author>
+      <author><first>Ewa</first><last>Juralewicz</last></author>
+      <author><first>Maciej</first><last>Piasecki</last></author>
+      <pages>409–417</pages>
+      <abstract>The study explores application of a simple Convolutional Neural Network for the problem of authorship attribution of tweets written in Polish. In our solution we use two-step compression of tweets using Byte Pair Encoding algorithm and vectorisation as an input to the distributional model generated for the large corpus of Polish tweets by word2vec algorithm. Our method achieves results comparable to the state-of-the-art approaches for the similar task on English tweets and expresses a very good performance in the classification of Polish tweets. We tested the proposed method in relation to the number of authors and tweets per author. We also juxtaposed results for authors with different topic backgrounds against each other.</abstract>
+      <url>R19-1048</url>
+      <doi>10.26615/978-954-452-056-4_048</doi>
+    </paper>
+    <paper id="49">
+      <title>Automatic Question Answering for Medical <fixed-case>MCQ</fixed-case>s: Can It go Further than Information Retrieval?</title>
+      <author><first>Le An</first><last>Ha</last></author>
+      <author><first>Victoria</first><last>Yaneva</last></author>
+      <pages>418–422</pages>
+      <abstract>We present a novel approach to automatic question answering that does not depend on the performance of an information retrieval (IR) system and does not require that the training data come from the same source as the questions. We evaluate the system performance on a challenging set of university-level medical science multiple-choice questions. Best performance is achieved when combining a neural approach with an IR approach, both of which work independently. Unlike previous approaches, the system achieves statistically significant improvement over the random guess baseline even for questions that are labeled as challenging based on the performance of baseline solvers.</abstract>
+      <url>R19-1049</url>
+      <doi>10.26615/978-954-452-056-4_049</doi>
+    </paper>
+    <paper id="50">
+      <title>Self-Knowledge Distillation in Natural Language Processing</title>
+      <author><first>Sangchul</first><last>Hahn</last></author>
+      <author><first>Heeyoul</first><last>Choi</last></author>
+      <pages>423–430</pages>
+      <abstract>Since deep learning became a key player in natural language processing (NLP), many deep learning models have been showing remarkable performances in a variety of NLP tasks. Such high performance can be explained by efficient knowledge representation of deep learning models. Knowledge distillation from pretrained deep networks suggests that we can use more information from the soft target probability to train other neural networks. In this paper, we propose a self-knowledge distillation method, based on the soft target probabilities of the training model itself, where multimode information is distilled from the word embedding space right below the softmax layer. Due to the time complexity, our method approximates the soft target probabilities. In experiments, we applied the proposed method to two different and fundamental NLP tasks: language model and neural machine translation. The experiment results show that our proposed method improves performance on the tasks.</abstract>
+      <url>R19-1050</url>
+      <doi>10.26615/978-954-452-056-4_050</doi>
+    </paper>
+    <paper id="51">
+      <title>From the Paft to the Fiiture: a Fully Automatic <fixed-case>NMT</fixed-case> and Word Embeddings Method for <fixed-case>OCR</fixed-case> Post-Correction</title>
+      <author><first>Mika</first><last>Hämäläinen</last></author>
+      <author><first>Simon</first><last>Hengchen</last></author>
+      <pages>431–436</pages>
+      <abstract>A great deal of historical corpora suffer from errors introduced by the OCR (optical character recognition) methods used in the digitization process. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We present a fully automatic unsupervised way of extracting parallel data for training a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction.</abstract>
+      <url>R19-1051</url>
+      <doi>10.26615/978-954-452-056-4_051</doi>
+    </paper>
+    <paper id="52">
+      <title>Investigating Terminology Translation in Statistical and Neural Machine Translation: A Case Study on <fixed-case>E</fixed-case>nglish-to-<fixed-case>H</fixed-case>indi and <fixed-case>H</fixed-case>indi-to-<fixed-case>E</fixed-case>nglish</title>
+      <author><first>Rejwanul</first><last>Haque</last></author>
+      <author><first>Md</first><last>Hasanuzzaman</last></author>
+      <author><first>Andy</first><last>Way</last></author>
+      <pages>437–446</pages>
+      <abstract>Terminology translation plays a critical role in domain-specific machine translation (MT). In this paper, we conduct a comparative qualitative evaluation on terminology translation in phrase-based statistical MT (PB-SMT) and neural MT (NMT) in two translation directions: English-to-Hindi and Hindi-to-English. For this, we select a test set from a legal domain corpus and create a gold standard for evaluating terminology translation in MT. We also propose an error typology taking the terminology translation errors into consideration. We evaluate the MT systems’ performance on terminology translation, and demonstrate our findings, unraveling strengths, weaknesses, and similarities of PB-SMT and NMT in the area of term translation.</abstract>
+      <url>R19-1052</url>
+      <doi>10.26615/978-954-452-056-4_052</doi>
+    </paper>
+    <paper id="53">
+      <title>Beyond <fixed-case>E</fixed-case>nglish-Only Reading Comprehension: Experiments in Zero-shot Multilingual Transfer for <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Momchil</first><last>Hardalov</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <pages>447–459</pages>
+      <abstract>Recently, reading comprehension models achieved near-human performance on large-scale datasets such as SQuAD, CoQA, MS Macro, RACE, etc. This is largely due to the release of pre-trained contextualized representations such as BERT and ELMo, which can be fine-tuned for the target task. Despite those advances and the creation of more challenging datasets, most of the work is still done for English. Here, we study the effectiveness of multilingual BERT fine-tuned on large-scale English datasets for reading comprehension (e.g., for RACE), and we apply it to Bulgarian multiple-choice reading comprehension. We propose a new dataset containing 2,221 questions from matriculation exams for twelfth grade in various subjects —history, biology, geography and philosophy—, and 412 additional questions from online quizzes in history. While the quiz authors gave no relevant context, we incorporate knowledge from Wikipedia, retrieving documents matching the combination of question + each answer option. Moreover, we experiment with different indexing and pre-training strategies. The evaluation results show accuracy of 42.23%, which is well above the baseline of 24.89%.</abstract>
+      <url>R19-1053</url>
+      <doi>10.26615/978-954-452-056-4_053</doi>
+    </paper>
+    <paper id="54">
+      <title>Tweaks and Tricks for Word Embedding Disruptions</title>
+      <author><first>Amir</first><last>Hazem</last></author>
+      <author><first>Nicolas</first><last>Hernandez</last></author>
+      <pages>460–464</pages>
+      <abstract>Word embeddings are established as very effective models used in several NLP applications. If they differ in their architecture and training process, they often exhibit similar properties and remain vector space models with continuously-valued dimensions describing the observed data. The complexity resides in the developed strategies for learning the values within each dimensional space. In this paper, we introduce the concept of disruption which we define as a side effect of the training process of embedding models. Disruptions are viewed as a set of embedding values that are more likely to be noise than effective descriptive features. We show that dealing with disruption phenomenon is of a great benefit to bottom-up sentence embedding representation. By contrasting several in-domain and pre-trained embedding models, we propose two simple but very effective tweaking techniques that yield strong empirical improvements on textual similarity task.</abstract>
+      <url>R19-1054</url>
+      <doi>10.26615/978-954-452-056-4_054</doi>
+    </paper>
+    <paper id="55">
+      <title>Meta-Embedding Sentence Representation for Textual Similarity</title>
+      <author><first>Amir</first><last>Hazem</last></author>
+      <author><first>Nicolas</first><last>Hernandez</last></author>
+      <pages>465–473</pages>
+      <abstract>Word embedding models are now widely used in most NLP applications. Despite their effectiveness, there is no clear evidence about the choice of the most appropriate model. It often depends on the nature of the task and on the quality and size of the used data sets. This remains true for bottom-up sentence embedding models. However, no straightforward investigation has been conducted so far. In this paper, we propose a systematic study of the impact of the main word embedding models on sentence representation. By contrasting in-domain and pre-trained embedding models, we show under which conditions they can be jointly used for bottom-up sentence embeddings. Finally, we propose the first bottom-up meta-embedding representation at the sentence level for textual similarity. Significant improvements are observed in several tasks including question-to-question similarity, paraphrasing and next utterance ranking.</abstract>
+      <url>R19-1055</url>
+      <doi>10.26615/978-954-452-056-4_055</doi>
+    </paper>
+    <paper id="56">
+      <title>Emoji Powered Capsule Network to Detect Type and Target of Offensive Posts in Social Media</title>
+      <author><first>Hansi</first><last>Hettiarachchi</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <pages>474–480</pages>
+      <abstract>This paper describes a novel research approach to detect type and target of offensive posts in social media using a capsule network. The input to the network was character embeddings combined with emoji embeddings. The approach was evaluated on all three subtasks in Task 6 - SemEval 2019: OffensEval: Identifying and Categorizing Offensive Language in Social Media. The evaluation also showed that even though the capsule networks have not been used commonly in natural language processing tasks, they can outperform existing state of the art solutions for offensive language detection in social media.</abstract>
+      <url>R19-1056</url>
+      <doi>10.26615/978-954-452-056-4_056</doi>
+    </paper>
+    <paper id="57">
+      <title><fixed-case>E</fixed-case>o<fixed-case>ANN</fixed-case>: Lexical Semantic Relation Classification Using an Ensemble of Artificial Neural Networks</title>
+      <author><first>Rayehe</first><last>Hosseini Pour</last></author>
+      <author><first>Mehrnoush</first><last>Shamsfard</last></author>
+      <pages>481–486</pages>
+      <abstract>Researchers use wordnets as a knowledge base in many natural language processing tasks and applications, such as question answering, textual entailment, discourse classification, and so forth. Lexico-semantic relations among words or concepts are important parts of knowledge encoded in wordnets. As the use of wordnets becomes extensively widespread, extending the existing ones gets more attention. Manually construction and extension of lexico-semantic relations for WordNets or knowledge graphs are very time-consuming. Using automatic relation extraction methods can speed up this process. In this study, we exploit an ensemble of lstm and convolutional neural networks in a supervised manner to capture lexico-semantic relations which can either be used directly in NLP applications or compose the edges of wordnets. The whole procedure of learning vector space representation of relations is language independent. We used Princeton WordNet 3.1, FarsNet 3.0 (the Persian wordnet), Root09 and EVALution as golden standards to evaluate the predictive performance of our model and the results are comparable on the two languages. Empirical results demonstrate that our model outperforms the state of the art models.</abstract>
+      <url>R19-1057</url>
+      <doi>10.26615/978-954-452-056-4_057</doi>
+    </paper>
+    <paper id="58">
+      <title>Opinions Summarization: Aspect Similarity Recognition Relaxes The Constraint of Predefined Aspects</title>
+      <author><first>Nguyen</first><last>Huy Tien</last></author>
+      <author><first>Le</first><last>Tung Thanh</last></author>
+      <author><first>Nguyen</first><last>Minh Le</last></author>
+      <pages>487–496</pages>
+      <abstract>Recently research in opinions summarization focuses on rating expressions by aspects and/or sentiments they carry. To extract aspects of an expression, most studies require a predefined list of aspects or at least the number of aspects. Instead of extracting aspects, we rate expressions by aspect similarity recognition (ASR), which evaluates whether two expressions share at least one aspect. This subtask relaxes the limitation of predefining aspects and makes our opinions summarization applicable in domain adaptation. For the ASR subtask, we propose an attention-cell LSTM model, which integrates attention signals into the LSTM gates. According to the experimental results, the attention-cell LSTM works efficiently for learning latent aspects between two sentences in both settings of in-domain and cross-domain. In addition, the proposed extractive summarization method using ASR shows significant improvements over baselines on the Opinosis corpus.</abstract>
+      <url>R19-1058</url>
+      <doi>10.26615/978-954-452-056-4_058</doi>
+    </paper>
+    <paper id="59">
+      <title>Discourse-Aware Hierarchical Attention Network for Extractive Single-Document Summarization</title>
+      <author><first>Tatsuya</first><last>Ishigaki</last></author>
+      <author><first>Hidetaka</first><last>Kamigaito</last></author>
+      <author><first>Hiroya</first><last>Takamura</last></author>
+      <author><first>Manabu</first><last>Okumura</last></author>
+      <pages>497–506</pages>
+      <abstract>Discourse relations between sentences are often represented as a tree, and the tree structure provides important information for summarizers to create a short and coherent summary. However, current neural network-based summarizers treat the source document as just a sequence of sentences and ignore the tree-like discourse structure inherent in the document. To incorporate the information of a discourse tree structure into the neural network-based summarizers, we propose a discourse-aware neural extractive summarizer which can explicitly take into account the discourse dependency tree structure of the source document. Our discourse-aware summarizer can jointly learn the discourse structure and the salience score of a sentence by using novel hierarchical attention modules, which can be trained on automatically parsed discourse dependency trees. Experimental results showed that our model achieved competitive or better performances against state-of-the-art models in terms of ROUGE scores on the DailyMail dataset. We further conducted manual evaluations. The results showed that our approach also gained the coherence of the output summaries.</abstract>
+      <url>R19-1059</url>
+      <doi>10.26615/978-954-452-056-4_059</doi>
+    </paper>
+    <paper id="60">
+      <title>Semi-Supervised Induction of <fixed-case>POS</fixed-case>-Tag Lexicons with Tree Models</title>
+      <author><first>Maciej</first><last>Janicki</last></author>
+      <pages>507–515</pages>
+      <abstract>We approach the problem of POS tagging of morphologically rich languages in a setting where only a small amount of labeled training data is available. We show that a bigram HMM tagger benefits from re-training on a larger untagged text using Baum-Welch estimation. Most importantly, this estimation can be significantly improved by pre-guessing tags for OOV words based on morphological criteria. We consider two models for this task: a character-based recurrent neural network, which guesses the tag from the string form of the word, and a recently proposed graph-based model of morphological transformations. In the latter, the unknown POS tags can be modeled as latent variables in a way very similar to Hidden Markov Tree models and an analogue of the Forward-Backward algorithm can be formulated, which enables us to compute expected values over unknown taggings. We evaluate both the quality of the induced tag lexicon and its impact on the HMM’s tagging accuracy. In both tasks, the graph-based morphology model performs significantly better than the RNN predictor. This confirms the intuition that morphologically related words provide useful information about an unknown word’s POS tag.</abstract>
+      <url>R19-1060</url>
+      <doi>10.26615/978-954-452-056-4_060</doi>
+    </paper>
+    <paper id="61">
+      <title>Word Sense Disambiguation based on Constrained Random Walks in Linked Semantic Networks</title>
+      <author><first>Arkadiusz</first><last>Janz</last></author>
+      <author><first>Maciej</first><last>Piasecki</last></author>
+      <pages>516–525</pages>
+      <abstract>Word Sense Disambiguation remains a challenging NLP task. Due to the lack of annotated training data, especially for rare senses, the supervised approaches are usually designed for specific subdomains limited to a narrow subset of identified senses. Recent advances in this area have shown that knowledge-based approaches are more scalable and obtain more promising results in all-words WSD scenarios. In this work we present a faster WSD algorithm based on the Monte Carlo approximation of sense probabilities given a context using constrained random walks over linked semantic networks. We show that the local semantic relatedness is mostly sufficient to successfully identify correct senses when an extensive knowledge base and a proper weighting scheme are used. The proposed methods are evaluated on English (SenseEval, SemEval) and Polish (Składnica, KPWr) datasets.</abstract>
+      <url>R19-1061</url>
+      <doi>10.26615/978-954-452-056-4_061</doi>
+    </paper>
+    <paper id="62">
+      <title>Classification of Micro-Texts Using Sub-Word Embeddings</title>
+      <author><first>Mihir</first><last>Joshi</last></author>
+      <author><first>Nur</first><last>Zincir-Heywood</last></author>
+      <pages>526–533</pages>
+      <abstract>Extracting features and writing styles from short text messages is always a challenge. Short messages, like tweets, do not have enough data to perform statistical authorship attribution. Besides, the vocabulary used in these texts is sometimes improvised or misspelled. Therefore, in this paper, we propose combining four feature extraction techniques namely character n-grams, word n-grams, Flexible Patterns and a new sub-word embedding using the skip-gram model. Our system uses a Multi-Layer Perceptron to utilize these features from tweets to analyze short text messages. This proposed system achieves 85% accuracy, which is a considerable improvement over previous systems.</abstract>
+      <url>R19-1062</url>
+      <doi>10.26615/978-954-452-056-4_062</doi>
+    </paper>
+    <paper id="63">
+      <title>Using Syntax to Resolve <fixed-case>NPE</fixed-case> in <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Payal</first><last>Khullar</last></author>
+      <author><first>Allen</first><last>Antony</last></author>
+      <author><first>Manish</first><last>Shrivastava</last></author>
+      <pages>534–540</pages>
+      <abstract>This paper describes a novel, syntax-based system for automatic detection and resolution of Noun Phrase Ellipsis (NPE) in English. The system takes in free input English text, detects the site of nominal elision, and if present, selects potential antecedent candidates. The rules are built using the syntactic information on ellipsis and its antecedent discussed in previous theoretical linguistics literature on NPE. Additionally, we prepare a curated dataset of 337 sentences from well-known, reliable sources, containing positive and negative samples of NPE. We split this dataset into two parts, and use one part to refine our rules and the other to test the performance of our final system. We get an F1-score of 76.47% for detection and 70.27% for NPE resolution on the testset. To the best of our knowledge, ours is the first system that detects and resolves NPE in English. The curated dataset used for this task, albeit small, covers a wide variety of NPE cases and will be made public for future work.</abstract>
+      <url>R19-1063</url>
+      <doi>10.26615/978-954-452-056-4_063</doi>
+    </paper>
+    <paper id="64">
+      <title>Is Similarity Visually Grounded? Computational Model of Similarity for the <fixed-case>E</fixed-case>stonian language</title>
+      <author><first>Claudia</first><last>Kittask</last></author>
+      <author><first>Eduard</first><last>Barbu</last></author>
+      <pages>541–549</pages>
+      <abstract>Researchers in Computational Linguistics build models of similarity and test them against human judgments. Although there are many empirical studies of the computational models of similarity for the English language, the similarity for other languages is less explored. In this study we are chiefly interested in two aspects. In the first place we want to know how much of the human similarity is grounded in the visual perception. To answer this question two neural computer vision models are used and their correlation with the human derived similarity scores is computed. In the second place we investigate if language influences the similarity computation. To this purpose diverse computational models trained on Estonian resources are evaluated against human judgments</abstract>
+      <url>R19-1064</url>
+      <doi>10.26615/978-954-452-056-4_064</doi>
+    </paper>
+    <paper id="65">
+      <title>Language-Agnostic <fixed-case>T</fixed-case>witter-Bot Detection</title>
+      <author><first>Jürgen</first><last>Knauth</last></author>
+      <pages>550–558</pages>
+      <abstract>In this paper we address the problem of detecting Twitter bots. We analyze a dataset of 8385 Twitter accounts and their tweets consisting of both humans and different kinds of bots. We use this data to train machine learning classifiers that distinguish between real and bot accounts. We identify features that are easy to extract while still providing good results. We analyze different feature groups based on account specific, tweet specific and behavioral specific features and measure their performance compared to other state of the art bot detection methods. For easy future portability of our work we focus on language-agnostic features. With AdaBoost, the best performing classifier, we achieve an accuracy of 0.988 and an AUC of 0.995. As the creation of good training data in machine learning is often difficult - especially in the domain of Twitter bot detection - we additionally analyze to what extent smaller amounts of training data lead to useful results by reviewing cross-validated learning curves. Our results indicate that using few but expressive features already has a good practical benefit for bot detection, especially if only a small amount of training data is available.</abstract>
+      <url>R19-1065</url>
+      <doi>10.26615/978-954-452-056-4_065</doi>
+    </paper>
+    <paper id="66">
+      <title>Multi-level analysis and recognition of the text sentiment on the example of consumer opinions</title>
+      <author><first>Jan</first><last>Kocoń</last></author>
+      <author><first>Monika</first><last>Zaśko-Zielińska</last></author>
+      <author><first>Piotr</first><last>Miłkowski</last></author>
+      <pages>559–567</pages>
+      <abstract>In this article, we present a novel multi-domain dataset of Polish text reviews, annotated with sentiment on different levels: sentences and the whole documents. The annotation was made by linguists in a 2+1 scheme (with inter-annotator agreement analysis). We present a preliminary approach to the classification of labelled data using logistic regression, bidirectional long short-term memory recurrent neural networks (BiLSTM) and bidirectional encoder representations from transformers (BERT).</abstract>
+      <url>R19-1066</url>
+      <doi>10.26615/978-954-452-056-4_066</doi>
+    </paper>
+    <paper id="67">
+      <title>A Qualitative Evaluation Framework for Paraphrase Identification</title>
+      <author><first>Venelin</first><last>Kovatchev</last></author>
+      <author><first>M. Antonia</first><last>Marti</last></author>
+      <author><first>Maria</first><last>Salamo</last></author>
+      <author><first>Javier</first><last>Beltran</last></author>
+      <pages>568–577</pages>
+      <abstract>In this paper, we present a new approach for the evaluation, error analysis, and interpretation of supervised and unsupervised Paraphrase Identification (PI) systems. Our evaluation framework makes use of a PI corpus annotated with linguistic phenomena to provide a better understanding and interpretation of the performance of various PI systems. Our approach allows for a qualitative evaluation and comparison of the PI models using human interpretable categories. It does not require modification of the training objective of the systems and does not place additional burden on the developers. We replicate several popular supervised and unsupervised PI systems. Using our evaluation framework we show that: 1) Each system performs differently with respect to a set of linguistic phenomena and makes qualitatively different kinds of errors; 2) Some linguistic phenomena are more challenging than others across all systems.</abstract>
+      <url>R19-1067</url>
+      <doi>10.26615/978-954-452-056-4_067</doi>
+    </paper>
+    <paper id="68">
+      <title>Study on Unsupervised Statistical Machine Translation for Backtranslation</title>
+      <author><first>Anush</first><last>Kumar</last></author>
+      <author><first>Nihal V.</first><last>Nayak</last></author>
+      <author><first>Aditya</first><last>Chandra</last></author>
+      <author><first>Mydhili K.</first><last>Nair</last></author>
+      <pages>578–582</pages>
+      <abstract>Machine Translation systems have drastically improved over the years for several language pairs. Monolingual data is often used to generate synthetic sentences to augment the training data which has shown to improve the performance of machine translation models. In our paper, we make use of an Unsupervised Statistical Machine Translation (USMT) to generate synthetic sentences. Our study compares the performance improvements in Neural Machine Translation model when using synthetic sentences from supervised and unsupervised Machine Translation models. Our approach of using USMT for backtranslation shows promise in low resource conditions and achieves an improvement of 3.2 BLEU score over the Neural Machine Translation model.</abstract>
+      <url>R19-1068</url>
+      <doi>10.26615/978-954-452-056-4_068</doi>
+    </paper>
+    <paper id="69">
+      <title>Towards Functionally Similar Corpus Resources for Translation</title>
+      <author><first>Maria</first><last>Kunilovskaya</last></author>
+      <author><first>Serge</first><last>Sharoff</last></author>
+      <pages>583–592</pages>
+      <abstract>The paper describes a computational approach to produce functionally comparable monolingual corpus resources for translation studies and contrastive analysis. We exploit a text-external approach, based on a set of Functional Text Dimensions to model text functions, so that each text can be represented as a vector in a multidimensional space of text functions. These vectors can be used to find reasonably homogeneous subsets of functionally similar texts across different corpora. Our models for predicting text functions are based on recurrent neural networks and traditional feature-based machine learning approaches. In addition to using the categories of the British National Corpus as our test case, we investigated the functional comparability of the English parts from the two parallel corpora: CroCo (English-German) and RusLTC (English-Russian) and applied our models to define functionally similar clusters in them. Our results show that the Functional Text Dimensions provide a useful description for text categories, while allowing a more flexible representation for texts with hybrid functions.</abstract>
+      <url>R19-1069</url>
+      <doi>10.26615/978-954-452-056-4_069</doi>
+    </paper>
+    <paper id="70">
+      <title>Question Similarity in Community Question Answering: A Systematic Exploration of Preprocessing Methods and Models</title>
+      <author><first>Florian</first><last>Kunneman</last></author>
+      <author><first>Thiago Castro</first><last>Ferreira</last></author>
+      <author><first>Emiel</first><last>Krahmer</last></author>
+      <author><first>Antal</first><last>van den Bosch</last></author>
+      <pages>593–601</pages>
+      <abstract>Community Question Answering forums are popular among Internet users, and a basic problem they encounter is trying to find out if their question has already been posed before. To address this issue, NLP researchers have developed methods to automatically detect question-similarity, which was one of the shared tasks in SemEval. The best performing systems for this task made use of Syntactic Tree Kernels or the SoftCosine metric. However, it remains unclear why these methods seem to work, whether their performance can be improved by better preprocessing methods and what kinds of errors they (and other methods) make. In this paper, we therefore systematically combine and compare these two approaches with the more traditional BM25 and translation-based models. Moreover, we analyze the impact of preprocessing steps (lowercasing, suppression of punctuation and stop words removal) and word meaning similarity based on different distributions (word translation probability, Word2Vec, fastText and ELMo) on the performance of the task. We conduct an error analysis to gain insight into the differences in performance between the system set-ups. The implementation is made publicly available from https://github.com/fkunneman/DiscoSumo/tree/master/ranlp.</abstract>
+      <url>R19-1070</url>
+      <doi>10.26615/978-954-452-056-4_070</doi>
+    </paper>
+    <paper id="71">
+      <title>A Classification-Based Approach to Cognate Detection Combining Orthographic and Semantic Similarity Information</title>
+      <author><first>Sofie</first><last>Labat</last></author>
+      <author><first>Els</first><last>Lefever</last></author>
+      <pages>602–610</pages>
+      <abstract>This paper presents proof-of-concept experiments for combining orthographic and semantic information to distinguish cognates from non-cognates. To this end, a context-independent gold standard is developed by manually labelling English-Dutch pairs of cognates and false friends in bilingual term lists. These annotated cognate pairs are then used to train and evaluate a supervised binary classification system for the automatic detection of cognates. Two types of information sources are incorporated in the classifier: fifteen string similarity metrics capture form similarity between source and target words, while word embeddings model semantic similarity between the words. The experimental results show that even though the system already achieves good results by only incorporating orthographic information, the performance further improves by including semantic information in the form of embeddings.</abstract>
+      <url>R19-1071</url>
+      <doi>10.26615/978-954-452-056-4_071</doi>
+    </paper>
+    <paper id="72">
+      <title>Resolving Pronouns for a Resource-Poor Language, <fixed-case>M</fixed-case>alayalam Using Resource-Rich Language, <fixed-case>T</fixed-case>amil.</title>
+      <author><first>Sobha</first><last>Lalitha Devi</last></author>
+      <pages>611–618</pages>
+      <abstract>In this paper we give in detail how a resource rich language can be used for resolving pronouns for a less resource language. The source language, which is resource rich language in this study, is Tamil and the resource poor language is Malayalam, both belonging to the same language family, Dravidian. The Pronominal resolution developed for Tamil uses CRFs. Our approach is to leverage the Tamil language model to test Malayalam data and the processing required for Malayalam data is detailed. The similarity at the syntactic level between the languages is exploited in identifying the features for developing the Tamil language model. The word form or the lexical item is not considered as a feature for training the CRFs. Evaluation on Malayalam Wikipedia data shows that our approach is correct and the results, though not as good as Tamil, but comparable.</abstract>
+      <url>R19-1072</url>
+      <doi>10.26615/978-954-452-056-4_072</doi>
+    </paper>
+    <paper id="73">
+      <title>Semantic Role Labeling with Pretrained Language Models for Known and Unknown Predicates</title>
+      <author><first>Daniil</first><last>Larionov</last></author>
+      <author><first>Artem</first><last>Shelmanov</last></author>
+      <author><first>Elena</first><last>Chistova</last></author>
+      <author><first>Ivan</first><last>Smirnov</last></author>
+      <pages>619–628</pages>
+      <abstract>We build the first full pipeline for semantic role labelling of Russian texts. The pipeline implements predicate identification, argument extraction, argument classification (labeling), and global scoring via integer linear programming. We train supervised neural network models for argument classification using Russian semantically annotated corpus – FrameBank. However, we note that this resource provides annotations only to a very limited set of predicates. We combat the problem of annotation scarcity by introducing two models that rely on different sets of features: one for “known” predicates that are present in the training set and one for “unknown” predicates that are not. We show that the model for “unknown” predicates can alleviate the lack of annotation by using pretrained embeddings. We perform experiments with various types of embeddings including the ones generated by deep pretrained language models: word2vec, FastText, ELMo, BERT, and show that embeddings generated by deep pretrained language models are superior to classical shallow embeddings for argument classification of both “known” and “unknown” predicates.</abstract>
+      <url>R19-1073</url>
+      <doi>10.26615/978-954-452-056-4_073</doi>
+    </paper>
+    <paper id="74">
+      <title>Structural Approach to Enhancing <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et with Conceptual Frame Semantics</title>
+      <author><first>Svetlozara</first><last>Leseva</last></author>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <pages>629–637</pages>
+      <abstract>This paper outlines procedures for enhancing WordNet with conceptual information from FrameNet. The mapping of the two resources is non-trivial. We define a number of techniques for the validation of the consistency of the mapping and the extension of its coverage which make use of the structure of both resources and the systematic relations between synsets in WordNet and between frames in FrameNet, as well as between synsets and frames). We present a case study on causativity, a relation which provides enhancement complementary to the one using hierarchical relations, by means of linking in a systematic way large parts of the lexicon. We show how consistency checks and denser relations may be implemented on the basis of this relation. We, then, propose new frames based on causative-inchoative correspondences and in conclusion touch on the possibilities for defining new frames based on the types of specialisation that takes place from parent to child synset.</abstract>
+      <url>R19-1074</url>
+      <doi>10.26615/978-954-452-056-4_074</doi>
+    </paper>
+    <paper id="75">
+      <title>Compositional Hyponymy with Positive Operators</title>
+      <author><first>Martha</first><last>Lewis</last></author>
+      <pages>638–647</pages>
+      <abstract>Language is used to describe concepts, and many of these concepts are hierarchical. Moreover, this hierarchy should be compatible with forming phrases and sentences. We use linear-algebraic methods that allow us to encode words as collections of vectors. The representations we use have an ordering, related to subspace inclusion, which we interpret as modelling hierarchical information. The word representations built can be understood within a compositional distributional semantic framework, providing methods for composing words to form phrase and sentence level representations. We show that the resulting representations give competitive results on both word-level hyponymy and sentence-level entailment datasets.</abstract>
+      <url>R19-1075</url>
+      <doi>10.26615/978-954-452-056-4_075</doi>
+    </paper>
+    <paper id="76">
+      <title>The Impact of Semantic Linguistic Features in Relation Extraction: A Logical Relational Learning Approach</title>
+      <author><first>Rinaldo</first><last>Lima</last></author>
+      <author><first>Bernard</first><last>Espinasse</last></author>
+      <author><first>Frederico</first><last>Freitas</last></author>
+      <pages>648–654</pages>
+      <abstract>Relation Extraction (RE) consists in detecting and classifying semantic relations between entities in a sentence. The vast majority of the state-of-the-art RE systems relies on morphosyntactic features and supervised machine learning algorithms. This paper tries to answer important questions concerning both the impact of semantic based features, and the integration of external linguistic knowledge resources on RE performance. For that, a RE system based on a logical and relational learning algorithm was used and evaluated on three reference datasets from two distinct domains. The yielded results confirm that the classifiers induced using the proposed richer feature set outperformed the classifiers built with morphosyntactic features in average 4% (F1-measure).</abstract>
+      <url>R19-1076</url>
+      <doi>10.26615/978-954-452-056-4_076</doi>
+    </paper>
+    <paper id="77">
+      <title>Detecting Anorexia in <fixed-case>S</fixed-case>panish Tweets</title>
+      <author><first>Pilar</first><last>López Úbeda</last></author>
+      <author><first>Flor Miriam</first><last>Plaza del Arco</last></author>
+      <author><first>Manuel Carlos</first><last>Díaz Galiano</last></author>
+      <author><first>L. Alfonso</first><last>Urena Lopez</last></author>
+      <author><first>Maite</first><last>Martin</last></author>
+      <pages>655–663</pages>
+      <abstract>Mental health is one of the main concerns of today’s society. Early detection of symptoms can greatly help people with mental disorders. People are using social networks more and more to express emotions, sentiments and mental states. Thus, the treatment of this information using NLP technologies can be applied to the automatic detection of mental problems such as eating disorders. However, the first step to solving the problem should be to provide a corpus in order to evaluate our systems. In this paper, we specifically focus on detecting anorexia messages on Twitter. Firstly, we have generated a new corpus of tweets extracted from different accounts including anorexia and non-anorexia messages in Spanish. The corpus is called SAD: Spanish Anorexia Detection corpus. In order to validate the effectiveness of the SAD corpus, we also propose several machine learning approaches for automatically detecting anorexia symptoms in the corpus. The good results obtained show that the application of textual classification methods is a promising option for developing this kind of system demonstrating that these tools could be used by professionals to help in the early detection of mental problems.</abstract>
+      <url>R19-1077</url>
+      <doi>10.26615/978-954-452-056-4_077</doi>
+    </paper>
+    <paper id="78">
+      <title>A type-theoretical reduction of morphological, syntactic and semantic compositionality to a single level of description</title>
+      <author><first>Erkki</first><last>Luuk</last></author>
+      <pages>664–673</pages>
+      <abstract>The paper presents NLC, a new formalism for modeling natural language (NL) compositionality. NLC is a functional type system (i.e. one based on mathematical functions and their types). Its main features include a close correspondence with NL and an integrated modeling of morphological, syntactic and semantic compositionality. The integration is effected with a subclass of compound types (types which are syntactic compounds of multiple types or their terms), while the correspondence is sought with function types and polymorphism. The paper also presents an implementation of NLC in Coq. The implementation formalizes a diverse fragment of NL, with NLC expressions type checking and failing to type check in exactly the same ways that NL expressions pass and fail their acceptability tests. Among other things, this demonstrates the possibility of reducing morphological, syntactic and semantic compositionality to a single level of description. The level is tentatively identified with semantic compositionality — an interpretation which, besides being supported by results from language processing, has interesting implications on NL structure and modeling.</abstract>
+      <url>R19-1078</url>
+      <doi>10.26615/978-954-452-056-4_078</doi>
+    </paper>
+    <paper id="79">
+      <title>v-trel: Vocabulary Trainer for Tracing Word Relations - An Implicit Crowdsourcing Approach</title>
+      <author><first>Verena</first><last>Lyding</last></author>
+      <author><first>Christos</first><last>Rodosthenous</last></author>
+      <author><first>Federico</first><last>Sangati</last></author>
+      <author><first>Umair</first><last>ul Hassan</last></author>
+      <author><first>Lionel</first><last>Nicolas</last></author>
+      <author><first>Alexander</first><last>König</last></author>
+      <author><first>Jolita</first><last>Horbacauskiene</last></author>
+      <author><first>Anisia</first><last>Katinskaia</last></author>
+      <pages>674–683</pages>
+      <abstract>In this paper, we present our work on developing a vocabulary trainer that uses exercises generated from language resources such as ConceptNet and crowdsources the responses of the learners to enrich the language resource. We performed an empirical evaluation of our approach with 60 non-native speakers over two days, which shows that new entries to expand Concept-Net can efficiently be gathered through vocabulary exercises on word relations. We also report on the feedback gathered from the users and an expert from language teaching, and discuss the potential of the vocabulary trainer application from the user and language learner perspective. The feedback suggests that v-trel has educational potential, while in its current state some shortcomings could be identified.</abstract>
+      <url>R19-1079</url>
+      <doi>10.26615/978-954-452-056-4_079</doi>
+    </paper>
+    <paper id="80">
+      <title>Jointly Learning Author and Annotated Character N-gram Embeddings: A Case Study in Literary Text</title>
+      <author><first>Suraj</first><last>Maharjan</last></author>
+      <author><first>Deepthi</first><last>Mave</last></author>
+      <author><first>Prasha</first><last>Shrestha</last></author>
+      <author><first>Manuel</first><last>Montes</last></author>
+      <author><first>Fabio A.</first><last>González</last></author>
+      <author><first>Thamar</first><last>Solorio</last></author>
+      <pages>684–692</pages>
+      <abstract>An author’s way of presenting a story through his/her writing style has a great impact on whether the story will be liked by readers or not. In this paper, we learn representations for authors of literary texts together with representations for character n-grams annotated with their functional roles. We train a neural character n-gram based language model using an external corpus of literary texts and transfer learned representations for use in downstream tasks. We show that augmenting the knowledge from external works of authors produces results competitive with other style-based methods for book likability prediction, genre classification, and authorship attribution.</abstract>
+      <url>R19-1080</url>
+      <doi>10.26615/978-954-452-056-4_080</doi>
+    </paper>
+    <paper id="81">
+      <title>Generating Challenge Datasets for Task-Oriented Conversational Agents through Self-Play</title>
+      <author><first>Sourabh</first><last>Majumdar</last></author>
+      <author><first>Serra Sinem</first><last>Tekiroglu</last></author>
+      <author><first>Marco</first><last>Guerini</last></author>
+      <pages>693–702</pages>
+      <abstract>End-to-end neural approaches are becoming increasingly common in conversational scenarios due to their promising performances when provided with sufficient amount of data. In this paper, we present a novel methodology to address the interpretability of neural approaches in such scenarios by creating challenge datasets using dialogue self-play over multiple tasks/intents. Dialogue self-play allows generating large amount of synthetic data; by taking advantage of the complete control over the generation process, we show how neural approaches can be evaluated in terms of unseen dialogue patterns. We propose several out-of-pattern test cases each of which introduces a natural and unexpected user utterance phenomenon. As a proof of concept, we built a single and a multiple memory network, and show that these two architectures have diverse performances depending on the peculiar dialogue patterns.</abstract>
+      <url>R19-1081</url>
+      <doi>10.26615/978-954-452-056-4_081</doi>
+    </paper>
+    <paper id="82">
+      <title>Sentiment Polarity Detection in <fixed-case>A</fixed-case>zerbaijani Social News Articles</title>
+      <author><first>Sevda</first><last>Mammadli</last></author>
+      <author><first>Shamsaddin</first><last>Huseynov</last></author>
+      <author><first>Huseyn</first><last>Alkaramov</last></author>
+      <author><first>Ulviyya</first><last>Jafarli</last></author>
+      <author><first>Umid</first><last>Suleymanov</last></author>
+      <author><first>Samir</first><last>Rustamov</last></author>
+      <pages>703–710</pages>
+      <abstract>Text classification field of natural language processing has been experiencing remarkable growth in recent years. Especially, sentiment analysis has received a considerable attention from both industry and research community. However, only a few research examples exist for Azerbaijani language. The main objective of this research is to apply various machine learning algorithms for determining the sentiment of news articles in Azerbaijani language. Approximately, 30.000 social news articles have been collected from online news sites and labeled manually as negative or positive according to their sentiment categories. Initially, text preprocessing was implemented to data in order to eliminate the noise. Secondly, to convert text to a more machine-readable form, BOW (bag of words) model has been applied. More specifically, two methodologies of BOW model, which are tf-idf and frequency based model have been used as vectorization methods. Additionally, SVM, Random Forest, and Naive Bayes algorithms have been applied as the classification algorithms, and their combinations with two vectorization approaches have been tested and analyzed. Experimental results indicate that SVM outperforms other classification algorithms.</abstract>
+      <url>R19-1082</url>
+      <doi>10.26615/978-954-452-056-4_082</doi>
+    </paper>
+    <paper id="83">
+      <title><fixed-case>I</fixed-case>nforex — a Collaborative Systemfor Text Corpora Annotation and Analysis Goes Open</title>
+      <author><first>Michał</first><last>Marcińczuk</last></author>
+      <author><first>Marcin</first><last>Oleksy</last></author>
+      <pages>711–719</pages>
+      <abstract>In the paper we present the latest changes introduce to Inforex — a web-based system for qualitative and collaborative text corpora annotation and analysis. One of the most important news is the release of source codes. Now the system is available on the GitHub repository (https://github.com/CLARIN-PL/Inforex) as an open source project. The system can be easily setup and run in a Docker container what simplifies the installation process. The major improvements include: semi-automatic text annotation, multilingual text preprocessing using CLARIN-PL web services, morphological tagging of XML documents, improved editor for annotation attribute, batch annotation attribute editor, morphological disambiguation, extended word sense annotation. This paper contains a brief description of the mentioned improvements. We also present two use cases in which various Inforex features were used and tested in real-life projects.</abstract>
+      <url>R19-1083</url>
+      <doi>10.26615/978-954-452-056-4_083</doi>
+    </paper>
+    <paper id="84">
+      <title>Semantic Language Model for <fixed-case>T</fixed-case>unisian Dialect</title>
+      <author><first>Abir</first><last>MASMOUDI</last></author>
+      <author><first>Rim</first><last>Laatar</last></author>
+      <author><first>Mariem</first><last>ellouze</last></author>
+      <author><first>lamia</first><last>hadrich belguith</last></author>
+      <pages>720–729</pages>
+      <abstract>In this paper, we describe the process of creating a statistical Language Model (LM) for the Tunisian Dialect. Indeed, this work is part of the realization of Automatic Speech Recognition (ASR) system for the Tunisian Railway Transport Network. Since our eld of work has been limited, there are several words with similar behaviors (semantic for example) but they do not have the same appearance probability; their class groupings will therefore be possible. For these reasons, we propose to build an n-class LM that is based mainly on the integration of purely semantic data. Indeed, each class represents an abstraction of similar labels. In order to improve the sequence labeling task, we proposed to use a discriminative algorithm based on the Conditional Random Field (CRF) model. To better judge our choice of creating an n-class word model, we compared the created model with the 3-gram type model on the same test corpus of evaluation. Additionally, to assess the impact of using the CRF model to perform the semantic labelling task in order to construct semantic classes, we compared the n-class created model with using the CRF in the semantic labelling task and the n- class model without using the CRF in the semantic labelling task. The drawn comparison of the predictive power of the n-class model obtained by applying the CRF model in the semantic labelling is that it is better than the other two models presenting the highest value of its perplexity.</abstract>
+      <url>R19-1084</url>
+      <doi>10.26615/978-954-452-056-4_084</doi>
+    </paper>
+    <paper id="85">
+      <title>Automatic diacritization of <fixed-case>T</fixed-case>unisian dialect text using Recurrent Neural Network</title>
+      <author><first>Abir</first><last>Masmoudi</last></author>
+      <author><first>Mariem</first><last>Ellouze</last></author>
+      <author><first>Lamia</first><last>Hadrich belguith</last></author>
+      <pages>730–739</pages>
+      <abstract>The absence of diacritical marks in the Arabic texts generally leads to morphological, syntactic and semantic ambiguities. This can be more blatant when one deals with under-resourced languages, such as the Tunisian dialect, which suffers from unavailability of basic tools and linguistic resources, like sufficient amount of corpora, multilingual dictionaries, morphological and syntactic analyzers. Thus, this language processing faces greater challenges due to the lack of these resources. The automatic diacritization of MSA text is one of the various complex problems that can be solved by deep neural networks today. Since the Tunisian dialect is an under-resourced language of MSA and as there are a lot of resemblance between both languages, we suggest to investigate a recurrent neural network (RNN) for this dialect diacritization problem. This model will be compared to our previous models models CRF and SMT (CITATION) based on the same dialect corpus. We can experimentally show that our model can achieve better outcomes (DER of 10.72%), as compared to the two models CRF (DER of 20.25%) and SMT (DER of 33.15%).</abstract>
+      <url>R19-1085</url>
+      <doi>10.26615/978-954-452-056-4_085</doi>
+    </paper>
+    <paper id="86">
+      <title>Comparing <fixed-case>MT</fixed-case> Approaches for Text Normalization</title>
+      <author><first>Claudia</first><last>Matos Veliz</last></author>
+      <author><first>Orphee</first><last>De Clercq</last></author>
+      <author><first>Veronique</first><last>Hoste</last></author>
+      <pages>740–749</pages>
+      <abstract>One of the main characteristics of social media data is the use of non-standard language. Since NLP tools have been trained on traditional text material their performance drops when applied to social media data. One way to overcome this is to first perform text normalization. In this work, we apply text normalization to noisy English and Dutch text coming from different social media genres: text messages, message board posts and tweets. We consider the normalization task as a Machine Translation problem and test the two leading paradigms: statistical and neural machine translation. For SMT we explore the added value of varying background corpora for training the language model. For NMT we have a look at data augmentation since the parallel datasets we are working with are limited in size. Our results reveal that when relying on SMT to perform the normalization it is beneficial to use a background corpus that is close to the genre you are normalizing. Regarding NMT, we find that the translations - or normalizations - coming out of this model are far from perfect and that for a low-resource language like Dutch adding additional training data works better than artificially augmenting the data.</abstract>
+      <url>R19-1086</url>
+      <doi>10.26615/978-954-452-056-4_086</doi>
+    </paper>
+    <paper id="87">
+      <title>Sentiment and Emotion Based Representations for Fake Reviews Detection</title>
+      <author><first>Alimuddin</first><last>Melleng</last></author>
+      <author><first>Anna</first><last>Jurek-Loughrey</last></author>
+      <author><first>Deepak</first><last>P</last></author>
+      <pages>750–757</pages>
+      <abstract>Fake reviews are increasingly prevalent across the Internet. They can be unethical as well as harmful. They can affect businesses and mislead individual customers. As the opinions on the Web are increasingly used the detection of fake reviews has become more and more critical. In this study, we explore the effectiveness of sentiment and emotions based representations for the task of building machine learning models for fake review detection. We perform empirical studies over three real world datasets and demonstrate that improved data representation can be achieved by combining sentiment and emotion extraction methods, as well as by performing sentiment and emotion analysis on a part-by-part basis by segmenting the reviews.</abstract>
+      <url>R19-1087</url>
+      <doi>10.26615/978-954-452-056-4_087</doi>
+    </paper>
+    <paper id="88">
+      <title>Turning silver into gold: error-focused corpus reannotation with active learning</title>
+      <author><first>Pierre André</first><last>Ménard</last></author>
+      <author><first>Antoine</first><last>Mougeot</last></author>
+      <pages>758–767</pages>
+      <abstract>While high quality gold standard annotated corpora are crucial for most tasks in natural language processing, many annotated corpora published in recent years, created by annotators or tools, contains noisy annotations. These corpora can be viewed as more silver than gold standards, even if they are used in evaluation campaigns or to compare systems’ performances. As upgrading a silver corpus to gold level is still a challenge, we explore the application of active learning techniques to detect errors using four datasets designed for document classification and part-of-speech tagging. Our results show that the proposed method for the seeding step improves the chance of finding incorrect annotations by a factor of 2.73 when compared to random selection, a 14.71% increase from the baseline methods. Our query method provides an increase in the error detection precision on average by a factor of 1.78 against random selection, an increase of 61.82% compared to other query approaches.</abstract>
+      <url>R19-1088</url>
+      <doi>10.26615/978-954-452-056-4_088</doi>
+    </paper>
+    <paper id="89">
+      <title>Community Perspective on Replicability in Natural Language Processing</title>
+      <author><first>Margot</first><last>Mieskes</last></author>
+      <author><first>Karën</first><last>Fort</last></author>
+      <author><first>Aurélie</first><last>Névéol</last></author>
+      <author><first>Cyril</first><last>Grouin</last></author>
+      <author><first>Kevin</first><last>Cohen</last></author>
+      <pages>768–775</pages>
+      <abstract>With recent efforts in drawing attention to the task of replicating and/or reproducing results, for example in the context of COLING 2018 and various LREC workshops, the question arises how the NLP community views the topic of replicability in general. Using a survey, in which we involve members of the NLP community, we investigate how our community perceives this topic, its relevance and options for improvement. Based on over two hundred participants, the survey results confirm earlier observations, that successful reproducibility requires more than having access to code and data. Additionally, the results show that the topic has to be tackled from the authors’, reviewers’ and community’s side.</abstract>
+      <url>R19-1089</url>
+      <doi>10.26615/978-954-452-056-4_089</doi>
+    </paper>
+    <paper id="90">
+      <title>Unsupervised Data Augmentation for Less-Resourced Languages with no Standardized Spelling</title>
+      <author><first>Alice</first><last>Millour</last></author>
+      <author><first>Karën</first><last>Fort</last></author>
+      <pages>776–784</pages>
+      <abstract>Building representative linguistic resources and NLP tools for non-standardized languages is challenging: when spelling is not determined by a norm, multiple written forms can be encountered for a given word, inducing a large proportion of out-of-vocabulary words. To embrace this diversity, we propose a methodology based on crowdsourced alternative spellings we use to extract rules applied to match OOV words with one of their spelling variants. This virtuous process enables the unsupervised augmentation of multi-variant lexicons without expert rule definition. We apply this multilingual methodology on Alsatian, a French regional language and provide an intrinsic evaluation of the correctness of the variants pairs, and an extrinsic evaluation on a downstream task. We show that in a low-resource scenario, 145 inital pairs can lead to the generation of 876 additional variant pairs, and a diminution of OOV words improving the part-of-speech tagging performance by 1 to 4%.</abstract>
+      <url>R19-1090</url>
+      <doi>10.26615/978-954-452-056-4_090</doi>
+    </paper>
+    <paper id="91">
+      <title>Neural Feature Extraction for Contextual Emotion Detection</title>
+      <author><first>Elham</first><last>Mohammadi</last></author>
+      <author><first>Hessam</first><last>Amini</last></author>
+      <author><first>Leila</first><last>Kosseim</last></author>
+      <pages>785–794</pages>
+      <abstract>This paper describes a new approach for the task of contextual emotion detection. The approach is based on a neural feature extractor, composed of a recurrent neural network with an attention mechanism, followed by a classifier, that can be neural or SVM-based. We evaluated the model with the dataset of the task 3 of SemEval 2019 (EmoContext), which includes short 3-turn conversations, tagged with 4 emotion classes. The best performing setup was achieved using ELMo word embeddings and POS tags as input, bidirectional GRU as hidden units, and an SVM as the final classifier. This configuration reached 69.93% in terms of micro-average F1 score on the main 3 emotion classes, a score that outperformed the baseline system by 11.25%.</abstract>
+      <url>R19-1091</url>
+      <doi>10.26615/978-954-452-056-4_091</doi>
+    </paper>
+    <paper id="92">
+      <title>Empirical Study of Diachronic Word Embeddings for Scarce Data</title>
+      <author><first>Syrielle</first><last>Montariol</last></author>
+      <author><first>Alexandre</first><last>Allauzen</last></author>
+      <pages>795–803</pages>
+      <abstract>Word meaning change can be inferred from drifts of time-varying word embeddings. However, temporal data may be too sparse to build robust word embeddings and to discriminate significant drifts from noise. In this paper, we compare three models to learn diachronic word embeddings on scarce data: incremental updating of a Skip-Gram from Kim et al. (2014), dynamic filtering from Bamler &amp; Mandt (2017), and dynamic Bernoulli embeddings from Rudolph &amp; Blei (2018). In particular, we study the performance of different initialisation schemes and emphasise what characteristics of each model are more suitable to data scarcity, relying on the distribution of detected drifts. Finally, we regularise the loss of these models to better adapt to scarce data.</abstract>
+      <url>R19-1092</url>
+      <doi>10.26615/978-954-452-056-4_092</doi>
+    </paper>
+    <paper id="93">
+      <title>A Fast and Accurate Partially Deterministic Morphological Analysis</title>
+      <author><first>Hajime</first><last>Morita</last></author>
+      <author><first>Tomoya</first><last>Iwakura</last></author>
+      <pages>804–809</pages>
+      <abstract>This paper proposes a partially deterministic morphological analysis method for improved processing speed. Maximum matching is a fast deterministic method for morphological analysis. However, the method tends to decrease performance due to lack of consideration of contextual information. In order to use maximum matching safely, we propose the use of Context Independent Strings (CISs), which are strings that do not have ambiguity in terms of morphological analysis. Our method first identifies CISs in a sentence using maximum matching without contextual information, then analyzes the unprocessed part of the sentence using a bi-gram-based morphological analysis model. We evaluate the method on a Japanese morphological analysis task. The experimental results show a 30% reduction of running time while maintaining improved accuracy.</abstract>
+      <url>R19-1093</url>
+      <doi>10.26615/978-954-452-056-4_093</doi>
+    </paper>
+    <paper id="94">
+      <title>incom.py - A Toolbox for Calculating Linguistic Distances and Asymmetries between Related Languages</title>
+      <author><first>Marius</first><last>Mosbach</last></author>
+      <author><first>Irina</first><last>Stenger</last></author>
+      <author><first>Tania</first><last>Avgustinova</last></author>
+      <author><first>Dietrich</first><last>Klakow</last></author>
+      <pages>810–818</pages>
+      <abstract>Languages may be differently distant from each other and their mutual intelligibility may be asymmetric. In this paper we introduce incom.py, a toolbox for calculating linguistic distances and asymmetries between related languages. incom.py allows linguist experts to quickly and easily perform statistical analyses and compare those with experimental results. We demonstrate the efficacy of incom.py in an incomprehension experiment on two Slavic languages: Bulgarian and Russian. Using incom.py we were able to validate three methods to measure linguistic distances and asymmetries: Levenshtein distance, word adaptation surprisal, and conditional entropy as predictors of success in a reading intercomprehension experiment.</abstract>
+      <url>R19-1094</url>
+      <doi>10.26615/978-954-452-056-4_094</doi>
+    </paper>
+    <paper id="95">
+      <title>A Holistic Natural Language Generation Framework for the Semantic Web</title>
+      <author><first>Axel-Cyrille</first><last>Ngonga Ngomo</last></author>
+      <author><first>Diego</first><last>Moussallem</last></author>
+      <author><first>Lorenz</first><last>Bühmann</last></author>
+      <pages>819–828</pages>
+      <abstract>With the ever-growing generation of data for the Semantic Web comes an increasing demand for this data to be made available to non-semantic Web experts. One way of achieving this goal is to translate the languages of the Semantic Web into natural language. We present LD2NL, a framework that allows verbalizing the three key languages of the Semantic Web, i.e., RDF, OWL, and SPARQL. Our framework is based on a bottom-up approach to verbalization. We evaluated LD2NL in an open survey with 86 persons. Our results suggest that our framework can generate verbalizations that are close to natural languages and that can be easily understood by non-experts. Therewith, it enables non-domain experts to interpret Semantic Web data with more than 91% of the accuracy of domain experts.</abstract>
+      <url>R19-1095</url>
+      <doi>10.26615/978-954-452-056-4_095</doi>
+    </paper>
+    <paper id="96">
+      <title>Building a Comprehensive <fixed-case>R</fixed-case>omanian Knowledge Base for Drug Administration</title>
+      <author><first>Bogdan</first><last>Nicula</last></author>
+      <author><first>Mihai</first><last>Dascalu</last></author>
+      <author><first>Maria-Dorinela</first><last>Sîrbu</last></author>
+      <author><first>Ștefan</first><last>Trăușan-Matu</last></author>
+      <author><first>Alexandru</first><last>Nuță</last></author>
+      <pages>829–836</pages>
+      <abstract>Information on drug administration is obtained traditionally from doctors and pharmacists, as well as leaflets which provide in most cases cumbersome and hard-to-follow details. Thus, the need for medical knowledge bases emerges to provide access to concrete and well-structured information which can play an important role in informing patients. This paper introduces a Romanian medical knowledge base focused on drug-drug interactions, on representing relevant drug information, and on symptom-disease relations. The knowledge base was created by extracting and transforming information using Natural Language Processing techniques from both structured and unstructured sources, together with manual annotations. The resulting Romanian ontologies are aligned with larger English medical ontologies. Our knowledge base supports queries regarding drugs (e.g., active ingredients, concentration, expiration date), drug-drug interaction, symptom-disease relations, as well as drug-symptom relations.</abstract>
+      <url>R19-1096</url>
+      <doi>10.26615/978-954-452-056-4_096</doi>
+    </paper>
+    <paper id="97">
+      <title>Summary Refinement through Denoising</title>
+      <author><first>Nikola</first><last>Nikolov</last></author>
+      <author><first>Alessandro</first><last>Calmanovici</last></author>
+      <author><first>Richard</first><last>Hahnloser</last></author>
+      <pages>837–843</pages>
+      <abstract>We propose a simple method for post-processing the outputs of a text summarization system in order to refine its overall quality. Our approach is to train text-to-text rewriting models to correct information redundancy errors that may arise during summarization. We train on synthetically generated noisy summaries, testing three different types of noise that introduce out-of-context information within each summary. When applied on top of extractive and abstractive summarization baselines, our summary denoising models yield metric improvements while reducing redundancy.</abstract>
+      <url>R19-1097</url>
+      <doi>10.26615/978-954-452-056-4_097</doi>
+    </paper>
+    <paper id="98">
+      <title>Large-Scale Hierarchical Alignment for Data-driven Text Rewriting</title>
+      <author><first>Nikola</first><last>Nikolov</last></author>
+      <author><first>Richard</first><last>Hahnloser</last></author>
+      <pages>844–853</pages>
+      <abstract>We propose a simple unsupervised method for extracting pseudo-parallel monolingual sentence pairs from comparable corpora representative of two different text styles, such as news articles and scientific papers. Our approach does not require a seed parallel corpus, but instead relies solely on hierarchical search over pre-trained embeddings of documents and sentences. We demonstrate the effectiveness of our method through automatic and extrinsic evaluation on text simplification from the normal to the Simple Wikipedia. We show that pseudo-parallel sentences extracted with our method not only supplement existing parallel data, but can even lead to competitive performance on their own.</abstract>
+      <url>R19-1098</url>
+      <doi>10.26615/978-954-452-056-4_098</doi>
+    </paper>
+    <paper id="99">
+      <title>Dependency-Based Relative Positional Encoding for Transformer <fixed-case>NMT</fixed-case></title>
+      <author><first>Yutaro</first><last>Omote</last></author>
+      <author><first>Akihiro</first><last>Tamura</last></author>
+      <author><first>Takashi</first><last>Ninomiya</last></author>
+      <pages>854–861</pages>
+      <abstract>This paper proposes a new Transformer neural machine translation model that incorporates syntactic distances between two source words into the relative position representations of the self-attention mechanism. In particular, the proposed model encodes pair-wise relative depths on a source dependency tree, which are differences between the depths of the two source words, in the encoder’s self-attention. The experiments show that our proposed model achieves 0.5 point gain in BLEU on the Asian Scientific Paper Excerpt Corpus Japanese-to-English translation task.</abstract>
+      <url>R19-1099</url>
+      <doi>10.26615/978-954-452-056-4_099</doi>
+    </paper>
+    <paper id="100">
+      <title>From Image to Text in Sentiment Analysis via Regression and Deep Learning</title>
+      <author><first>Daniela</first><last>Onita</last></author>
+      <author><first>Liviu P.</first><last>Dinu</last></author>
+      <author><first>Adriana</first><last>Birlutiu</last></author>
+      <pages>862–868</pages>
+      <abstract>Images and text represent types of content which are used together for conveying user emotions in online social networks. These contents are usually associated with a sentiment category. In this paper, we investigate an approach for mapping images to text for three types of sentiment categories: positive, neutral and negative. The mapping from images to text is performed using a Kernel Ridge Regression model. We considered two types of image features: i) RGB pixel-values features, and ii) features extracted with a deep learning approach. The experimental evaluation was performed on a Twitter data set containing both text and images and the sentiment associated with these. The experimental results show a difference in performance for different sentiment categories, in particular the mapping that we propose performs better for the positive sentiment category in comparison with the neutral and negative ones. Furthermore, the experimental results show that the more complex deep learning features perform better than the RGB pixel-value features for all sentiment categories and for larger training sets.</abstract>
+      <url>R19-1100</url>
+      <doi>10.26615/978-954-452-056-4_100</doi>
+    </paper>
+    <paper id="101">
+      <title>Building a Morphological Analyser for Laz</title>
+      <author><first>Esra</first><last>Onal</last></author>
+      <author><first>Francis</first><last>Tyers</last></author>
+      <pages>869–877</pages>
+      <abstract>This study is an attempt to contribute to documentation and revitalization efforts of endangered Laz language, a member of South Caucasian language family mainly spoken on northeastern coastline of Turkey. It constitutes the first steps to create a general computational model for word form recognition and production for Laz by building a rule-based morphological analyser using Helsinki Finite-State Toolkit (HFST). The evaluation results show that the analyser has a 64.9% coverage over a corpus collected for this study with 111,365 tokens. We have also performed an error analysis on randomly selected 100 tokens from the corpus which are not covered by the analyser, and these results show that the errors mostly result from Turkish words in the corpus and missing stems in our lexicon.</abstract>
+      <url>R19-1101</url>
+      <doi>10.26615/978-954-452-056-4_101</doi>
+    </paper>
+    <paper id="102">
+      <title>Term Based Semantic Clusters for Very Short Text Classification</title>
+      <author><first>Jasper</first><last>Paalman</last></author>
+      <author><first>Shantanu</first><last>Mullick</last></author>
+      <author><first>Kalliopi</first><last>Zervanou</last></author>
+      <author><first>Yingqian</first><last>Zhang</last></author>
+      <pages>878–887</pages>
+      <abstract>Very short texts, such as tweets and invoices, present challenges in classification. Although term occurrences are strong indicators of content, in very short texts, the sparsity of these texts makes it difficult to capture important semantic relationships. A solution calls for a method that not only considers term occurrence, but also handles sparseness well. In this work, we introduce such an approach, the Term Based Semantic Clusters (TBSeC) that employs terms to create distinctive semantic concept clusters. These clusters are ranked using a semantic similarity function which in turn defines a semantic feature space that can be used for text classification. Our method is evaluated in an invoice classification task. Compared to well-known content representation methods the proposed method performs competitively.</abstract>
+      <url>R19-1102</url>
+      <doi>10.26615/978-954-452-056-4_102</doi>
+    </paper>
+    <paper id="103">
+      <title>Quotation Detection and Classification with a Corpus-Agnostic Model</title>
+      <author><first>Sean</first><last>Papay</last></author>
+      <author><first>Sebastian</first><last>Padó</last></author>
+      <pages>888–894</pages>
+      <abstract>The detection of quotations (i.e., reported speech, thought, and writing) has established itself as an NLP analysis task. However, state-of-the-art models have been developed on the basis of specific corpora and incorpo- rate a high degree of corpus-specific assumptions and knowledge, which leads to fragmentation. In the spirit of task-agnostic modeling, we present a corpus-agnostic neural model for quotation detection and evaluate it on three corpora that vary in language, text genre, and structural assumptions. The model (a) approaches the state-of-the-art on the corpora when using established feature sets and (b) shows reasonable performance even when us- ing solely word forms, which makes it applicable for non-standard (i.e., historical) corpora.</abstract>
+      <url>R19-1103</url>
+      <doi>10.26615/978-954-452-056-4_103</doi>
+    </paper>
+    <paper id="104">
+      <title>Validation of Facts Against Textual Sources</title>
+      <author><first>Vamsi Krishna</first><last>Pendyala</last></author>
+      <author><first>Simran</first><last>Sinha</last></author>
+      <author><first>Satya</first><last>Prakash</last></author>
+      <author><first>Shriya</first><last>Reddy</last></author>
+      <author><first>Anupam</first><last>Jamatia</last></author>
+      <pages>895–903</pages>
+      <abstract>In today’s digital world of information, a fact verification system to disprove assertions made in speech, print media or online content is the need of the hour. We propose a system which would verify a claim against a source and classify the claim to be true, false, out-of-context or an inappropriate claim with respect to the textual source provided to the system. A true label is used if the claim is true, false if it is false, if the claim has no relation with the source then it is classified as out-of-context and if the claim cannot be verified at all then it is classified as inappropriate. This would help us to verify a claim or a fact as well as know about the source or our knowledge base against which we are trying to verify our facts. We used a two-step approach to achieve our goal. At first, we retrieved evidence related to the claims from the textual source using the Term Frequency-Inverse Document Frequency(TF-IDF) vectors. Later we classified the claim-evidence pairs as true, false, inappropriate and out of context using a modified version of textual entailment module. Textual entailment module calculates the probability of each sentence supporting the claim, contradicting the claim or not providing any relevant information using Bi-LSTM network to assess the veracity of the claim. The accuracy of the best performing system is 64.49%</abstract>
+      <url>R19-1104</url>
+      <doi>10.26615/978-954-452-056-4_104</doi>
+    </paper>
+    <paper id="105">
+      <title>A Neural Network Component for Knowledge-Based Semantic Representations of Text</title>
+      <author><first>Alejandro</first><last>Piad-Morffis</last></author>
+      <author><first>Rafael</first><last>Muñoz</last></author>
+      <author><first>Yoan</first><last>Gutiérrez</last></author>
+      <author><first>Yudivian</first><last>Almeida-Cruz</last></author>
+      <author><first>Suilan</first><last>Estevez-Velarde</last></author>
+      <author><first>Andrés</first><last>Montoyo</last></author>
+      <pages>904–911</pages>
+      <abstract>This paper presents Semantic Neural Networks (SNNs), a knowledge-aware component based on deep learning. SNNs can be trained to encode explicit semantic knowledge from an arbitrary knowledge base, and can subsequently be combined with other deep learning architectures. At prediction time, SNNs provide a semantic encoding extracted from the input data, which can be exploited by other neural network components to build extended representation models that can face alternative problems. The SNN architecture is defined in terms of the concepts and relations present in a knowledge base. Based on this architecture, a training procedure is developed. Finally, an experimental setup is presented to illustrate the behaviour and performance of a SNN for a specific NLP problem, in this case, opinion mining for the classification of movie reviews.</abstract>
+      <url>R19-1105</url>
+      <doi>10.26615/978-954-452-056-4_105</doi>
+    </paper>
+    <paper id="106">
+      <title>Toponym Detection in the Bio-Medical Domain: A Hybrid Approach with Deep Learning</title>
+      <author><first>Alistair</first><last>Plum</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Constantin</first><last>Orasan</last></author>
+      <pages>912–921</pages>
+      <abstract>This paper compares how different machine learning classifiers can be used together with simple string matching and named entity recognition to detect locations in texts. We compare five different state-of-the-art machine learning classifiers in order to predict whether a sentence contains a location or not. Following this classification task, we use a string matching algorithm with a gazetteer to identify the exact index of a toponym within the sentence. We evaluate different approaches in terms of machine learning classifiers, text pre-processing and location extraction on the SemEval-2019 Task 12 dataset, compiled for toponym resolution in the bio-medical domain. Finally, we compare the results with our system that was previously submitted to the SemEval-2019 task evaluation.</abstract>
+      <url>R19-1106</url>
+      <doi>10.26615/978-954-452-056-4_106</doi>
+    </paper>
+    <paper id="107">
+      <title>Combining <fixed-case>PBSMT</fixed-case> and <fixed-case>NMT</fixed-case> Back-translated Data for Efficient <fixed-case>NMT</fixed-case></title>
+      <author><first>Alberto</first><last>Poncelas</last></author>
+      <author><first>Maja</first><last>Popović</last></author>
+      <author><first>Dimitar</first><last>Shterionov</last></author>
+      <author><first>Gideon</first><last>Maillette de Buy Wenniger</last></author>
+      <author><first>Andy</first><last>Way</last></author>
+      <pages>922–931</pages>
+      <abstract>Neural Machine Translation (NMT) models achieve their best performance when large sets of parallel data are used for training. Consequently, techniques for augmenting the training set have become popular recently. One of these methods is back-translation, which consists on generating synthetic sentences by translating a set of monolingual, target-language sentences using a Machine Translation (MT) model. Generally, NMT models are used for back-translation. In this work, we analyze the performance of models when the training data is extended with synthetic data using different MT approaches. In particular we investigate back-translated data generated not only by NMT but also by Statistical Machine Translation (SMT) models and combinations of both. The results reveal that the models achieve the best performances when the training set is augmented with back-translated data created by merging different MT approaches.</abstract>
+      <url>R19-1107</url>
+      <doi>10.26615/978-954-452-056-4_107</doi>
+    </paper>
+    <paper id="108">
+      <title>Unsupervised dialogue intent detection via hierarchical topic model</title>
+      <author><first>Artem</first><last>Popov</last></author>
+      <author><first>Victor</first><last>Bulatov</last></author>
+      <author><first>Darya</first><last>Polyudova</last></author>
+      <author><first>Eugenia</first><last>Veselova</last></author>
+      <pages>932–938</pages>
+      <abstract>One of the challenges during a task-oriented chatbot development is the scarce availability of the labeled training data. The best way of getting one is to ask the assessors to tag each dialogue according to its intent. Unfortunately, performing labeling without any provisional collection structure is difficult since the very notion of the intent is ill-defined. In this paper, we propose a hierarchical multimodal regularized topic model to obtain a first approximation of the intent set. Our rationale for hierarchical models usage is their ability to take into account several degrees of the dialogues relevancy. We attempt to build a model that can distinguish between subject-based (e.g. medicine and transport topics) and action-based (e.g. filing of an application and tracking application status) similarities. In order to achieve this, we divide set of all features into several groups according to part-of-speech analysis. Various feature groups are treated differently on different hierarchy levels.</abstract>
+      <url>R19-1108</url>
+      <doi>10.26615/978-954-452-056-4_108</doi>
+    </paper>
+    <paper id="109">
+      <title>Graph Embeddings for Frame Identification</title>
+      <author><first>Alexander</first><last>Popov</last></author>
+      <author><first>Jennifer</first><last>Sikos</last></author>
+      <pages>939–948</pages>
+      <abstract>Lexical resources such as WordNet (Miller, 1995) and FrameNet (Baker et al., 1998) are organized as graphs, where relationships between words are made explicit via the structure of the resource. This work explores how structural information from these lexical resources can lead to gains in a downstream task, namely frame identification. While much of the current work in frame identification uses various neural architectures to predict frames, those neural architectures only use representations of frames based on annotated corpus data. We demonstrate how incorporating knowledge directly from the FrameNet graph structure improves the performance of a neural network-based frame identification system. Specifically, we construct a bidirectional LSTM with a loss function that incorporates various graph- and corpus-based frame embeddings for learning and ultimately achieves strong performance gains with the graph-based embeddings over corpus-based embeddings alone.</abstract>
+      <url>R19-1109</url>
+      <doi>10.26615/978-954-452-056-4_109</doi>
+    </paper>
+    <paper id="110">
+      <title>Know Your Graph. State-of-the-Art Knowledge-Based <fixed-case>WSD</fixed-case></title>
+      <author><first>Alexander</first><last>Popov</last></author>
+      <author><first>Kiril</first><last>Simov</last></author>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <pages>949–958</pages>
+      <abstract>This paper introduces several improvements over the current state of the art in knowledge-based word sense disambiguation. Those innovations are the result of modifying and enriching a knowledge base created originally on the basis of WordNet. They reflect several separate but connected strategies: manipulating the shape and the content of the knowledge base, assigning weights over the relations in the knowledge base, and the addition of new relations to it. The main contribution of the paper is to demonstrate that the previously proposed knowledge bases organize linguistic and world knowledge suboptimally for the task of word sense disambiguation. In doing so, the paper also establishes a new state of the art for knowledge-based approaches. Its best models are competitive in the broader context of supervised systems as well.</abstract>
+      <url>R19-1110</url>
+      <doi>10.26615/978-954-452-056-4_110</doi>
+    </paper>
+    <paper id="111">
+      <title>Are ambiguous conjunctions problematic for machine translation?</title>
+      <author><first>Maja</first><last>Popović</last></author>
+      <author><first>Sheila</first><last>Castilho</last></author>
+      <pages>959–966</pages>
+      <abstract>The translation of ambiguous words still poses challenges for machine translation. In this work, we carry out a systematic quantitative analysis regarding the ability of different machine translation systems to disambiguate the source language conjunctions “but” and “and”. We evaluate specialised test sets focused on the translation of these two conjunctions. The test sets contain source languages that do not distinguish different variants of the given conjunction, whereas the target languages do. In total, we evaluate the conjunction “but” on 20 translation outputs, and the conjunction “and” on 10. All machine translation systems almost perfectly recognise one variant of the target conjunction, especially for the source conjunction “but”. The other target variant, however, represents a challenge for machine translation systems, with accuracy varying from 50% to 95% for “but” and from 20% to 57% for “and”. The major error for all systems is replacing the correct target variant with the opposite one.</abstract>
+      <url>R19-1111</url>
+      <doi>10.26615/978-954-452-056-4_111</doi>
+    </paper>
+    <paper id="112">
+      <title><fixed-case>ULSA</fixed-case>na: Universal Language Semantic Analyzer</title>
+      <author><first>Ondřej</first><last>Pražák</last></author>
+      <author><first>Miloslav</first><last>Konopik</last></author>
+      <pages>967–972</pages>
+      <abstract>We present a live cross-lingual system capable of producing shallow semantic annotations of natural language sentences for 51 languages at this time. The domain of the input sentences is in principle unconstrained. The system uses single training data (in English) for all the languages. The resulting semantic annotations are therefore consistent across different languages. We use CoNLL Semantic Role Labeling training data and Universal dependencies as the basis for the system. The system is publicly available and supports processing data in batches; therefore, it can be easily used by the community for the following research tasks.</abstract>
+      <url>R19-1112</url>
+      <doi>10.26615/978-954-452-056-4_112</doi>
+    </paper>
+    <paper id="113">
+      <title>Machine Learning Approach to Fact-Checking in West <fixed-case>S</fixed-case>lavic Languages</title>
+      <author><first>Pavel</first><last>Přibáň</last></author>
+      <author><first>Tomáš</first><last>Hercig</last></author>
+      <author><first>Josef</first><last>Steinberger</last></author>
+      <pages>973–979</pages>
+      <abstract>Fake news detection and closely-related fact-checking have recently attracted a lot of attention. Automatization of these tasks has been already studied for English. For other languages, only a few studies can be found (e.g. (Baly et al., 2018)), and to the best of our knowledge, no research has been conducted for West Slavic languages. In this paper, we present datasets for Czech, Polish, and Slovak. We also ran initial experiments which set a baseline for further research into this area.</abstract>
+      <url>R19-1113</url>
+      <doi>10.26615/978-954-452-056-4_113</doi>
+    </paper>
+    <paper id="114">
+      <title><fixed-case>NE</fixed-case>-Table: A Neural key-value table for Named Entities</title>
+      <author><first>Janarthanan</first><last>Rajendran</last></author>
+      <author><first>Jatin</first><last>Ganhotra</last></author>
+      <author><first>Xiaoxiao</first><last>Guo</last></author>
+      <author><first>Mo</first><last>Yu</last></author>
+      <author><first>Satinder</first><last>Singh</last></author>
+      <author><first>Lazaros</first><last>Polymenakos</last></author>
+      <pages>980–993</pages>
+      <abstract>Many Natural Language Processing (NLP) tasks depend on using Named Entities (NEs) that are contained in texts and in external knowledge sources. While this is easy for humans, the present neural methods that rely on learned word embeddings may not perform well for these NLP tasks, especially in the presence of Out-Of-Vocabulary (OOV) or rare NEs. In this paper, we propose a solution for this problem, and present empirical evaluations on: a) a structured Question-Answering task, b) three related Goal-Oriented dialog tasks, and c) a Reading-Comprehension task, which show that the proposed method can be effective in dealing with both in-vocabulary and OOV NEs. We create extended versions of dialog bAbI tasks 1,2 and 4 and OOV versions of the CBT test set which are available at - https://github.com/IBM/ne-table-datasets/</abstract>
+      <url>R19-1114</url>
+      <doi>10.26615/978-954-452-056-4_114</doi>
+    </paper>
+    <paper id="115">
+      <title>Enhancing Unsupervised Sentence Similarity Methods with Deep Contextualised Word Representations</title>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Constantin</first><last>Orasan</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>994–1003</pages>
+      <abstract>Calculating Semantic Textual Similarity (STS) plays a significant role in many applications such as question answering, document summarisation, information retrieval and information extraction. All modern state of the art STS methods rely on word embeddings one way or another. The recently introduced contextualised word embeddings have proved more effective than standard word embeddings in many natural language processing tasks. This paper evaluates the impact of several contextualised word embeddings on unsupervised STS methods and compares it with the existing supervised/unsupervised STS methods for different datasets in different languages and different domains</abstract>
+      <url>R19-1115</url>
+      <doi>10.26615/978-954-452-056-4_115</doi>
+    </paper>
+    <paper id="116">
+      <title>Semantic Textual Similarity with <fixed-case>S</fixed-case>iamese Neural Networks</title>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Constantin</first><last>Orasan</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>1004–1011</pages>
+      <abstract>Calculating the Semantic Textual Similarity (STS) is an important research area in natural language processing which plays a significant role in many applications such as question answering, document summarisation, information retrieval and information extraction. This paper evaluates Siamese recurrent architectures, a special type of neural networks, which are used here to measure STS. Several variants of the architecture are compared with existing methods</abstract>
+      <url>R19-1116</url>
+      <doi>10.26615/978-954-452-056-4_116</doi>
+    </paper>
+    <paper id="117">
+      <title>Analysing the Impact of Supervised Machine Learning on Automatic Term Extraction: <fixed-case>HAMLET</fixed-case> vs <fixed-case>T</fixed-case>ermo<fixed-case>S</fixed-case>tat</title>
+      <author><first>Ayla</first><last>Rigouts Terryn</last></author>
+      <author><first>Patrick</first><last>Drouin</last></author>
+      <author><first>Veronique</first><last>Hoste</last></author>
+      <author><first>Els</first><last>Lefever</last></author>
+      <pages>1012–1021</pages>
+      <abstract>Traditional approaches to automatic term extraction do not rely on machine learning (ML) and select the top n ranked candidate terms or candidate terms above a certain predefined cut-off point, based on a limited number of linguistic and statistical clues. However, supervised ML approaches are gaining interest. Relatively little is known about the impact of these supervised methodologies; evaluations are often limited to precision, and sometimes recall and f1-scores, without information about the nature of the extracted candidate terms. Therefore, the current paper presents a detailed and elaborate analysis and comparison of a traditional, state-of-the-art system (TermoStat) and a new, supervised ML approach (HAMLET), using the results obtained for the same, manually annotated, Dutch corpus about dressage.</abstract>
+      <url>R19-1117</url>
+      <doi>10.26615/978-954-452-056-4_117</doi>
+    </paper>
+    <paper id="118">
+      <title>Distant Supervision for Sentiment Attitude Extraction</title>
+      <author><first>Nicolay</first><last>Rusnachenko</last></author>
+      <author><first>Natalia</first><last>Loukachevitch</last></author>
+      <author><first>Elena</first><last>Tutubalina</last></author>
+      <pages>1022–1030</pages>
+      <abstract>News articles often convey attitudes between the mentioned subjects, which is essential for understanding the described situation. In this paper, we describe a new approach to distant supervision for extracting sentiment attitudes between named entities mentioned in texts. Two factors (pair-based and frame-based) were used to automatically label an extensive news collection, dubbed as RuAttitudes. The latter became a basis for adaptation and training convolutional architectures, including piecewise max pooling and full use of information across different sentences. The results show that models, trained with RuAttitudes, outperform ones that were trained with only supervised learning approach and achieve 13.4% increase in F1-score on RuSentRel collection.</abstract>
+      <url>R19-1118</url>
+      <doi>10.26615/978-954-452-056-4_118</doi>
+    </paper>
+    <paper id="119">
+      <title>Self-Attentional Models Application in Task-Oriented Dialogue Generation Systems</title>
+      <author><first>Mansour</first><last>Saffar Mehrjardi</last></author>
+      <author><first>Amine</first><last>Trabelsi</last></author>
+      <author><first>Osmar</first><last>R. Zaiane</last></author>
+      <pages>1031–1040</pages>
+      <abstract>Self-attentional models are a new paradigm for sequence modelling tasks which differ from common sequence modelling methods, such as recurrence-based and convolution-based sequence learning, in the way that their architecture is only based on the attention mechanism. Self-attentional models have been used in the creation of the state-of-the-art models in many NLP task such as neural machine translation, but their usage has not been explored for the task of training end-to-end task-oriented dialogue generation systems yet. In this study, we apply these models on the DSTC2 dataset for training task-oriented chatbots. Our finding shows that self-attentional models can be exploited to create end-to-end task-oriented chatbots which not only achieve higher evaluation scores compared to recurrence-based models, but also do so more efficiently.</abstract>
+      <url>R19-1119</url>
+      <doi>10.26615/978-954-452-056-4_119</doi>
+    </paper>
+    <paper id="120">
+      <title>Whom to Learn From? Graph- vs. Text-based Word Embeddings</title>
+      <author><first>Małgorzata</first><last>Salawa</last></author>
+      <author><first>António</first><last>Branco</last></author>
+      <author><first>Ruben</first><last>Branco</last></author>
+      <author><first>João</first><last>António Rodrigues</last></author>
+      <author><first>Chakaveh</first><last>Saedi</last></author>
+      <pages>1041–1051</pages>
+      <abstract>Vectorial representations of meaning can be supported by empirical data from diverse sources and obtained with diverse embedding approaches. This paper aims at screening this experimental space and reports on an assessment of word embeddings supported (i) by data in raw texts vs. in lexical graphs, (ii) by lexical information encoded in association- vs. inference-based graphs, and obtained (iii) by edge reconstruction- vs. matrix factorisation vs. random walk-based graph embedding methods. The results observed with these experiments indicate that the best solutions with graph-based word embeddings are very competitive, consistently outperforming mainstream text-based ones.</abstract>
+      <url>R19-1120</url>
+      <doi>10.26615/978-954-452-056-4_120</doi>
+    </paper>
+    <paper id="121">
+      <title>Persistence pays off: Paying Attention to What the <fixed-case>LSTM</fixed-case> Gating Mechanism Persists</title>
+      <author><first>Giancarlo</first><last>Salton</last></author>
+      <author><first>John</first><last>Kelleher</last></author>
+      <pages>1052–1059</pages>
+      <abstract>Recurrent Neural Network Language Models composed of LSTM units, especially those augmented with an external memory, have achieved state-of-the-art results in Language Modeling. However, these models still struggle to process long sequences which are more likely to contain long-distance dependencies because of information fading. In this paper we demonstrate an effective mechanism for retrieving information in a memory augmented LSTM LM based on attending to information in memory in proportion to the number of timesteps the LSTM gating mechanism persisted the information.</abstract>
+      <url>R19-1121</url>
+      <doi>10.26615/978-954-452-056-4_121</doi>
+    </paper>
+    <paper id="122">
+      <title>Development and Evaluation of Three Named Entity Recognition Systems for <fixed-case>S</fixed-case>erbian - The Case of Personal Names</title>
+      <author><first>Branislava</first><last>Šandrih</last></author>
+      <author><first>Cvetana</first><last>Krstev</last></author>
+      <author><first>Ranka</first><last>Stankovic</last></author>
+      <pages>1060–1068</pages>
+      <abstract>In this paper we present a rule- and lexicon-based system for the recognition of Named Entities (NE) in Serbian newspaper texts that was used to prepare a gold standard annotated with personal names. It was further used to prepare training sets for four different levels of annotation, which were further used to train two Named Entity Recognition (NER) systems: Stanford and spaCy. All obtained models, together with a rule- and lexicon-based system were evaluated on two sample texts: a part of the gold standard and an independent newspaper text of approximately the same size. The results show that rule- and lexicon-based system outperforms trained models in all four scenarios (measured by F1), while Stanford models has the highest precision. All systems obtain best results in recognizing full names, while the recognition of first names only is rather poor. The produced models are incorporated into a Web platform NER&amp;Beyond that provides various NE-related functions.</abstract>
+      <url>R19-1122</url>
+      <doi>10.26615/978-954-452-056-4_122</doi>
+    </paper>
+    <paper id="123">
+      <title>Moral Stance Recognition and Polarity Classification from <fixed-case>T</fixed-case>witter and Elicited Text</title>
+      <author><first>Wesley</first><last>Santos</last></author>
+      <author><first>Ivandré</first><last>Paraboni</last></author>
+      <pages>1069–1075</pages>
+      <abstract>We introduce a labelled corpus of stances about moral issues for the Brazilian Portuguese language, and present reference results for both the stance recognition and polarity classification tasks. The corpus is built from Twitter and further expanded with data elicited through crowd sourcing and labelled by their own authors. Put together, the corpus and reference results are expected to be taken as a baseline for further studies in the field of stance recognition and polarity classification from text.</abstract>
+      <url>R19-1123</url>
+      <doi>10.26615/978-954-452-056-4_123</doi>
+    </paper>
+    <paper id="124">
+      <title>The “Jump and Stay” Method to Discover Proper Verb Centered Constructions in Corpus Lattices</title>
+      <author><first>Bálint</first><last>Sass</last></author>
+      <pages>1076–1084</pages>
+      <abstract>The research presented here is based on the theoretical model of corpus lattices. We implemented this as an effective data structure, and developed an algorithm based on this structure to discover essential verbal expressions from corpus data. The idea behind the algorithm is the “jump and stay” principle, which tells us that our target expressions will be found at such places in the lattice where the value of a suitable function (defined on the vertex set of the corpus lattice) significantly increases (jumps) and then remains the same (stays). We evaluated our method on Hungarian data. Evaluation shows that about 75% of the obtained expressions are correct, actual errors are rare. Thus, this paper is 1. a proof of concept concerning the corpus lattice model, opening the way to investigate this structure further through our implementation; and 2. a proof of concept of the “jump and stay” idea and the algorithm itself, opening the way to apply it further, e.g. for other languages.</abstract>
+      <url>R19-1124</url>
+      <doi>10.26615/978-954-452-056-4_124</doi>
+    </paper>
+    <paper id="125">
+      <title>Offence in Dialogues: A Corpus-Based Study</title>
+      <author><first>Johannes</first><last>Schäfer</last></author>
+      <author><first>Ben</first><last>Burtenshaw</last></author>
+      <pages>1085–1093</pages>
+      <abstract>In recent years an increasing number of analyses of offensive language has been published, however, dealing mainly with the automatic detection and classification of isolated instances. In this paper we aim to understand the impact of offensive messages in online conversations diachronically, and in particular the change in offensiveness of dialogue turns. In turn, we aim to measure the progression of offence level as well as its direction - For example, whether a conversation is escalating or declining in offence. We present our method of extracting linear dialogues from tree-structured conversations in social media data and make our code publicly available. Furthermore, we discuss methods to analyse this dataset through changes in discourse offensiveness. Our paper includes two main contributions; first, using a neural network to measure the level of offensiveness in conversations; and second, the analysis of conversations around offensive comments using decoupling functions.</abstract>
+      <url>R19-1125</url>
+      <doi>10.26615/978-954-452-056-4_125</doi>
+    </paper>
+    <paper id="126">
+      <title><fixed-case>E</fixed-case>mo<fixed-case>T</fixed-case>ag – Towards an Emotion-Based Analysis of Emojis</title>
+      <author><first>Abu Awal Md</first><last>Shoeb</last></author>
+      <author><first>Shahab</first><last>Raji</last></author>
+      <author><first>Gerard</first><last>de Melo</last></author>
+      <pages>1094–1103</pages>
+      <abstract>Despite being a fairly recent phenomenon, emojis have quickly become ubiquitous. Besides their extensive use in social media, they are now also invoked in customer surveys and feedback forms. Hence, there is a need for techniques to understand their sentiment and emotion. In this work, we provide a method to quantify the emotional association of basic emotions such as anger, fear, joy, and sadness for a set of emojis. We collect and process a unique corpus of 20 million emoji-centric tweets, such that we can capture rich emoji semantics using a comparably small dataset. We evaluate the induced emotion profiles of emojis with regard to their ability to predict word affect intensities as well as sentiment scores.</abstract>
+      <url>R19-1126</url>
+      <doi>10.26615/978-954-452-056-4_126</doi>
+    </paper>
+    <paper id="127">
+      <title>A Morpho-Syntactically Informed <fixed-case>LSTM</fixed-case>-<fixed-case>CRF</fixed-case> Model for Named Entity Recognition</title>
+      <author><first>Lilia</first><last>Simeonova</last></author>
+      <author><first>Kiril</first><last>Simov</last></author>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <pages>1104–1113</pages>
+      <abstract>We propose a morphologically informed model for named entity recognition, which is based on LSTM-CRF architecture and combines word embeddings, Bi-LSTM character embeddings, part-of-speech (POS) tags, and morphological information. While previous work has focused on learning from raw word input, using word and character embeddings only, we show that for morphologically rich languages, such as Bulgarian, access to POS information contributes more to the performance gains than the detailed morphological information. Thus, we show that named entity recognition needs only coarse-grained POS tags, but at the same time it can benefit from simultaneously using some POS information of different granularity. Our evaluation results over a standard dataset show sizeable improvements over the state-of-the-art for Bulgarian NER.</abstract>
+      <url>R19-1127</url>
+      <doi>10.26615/978-954-452-056-4_127</doi>
+    </paper>
+    <paper id="128">
+      <title>Named Entity Recognition in Information Security Domain for <fixed-case>R</fixed-case>ussian</title>
+      <author><first>Anastasiia</first><last>Sirotina</last></author>
+      <author><first>Natalia</first><last>Loukachevitch</last></author>
+      <pages>1114–1120</pages>
+      <abstract>In this paper we discuss the named entity recognition task for Russian texts related to cybersecurity. First of all, we describe the problems that arise in course of labeling unstructured texts from information security domain. We introduce guidelines for human annotators, according to which a corpus has been marked up. Then, a CRF-based system and different neural architectures have been implemented and applied to the corpus. The named entity recognition systems have been evaluated and compared to determine the most efficient one.</abstract>
+      <url>R19-1128</url>
+      <doi>10.26615/978-954-452-056-4_128</doi>
+    </paper>
+    <paper id="129">
+      <title>Cross-Family Similarity Learning for Cognate Identification in Low-Resource Languages</title>
+      <author><first>Eliel</first><last>Soisalon-Soininen</last></author>
+      <author><first>Mark</first><last>Granroth-Wilding</last></author>
+      <pages>1121–1130</pages>
+      <abstract>We address the problem of cognate identification across vocabulary pairs of any set of languages. In particular, we focus on the case where the examined pair of languages are low-resource to the extent that no training data whatsoever in these languages, or even closely related ones, are available for the task. We investigate the extent to which training data from another, unrelated language family can be used instead. Our approach consists of learning a similarity metric from example cognates in Indo-European languages and applying it to low-resource Sami languages of the Uralic family. We apply two models following previous work: a Siamese convolutional neural network (S-CNN) and a support vector machine (SVM), and compare them with a Levenshtein-distance baseline. We test performance on three Sami languages and find that the S-CNN outperforms the other approaches, suggesting that it is better able to learn such general characteristics of cognateness that carry over across language families. We also experiment with fine-tuning the S-CNN model with data from within the language family in order to quantify how well this model can make use of a small amount of target-domain data to adapt.</abstract>
+      <url>R19-1129</url>
+      <doi>10.26615/978-954-452-056-4_129</doi>
+    </paper>
+    <paper id="130">
+      <title>Automatic Detection of Translation Direction</title>
+      <author><first>Ilia</first><last>Sominsky</last></author>
+      <author><first>Shuly</first><last>Wintner</last></author>
+      <pages>1131–1140</pages>
+      <abstract>Parallel corpora are crucial resources for NLP applications, most notably for machine translation. The direction of the (human) translation of parallel corpora has been shown to have significant implications for the quality of statistical machine translation systems that are trained with such corpora. We describe a method for determining the direction of the (manual) translation of parallel corpora at the sentence-pair level. Using several linguistically-motivated features, coupled with a neural network model, we obtain high accuracy on several language pairs. Furthermore, we demonstrate that the accuracy is correlated with the (typological) distance between the two languages.</abstract>
+      <url>R19-1130</url>
+      <doi>10.26615/978-954-452-056-4_130</doi>
+    </paper>
+    <paper id="131">
+      <title>Automated Text Simplification as a Preprocessing Step for Machine Translation into an Under-resourced Language</title>
+      <author><first>Sanja</first><last>Štajner</last></author>
+      <author><first>Maja</first><last>Popović</last></author>
+      <pages>1141–1150</pages>
+      <abstract>In this work, we investigate the possibility of using fully automatic text simplification system on the English source in machine translation (MT) for improving its translation into an under-resourced language. We use the state-of-the-art automatic text simplification (ATS) system for lexically and syntactically simplifying source sentences, which are then translated with two state-of-the-art English-to-Serbian MT systems, the phrase-based MT (PBMT) and the neural MT (NMT). We explore three different scenarios for using the ATS in MT: (1) using the raw output of the ATS; (2) automatically filtering out the sentences with low grammaticality and meaning preservation scores; and (3) performing a minimal manual correction of the ATS output. Our results show improvement in fluency of the translation regardless of the chosen scenario, and difference in success of the three scenarios depending on the MT approach used (PBMT or NMT) with regards to improving translation fluency and post-editing effort.</abstract>
+      <url>R19-1131</url>
+      <doi>10.26615/978-954-452-056-4_131</doi>
+    </paper>
+    <paper id="132">
+      <title>Investigating Multilingual Abusive Language Detection: A Cautionary Tale</title>
+      <author><first>Kenneth</first><last>Steimel</last></author>
+      <author><first>Daniel</first><last>Dakota</last></author>
+      <author><first>Yue</first><last>Chen</last></author>
+      <author><first>Sandra</first><last>Kübler</last></author>
+      <pages>1151–1160</pages>
+      <abstract>Abusive language detection has received much attention in the last years, and recent approaches perform the task in a number of different languages. We investigate which factors have an effect on multilingual settings, focusing on the compatibility of data and annotations. In the current paper, we focus on English and German. Our findings show large differences in performance between the two languages. We find that the best performance is achieved by different classification algorithms. Sampling to address class imbalance issues is detrimental for German and beneficial for English. The only similarity that we find is that neither data set shows clear topics when we compare the results of topic modeling to the gold standard. Based on our findings, we can conclude that a multilingual optimization of classifiers is not possible even in settings where comparable data sets are used.</abstract>
+      <url>R19-1132</url>
+      <doi>10.26615/978-954-452-056-4_132</doi>
+    </paper>
+    <paper id="133">
+      <title>Augmenting a <fixed-case>B</fixed-case>i<fixed-case>LSTM</fixed-case> Tagger with a Morphological Lexicon and a Lexical Category Identification Step</title>
+      <author><first>Steinþór</first><last>Steingrímsson</last></author>
+      <author><first>Örvar</first><last>Kárason</last></author>
+      <author><first>Hrafn</first><last>Loftsson</last></author>
+      <pages>1161–1168</pages>
+      <abstract>Previous work on using BiLSTM models for PoS tagging has primarily focused on small tagsets. We evaluate BiLSTM models for tagging Icelandic, a morphologically rich language, using a relatively large tagset. Our baseline BiLSTM model achieves higher accuracy than any other previously published tagger, when not taking advantage of a morphological lexicon. When we extend the model by incorporating such data, we outperform the earlier state-of-the-art results by a significant margin. We also report on work in progress that attempts to address the problem of data sparsity inherent to morphologically detailed, fine-grained tagsets. We experiment with training a separate model on only the lexical category and using the coarse-grained output tag as an input into to the main model. This method further increases the accuracy and reduces the tagging errors by 21.3% compared to previous state-of-the-art results. Finally, we train and test our tagger on a new gold standard for Icelandic.</abstract>
+      <url>R19-1133</url>
+      <doi>10.26615/978-954-452-056-4_133</doi>
+    </paper>
+    <paper id="134">
+      <title>Comparison of Machine Learning Approaches for Industry Classification Based on Textual Descriptions of Companies</title>
+      <author><first>Andrey</first><last>Tagarev</last></author>
+      <author><first>Nikola</first><last>Tulechki</last></author>
+      <author><first>Svetla</first><last>Boytcheva</last></author>
+      <pages>1169–1175</pages>
+      <abstract>This paper addresses the task of categorizing companies within industry classification schemes. The datasets consists of encyclopedic articles about companies and their economic activities. The target classification schema is build by mapping linked open data in a semi-supervised manner. Target classes are build bottom-up from DBpedia. We apply several state of the art text classification techniques, based both on deep-learning and classical vector-space models.</abstract>
+      <url>R19-1134</url>
+      <doi>10.26615/978-954-452-056-4_134</doi>
+    </paper>
+    <paper id="135">
+      <title>A Quantum-Like Approach to Word Sense Disambiguation</title>
+      <author><first>Fabio</first><last>Tamburini</last></author>
+      <pages>1176–1185</pages>
+      <abstract>This paper presents a novel algorithm for Word Sense Disambiguation (WSD) based on Quantum Probability Theory. The Quantum WSD algorithm requires concepts representations as vectors in the complex domain and thus we have developed a technique for computing complex word and sentence embeddings based on the Paragraph Vectors algorithm. Despite the proposed method is quite simple and that it does not require long training phases, when it is evaluated on a standardized benchmark for this task it exhibits state-of-the-art (SOTA) performances.</abstract>
+      <url>R19-1135</url>
+      <doi>10.26615/978-954-452-056-4_135</doi>
+    </paper>
+    <paper id="136">
+      <title>Understanding Neural Machine Translation by Simplification: The Case of Encoder-free Models</title>
+      <author><first>Gongbo</first><last>Tang</last></author>
+      <author><first>Rico</first><last>Sennrich</last></author>
+      <author><first>Joakim</first><last>Nivre</last></author>
+      <pages>1186–1193</pages>
+      <abstract>In this paper, we try to understand neural machine translation (NMT) via simplifying NMT architectures and training encoder-free NMT models. In an encoder-free model, the sums of word embeddings and positional embeddings represent the source. The decoder is a standard Transformer or recurrent neural network that directly attends to embeddings via attention mechanisms. Experimental results show (1) that the attention mechanism in encoder-free models acts as a strong feature extractor, (2) that the word embeddings in encoder-free models are competitive to those in conventional models, (3) that non-contextualized source representations lead to a big performance drop, and (4) that encoder-free models have different effects on alignment quality for German-English and Chinese-English.</abstract>
+      <url>R19-1136</url>
+      <doi>10.26615/978-954-452-056-4_136</doi>
+    </paper>
+    <paper id="137">
+      <title>Text-Based Joint Prediction of Numeric and Categorical Attributes of Entities in Knowledge Bases</title>
+      <author><first>V</first><last>Thejas</last></author>
+      <author><first>Abhijeet</first><last>Gupta</last></author>
+      <author><first>Sebastian</first><last>Padó</last></author>
+      <pages>1194–1202</pages>
+      <abstract>Collaboratively constructed knowledge bases play an important role in information systems, but are essentially always incomplete. Thus, a large number of models has been developed for Knowledge Base Completion, the task of predicting new attributes of entities given partial descriptions of these entities. Virtually all of these models either concentrate on numeric attributes (&lt;Italy,GDP,2T$&gt;) or they concentrate on categorical attributes (&lt;Tim Cook,chairman,Apple&gt;). In this paper, we propose a simple feed-forward neural architecture to jointly predict numeric and categorical attributes based on embeddings learned from textual occurrences of the entities in question. Following insights from multi-task learning, our hypothesis is that due to the correlations among attributes of different kinds, joint prediction improves over separate prediction. Our experiments on seven FreeBase domains show that this hypothesis is true of the two attribute types: we find substantial improvements for numeric attributes in the joint model, while performance remains largely unchanged for categorical attributes. Our analysis indicates that this is the case because categorical attributes, many of which describe membership in various classes, provide useful ‘background knowledge’ for numeric prediction, while this is true to a lesser degree in the inverse direction.</abstract>
+      <url>R19-1137</url>
+      <doi>10.26615/978-954-452-056-4_137</doi>
+    </paper>
+    <paper id="138">
+      <title><fixed-case>S</fixed-case>en<fixed-case>Z</fixed-case>i: A Sentiment Analysis Lexicon for the Latinised <fixed-case>A</fixed-case>rabic (<fixed-case>A</fixed-case>rabizi)</title>
+      <author><first>Taha</first><last>Tobaili</last></author>
+      <author><first>Miriam</first><last>Fernandez</last></author>
+      <author><first>Harith</first><last>Alani</last></author>
+      <author><first>Sanaa</first><last>Sharafeddine</last></author>
+      <author><first>Hazem</first><last>Hajj</last></author>
+      <author><first>Goran</first><last>Glavaš</last></author>
+      <pages>1203–1211</pages>
+      <abstract>Arabizi is an informal written form of dialectal Arabic transcribed in Latin alphanumeric characters. It has a proven popularity on chat platforms and social media, yet it suffers from a severe lack of natural language processing (NLP) resources. As such, texts written in Arabizi are often disregarded in sentiment analysis tasks for Arabic. In this paper we describe the creation of a sentiment lexicon for Arabizi that was enriched with word embeddings. The result is a new Arabizi lexicon consisting of 11.3K positive and 13.3K negative words. We evaluated this lexicon by classifying the sentiment of Arabizi tweets achieving an F1-score of 0.72. We provide a detailed error analysis to present the challenges that impact the sentiment analysis of Arabizi.</abstract>
+      <url>R19-1138</url>
+      <doi>10.26615/978-954-452-056-4_138</doi>
+    </paper>
+    <paper id="139">
+      <title>Mining the <fixed-case>UK</fixed-case> Web Archive for Semantic Change Detection</title>
+      <author><first>Adam</first><last>Tsakalidis</last></author>
+      <author><first>Marya</first><last>Bazzi</last></author>
+      <author><first>Mihai</first><last>Cucuringu</last></author>
+      <author><first>Pierpaolo</first><last>Basile</last></author>
+      <author><first>Barbara</first><last>McGillivray</last></author>
+      <pages>1212–1221</pages>
+      <abstract>Semantic change detection (i.e., identifying words whose meaning has changed over time) started emerging as a growing area of research over the past decade, with important downstream applications in natural language processing, historical linguistics and computational social science. However, several obstacles make progress in the domain slow and difficult. These pertain primarily to the lack of well-established gold standard datasets, resources to study the problem at a fine-grained temporal resolution, and quantitative evaluation approaches. In this work, we aim to mitigate these issues by (a) releasing a new labelled dataset of more than 47K word vectors trained on the UK Web Archive over a short time-frame (2000-2013); (b) proposing a variant of Procrustes alignment to detect words that have undergone semantic shift; and (c) introducing a rank-based approach for evaluation purposes. Through extensive numerical experiments and validation, we illustrate the effectiveness of our approach against competitive baselines. Finally, we also make our resources publicly available to further enable research in the domain.</abstract>
+      <url>R19-1139</url>
+      <doi>10.26615/978-954-452-056-4_139</doi>
+    </paper>
+    <paper id="140">
+      <title>Cross-Lingual Word Embeddings for Morphologically Rich Languages</title>
+      <author><first>Ahmet</first><last>Üstün</last></author>
+      <author><first>Gosse</first><last>Bouma</last></author>
+      <author><first>Gertjan</first><last>van Noord</last></author>
+      <pages>1222–1228</pages>
+      <abstract>Cross-lingual word embedding models learn a shared vector space for two or more languages so that words with similar meaning are represented by similar vectors regardless of their language. Although the existing models achieve high performance on pairs of morphologically simple languages, they perform very poorly on morphologically rich languages such as Turkish and Finnish. In this paper, we propose a morpheme-based model in order to increase the performance of cross-lingual word embeddings on morphologically rich languages. Our model includes a simple extension which enables us to exploit morphemes for cross-lingual mapping. We applied our model for the Turkish-Finnish language pair on the bilingual word translation task. Results show that our model outperforms the baseline models by 2% in the nearest neighbour ranking.</abstract>
+      <url>R19-1140</url>
+      <doi>10.26615/978-954-452-056-4_140</doi>
+    </paper>
+    <paper id="141">
+      <title>It Takes Nine to Smell a Rat: Neural Multi-Task Learning for Check-Worthiness Prediction</title>
+      <author><first>Slavena</first><last>Vasileva</last></author>
+      <author><first>Pepa</first><last>Atanasova</last></author>
+      <author><first>Lluís</first><last>Màrquez</last></author>
+      <author><first>Alberto</first><last>Barrón-Cedeño</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <pages>1229–1239</pages>
+      <abstract>We propose a multi-task deep-learning approach for estimating the check-worthiness of claims in political debates. Given a political debate, such as the 2016 US Presidential and Vice-Presidential ones, the task is to predict which statements in the debate should be prioritized for fact-checking. While different fact-checking organizations would naturally make different choices when analyzing the same debate, we show that it pays to learn from multiple sources simultaneously (PolitiFact, FactCheck, ABC, CNN, NPR, NYT, Chicago Tribune, The Guardian, and Washington Post) in a multi-task learning setup, even when a particular source is chosen as a target to imitate. Our evaluation shows state-of-the-art results on a standard dataset for the task of check-worthiness prediction.</abstract>
+      <url>R19-1141</url>
+      <doi>10.26615/978-954-452-056-4_141</doi>
+    </paper>
+    <paper id="142">
+      <title>Deep learning contextual models for prediction of sport event outcome from sportsman’s interviews</title>
+      <author><first>Boris</first><last>Velichkov</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <author><first>Svetla</first><last>Boytcheva</last></author>
+      <pages>1240–1246</pages>
+      <abstract>This paper presents an approach for prediction of results for sport events. Usually the sport forecasting approaches are based on structured data. We test the hypothesis that the sports results can be predicted by using natural language processing and machine learning techniques applied over interviews with the players shortly before the sport events. The proposed method uses deep learning contextual models, applied over unstructured textual documents. Several experiments were performed for interviews with players in individual sports like boxing, martial arts, and tennis. The results from the conducted experiment confirmed our initial assumption that an interview from a sportsman before a match contains information that can be used for prediction the outcome from it. Furthermore, the results provide strong evidence in support of our research hypothesis, that is, we can predict the outcome from a sport match analyzing an interview, given before it.</abstract>
+      <url>R19-1142</url>
+      <doi>10.26615/978-954-452-056-4_142</doi>
+    </paper>
+    <paper id="143">
+      <title>Exploiting Frame-Semantics and Frame-Semantic Parsing for Automatic Extraction of Typological Information from Descriptive Grammars of Natural Languages</title>
+      <author><first>Shafqat Mumtaz</first><last>Virk</last></author>
+      <author><first>Azam</first><last>Sheikh Muhammad</last></author>
+      <author><first>Lars</first><last>Borin</last></author>
+      <author><first>Muhammad Irfan</first><last>Aslam</last></author>
+      <author><first>Saania</first><last>Iqbal</last></author>
+      <author><first>Nazia</first><last>Khurram</last></author>
+      <pages>1247–1256</pages>
+      <abstract>We describe a novel system for automatic extraction of typological linguistic information from descriptive grammars of natural languages, applying the theory of frame semantics in the form of frame-semantic parsing. The current proof-of-concept system covers a few selected linguistic features, but the methodology is general and can be extended not only to other typological features but also to descriptive grammars written in languages other than English. Such a system is expected to be a useful assistance for automatic curation of typological databases which otherwise are built manually, a very labor and time consuming as well as cognitively taxing enterprise.</abstract>
+      <url>R19-1143</url>
+      <doi>10.26615/978-954-452-056-4_143</doi>
+    </paper>
+    <paper id="144">
+      <title>Exploiting Open <fixed-case>IE</fixed-case> for Deriving Multiple Premises Entailment Corpus</title>
+      <author><first>Martin</first><last>Víta</last></author>
+      <author><first>Jakub</first><last>Klímek</last></author>
+      <pages>1257–1264</pages>
+      <abstract>Natural language inference (NLI) is a key part of natural language understanding. The NLI task is defined as a decision problem whether a given sentence – hypothesis – can be inferred from a given text. Typically, we deal with a text consisting of just a single premise/single sentence, which is called a single premise entailment (SPE) task. Recently, a derived task of NLI from multiple premises (MPE) was introduced together with the first annotated corpus and corresponding several strong baselines. Nevertheless, the further development in MPE field requires accessibility of huge amounts of annotated data. In this paper we introduce a novel method for rapid deriving of MPE corpora from an existing NLI (SPE) annotated data that does not require any additional annotation work. This proposed approach is based on using an open information extraction system. We demonstrate the application of the method on a well known SNLI corpus. Over the obtained corpus, we provide the first evaluations as well as we state a strong baseline.</abstract>
+      <url>R19-1144</url>
+      <doi>10.26615/978-954-452-056-4_144</doi>
+    </paper>
+    <paper id="145">
+      <title>Towards Adaptive Text Summarization: How Does Compression Rate Affect Summary Readability of <fixed-case>L</fixed-case>2 Texts?</title>
+      <author><first>Tatiana</first><last>Vodolazova</last></author>
+      <author><first>Elena</first><last>Lloret</last></author>
+      <pages>1265–1274</pages>
+      <abstract>This paper addresses the problem of readability of automatically generated summaries in the context of second language learning. For this we experimented with a new corpus of level-annotated simplified English texts. The texts were summarized using a total of 7 extractive and abstractive summarization systems with compression rates of 20%, 40%, 60% and 80%. We analyzed the generated summaries in terms of lexical, syntactic and length-based features of readability, and concluded that summary complexity depends on the compression rate, summarization technique and the nature of the summarized corpus. Our experiments demonstrate the importance of choosing appropriate summarization techniques that align with user’s needs and language proficiency.</abstract>
+      <url>R19-1145</url>
+      <doi>10.26615/978-954-452-056-4_145</doi>
+    </paper>
+    <paper id="146">
+      <title>The Impact of Rule-Based Text Generation on the Quality of Abstractive Summaries</title>
+      <author><first>Tatiana</first><last>Vodolazova</last></author>
+      <author><first>Elena</first><last>Lloret</last></author>
+      <pages>1275–1284</pages>
+      <abstract>In this paper we describe how an abstractive text summarization method improved the informativeness of automatic summaries by integrating syntactic text simplification, subject-verb-object concept frequency scoring and a set of rules that transform text into its semantic representation. We analyzed the impact of each component of our approach on the quality of generated summaries and tested it on DUC 2002 dataset. Our experiments showed that our approach outperformed other state-of-the-art abstractive methods while maintaining acceptable linguistic quality and redundancy rate.</abstract>
+      <url>R19-1146</url>
+      <doi>10.26615/978-954-452-056-4_146</doi>
+    </paper>
+    <paper id="147">
+      <title><fixed-case>ETNLP</fixed-case>: A Visual-Aided Systematic Approach to Select Pre-Trained Embeddings for a Downstream Task</title>
+      <author><first>Son</first><last>Vu Xuan</last></author>
+      <author><first>Thanh</first><last>Vu</last></author>
+      <author><first>Son</first><last>Tran</last></author>
+      <author><first>Lili</first><last>Jiang</last></author>
+      <pages>1285–1294</pages>
+      <abstract>Given many recent advanced embedding models, selecting pre-trained word representation (i.e., word embedding) models best fit for a specific downstream NLP task is non-trivial. In this paper, we propose a systematic approach to extracting, evaluating, and visualizing multiple sets of pre-trained word embed- dings to determine which embeddings should be used in a downstream task. First, for extraction, we provide a method to extract a subset of the embeddings to be used in the downstream NLP tasks. Second, for evaluation, we analyse the quality of pre-trained embeddings using an input word analogy list. Finally, we visualize the embedding space to explore the embedded words interactively. We demonstrate the effectiveness of the proposed approach on our pre-trained word embedding models in Vietnamese to select which models are suitable for a named entity recogni- tion (NER) task. Specifically, we create a large Vietnamese word analogy list to evaluate and select the pre-trained embedding models for the task. We then utilize the selected embed- dings for the NER task and achieve the new state-of-the-art results on the task benchmark dataset. We also apply the approach to another downstream task of privacy-guaranteed embedding selection, and show that it helps users quickly select the most suitable embeddings. In addition, we create an open-source system using the proposed systematic approach to facilitate similar studies on other NLP tasks. The source code and data are available at https: //github.com/vietnlp/etnlp.</abstract>
+      <url>R19-1147</url>
+      <doi>10.26615/978-954-452-056-4_147</doi>
+    </paper>
+    <paper id="148">
+      <title>Tagger for Polish Computer Mediated Communication Texts</title>
+      <author><first>Wiktor</first><last>Walentynowicz</last></author>
+      <author><first>Maciej</first><last>Piasecki</last></author>
+      <author><first>Marcin</first><last>Oleksy</last></author>
+      <pages>1295–1303</pages>
+      <abstract>In this paper we present a morpho-syntactic tagger dedicated to Computer-mediated Communication texts in Polish. Its construction is based on an expanded RNN-based neural network adapted to the work on noisy texts. Among several techniques, the tagger utilises fastText embedding vectors, sequential character embedding vectors, and Brown clustering for the coarse-grained representation of sentence structures. In addition a set of manually written rules was proposed for post-processing. The system was trained to disambiguate descriptions of words in relation to Parts of Speech tags together with the full morphological information in terms of values for the different grammatical categories. We present also evaluation of several model variants on the gold standard annotated CMC data, comparison to the state-of-the-art taggers for Polish and error analysis. The proposed tagger shows significantly better results in this domain and demonstrates the viability of adaptation.</abstract>
+      <url>R19-1148</url>
+      <doi>10.26615/978-954-452-056-4_148</doi>
+    </paper>
+    <paper id="149">
+      <title>Evaluation of vector embedding models in clustering of text documents</title>
+      <author><first>Tomasz</first><last>Walkowiak</last></author>
+      <author><first>Mateusz</first><last>Gniewkowski</last></author>
+      <pages>1304–1311</pages>
+      <abstract>The paper presents an evaluation of word embedding models in clustering of texts in the Polish language. Authors verified six different embedding models, starting from widely used word2vec, across fastText with character n-grams embedding, to deep learning-based ELMo and BERT. Moreover, four standardisation methods, three distance measures and four clustering methods were evaluated. The analysis was performed on two corpora of texts in Polish classified into subjects. The Adjusted Mutual Information (AMI) metric was used to verify the quality of clustering results. The performed experiments show that Skipgram models with n-grams character embedding, built on KGR10 corpus and provided by Clarin-PL, outperforms other publicly available models for Polish. Moreover, presented results suggest that Yeo–Johnson transformation for document vectors standardisation and Agglomerative Clustering with a cosine distance should be used for grouping of text documents.</abstract>
+      <url>R19-1149</url>
+      <doi>10.26615/978-954-452-056-4_149</doi>
+    </paper>
+    <paper id="150">
+      <title>Bigger versus Similar: Selecting a Background Corpus for First Story Detection Based on Distributional Similarity</title>
+      <author><first>Fei</first><last>Wang</last></author>
+      <author><first>Robert J.</first><last>Ross</last></author>
+      <author><first>John D.</first><last>Kelleher</last></author>
+      <pages>1312–1320</pages>
+      <abstract>The current state of the art for First Story Detection (FSD) are nearest neighbour-based models with traditional term vector representations; however, one challenge faced by FSD models is that the document representation is usually defined by the vocabulary and term frequency from a background corpus. Consequently, the ideal background corpus should arguably be both large-scale to ensure adequate term coverage, and similar to the target domain in terms of the language distribution. However, given these two factors cannot always be mutually satisfied, in this paper we examine whether the distributional similarity of common terms is more important than the scale of common terms for FSD. As a basis for our analysis we propose a set of metrics to quantitatively measure the scale of common terms and the distributional similarity between corpora. Using these metrics we rank different background corpora relative to a target corpus. We also apply models based on different background corpora to the FSD task. Our results show that term distributional similarity is more predictive of good FSD performance than the scale of common terms; and, thus we demonstrate that a smaller recent domain-related corpus will be more suitable than a very large-scale general corpus for FSD.</abstract>
+      <url>R19-1150</url>
+      <doi>10.26615/978-954-452-056-4_150</doi>
+    </paper>
+    <paper id="151">
+      <title>Predicting Sentiment of Polish Language Short Texts</title>
+      <author><first>Aleksander</first><last>Wawer</last></author>
+      <author><first>Julita</first><last>Sobiczewska</last></author>
+      <pages>1321–1327</pages>
+      <abstract>The goal of this paper is to use all available Polish language data sets to seek the best possible performance in supervised sentiment analysis of short texts. We use text collections with labelled sentiment such as tweets, movie reviews and a sentiment treebank, in three comparison modes. In the first, we examine the performance of models trained and tested on the same text collection using standard cross-validation (in-domain). In the second we train models on all available data except the given test collection, which we use for testing (one vs rest cross-domain). In the third, we train a model on one data set and apply it to another one (one vs one cross-domain). We compare wide range of methods including machine learning on bag-of-words representation, bidirectional recurrent neural networks as well as the most recent pre-trained architectures ELMO and BERT. We formulate conclusions as to cross-domain and in-domain performance of each method. Unsurprisingly, BERT turned out to be a strong performer, especially in the cross-domain setting. What is surprising however, is solid performance of the relatively simple multinomial Naive Bayes classifier, which performed equally well as BERT on several data sets.</abstract>
+      <url>R19-1151</url>
+      <doi>10.26615/978-954-452-056-4_151</doi>
+    </paper>
+    <paper id="152">
+      <title>Improving Named Entity Linking Corpora Quality</title>
+      <author><first>Albert</first><last>Weichselbraun</last></author>
+      <author><first>Adrian M.P.</first><last>Brasoveanu</last></author>
+      <author><first>Philipp</first><last>Kuntschik</last></author>
+      <author><first>Lyndon J.B.</first><last>Nixon</last></author>
+      <pages>1328–1337</pages>
+      <abstract>Gold standard corpora and competitive evaluations play a key role in benchmarking named entity linking (NEL) performance and driving the development of more sophisticated NEL systems. The quality of the used corpora and the used evaluation metrics are crucial in this process. We, therefore, assess the quality of three popular evaluation corpora, identifying four major issues which affect these gold standards: (i) the use of different annotation styles, (ii) incorrect and missing annotations, (iii) Knowledge Base evolution, (iv) and differences in annotating co-occurrences. This paper addresses these issues by formalizing NEL annotations and corpus versioning which allows standardizing corpus creation, supports corpus evolution, and paves the way for the use of lenses to automatically transform between different corpus configurations. In addition, the use of clearly defined scoring rules and evaluation metrics ensures a better comparability of evaluation results.</abstract>
+      <url>R19-1152</url>
+      <doi>10.26615/978-954-452-056-4_152</doi>
+    </paper>
+    <paper id="153">
+      <title>Sequential Graph Dependency Parser</title>
+      <author><first>Sean</first><last>Welleck</last></author>
+      <author><first>Kyunghyun</first><last>Cho</last></author>
+      <pages>1338–1345</pages>
+      <abstract>We propose a method for non-projective dependency parsing by incrementally predicting a set of edges. Since the edges do not have a pre-specified order, we propose a set-based learning method. Our method blends graph, transition, and easy-first parsing, including a prior state of the parser as a special case. The proposed transition-based method successfully parses near the state of the art on both projective and non-projective languages, without assuming a certain parsing order.</abstract>
+      <url>R19-1153</url>
+      <doi>10.26615/978-954-452-056-4_153</doi>
+    </paper>
+    <paper id="154">
+      <title>Term-Based Extraction of Medical Information: Pre-Operative Patient Education Use Case</title>
+      <author><first>Martin</first><last>Wolf</last></author>
+      <author><first>Volha</first><last>Petukhova</last></author>
+      <author><first>Dietrich</first><last>Klakow</last></author>
+      <pages>1346–1355</pages>
+      <abstract>The processing of medical information is not a trivial task for medical non-experts. The paper presents an artificial assistant designed to facilitate a reliable access to medical online contents. Interactions are modelled as doctor-patient Question Answering sessions within a pre-operative patient education scenario where the system addresses patient’s information needs explaining medical events and procedures. This implies an accurate medical information extraction from and reasoning with available medical knowledge and large amounts of unstructured multilingual online data. Bridging the gap between medical knowledge and data, we explore a language-agnostic approach to medical concepts mining from the standard terminologies, and the data-driven collection of the corresponding seed terms in a distant supervision setting for German. Experimenting with different terminologies, features and term matching strategies, we achieved a promising F-score of 0.91 on the medical term extraction task. The concepts and terms are used to search and retrieve definitions from the verified online free resources. The proof-of-concept definition retrieval system is designed and evaluated showing promising results, acceptable by humans in 92% of cases.</abstract>
+      <url>R19-1154</url>
+      <doi>10.26615/978-954-452-056-4_154</doi>
+    </paper>
+    <paper id="155">
+      <title>A Survey of the Perceived Text Adaptation Needs of Adults with Autism</title>
+      <author><first>Victoria</first><last>Yaneva</last></author>
+      <author><first>Constantin</first><last>Orasan</last></author>
+      <author><first>Le An</first><last>Ha</last></author>
+      <author><first>Natalia</first><last>Ponomareva</last></author>
+      <pages>1356–1363</pages>
+      <abstract>NLP approaches to automatic text adaptation often rely on user-need guidelines which are generic and do not account for the differences between various types of target groups. One such group are adults with high-functioning autism, who are usually able to read long sentences and comprehend difficult words but whose comprehension may be impeded by other linguistic constructions. This is especially challenging for real-world user-generated texts such as product reviews, which cannot be controlled editorially and are thus a particularly good applcation for automatic text adaptation systems. In this paper we present a mixed-methods survey conducted with 24 adult web-users diagnosed with autism and an age-matched control group of 33 neurotypical participants. The aim of the survey was to identify whether the group with autism experienced any barriers when reading online reviews, what these potential barriers were, and what NLP methods would be best suited to improve the accessibility of online reviews for people with autism. The group with autism consistently reported significantly greater difficulties with understanding online product reviews compared to the control group and identified issues related to text length, poor topic organisation, and the use of irony and sarcasm.</abstract>
+      <url>R19-1155</url>
+      <doi>10.26615/978-954-452-056-4_155</doi>
+    </paper>
+    <paper id="156">
+      <title>An Open, Extendible, and Fast <fixed-case>T</fixed-case>urkish Morphological Analyzer</title>
+      <author><first>Olcay Taner</first><last>Yıldız</last></author>
+      <author><first>Begüm</first><last>Avar</last></author>
+      <author><first>Gökhan</first><last>Ercan</last></author>
+      <pages>1364–1372</pages>
+      <abstract>In this paper, we present a two-level morphological analyzer for Turkish. The morphological analyzer consists of five main components: finite state transducer, rule engine for suffixation, lexicon, trie data structure, and LRU cache. We use Java language to implement finite state machine logic and rule engine, Xml language to describe the finite state transducer rules of the Turkish language, which makes the morphological analyzer both easily extendible and easily applicable to other languages. Empowered with the comprehensiveness of a lexicon of 54,000 bare-forms including 19,000 proper nouns, our morphological analyzer presents one of the most reliable analyzers produced so far. The analyzer is compared with Turkish morphological analyzers in the literature. By using LRU cache and a trie data structure, the system can analyze 100,000 words per second, which enables users to analyze huge corpora in a few hours.</abstract>
+      <url>R19-1156</url>
+      <doi>10.26615/978-954-452-056-4_156</doi>
+    </paper>
+    <paper id="157">
+      <title>Self-Attention Networks for Intent Detection</title>
+      <author><first>Sevinj</first><last>Yolchuyeva</last></author>
+      <author><first>Géza</first><last>Németh</last></author>
+      <author><first>Bálint</first><last>Gyires-Tóth</last></author>
+      <pages>1373–1379</pages>
+      <abstract>Self-attention networks (SAN) have shown promising performance in various Natural Language Processing (NLP) scenarios, especially in machine translation. One of the main points of SANs is the strength of capturing long-range and multi-scale dependencies from the data. In this paper, we present a novel intent detection system which is based on a self-attention network and a Bi-LSTM. Our approach shows improvement by using a transformer model and deep averaging network-based universal sentence encoder compared to previous solutions. We evaluate the system on Snips, Smart Speaker, Smart Lights, and ATIS datasets by different evaluation metrics. The performance of the proposed model is compared with LSTM with the same datasets.</abstract>
+      <url>R19-1157</url>
+      <doi>10.26615/978-954-452-056-4_157</doi>
+    </paper>
+    <paper id="158">
+      <title><fixed-case>T</fixed-case>urkish Tweet Classification with Transformer Encoder</title>
+      <author><first>Atıf Emre</first><last>Yüksel</last></author>
+      <author><first>Yaşar Alim</first><last>Türkmen</last></author>
+      <author><first>Arzucan</first><last>Özgür</last></author>
+      <author><first>Berna</first><last>Altınel</last></author>
+      <pages>1380–1387</pages>
+      <abstract>Short-text classification is a challenging task, due to the sparsity and high dimensionality of the feature space. In this study, we aim to analyze and classify Turkish tweets based on their topics. Social media jargon and the agglutinative structure of the Turkish language makes this classification task even harder. As far as we know, this is the first study that uses a Transformer Encoder for short text classification in Turkish. The model is trained in a weakly supervised manner, where the training data set has been labeled automatically. Our results on the test set, which has been manually labeled, show that performing morphological analysis improves the classification performance of the traditional machine learning algorithms Random Forest, Naive Bayes, and Support Vector Machines. Still, the proposed approach achieves an F-score of 89.3 % outperforming those algorithms by at least 5 points.</abstract>
+      <url>R19-1158</url>
+      <doi>10.26615/978-954-452-056-4_158</doi>
+    </paper>
+    <paper id="159">
+      <title>Multilingual Dynamic Topic Model</title>
+      <author><first>Elaine</first><last>Zosa</last></author>
+      <author><first>Mark</first><last>Granroth-Wilding</last></author>
+      <pages>1388–1396</pages>
+      <abstract>Dynamic topic models (DTMs) capture the evolution of topics and trends in time series data.Current DTMs are applicable only to monolingual datasets. In this paper we present the multilingual dynamic topic model (ML-DTM), a novel topic model that combines DTM with an existing multilingual topic modeling method to capture cross-lingual topics that evolve across time. We present results of this model on a parallel German-English corpus of news articles and a comparable corpus of Finnish and Swedish news articles. We demonstrate the capability of ML-DTM to track significant events related to a topic and show that it finds distinct topics and performs as well as existing multilingual topic models in aligning cross-lingual topics.</abstract>
+      <url>R19-1159</url>
+      <doi>10.26615/978-954-452-056-4_159</doi>
+    </paper>
+    <paper id="160">
+      <title>A Wide-Coverage Context-Free Grammar for <fixed-case>I</fixed-case>celandic and an Accompanying Parsing System</title>
+      <author><first>Vilhjálmur</first><last>Þorsteinsson</last></author>
+      <author><first>Hulda</first><last>Óladóttir</last></author>
+      <author><first>Hrafn</first><last>Loftsson</last></author>
+      <pages>1397–1404</pages>
+      <abstract>We present an open-source, wide-coverage context-free grammar (CFG) for Icelandic, and an accompanying parsing system. The grammar has over 5,600 nonterminals, 4,600 terminals and 19,000 productions in fully expanded form, with feature agreement constraints for case, gender, number and person. The parsing system consists of an enhanced Earley-based parser and a mechanism to select best-scoring parse trees from shared packed parse forests. Our parsing system is able to parse about 90% of all sentences in articles published on the main Icelandic news websites. Preliminary evaluation with evalb shows an F-measure of 70.72% on parsed sentences. Our system demonstrates that parsing a morphologically rich language using a wide-coverage CFG can be practical.</abstract>
+      <url>R19-1160</url>
+      <doi>10.26615/978-954-452-056-4_160</doi>
+    </paper>
+  </volume>
+  <volume id="2" ingest-date="2020-01-16">
+    <meta>
+      <booktitle>Proceedings of the Student Research Workshop Associated with RANLP 2019</booktitle>
+      <url>R19-2</url>
+      <editor><first>University of Barcelona</first><last>Venelin Kovatchev</last></editor>
+      <editor><first>Irina</first><last>Temnikova</last></editor>
+      <editor><first>Belgrade University</first><last>Branislava Šandrih</last></editor>
+      <editor><first>Bulgarian Academy of Sciences</first><last>Ivelina Nikolova</last></editor>
+      <editor><first>Ontotext</first><last>AD</last></editor>
+      <publisher>INCOMA Ltd.</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2019</year>
+    </meta>
+    <frontmatter>
+      <url>R19-2000</url>
+    </frontmatter>
+    <paper id="1">
+      <title>Normalization of <fixed-case>K</fixed-case>azakh Texts</title>
+      <author><first>Assina</first><last>Abdussaitova</last></author>
+      <author><first>Alina</first><last>Amangeldiyeva</last></author>
+      <pages>1–6</pages>
+      <abstract>Kazakh language, like other agglutinative languages, has specific difficulties on both recognition of wrong words and generation the corrections for misspelt words. The main goal of this work is to develop a better algorithm for the normalization of Kazakh texts based on traditional and Machine Learning methods, as well as the new approach which is also considered in this paper. The procedure of election among methods of normalization has been conducted in a manner of comparative analysis. The results of the comparative analysis turned up successful and are shown in details.</abstract>
+      <url>R19-2001</url>
+      <doi>10.26615/issn.2603-2821.2019_001</doi>
+    </paper>
+    <paper id="2">
+      <title>Classification Approaches to Identify Informative Tweets</title>
+      <author><first>Piush</first><last>Aggarwal</last></author>
+      <pages>7–15</pages>
+      <abstract>Social media platforms have become prime forums for reporting news, with users sharing what they saw, heard or read on social media. News from social media is potentially useful for various stakeholders including aid organizations, news agencies, and individuals. However, social media also contains a vast amount of non-news content. For users to be able to draw on benefits from news reported on social media it is necessary to reliably identify news content and differentiate it from non-news. In this paper, we tackle the challenge of classifying a social post as news or not. To this end, we provide a new manually annotated dataset containing 2,992 tweets from 5 different topical categories. Unlike earlier datasets, it includes postings posted by personal users who do not promote a business or a product and are not affiliated with any organization. We also investigate various baseline systems and evaluate their performance on the newly generated dataset. Our results show that the best classifiers are the SVM and BERT models.</abstract>
+      <url>R19-2002</url>
+      <doi>10.26615/issn.2603-2821.2019_002</doi>
+    </paper>
+    <paper id="3">
+      <title>Dialect-Specific Models for Automatic Speech Recognition of <fixed-case>A</fixed-case>frican <fixed-case>A</fixed-case>merican Vernacular <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Rachel</first><last>Dorn</last></author>
+      <pages>16–20</pages>
+      <abstract>African American Vernacular English (AAVE) is a widely-spoken dialect of English, yet it is under-represented in major speech corpora. As a result, speakers of this dialect are often misunderstood by NLP applications. This study explores the effect on transcription accuracy of an automatic voice recognition system when AAVE data is used. Models trained on AAVE data and on Standard American English data were compared to a baseline model trained on a combination of the two dialects. The accuracy for both dialect-specific models was significantly higher than the baseline model, with the AAVE model showing over 18% improvement. By isolating the effect of having AAVE speakers in the training data, this study highlights the importance of increasing diversity in the field of natural language processing.</abstract>
+      <url>R19-2003</url>
+      <doi>10.26615/issn.2603-2821.2019_003</doi>
+    </paper>
+    <paper id="4">
+      <title>Multilingual Language Models for Named Entity Recognition in <fixed-case>G</fixed-case>erman and <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Antonia</first><last>Baumann</last></author>
+      <pages>21–27</pages>
+      <abstract>We assess the language specificity of recent language models by exploring the potential of a multilingual language model. In particular, we evaluate Google’s multilingual BERT (mBERT) model on Named Entity Recognition (NER) in German and English. We expand the work on language model fine-tuning by Howard and Ruder (2018), applying it to the BERT architecture. We successfully reproduce the NER results published by Devlin et al. (2019).Our results show that the multilingual language model generalises well for NER in the chosen languages, matching the native model in English and comparing well with recent approaches for German. However, it does not benefit from the added fine-tuning methods.</abstract>
+      <url>R19-2004</url>
+      <doi>10.26615/issn.2603-2821.2019_004</doi>
+    </paper>
+    <paper id="5">
+      <title>Parts of Speech Tagging for <fixed-case>K</fixed-case>annada</title>
+      <author><first>Swaroop</first><last>L R</last></author>
+      <author><first>Rakshith</first><last>Gowda G S</last></author>
+      <author><first>Sourabh</first><last>U</last></author>
+      <author><first>Shriram</first><last>Hegde</last></author>
+      <pages>28–31</pages>
+      <abstract>Parts of speech (POS) tagging is the process of assigning the part of speech tag to each and every word in a sentence. In this paper, we have presented POS tagger for Kannada, a low resource south Asian language, using Condition Random Fields. POS tagger developed in the work uses novel features native to Kannada language. The novel features include Sandhi splitting, where a compound word is broken down into two or more meaningful constituent words. The proposed model is trained and tested on the tagged dataset which contains 21 thousand sentences and achieves a highest accuracy of 94.56%.</abstract>
+      <url>R19-2005</url>
+      <doi>10.26615/issn.2603-2821.2019_005</doi>
+    </paper>
+    <paper id="6">
+      <title>Cross-Lingual Coreference: The Case of <fixed-case>B</fixed-case>ulgarian and <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Zara</first><last>Kancheva</last></author>
+      <pages>32–38</pages>
+      <abstract>The paper presents several common approaches towards cross- and multi-lingual coreference resolution in a search of the most effective practices to be applied within the work on Bulgarian-English manual coreference annotation of a short story. The work aims at outlining the typology of the differences in the annotated parallel texts. The results of the research prove to be comparable with the tendencies observed in similar works on other Slavic languages and show surprising differences between the types of markables and their frequency in Bulgarian and English.</abstract>
+      <url>R19-2006</url>
+      <doi>10.26615/issn.2603-2821.2019_006</doi>
+    </paper>
+    <paper id="7">
+      <title>Towards Accurate Text Verbalization for <fixed-case>ASR</fixed-case> Based on Audio Alignment</title>
+      <author><first>Diana</first><last>Geneva</last></author>
+      <author><first>Georgi</first><last>Shopov</last></author>
+      <pages>39–47</pages>
+      <abstract>Verbalization of non-lexical linguistic units plays an important role in language modeling for automatic speech recognition systems. Most verbalization methods require valuable resources such as ground truth, large training corpus and expert knowledge which are often unavailable. On the other hand a considerable amount of audio data along with its transcribed text are freely available on the Internet and could be utilized for the task of verbalization. This paper presents a methodology for accurate verbalization of audio transcriptions based on phone-level alignment between the transcriptions and their corresponding audio recordings. Comparing this approach to a more general rule-based verbalization method shows a significant improvement in ASR recognition of non-lexical units. In the process of evaluating this approach we also expose the indirect influence of verbalization accuracy on the quality of acoustic models trained on automatically derived speech corpora.</abstract>
+      <url>R19-2007</url>
+      <doi>10.26615/issn.2603-2821.2019_007</doi>
+    </paper>
+    <paper id="8">
+      <title>Evaluation of Stacked Embeddings for <fixed-case>B</fixed-case>ulgarian on the Downstream Tasks <fixed-case>POS</fixed-case> and <fixed-case>NERC</fixed-case></title>
+      <author><first>Iva</first><last>Marinova</last></author>
+      <pages>48–54</pages>
+      <abstract>This paper reports on experiments with different stacks of word embeddings and evaluation of their usefulness for Bulgarian downstream tasks such as Named Entity Recognition and Classification (NERC) and Part-of-speech (POS) Tagging. Word embeddings stay in the core of the development of NLP, with several key language models being created over the last two years like FastText (CITATION), ElMo (CITATION), BERT (CITATION) and Flair (CITATION). Stacking or combining different word embeddings is another technique used in this paper and still not reported for Bulgarian NERC. Well-established architecture is used for the sequence tagging task such as BI-LSTM-CRF, and different pre-trained language models are combined in the embedding layer to decide which combination of them scores better.</abstract>
+      <url>R19-2008</url>
+      <doi>10.26615/issn.2603-2821.2019_008</doi>
+    </paper>
+    <paper id="9">
+      <title>Overview on <fixed-case>NLP</fixed-case> Techniques for Content-based Recommender Systems for Books</title>
+      <author><first>Melania</first><last>Berbatova</last></author>
+      <pages>55–61</pages>
+      <abstract>Recommender systems are an essential part of today’s largest websites. Without them, it would be hard for users to find the right products and content. One of the most popular methods for recommendations is content-based filtering. It relies on analysing product metadata, a great part of which is textual data. Despite their frequent use, there is still no standard procedure for developing and evaluating content-based recommenders. In this paper, we will first examine current approaches for designing, training and evaluating recommender systems based on textual data for books recommendations for GoodReads’ website. We will give critiques on existing methods and suggest how natural language techniques can be employed for the improvement of content-based recommenders.</abstract>
+      <url>R19-2009</url>
+      <doi>10.26615/issn.2603-2821.2019_009</doi>
+    </paper>
+    <paper id="10">
+      <title>Corpora and Processing Tools for Non-standard Contemporary and Diachronic Balkan <fixed-case>S</fixed-case>lavic</title>
+      <author><first>Teodora</first><last>Vukovic</last></author>
+      <author><first>Nora</first><last>Muheim</last></author>
+      <author><first>Olivier</first><last>Winistörfer</last></author>
+      <author><first>Ivan</first><last>Šimko</last></author>
+      <author><first>Anastasia</first><last>Makarova</last></author>
+      <author><first>Sanja</first><last>Bradjan</last></author>
+      <pages>62–68</pages>
+      <abstract>The paper describes three corpora of different varieties of BS that are currently being developed with the goal of providing data for the analysis of the diatopic and diachronic variation in non-standard Balkan Slavic. The corpora includes spoken materials from Torlak, Macedonian dialects, as well as the manuscripts of pre-standardized Bulgarian. Apart from the texts, tools for PoS annotation and lemmatization for all varieties are being created, as well as syntactic parsing for Torlak and Bulgarian varieties. The corpora are built using a unified methodology, relying on the pest practices and state-of-the-art methods from the field. The uniform methodology allows the contrastive analysis of the data from different varieties. The corpora under construction can be considered a crucial contribution to the linguistic research on the languages in the Balkans as they provide the lacking data needed for the studies of linguistic variation in the Balkan Slavic, and enable the comparison of the said varieties with other neighbouring languages.</abstract>
+      <url>R19-2010</url>
+      <doi>10.26615/issn.2603-2821.2019_010</doi>
+    </paper>
+    <paper id="11">
+      <title>Question Answering Systems Approaches and Challenges</title>
+      <author><first>Reem</first><last>Alqifari</last></author>
+      <pages>69–75</pages>
+      <abstract>Question answering (QA) systems permit the user to ask a question using natural language, and the system provides a concise and correct answer. QA systems can be implemented for different types of datasets, structured or unstructured. In this paper, some of the recent studies will be reviewed and the limitations will be discussed. Consequently, the current issues are analyzed with the proposed solutions.</abstract>
+      <url>R19-2011</url>
+      <doi>10.26615/issn.2603-2821.2019_011</doi>
+    </paper>
+    <paper id="12">
+      <title>Adding Linguistic Knowledge to <fixed-case>NLP</fixed-case> Tasks for <fixed-case>B</fixed-case>ulgarian: The Verb Paradigm Patterns</title>
+      <author><first>Ivaylo</first><last>Radev</last></author>
+      <pages>76–82</pages>
+      <abstract>This paper discusses some possible usages of one unexplored lexical language resource containing Bulgarian verb paradigms and their English translations. This type of data can be used for machine translation, generation of pseudo corpora/language exercises, and evaluation of parsers. Upon completion, the resource will be linked with other existing resources such as the morphological lexicon, valency lexicon, as well as BTB-WordNet.</abstract>
+      <url>R19-2012</url>
+      <doi>10.26615/issn.2603-2821.2019_012</doi>
+    </paper>
+    <paper id="13">
+      <title>Multilingual Complex Word Identification: Convolutional Neural Networks with Morphological and Linguistic Features</title>
+      <author><first>Kim Cheng</first><last>SHEANG</last></author>
+      <pages>83–89</pages>
+      <abstract>The paper is about our experiments with Complex Word Identification system using deep learning approach with word embeddings and engineered features.</abstract>
+      <url>R19-2013</url>
+      <doi>10.26615/issn.2603-2821.2019_013</doi>
+    </paper>
+    <paper id="14">
+      <title>Neural Network-based Models with Commonsense Knowledge for Machine Reading Comprehension</title>
+      <author><first>Denis</first><last>Smirnov</last></author>
+      <pages>90–94</pages>
+      <abstract>State-of-the-art machine reading comprehension models are capable of producing answers for factual questions about a given piece of text. However, some type of questions requires commonsense knowledge which cannot be inferred from the given text passage. Thus, external semantic information could enhance the performance of these models. This PhD research proposal provides a brief overview of some existing machine reading comprehension datasets and models and outlines possible ways of their improvement.</abstract>
+      <url>R19-2014</url>
+      <doi>10.26615/issn.2603-2821.2019_014</doi>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/W17.xml b/data/xml/W17.xml
index 63b56dd26a..c959c8dd17 100644
--- a/data/xml/W17.xml
+++ b/data/xml/W17.xml
@@ -21,206 +21,206 @@
     </frontmatter>
     <paper id="1">
       <title>A Morphological Parser for Odawa</title>
-      <author><first>Dustin</first> <last>Bowers</last></author>
-      <author><first>Antti</first> <last>Arppe</last></author>
-      <author><first>Jordan</first> <last>Lachler</last></author>
-      <author><first>Sjur</first> <last>Moshagen</last></author>
-      <author><first>Trond</first> <last>Trosterud</last></author>
+      <author><first>Dustin</first><last>Bowers</last></author>
+      <author><first>Antti</first><last>Arppe</last></author>
+      <author><first>Jordan</first><last>Lachler</last></author>
+      <author><first>Sjur</first><last>Moshagen</last></author>
+      <author><first>Trond</first><last>Trosterud</last></author>
       <pages>1–9</pages>
       <url>W17-0101</url>
       <doi>10.18653/v1/W17-0101</doi>
     </paper>
     <paper id="2">
       <title>Creating lexical resources for polysynthetic languages—the case of <fixed-case>A</fixed-case>rapaho</title>
-      <author><first>Ghazaleh</first> <last>Kazeminejad</last></author>
-      <author><first>Andrew</first> <last>Cowell</last></author>
-      <author><first>Mans</first> <last>Hulden</last></author>
+      <author><first>Ghazaleh</first><last>Kazeminejad</last></author>
+      <author><first>Andrew</first><last>Cowell</last></author>
+      <author><first>Mans</first><last>Hulden</last></author>
       <pages>10–18</pages>
       <url>W17-0102</url>
       <doi>10.18653/v1/W17-0102</doi>
     </paper>
     <paper id="3">
       <title>From Small to Big Data: paper manuscripts to <fixed-case>RDF</fixed-case> triples of <fixed-case>A</fixed-case>ustralian Indigenous Vocabularies</title>
-      <author><first>Nick</first> <last>Thieberger</last></author>
-      <author><first>Conal</first> <last>Tuohy</last></author>
+      <author><first>Nick</first><last>Thieberger</last></author>
+      <author><first>Conal</first><last>Tuohy</last></author>
       <pages>19–23</pages>
       <url>W17-0103</url>
       <doi>10.18653/v1/W17-0103</doi>
     </paper>
     <paper id="4">
       <title>Issues in digital text representation, on-line dissemination, sharing and re-use for <fixed-case>A</fixed-case>frican minority languages</title>
-      <author><first>Emmanuel Ngué</first> <last>Um</last></author>
+      <author><first>Emmanuel Ngué</first><last>Um</last></author>
       <pages>24–32</pages>
       <url>W17-0104</url>
       <doi>10.18653/v1/W17-0104</doi>
     </paper>
     <paper id="5">
       <title>Developing collection management tools to create more robust and reliable linguistic data</title>
-      <author><first>Gary</first> <last>Holton</last></author>
-      <author><first>Kavon</first> <last>Hooshiar</last></author>
-      <author><first>Nick</first> <last>Thieberger</last></author>
+      <author><first>Gary</first><last>Holton</last></author>
+      <author><first>Kavon</first><last>Hooshiar</last></author>
+      <author><first>Nick</first><last>Thieberger</last></author>
       <pages>33–38</pages>
       <url>W17-0105</url>
       <doi>10.18653/v1/W17-0105</doi>
     </paper>
     <paper id="6">
       <title><fixed-case>STREAMLI</fixed-case>n<fixed-case>ED</fixed-case> Challenges: Aligning Research Interests with Shared Tasks</title>
-      <author><first>Gina-Anne</first> <last>Levow</last></author>
-      <author><first>Emily M.</first> <last>Bender</last></author>
-      <author><first>Patrick</first> <last>Littell</last></author>
-      <author><first>Kristen</first> <last>Howell</last></author>
-      <author><first>Shobhana</first> <last>Chelliah</last></author>
-      <author><first>Joshua</first> <last>Crowgey</last></author>
-      <author><first>Dan</first> <last>Garrette</last></author>
-      <author><first>Jeff</first> <last>Good</last></author>
-      <author><first>Sharon</first> <last>Hargus</last></author>
-      <author><first>David</first> <last>Inman</last></author>
-      <author><first>Michael</first> <last>Maxwell</last></author>
-      <author><first>Michael</first> <last>Tjalve</last></author>
-      <author><first>Fei</first> <last>Xia</last></author>
+      <author><first>Gina-Anne</first><last>Levow</last></author>
+      <author><first>Emily M.</first><last>Bender</last></author>
+      <author><first>Patrick</first><last>Littell</last></author>
+      <author><first>Kristen</first><last>Howell</last></author>
+      <author><first>Shobhana</first><last>Chelliah</last></author>
+      <author><first>Joshua</first><last>Crowgey</last></author>
+      <author><first>Dan</first><last>Garrette</last></author>
+      <author><first>Jeff</first><last>Good</last></author>
+      <author><first>Sharon</first><last>Hargus</last></author>
+      <author><first>David</first><last>Inman</last></author>
+      <author><first>Michael</first><last>Maxwell</last></author>
+      <author><first>Michael</first><last>Tjalve</last></author>
+      <author><first>Fei</first><last>Xia</last></author>
       <pages>39–47</pages>
       <url>W17-0106</url>
       <doi>10.18653/v1/W17-0106</doi>
     </paper>
     <paper id="7">
       <title>Work With What You’ve Got</title>
-      <author><first>Lucy</first> <last>Bell</last></author>
-      <author><first>Lawrence</first> <last>Bell</last></author>
+      <author><first>Lucy</first><last>Bell</last></author>
+      <author><first>Lawrence</first><last>Bell</last></author>
       <pages>48–51</pages>
       <url>W17-0107</url>
       <doi>10.18653/v1/W17-0107</doi>
     </paper>
     <paper id="8">
       <title>Converting a comprehensive lexical database into a computational model: The case of East Cree verb inflection</title>
-      <author><first>Antti</first> <last>Arppe</last></author>
-      <author><first>Marie-Odile</first> <last>Junker</last></author>
-      <author><first>Delasie</first> <last>Torkornoo</last></author>
+      <author><first>Antti</first><last>Arppe</last></author>
+      <author><first>Marie-Odile</first><last>Junker</last></author>
+      <author><first>Delasie</first><last>Torkornoo</last></author>
       <pages>52–56</pages>
       <url>W17-0108</url>
       <doi>10.18653/v1/W17-0108</doi>
     </paper>
     <paper id="9">
       <title>Instant annotations in <fixed-case>ELAN</fixed-case> corpora of spoken and written <fixed-case>K</fixed-case>omi, an endangered language of the Barents Sea region</title>
-      <author><first>Ciprian</first> <last>Gerstenberger</last></author>
-      <author><first>Niko</first> <last>Partanen</last></author>
-      <author><first>Michael</first> <last>Rießler</last></author>
+      <author><first>Ciprian</first><last>Gerstenberger</last></author>
+      <author><first>Niko</first><last>Partanen</last></author>
+      <author><first>Michael</first><last>Rießler</last></author>
       <pages>57–66</pages>
       <url>W17-0109</url>
       <doi>10.18653/v1/W17-0109</doi>
     </paper>
     <paper id="10">
       <title>Inferring Case Systems from <fixed-case>IGT</fixed-case>: Enriching the Enrichment</title>
-      <author><first>Kristen</first> <last>Howell</last></author>
-      <author><first>Emily M.</first> <last>Bender</last></author>
-      <author><first>Michel</first> <last>Lockwood</last></author>
-      <author><first>Fei</first> <last>Xia</last></author>
-      <author><first>Olga</first> <last>Zamaraeva</last></author>
+      <author><first>Kristen</first><last>Howell</last></author>
+      <author><first>Emily M.</first><last>Bender</last></author>
+      <author><first>Michel</first><last>Lockwood</last></author>
+      <author><first>Fei</first><last>Xia</last></author>
+      <author><first>Olga</first><last>Zamaraeva</last></author>
       <pages>67–75</pages>
       <url>W17-0110</url>
       <doi>10.18653/v1/W17-0110</doi>
     </paper>
     <paper id="11">
       <title>Case Studies in the Automatic Characterization of Grammars from Small Wordlists</title>
-      <author><first>Jordan</first> <last>Kodner</last></author>
-      <author><first>Spencer</first> <last>Kaplan</last></author>
-      <author><first>Hongzhi</first> <last>Xu</last></author>
-      <author><first>Mitchell P.</first> <last>Marcus</last></author>
-      <author><first>Charles</first> <last>Yang</last></author>
+      <author><first>Jordan</first><last>Kodner</last></author>
+      <author><first>Spencer</first><last>Caplan</last></author>
+      <author><first>Hongzhi</first><last>Xu</last></author>
+      <author><first>Mitchell P.</first><last>Marcus</last></author>
+      <author><first>Charles</first><last>Yang</last></author>
       <pages>76–84</pages>
       <url>W17-0111</url>
       <doi>10.18653/v1/W17-0111</doi>
     </paper>
     <paper id="12">
       <title>Endangered Data for Endangered Languages: Digitizing Print dictionaries</title>
-      <author><first>Michael</first> <last>Maxwell</last></author>
-      <author><first>Aric</first> <last>Bills</last></author>
+      <author><first>Michael</first><last>Maxwell</last></author>
+      <author><first>Aric</first><last>Bills</last></author>
       <pages>85–91</pages>
       <url>W17-0112</url>
       <doi>10.18653/v1/W17-0112</doi>
     </paper>
     <paper id="13">
       <title>A computationally-assisted procedure for discovering poetic organization within oral tradition</title>
-      <author><first>David</first> <last>Meyer</last></author>
+      <author><first>David</first><last>Meyer</last></author>
       <pages>92–100</pages>
       <url>W17-0113</url>
       <doi>10.18653/v1/W17-0113</doi>
     </paper>
     <paper id="14">
       <title>Improving Coverage of an <fixed-case>I</fixed-case>nuktitut Morphological Analyzer Using a Segmental Recurrent Neural Network</title>
-      <author><first>Jeffrey</first> <last>Micher</last></author>
+      <author><first>Jeffrey</first><last>Micher</last></author>
       <pages>101–106</pages>
       <url>W17-0114</url>
       <doi>10.18653/v1/W17-0114</doi>
     </paper>
     <paper id="15">
       <title>Click reduction in fluent speech: a semi-automated analysis of <fixed-case>M</fixed-case>angetti Dune !<fixed-case>X</fixed-case>ung</title>
-      <author><first>Amanda</first> <last>Miller</last></author>
-      <author><first>Micha</first> <last>Elsner</last></author>
+      <author><first>Amanda</first><last>Miller</last></author>
+      <author><first>Micha</first><last>Elsner</last></author>
       <pages>107–115</pages>
       <url>W17-0115</url>
       <doi>10.18653/v1/W17-0115</doi>
     </paper>
     <paper id="16">
       <title><fixed-case>DECCA</fixed-case> Repurposed: Detecting transcription inconsistencies without an orthographic standard</title>
-      <author><first>C. Anton</first> <last>Rytting</last></author>
-      <author><first>Julie</first> <last>Yelle</last></author>
+      <author><first>C. Anton</first><last>Rytting</last></author>
+      <author><first>Julie</first><last>Yelle</last></author>
       <pages>116–121</pages>
       <url>W17-0116</url>
       <doi>10.18653/v1/W17-0116</doi>
     </paper>
     <paper id="17">
       <title>Jejueo talking dictionary: A collaborative online database for language revitalization</title>
-      <author><first>Moira</first> <last>Saltzman</last></author>
+      <author><first>Moira</first><last>Saltzman</last></author>
       <pages>122–129</pages>
       <url>W17-0117</url>
       <doi>10.18653/v1/W17-0117</doi>
     </paper>
     <paper id="18">
       <title>Computational Support for Finding Word Classes: A Case Study of Abui</title>
-      <author><first>Olga</first> <last>Zamaraeva</last></author>
-      <author><first>František</first> <last>Kratochvíl</last></author>
-      <author><first>Emily M.</first> <last>Bender</last></author>
-      <author><first>Fei</first> <last>Xia</last></author>
-      <author><first>Kristen</first> <last>Howell</last></author>
+      <author><first>Olga</first><last>Zamaraeva</last></author>
+      <author><first>František</first><last>Kratochvíl</last></author>
+      <author><first>Emily M.</first><last>Bender</last></author>
+      <author><first>Fei</first><last>Xia</last></author>
+      <author><first>Kristen</first><last>Howell</last></author>
       <pages>130–140</pages>
       <url>W17-0118</url>
       <doi>10.18653/v1/W17-0118</doi>
     </paper>
     <paper id="19">
       <title>Waldayu and Waldayu Mobile: Modern digital dictionary interfaces for endangered languages</title>
-      <author><first>Patrick</first> <last>Littell</last></author>
-      <author><first>Aidan</first> <last>Pine</last></author>
-      <author><first>Henry</first> <last>Davis</last></author>
+      <author><first>Patrick</first><last>Littell</last></author>
+      <author><first>Aidan</first><last>Pine</last></author>
+      <author><first>Henry</first><last>Davis</last></author>
       <pages>141–150</pages>
       <url>W17-0119</url>
       <doi>10.18653/v1/W17-0119</doi>
     </paper>
     <paper id="20">
       <title>Connecting Documentation and Revitalization: A New Approach to Language Apps</title>
-      <author><first>Alexa N.</first> <last>Little</last></author>
+      <author><first>Alexa N.</first><last>Little</last></author>
       <pages>151–155</pages>
       <url>W17-0120</url>
       <doi>10.18653/v1/W17-0120</doi>
     </paper>
     <paper id="21">
       <title>Developing a Suite of Mobile Applications for Collaborative Language Documentation</title>
-      <author><first>Mat</first> <last>Bettinson</last></author>
-      <author><first>Steven</first> <last>Bird</last></author>
+      <author><first>Mat</first><last>Bettinson</last></author>
+      <author><first>Steven</first><last>Bird</last></author>
       <pages>156–164</pages>
       <url>W17-0121</url>
       <doi>10.18653/v1/W17-0121</doi>
     </paper>
     <paper id="22">
       <title>Cross-language forced alignment to assist community-based linguistics for low resource languages</title>
-      <author><first>Timothy</first> <last>Kempton</last></author>
+      <author><first>Timothy</first><last>Kempton</last></author>
       <pages>165–169</pages>
       <url>W17-0122</url>
       <doi>10.18653/v1/W17-0122</doi>
     </paper>
     <paper id="23">
       <title>A case study on using speech-to-translation alignments for language documentation</title>
-      <author><first>Antonios</first> <last>Anastasopoulos</last></author>
-      <author><first>David</first> <last>Chiang</last></author>
+      <author><first>Antonios</first><last>Anastasopoulos</last></author>
+      <author><first>David</first><last>Chiang</last></author>
       <pages>170–178</pages>
       <url>W17-0123</url>
       <doi>10.18653/v1/W17-0123</doi>
@@ -242,384 +242,384 @@
     </frontmatter>
     <paper id="1">
       <title>Joint <fixed-case>UD</fixed-case> Parsing of <fixed-case>N</fixed-case>orwegian <fixed-case>B</fixed-case>okmål and Nynorsk</title>
-      <author><first>Erik</first> <last>Velldal</last></author>
-      <author><first>Lilja</first> <last>Øvrelid</last></author>
-      <author><first>Petter</first> <last>Hohle</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
+      <author><first>Petter</first><last>Hohle</last></author>
       <pages>1–10</pages>
       <url>W17-0201</url>
     </paper>
     <paper id="2">
       <title>Replacing <fixed-case>OOV</fixed-case> Words For Dependency Parsing With Distributional Semantics</title>
-      <author><first>Prasanth</first> <last>Kolachina</last></author>
-      <author><first>Martin</first> <last>Riedl</last></author>
-      <author><first>Chris</first> <last>Biemann</last></author>
+      <author><first>Prasanth</first><last>Kolachina</last></author>
+      <author><first>Martin</first><last>Riedl</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
       <pages>11–19</pages>
       <url>W17-0202</url>
     </paper>
     <paper id="3">
       <title>Real-valued Syntactic Word Vectors (<fixed-case>RSV</fixed-case>) for Greedy Neural Dependency Parsing</title>
-      <author><first>Ali</first> <last>Basirat</last></author>
-      <author><first>Joakim</first> <last>Nivre</last></author>
+      <author><first>Ali</first><last>Basirat</last></author>
+      <author><first>Joakim</first><last>Nivre</last></author>
       <pages>20–28</pages>
       <url>W17-0203</url>
     </paper>
     <paper id="4">
       <title>Tagging Named Entities in 19th Century and Modern <fixed-case>F</fixed-case>innish Newspaper Material with a <fixed-case>F</fixed-case>innish Semantic Tagger</title>
-      <author><first>Kimmo</first> <last>Kettunen</last></author>
-      <author><first>Laura</first> <last>Löfberg</last></author>
+      <author><first>Kimmo</first><last>Kettunen</last></author>
+      <author><first>Laura</first><last>Löfberg</last></author>
       <pages>29–36</pages>
       <url>W17-0204</url>
     </paper>
     <paper id="5">
       <title>Machine Learning for Rhetorical Figure Detection: More Chiasmus with Less Annotation</title>
-      <author><first>Marie</first> <last>Dubremetz</last></author>
-      <author><first>Joakim</first> <last>Nivre</last></author>
+      <author><first>Marie</first><last>Dubremetz</last></author>
+      <author><first>Joakim</first><last>Nivre</last></author>
       <pages>37–45</pages>
       <url>W17-0205</url>
     </paper>
     <paper id="6">
       <title>Coreference Resolution for <fixed-case>S</fixed-case>wedish and <fixed-case>G</fixed-case>erman using Distant Supervision</title>
-      <author><first>Alexander</first> <last>Wallin</last></author>
-      <author><first>Pierre</first> <last>Nugues</last></author>
+      <author><first>Alexander</first><last>Wallin</last></author>
+      <author><first>Pierre</first><last>Nugues</last></author>
       <pages>46–55</pages>
       <url>W17-0206</url>
     </paper>
     <paper id="7">
       <title>Aligning phonemes using finte-state methods</title>
-      <author><first>Kimmo</first> <last>Koskenniemi</last></author>
+      <author><first>Kimmo</first><last>Koskenniemi</last></author>
       <pages>56–64</pages>
       <url>W17-0207</url>
     </paper>
     <paper id="8">
       <title>Acoustic Model Compression with <fixed-case>MAP</fixed-case> adaptation</title>
-      <author><first>Katri</first> <last>Leino</last></author>
-      <author><first>Mikko</first> <last>Kurimo</last></author>
+      <author><first>Katri</first><last>Leino</last></author>
+      <author><first>Mikko</first><last>Kurimo</last></author>
       <pages>65–69</pages>
       <url>W17-0208</url>
     </paper>
     <paper id="9">
       <title><fixed-case>OCR</fixed-case> and post-correction of historical <fixed-case>F</fixed-case>innish texts</title>
-      <author><first>Senka</first> <last>Drobac</last></author>
-      <author><first>Pekka</first> <last>Kauppinen</last></author>
-      <author><first>Krister</first> <last>Lindén</last></author>
+      <author><first>Senka</first><last>Drobac</last></author>
+      <author><first>Pekka</first><last>Kauppinen</last></author>
+      <author><first>Krister</first><last>Lindén</last></author>
       <pages>70–76</pages>
       <url>W17-0209</url>
     </paper>
     <paper id="10">
       <title>Twitter Topic Modeling by Tweet Aggregation</title>
-      <author><first>Asbjørn</first> <last>Steinskog</last></author>
-      <author><first>Jonas</first> <last>Therkelsen</last></author>
-      <author><first>Björn</first> <last>Gambäck</last></author>
+      <author><first>Asbjørn</first><last>Steinskog</last></author>
+      <author><first>Jonas</first><last>Therkelsen</last></author>
+      <author><first>Björn</first><last>Gambäck</last></author>
       <pages>77–86</pages>
       <url>W17-0210</url>
     </paper>
     <paper id="11">
       <title>A Multilingual Entity Linker Using <fixed-case>P</fixed-case>age<fixed-case>R</fixed-case>ank and Semantic Graphs</title>
-      <author><first>Anton</first> <last>Södergren</last></author>
-      <author><first>Pierre</first> <last>Nugues</last></author>
+      <author><first>Anton</first><last>Södergren</last></author>
+      <author><first>Pierre</first><last>Nugues</last></author>
       <pages>87–95</pages>
       <url>W17-0211</url>
     </paper>
     <paper id="12">
       <title>Linear Ensembles of Word Embedding Models</title>
-      <author><first>Avo</first> <last>Muromägi</last></author>
-      <author><first>Kairit</first> <last>Sirts</last></author>
-      <author><first>Sven</first> <last>Laur</last></author>
+      <author><first>Avo</first><last>Muromägi</last></author>
+      <author><first>Kairit</first><last>Sirts</last></author>
+      <author><first>Sven</first><last>Laur</last></author>
       <pages>96–104</pages>
       <url>W17-0212</url>
     </paper>
     <paper id="13">
       <title>Using Pseudowords for Algorithm Comparison: An Evaluation Framework for Graph-based Word Sense Induction</title>
-      <author><first>Flavio</first> <last>Massimiliano Cecchini</last></author>
-      <author><first>Chris</first> <last>Biemann</last></author>
-      <author><first>Martin</first> <last>Riedl</last></author>
+      <author><first>Flavio</first><last>Massimiliano Cecchini</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
+      <author><first>Martin</first><last>Riedl</last></author>
       <pages>105–114</pages>
       <url>W17-0213</url>
     </paper>
     <paper id="14">
       <title>North-Sámi to <fixed-case>F</fixed-case>innish rule-based machine translation system</title>
-      <author><first>Tommi</first> <last>Pirinen</last></author>
-      <author><first>Francis M.</first> <last>Tyers</last></author>
-      <author><first>Trond</first> <last>Trosterud</last></author>
-      <author><first>Ryan</first> <last>Johnson</last></author>
-      <author><first>Kevin</first> <last>Unhammer</last></author>
-      <author><first>Tiina</first> <last>Puolakainen</last></author>
+      <author><first>Tommi</first><last>Pirinen</last></author>
+      <author><first>Francis M.</first><last>Tyers</last></author>
+      <author><first>Trond</first><last>Trosterud</last></author>
+      <author><first>Ryan</first><last>Johnson</last></author>
+      <author><first>Kevin</first><last>Unhammer</last></author>
+      <author><first>Tiina</first><last>Puolakainen</last></author>
       <pages>115–122</pages>
       <url>W17-0214</url>
     </paper>
     <paper id="15">
       <title>Machine translation with North Saami as a pivot language</title>
-      <author><first>Lene</first> <last>Antonsen</last></author>
-      <author><first>Ciprian</first> <last>Gerstenberger</last></author>
-      <author><first>Maja</first> <last>Kappfjell</last></author>
-      <author><first>Sandra</first> <last>Nystø Rahka</last></author>
-      <author><first>Marja-Liisa</first> <last>Olthuis</last></author>
-      <author><first>Trond</first> <last>Trosterud</last></author>
-      <author><first>Francis M.</first> <last>Tyers</last></author>
+      <author><first>Lene</first><last>Antonsen</last></author>
+      <author><first>Ciprian</first><last>Gerstenberger</last></author>
+      <author><first>Maja</first><last>Kappfjell</last></author>
+      <author><first>Sandra</first><last>Nystø Rahka</last></author>
+      <author><first>Marja-Liisa</first><last>Olthuis</last></author>
+      <author><first>Trond</first><last>Trosterud</last></author>
+      <author><first>Francis M.</first><last>Tyers</last></author>
       <pages>123–131</pages>
       <url>W17-0215</url>
     </paper>
     <paper id="16">
       <title><fixed-case>SWEGRAM</fixed-case> – A Web-Based Tool for Automatic Annotation and Analysis of <fixed-case>S</fixed-case>wedish Texts</title>
-      <author><first>Jesper</first> <last>Näsman</last></author>
-      <author><first>Beáta</first> <last>Megyesi</last></author>
-      <author><first>Anne</first> <last>Palmér</last></author>
+      <author><first>Jesper</first><last>Näsman</last></author>
+      <author><first>Beáta</first><last>Megyesi</last></author>
+      <author><first>Anne</first><last>Palmér</last></author>
       <pages>132–141</pages>
       <url>W17-0216</url>
     </paper>
     <paper id="17">
       <title>Optimizing a <fixed-case>P</fixed-case>o<fixed-case>S</fixed-case> Tagset for <fixed-case>N</fixed-case>orwegian Dependency Parsing</title>
-      <author><first>Petter</first> <last>Hohle</last></author>
-      <author><first>Lilja</first> <last>Øvrelid</last></author>
-      <author><first>Erik</first> <last>Velldal</last></author>
+      <author><first>Petter</first><last>Hohle</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
       <pages>142–151</pages>
       <url>W17-0217</url>
     </paper>
     <paper id="18">
       <title>Creating register sub-corpora for the <fixed-case>F</fixed-case>innish <fixed-case>I</fixed-case>nternet Parsebank</title>
-      <author><first>Veronika</first> <last>Laippala</last></author>
-      <author><first>Juhani</first> <last>Luotolahti</last></author>
-      <author><first>Aki-Juhani</first> <last>Kyröläinen</last></author>
-      <author><first>Tapio</first> <last>Salakoski</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
+      <author><first>Veronika</first><last>Laippala</last></author>
+      <author><first>Juhani</first><last>Luotolahti</last></author>
+      <author><first>Aki-Juhani</first><last>Kyröläinen</last></author>
+      <author><first>Tapio</first><last>Salakoski</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
       <pages>152–161</pages>
       <url>W17-0218</url>
     </paper>
     <paper id="19">
       <title><fixed-case>KILLE</fixed-case>: a Framework for Situated Agents for Learning Language Through Interaction</title>
-      <author><first>Simon</first> <last>Dobnik</last></author>
-      <author><first>Erik</first> <last>de Graaf</last></author>
+      <author><first>Simon</first><last>Dobnik</last></author>
+      <author><first>Erik</first><last>de Graaf</last></author>
       <pages>162–171</pages>
       <url>W17-0219</url>
     </paper>
     <paper id="20">
       <title>Data Collection from Persons with Mild Forms of Cognitive Impairment and Healthy Controls - Infrastructure for Classification and Prediction of Dementia</title>
-      <author><first>Dimitrios</first> <last>Kokkinakis</last></author>
-      <author><first>Kristina</first> <last>Lundholm Fors</last></author>
-      <author><first>Eva</first> <last>Björkner</last></author>
-      <author><first>Arto</first> <last>Nordlund</last></author>
+      <author><first>Dimitrios</first><last>Kokkinakis</last></author>
+      <author><first>Kristina</first><last>Lundholm Fors</last></author>
+      <author><first>Eva</first><last>Björkner</last></author>
+      <author><first>Arto</first><last>Nordlund</last></author>
       <pages>172–182</pages>
       <url>W17-0220</url>
     </paper>
     <paper id="21">
       <title>Evaluation of language identification methods using 285 languages</title>
-      <author><first>Tommi</first> <last>Jauhiainen</last></author>
-      <author><first>Krister</first> <last>Lindén</last></author>
-      <author><first>Heidi</first> <last>Jauhiainen</last></author>
+      <author><first>Tommi</first><last>Jauhiainen</last></author>
+      <author><first>Krister</first><last>Lindén</last></author>
+      <author><first>Heidi</first><last>Jauhiainen</last></author>
       <pages>183–191</pages>
       <url>W17-0221</url>
     </paper>
     <paper id="22">
       <title>Can We Create a Tool for General Domain Event Analysis?</title>
-      <author><first>Siim</first> <last>Orasmaa</last></author>
-      <author><first>Heiki-Jaan</first> <last>Kaalep</last></author>
+      <author><first>Siim</first><last>Orasmaa</last></author>
+      <author><first>Heiki-Jaan</first><last>Kaalep</last></author>
       <pages>192–201</pages>
       <url>W17-0222</url>
     </paper>
     <paper id="23">
       <title>From Treebank to <fixed-case>P</fixed-case>ropbank: A Semantic-Role and <fixed-case>V</fixed-case>erb<fixed-case>N</fixed-case>et Corpus for <fixed-case>D</fixed-case>anish</title>
-      <author><first>Eckhard</first> <last>Bick</last></author>
+      <author><first>Eckhard</first><last>Bick</last></author>
       <pages>202–210</pages>
       <url>W17-0223</url>
     </paper>
     <paper id="24">
       <title>Cross-lingual Learning of Semantic Textual Similarity with Multilingual Word Representations</title>
-      <author><first>Johannes</first> <last>Bjerva</last></author>
-      <author><first>Robert</first> <last>Östling</last></author>
+      <author><first>Johannes</first><last>Bjerva</last></author>
+      <author><first>Robert</first><last>Östling</last></author>
       <pages>211–215</pages>
       <url>W17-0224</url>
     </paper>
     <paper id="25">
       <title>Will my auxiliary tagging task help? Estimating Auxiliary Tasks Effectivity in Multi-Task Learning</title>
-      <author><first>Johannes</first> <last>Bjerva</last></author>
+      <author><first>Johannes</first><last>Bjerva</last></author>
       <pages>216–220</pages>
       <url>W17-0225</url>
     </paper>
     <paper id="26">
       <title>Iconic Locations in <fixed-case>S</fixed-case>wedish Sign Language: Mapping Form to Meaning with Lexical Databases</title>
-      <author><first>Carl</first> <last>Börstell</last></author>
-      <author><first>Robert</first> <last>Östling</last></author>
+      <author><first>Carl</first><last>Börstell</last></author>
+      <author><first>Robert</first><last>Östling</last></author>
       <pages>221–225</pages>
       <url>W17-0226</url>
     </paper>
     <paper id="27">
       <title><fixed-case>D</fixed-case>ocforia: A Multilayer Document Model</title>
-      <author><first>Marcus</first> <last>Klang</last></author>
-      <author><first>Pierre</first> <last>Nugues</last></author>
+      <author><first>Marcus</first><last>Klang</last></author>
+      <author><first>Pierre</first><last>Nugues</last></author>
       <pages>226–230</pages>
       <url>W17-0227</url>
     </paper>
     <paper id="28">
       <title><fixed-case>F</fixed-case>innish resources for evaluating language model semantics</title>
-      <author><first>Viljami</first> <last>Venekoski</last></author>
-      <author><first>Jouko</first> <last>Vankka</last></author>
+      <author><first>Viljami</first><last>Venekoski</last></author>
+      <author><first>Jouko</first><last>Vankka</last></author>
       <pages>231–236</pages>
       <url>W17-0228</url>
     </paper>
     <paper id="29">
       <title><fixed-case>M</fixed-case>álrómur: A Manually Verified Corpus of Recorded <fixed-case>I</fixed-case>celandic Speech</title>
-      <author><first>Steinþór</first> <last>Steingrímsson</last></author>
-      <author><first>Jón</first> <last>Guðnason</last></author>
-      <author><first>Sigrún</first> <last>Helgadóttir</last></author>
-      <author><first>Eiríkur</first> <last>Rögnvaldsson</last></author>
+      <author><first>Steinþór</first><last>Steingrímsson</last></author>
+      <author><first>Jón</first><last>Guðnason</last></author>
+      <author><first>Sigrún</first><last>Helgadóttir</last></author>
+      <author><first>Eiríkur</first><last>Rögnvaldsson</last></author>
       <pages>237–240</pages>
       <url>W17-0229</url>
     </paper>
     <paper id="30">
       <title>The Effect of Translationese on Tuning for Statistical Machine Translation</title>
-      <author><first>Sara</first> <last>Stymne</last></author>
+      <author><first>Sara</first><last>Stymne</last></author>
       <pages>241–246</pages>
       <url>W17-0230</url>
     </paper>
     <paper id="31">
       <title>Multilingwis² – Explore Your Parallel Corpus</title>
-      <author><first>Johannes</first> <last>Graën</last></author>
-      <author><first>Dominique</first> <last>Sandoz</last></author>
-      <author><first>Martin</first> <last>Volk</last></author>
+      <author><first>Johannes</first><last>Graën</last></author>
+      <author><first>Dominique</first><last>Sandoz</last></author>
+      <author><first>Martin</first><last>Volk</last></author>
       <pages>247–250</pages>
       <url>W17-0231</url>
     </paper>
     <paper id="32">
       <title>A modernised version of the Glossa corpus search system</title>
-      <author><first>Anders</first> <last>Nøklestad</last></author>
-      <author><first>Kristin</first> <last>Hagen</last></author>
-      <author><first>Janne</first> <last>Bondi Johannessen</last></author>
-      <author><first>Michał</first> <last>Kosek</last></author>
-      <author><first>Joel</first> <last>Priestley</last></author>
+      <author><first>Anders</first><last>Nøklestad</last></author>
+      <author><first>Kristin</first><last>Hagen</last></author>
+      <author><first>Janne</first><last>Bondi Johannessen</last></author>
+      <author><first>Michał</first><last>Kosek</last></author>
+      <author><first>Joel</first><last>Priestley</last></author>
       <pages>251–254</pages>
       <url>W17-0232</url>
     </paper>
     <paper id="33">
       <title><fixed-case>D</fixed-case>ep_search: Efficient Search Tool for Large Dependency Parsebanks</title>
-      <author><first>Juhani</first> <last>Luotolahti</last></author>
-      <author><first>Jenna</first> <last>Kanerva</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
+      <author><first>Juhani</first><last>Luotolahti</last></author>
+      <author><first>Jenna</first><last>Kanerva</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
       <pages>255–258</pages>
       <url>W17-0233</url>
     </paper>
     <paper id="34">
       <title>Proto-<fixed-case>I</fixed-case>ndo-<fixed-case>E</fixed-case>uropean Lexicon: The Generative Etymological Dictionary of <fixed-case>I</fixed-case>ndo-<fixed-case>E</fixed-case>uropean Languages</title>
-      <author><first>Jouna</first> <last>Pyysalo</last></author>
+      <author><first>Jouna</first><last>Pyysalo</last></author>
       <pages>259–262</pages>
       <url>W17-0234</url>
     </paper>
     <paper id="35">
       <title>Tilde <fixed-case>MODEL</fixed-case> - Multilingual Open Data for <fixed-case>EU</fixed-case> Languages</title>
-      <author><first>Roberts</first> <last>Rozis</last></author>
-      <author><first>Raivis</first> <last>Skadiņš</last></author>
+      <author><first>Roberts</first><last>Rozis</last></author>
+      <author><first>Raivis</first><last>Skadiņš</last></author>
       <pages>263–265</pages>
       <url>W17-0235</url>
     </paper>
     <paper id="36">
       <title>Mainstreaming August Strindberg with Text Normalization</title>
-      <author><first>Adam</first> <last>Ek</last></author>
-      <author><first>Sofia</first> <last>Knuutinen</last></author>
+      <author><first>Adam</first><last>Ek</last></author>
+      <author><first>Sofia</first><last>Knuutinen</last></author>
       <pages>266–270</pages>
       <url>W17-0236</url>
     </paper>
     <paper id="37">
       <title>Word vectors, reuse, and replicability: Towards a community repository of large-text resources</title>
-      <author><first>Murhaf</first> <last>Fares</last></author>
-      <author><first>Andrey</first> <last>Kutuzov</last></author>
-      <author><first>Stephan</first> <last>Oepen</last></author>
-      <author><first>Erik</first> <last>Velldal</last></author>
+      <author><first>Murhaf</first><last>Fares</last></author>
+      <author><first>Andrey</first><last>Kutuzov</last></author>
+      <author><first>Stephan</first><last>Oepen</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
       <pages>271–276</pages>
       <url>W17-0237</url>
     </paper>
     <paper id="38">
       <title>Improving Optical Character Recognition of <fixed-case>F</fixed-case>innish Historical Newspapers with a Combination of Fraktur &amp; Antiqua Models and Image Preprocessing</title>
-      <author><first>Mika</first> <last>Koistinen</last></author>
-      <author><first>Kimmo</first> <last>Kettunen</last></author>
-      <author><first>Tuula</first> <last>Pääkkönen</last></author>
+      <author><first>Mika</first><last>Koistinen</last></author>
+      <author><first>Kimmo</first><last>Kettunen</last></author>
+      <author><first>Tuula</first><last>Pääkkönen</last></author>
       <pages>277–283</pages>
       <url>W17-0238</url>
     </paper>
     <paper id="39">
       <title>Redefining Context Windows for Word Embedding Models: An Experimental Study</title>
-      <author><first>Pierre</first> <last>Lison</last></author>
-      <author><first>Andrey</first> <last>Kutuzov</last></author>
+      <author><first>Pierre</first><last>Lison</last></author>
+      <author><first>Andrey</first><last>Kutuzov</last></author>
       <pages>284–288</pages>
       <url>W17-0239</url>
     </paper>
     <paper id="40">
       <title>The Effect of Excluding Out of Domain Training Data from Supervised Named-Entity Recognition</title>
-      <author><first>Adam</first> <last>Persson</last></author>
+      <author><first>Adam</first><last>Persson</last></author>
       <pages>289–292</pages>
       <url>W17-0240</url>
     </paper>
     <paper id="41">
       <title>Quote Extraction and Attribution from <fixed-case>N</fixed-case>orwegian Newspapers</title>
-      <author><first>Andrew</first> <last>Salway</last></author>
-      <author><first>Paul</first> <last>Meurer</last></author>
-      <author><first>Knut</first> <last>Hofland</last></author>
-      <author><first>Øystein</first> <last>Reigem</last></author>
+      <author><first>Andrew</first><last>Salway</last></author>
+      <author><first>Paul</first><last>Meurer</last></author>
+      <author><first>Knut</first><last>Hofland</last></author>
+      <author><first>Øystein</first><last>Reigem</last></author>
       <pages>293–297</pages>
       <url>W17-0241</url>
     </paper>
     <paper id="42">
       <title><fixed-case>W</fixed-case>ordnet extension via word embeddings: Experiments on the <fixed-case>N</fixed-case>orwegian <fixed-case>W</fixed-case>ordnet</title>
-      <author><first>Heidi</first> <last>Sand</last></author>
-      <author><first>Erik</first> <last>Velldal</last></author>
-      <author><first>Lilja</first> <last>Øvrelid</last></author>
+      <author><first>Heidi</first><last>Sand</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
       <pages>298–302</pages>
       <url>W17-0242</url>
     </paper>
     <paper id="43">
       <title>Universal Dependencies for <fixed-case>S</fixed-case>wedish Sign Language</title>
-      <author><first>Robert</first> <last>Östling</last></author>
-      <author><first>Carl</first> <last>Börstell</last></author>
-      <author><first>Moa</first> <last>Gärdenfors</last></author>
-      <author><first>Mats</first> <last>Wirén</last></author>
+      <author><first>Robert</first><last>Östling</last></author>
+      <author><first>Carl</first><last>Börstell</last></author>
+      <author><first>Moa</first><last>Gärdenfors</last></author>
+      <author><first>Mats</first><last>Wirén</last></author>
       <pages>303–308</pages>
       <url>W17-0243</url>
     </paper>
     <paper id="44">
       <title>Services for text simplification and analysis</title>
-      <author><first>Johan</first> <last>Falkenjack</last></author>
-      <author><first>Evelina</first> <last>Rennes</last></author>
-      <author><first>Daniel</first> <last>Fahlborg</last></author>
-      <author><first>Vida</first> <last>Johansson</last></author>
-      <author><first>Arne</first> <last>Jönsson</last></author>
+      <author><first>Johan</first><last>Falkenjack</last></author>
+      <author><first>Evelina</first><last>Rennes</last></author>
+      <author><first>Daniel</first><last>Fahlborg</last></author>
+      <author><first>Vida</first><last>Johansson</last></author>
+      <author><first>Arne</first><last>Jönsson</last></author>
       <pages>309–313</pages>
       <url>W17-0244</url>
     </paper>
     <paper id="45">
       <title>Exploring Properties of Intralingual and Interlingual Association Measures Visually</title>
-      <author><first>Johannes</first> <last>Graën</last></author>
-      <author><first>Christof</first> <last>Bless</last></author>
+      <author><first>Johannes</first><last>Graën</last></author>
+      <author><first>Christof</first><last>Bless</last></author>
       <pages>314–317</pages>
       <url>W17-0245</url>
     </paper>
     <paper id="46">
       <title><fixed-case>TALERUM</fixed-case> - Learning <fixed-case>D</fixed-case>anish by Doing <fixed-case>D</fixed-case>anish</title>
-      <author><first>Peter</first> <last>Juel Henrichsen</last></author>
+      <author><first>Peter</first><last>Juel Henrichsen</last></author>
       <pages>318–321</pages>
       <url>W17-0246</url>
     </paper>
     <paper id="47">
       <title>Cross-Lingual Syntax: Relating Grammatical Framework with Universal Dependencies</title>
-      <author><first>Aarne</first> <last>Ranta</last></author>
-      <author><first>Prasanth</first> <last>Kolachina</last></author>
-      <author><first>Thomas</first> <last>Hallgren</last></author>
+      <author><first>Aarne</first><last>Ranta</last></author>
+      <author><first>Prasanth</first><last>Kolachina</last></author>
+      <author><first>Thomas</first><last>Hallgren</last></author>
       <pages>322–325</pages>
       <url>W17-0247</url>
     </paper>
     <paper id="48">
       <title>Exploring Treebanks with <fixed-case>INESS</fixed-case> Search</title>
-      <author><first>Victoria</first> <last>Rosén</last></author>
-      <author><first>Helge</first> <last>Dyvik</last></author>
-      <author><first>Paul</first> <last>Meurer</last></author>
-      <author><first>Koenraad</first> <last>De Smedt</last></author>
+      <author><first>Victoria</first><last>Rosén</last></author>
+      <author><first>Helge</first><last>Dyvik</last></author>
+      <author><first>Paul</first><last>Meurer</last></author>
+      <author><first>Koenraad</first><last>De Smedt</last></author>
       <pages>326–329</pages>
       <url>W17-0248</url>
     </paper>
     <paper id="49">
       <title>A System for Identifying and Exploring Text Repetition in Large Historical Document Corpora</title>
-      <author><first>Aleksi</first> <last>Vesanto</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
-      <author><first>Hannu</first> <last>Salmi</last></author>
-      <author><first>Asko</first> <last>Nivala</last></author>
-      <author><first>Tapio</first> <last>Salakoski</last></author>
+      <author><first>Aleksi</first><last>Vesanto</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
+      <author><first>Hannu</first><last>Salmi</last></author>
+      <author><first>Asko</first><last>Nivala</last></author>
+      <author><first>Tapio</first><last>Salakoski</last></author>
       <pages>330–333</pages>
       <url>W17-0249</url>
     </paper>
@@ -642,56 +642,56 @@
     </frontmatter>
     <paper id="1">
       <title>Learning with learner corpora: Using the <fixed-case>TLE</fixed-case> for native language identification</title>
-      <author><first>Allison</first> <last>Adams</last></author>
-      <author><first>Sara</first> <last>Stymne</last></author>
+      <author><first>Allison</first><last>Adams</last></author>
+      <author><first>Sara</first><last>Stymne</last></author>
       <pages>1-7</pages>
       <url>W17-0301</url>
     </paper>
     <paper id="2">
       <title>Challenging learners in their individual zone of proximal development using pedagogic developmental benchmarks of syntactic complexity</title>
-      <author><first>Xiaobin</first> <last>Chen</last></author>
-      <author><first>Detmar</first> <last>Meurers</last></author>
+      <author><first>Xiaobin</first><last>Chen</last></author>
+      <author><first>Detmar</first><last>Meurers</last></author>
       <pages>8-17</pages>
       <url>W17-0302</url>
     </paper>
     <paper id="3">
       <title>Crossing the border twice: Reimporting prepositions to alleviate <fixed-case>L</fixed-case>1-specific transfer errors</title>
-      <author><first>Johannes</first> <last>Graën</last></author>
-      <author><first>Gerold</first> <last>Schneider</last></author>
+      <author><first>Johannes</first><last>Graën</last></author>
+      <author><first>Gerold</first><last>Schneider</last></author>
       <pages>18-26</pages>
       <url>W17-0303</url>
     </paper>
     <paper id="4">
       <title><fixed-case>R</fixed-case>evita: a system for language learning and supporting endangered languages</title>
-      <author><first>Anisia</first> <last>Katinskaia</last></author>
-      <author><first>Javad</first> <last>Nouri</last></author>
-      <author><first>Roman</first> <last>Yangarber</last></author>
+      <author><first>Anisia</first><last>Katinskaia</last></author>
+      <author><first>Javad</first><last>Nouri</last></author>
+      <author><first>Roman</first><last>Yangarber</last></author>
       <pages>27-35</pages>
       <url>W17-0304</url>
     </paper>
     <paper id="5">
       <title>Developing a web-based workbook for <fixed-case>E</fixed-case>nglish supporting the interaction of students and teachers</title>
-      <author><first>Björn</first> <last>Rudzewitz</last></author>
-      <author><first>Ramon</first> <last>Ziai</last></author>
-      <author><first>Kordula</first> <last>De Kuthy</last></author>
-      <author><first>Detmar</first> <last>Meurers</last></author>
+      <author><first>Björn</first><last>Rudzewitz</last></author>
+      <author><first>Ramon</first><last>Ziai</last></author>
+      <author><first>Kordula</first><last>De Kuthy</last></author>
+      <author><first>Detmar</first><last>Meurers</last></author>
       <pages>36-46</pages>
       <url>W17-0305</url>
     </paper>
     <paper id="6">
       <title>Annotating errors in student texts: First experiences and experiments</title>
-      <author><first>Sara</first> <last>Stymne</last></author>
-      <author><first>Eva</first> <last>Pettersson</last></author>
-      <author><first>Beáta</first> <last>Megyesi</last></author>
-      <author><first>Anne</first> <last>Palmér</last></author>
+      <author><first>Sara</first><last>Stymne</last></author>
+      <author><first>Eva</first><last>Pettersson</last></author>
+      <author><first>Beáta</first><last>Megyesi</last></author>
+      <author><first>Anne</first><last>Palmér</last></author>
       <pages>47-60</pages>
       <url>W17-0306</url>
     </paper>
     <paper id="7">
       <title>Building and using language resources and infrastructure to develop e-learning programs for a minority language</title>
-      <author><first>Heli</first> <last>Uibo</last></author>
-      <author><first>Jack</first> <last>Rueter</last></author>
-      <author><first>Sulev</first> <last>Iva</last></author>
+      <author><first>Heli</first><last>Uibo</last></author>
+      <author><first>Jack</first><last>Rueter</last></author>
+      <author><first>Sulev</first><last>Iva</last></author>
       <pages>61-67</pages>
       <url>W17-0307</url>
     </paper>
@@ -713,138 +713,138 @@
     </frontmatter>
     <paper id="1">
       <title>Cross-Lingual Parser Selection for Low-Resource Languages</title>
-      <author><first>Željko</first> <last>Agić</last></author>
+      <author><first>Željko</first><last>Agić</last></author>
       <pages>1–10</pages>
       <url>W17-0401</url>
     </paper>
     <paper id="2">
       <title><fixed-case>S</fixed-case>wedish Prepositions are not Pure Function Words</title>
-      <author><first>Lars</first> <last>Ahrenberg</last></author>
+      <author><first>Lars</first><last>Ahrenberg</last></author>
       <pages>11–18</pages>
       <url>W17-0402</url>
     </paper>
     <paper id="3">
       <title>Increasing Return on Annotation Investment: The Automatic Construction of a Universal Dependency Treebank for <fixed-case>D</fixed-case>utch</title>
-      <author><first>Gosse</first> <last>Bouma</last></author>
-      <author><first>Gertjan</first> <last>van Noord</last></author>
+      <author><first>Gosse</first><last>Bouma</last></author>
+      <author><first>Gertjan</first><last>van Noord</last></author>
       <pages>19–26</pages>
       <url>W17-0403</url>
     </paper>
     <paper id="4">
       <title>Converting the <fixed-case>T</fixed-case>ü<fixed-case>B</fixed-case>a-D/Z Treebank of <fixed-case>G</fixed-case>erman to Universal Dependencies</title>
-      <author><first>Çağrı</first> <last>Çöltekin</last></author>
-      <author><first>Ben</first> <last>Campbell</last></author>
-      <author><first>Erhard</first> <last>Hinrichs</last></author>
-      <author><first>Heike</first> <last>Telljohann</last></author>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
+      <author><first>Ben</first><last>Campbell</last></author>
+      <author><first>Erhard</first><last>Hinrichs</last></author>
+      <author><first>Heike</first><last>Telljohann</last></author>
       <pages>27–37</pages>
       <url>W17-0404</url>
     </paper>
     <paper id="5">
       <title>Universal Dependencies for <fixed-case>A</fixed-case>frikaans</title>
-      <author><first>Peter</first> <last>Dirix</last></author>
-      <author><first>Liesbeth</first> <last>Augustinus</last></author>
-      <author><first>Daniel</first> <last>van Niekerk</last></author>
-      <author><first>Frank</first> <last>Van Eynde</last></author>
+      <author><first>Peter</first><last>Dirix</last></author>
+      <author><first>Liesbeth</first><last>Augustinus</last></author>
+      <author><first>Daniel</first><last>van Niekerk</last></author>
+      <author><first>Frank</first><last>Van Eynde</last></author>
       <pages>38–47</pages>
       <url>W17-0405</url>
     </paper>
     <paper id="6">
       <title>Elliptic Constructions: Spotting Patterns in <fixed-case>UD</fixed-case> Treebanks</title>
-      <author><first>Kira</first> <last>Droganova</last></author>
-      <author><first>Daniel</first> <last>Zeman</last></author>
+      <author><first>Kira</first><last>Droganova</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
       <pages>48–57</pages>
       <url>W17-0406</url>
     </paper>
     <paper id="7">
       <title>Dependency Tree Transformation with Tree Transducers</title>
-      <author><first>Felix</first> <last>Hennig</last></author>
-      <author><first>Arne</first> <last>Köhn</last></author>
+      <author><first>Felix</first><last>Hennig</last></author>
+      <author><first>Arne</first><last>Köhn</last></author>
       <pages>58–66</pages>
       <url>W17-0407</url>
     </paper>
     <paper id="8">
       <title>Towards Universal Dependencies for Learner <fixed-case>C</fixed-case>hinese</title>
-      <author><first>John</first> <last>Lee</last></author>
-      <author><first>Herman</first> <last>Leung</last></author>
-      <author><first>Keying</first> <last>Li</last></author>
+      <author><first>John</first><last>Lee</last></author>
+      <author><first>Herman</first><last>Leung</last></author>
+      <author><first>Keying</first><last>Li</last></author>
       <pages>67–71</pages>
       <url>W17-0408</url>
     </paper>
     <paper id="9">
       <title>Does Syntactic Informativity Predict Word Length? A Cross-Linguistic Study Based on the Universal Dependencies Corpora</title>
-      <author><first>Natalia</first> <last>Levshina</last></author>
+      <author><first>Natalia</first><last>Levshina</last></author>
       <pages>72–78</pages>
       <url>W17-0409</url>
     </paper>
     <paper id="10">
       <title><fixed-case>E</fixed-case>stonian Copular and Existential Constructions as an <fixed-case>UD</fixed-case> Annotation Problem</title>
-      <author><first>Kadri</first> <last>Muischnek</last></author>
-      <author><first>Kaili</first> <last>Müürisep</last></author>
+      <author><first>Kadri</first><last>Muischnek</last></author>
+      <author><first>Kaili</first><last>Müürisep</last></author>
       <pages>79–85</pages>
       <url>W17-0410</url>
     </paper>
     <paper id="11">
       <title>Universal Dependency Evaluation</title>
-      <author><first>Joakim</first> <last>Nivre</last></author>
-      <author><first>Chiao-Ting</first> <last>Fang</last></author>
+      <author><first>Joakim</first><last>Nivre</last></author>
+      <author><first>Chiao-Ting</first><last>Fang</last></author>
       <pages>86–95</pages>
       <url>W17-0411</url>
     </paper>
     <paper id="12">
       <title><fixed-case>U</fixed-case>dapi: Universal <fixed-case>API</fixed-case> for Universal Dependencies</title>
-      <author><first>Martin</first> <last>Popel</last></author>
-      <author><first>Zdeněk</first> <last>Žabokrtský</last></author>
-      <author><first>Martin</first> <last>Vojtek</last></author>
+      <author><first>Martin</first><last>Popel</last></author>
+      <author><first>Zdeněk</first><last>Žabokrtský</last></author>
+      <author><first>Martin</first><last>Vojtek</last></author>
       <pages>96–101</pages>
       <url>W17-0412</url>
     </paper>
     <paper id="13">
       <title>Universal Dependencies for <fixed-case>G</fixed-case>reek</title>
-      <author><first>Prokopis</first> <last>Prokopidis</last></author>
-      <author><first>Haris</first> <last>Papageorgiou</last></author>
+      <author><first>Prokopis</first><last>Prokopidis</last></author>
+      <author><first>Haris</first><last>Papageorgiou</last></author>
       <pages>102–106</pages>
       <url>W17-0413</url>
     </paper>
     <paper id="14">
       <title>From Universal Dependencies to Abstract Syntax</title>
-      <author><first>Aarne</first> <last>Ranta</last></author>
-      <author><first>Prasanth</first> <last>Kolachina</last></author>
+      <author><first>Aarne</first><last>Ranta</last></author>
+      <author><first>Prasanth</first><last>Kolachina</last></author>
       <pages>107–116</pages>
       <url>W17-0414</url>
     </paper>
     <paper id="15">
       <title>Empirically Sampling Universal Dependencies</title>
-      <author><first>Natalie</first> <last>Schluter</last></author>
-      <author><first>Željko</first> <last>Agić</last></author>
+      <author><first>Natalie</first><last>Schluter</last></author>
+      <author><first>Željko</first><last>Agić</last></author>
       <pages>117–122</pages>
       <url>W17-0415</url>
     </paper>
     <paper id="16">
       <title>Gapping Constructions in Universal Dependencies v2</title>
-      <author><first>Sebastian</first> <last>Schuster</last></author>
-      <author><first>Matthew</first> <last>Lamm</last></author>
-      <author><first>Christopher D.</first> <last>Manning</last></author>
+      <author><first>Sebastian</first><last>Schuster</last></author>
+      <author><first>Matthew</first><last>Lamm</last></author>
+      <author><first>Christopher D.</first><last>Manning</last></author>
       <pages>123–132</pages>
       <url>W17-0416</url>
     </paper>
     <paper id="17">
       <title>Toward Universal Dependencies for <fixed-case>A</fixed-case>inu</title>
-      <author><first>Hajime</first> <last>Senuma</last></author>
-      <author><first>Akiko</first> <last>Aizawa</last></author>
+      <author><first>Hajime</first><last>Senuma</last></author>
+      <author><first>Akiko</first><last>Aizawa</last></author>
       <pages>133–139</pages>
       <url>W17-0417</url>
     </paper>
     <paper id="18">
       <title>Automatic Morpheme Segmentation and Labeling in Universal Dependencies Resources</title>
-      <author><first>Miikka</first> <last>Silfverberg</last></author>
-      <author><first>Mans</first> <last>Hulden</last></author>
+      <author><first>Miikka</first><last>Silfverberg</last></author>
+      <author><first>Mans</first><last>Hulden</last></author>
       <pages>140–145</pages>
       <url>W17-0418</url>
     </paper>
     <paper id="19">
       <title>A Systematic Comparison of Syntactic Representations of Dependency Parsing</title>
-      <author><first>Guillaume</first> <last>Wisniewski</last></author>
-      <author><first>Ophélie</first> <last>Lacroix</last></author>
+      <author><first>Guillaume</first><last>Wisniewski</last></author>
+      <author><first>Ophélie</first><last>Lacroix</last></author>
       <pages>146–152</pages>
       <url>W17-0419</url>
     </paper>
@@ -865,84 +865,84 @@
     </frontmatter>
     <paper id="1">
       <title>Variance in Historical Data: How bad is it and how can we profit from it for historical linguistics?</title>
-      <author><first>Stefanie</first> <last>Dipper</last></author>
+      <author><first>Stefanie</first><last>Dipper</last></author>
       <pages>1–1</pages>
       <url>W17-0501</url>
     </paper>
     <paper id="2">
       <title>Improving <fixed-case>POS</fixed-case> Tagging in Old <fixed-case>S</fixed-case>panish Using <fixed-case>TEITOK</fixed-case></title>
-      <author><first>Maarten</first> <last>Janssen</last></author>
-      <author><first>Josep</first> <last>Ausensi</last></author>
-      <author><first>Josep</first> <last>Fontana</last></author>
+      <author><first>Maarten</first><last>Janssen</last></author>
+      <author><first>Josep</first><last>Ausensi</last></author>
+      <author><first>Josep</first><last>Fontana</last></author>
       <pages>2–6</pages>
       <url>W17-0502</url>
     </paper>
     <paper id="3">
       <title>The Making of the Royal Society Corpus</title>
-      <author><first>Jörg</first> <last>Knappen</last></author>
-      <author><first>Stefan</first> <last>Fischer</last></author>
-      <author><first>Hannah</first> <last>Kermes</last></author>
-      <author><first>Elke</first> <last>Teich</last></author>
-      <author><first>Peter</first> <last>Fankhauser</last></author>
+      <author><first>Jörg</first><last>Knappen</last></author>
+      <author><first>Stefan</first><last>Fischer</last></author>
+      <author><first>Hannah</first><last>Kermes</last></author>
+      <author><first>Elke</first><last>Teich</last></author>
+      <author><first>Peter</first><last>Fankhauser</last></author>
       <pages>7–11</pages>
       <url>W17-0503</url>
     </paper>
     <paper id="4">
       <title>Normalizing Medieval <fixed-case>G</fixed-case>erman Texts: from rules to deep learning</title>
-      <author><first>Natalia</first> <last>Korchagina</last></author>
+      <author><first>Natalia</first><last>Korchagina</last></author>
       <pages>12–17</pages>
       <url>W17-0504</url>
     </paper>
     <paper id="5">
       <title>Ambiguity in Semantically Related Word Substitutions: an investigation in historical <fixed-case>B</fixed-case>ible translations</title>
-      <author><first>Maria</first> <last>Moritz</last></author>
-      <author><first>Marco</first> <last>Büchler</last></author>
+      <author><first>Maria</first><last>Moritz</last></author>
+      <author><first>Marco</first><last>Büchler</last></author>
       <pages>18–23</pages>
       <url>W17-0505</url>
     </paper>
     <paper id="6">
       <title>The Lemlat 3.0 Package for Morphological Analysis of <fixed-case>L</fixed-case>atin</title>
-      <author><first>Marco</first> <last>Passarotti</last></author>
-      <author><first>Marco</first> <last>Budassi</last></author>
-      <author><first>Eleonora</first> <last>Litta</last></author>
-      <author><first>Paolo</first> <last>Ruffolo</last></author>
+      <author><first>Marco</first><last>Passarotti</last></author>
+      <author><first>Marco</first><last>Budassi</last></author>
+      <author><first>Eleonora</first><last>Litta</last></author>
+      <author><first>Paolo</first><last>Ruffolo</last></author>
       <pages>24–31</pages>
       <url>W17-0506</url>
     </paper>
     <paper id="7">
       <title><fixed-case>H</fixed-case>isto<fixed-case>B</fixed-case>ank<fixed-case>V</fixed-case>is: Detecting Language Change via Data Visualization</title>
-      <author><first>Christin</first> <last>Schätzle</last></author>
-      <author><first>Michael</first> <last>Hund</last></author>
-      <author><first>Frederik</first> <last>Dennig</last></author>
-      <author><first>Miriam</first> <last>Butt</last></author>
-      <author><first>Daniel</first> <last>Keim</last></author>
+      <author><first>Christin</first><last>Schätzle</last></author>
+      <author><first>Michael</first><last>Hund</last></author>
+      <author><first>Frederik</first><last>Dennig</last></author>
+      <author><first>Miriam</first><last>Butt</last></author>
+      <author><first>Daniel</first><last>Keim</last></author>
       <pages>32–39</pages>
       <url>W17-0507</url>
     </paper>
     <paper id="8">
       <title>Comparing Rule-based and <fixed-case>SMT</fixed-case>-based Spelling Normalisation for <fixed-case>E</fixed-case>nglish Historical Texts</title>
-      <author><first>Gerold</first> <last>Schneider</last></author>
-      <author><first>Eva</first> <last>Pettersson</last></author>
-      <author><first>Michael</first> <last>Percillier</last></author>
+      <author><first>Gerold</first><last>Schneider</last></author>
+      <author><first>Eva</first><last>Pettersson</last></author>
+      <author><first>Michael</first><last>Percillier</last></author>
       <pages>40–46</pages>
       <url>W17-0508</url>
     </paper>
     <paper id="9">
       <title>Data-driven Morphology and Sociolinguistics for Early Modern <fixed-case>D</fixed-case>utch</title>
-      <author><first>Marijn</first> <last>Schraagen</last></author>
-      <author><first>Marjo</first> <last>van Koppen</last></author>
-      <author><first>Feike</first> <last>Dietz</last></author>
+      <author><first>Marijn</first><last>Schraagen</last></author>
+      <author><first>Marjo</first><last>van Koppen</last></author>
+      <author><first>Feike</first><last>Dietz</last></author>
       <pages>47–53</pages>
       <url>W17-0509</url>
     </paper>
     <paper id="10">
       <title>Applying <fixed-case>BLAST</fixed-case> to Text Reuse Detection in <fixed-case>F</fixed-case>innish Newspapers and Journals, 1771-1910</title>
-      <author><first>Aleksi</first> <last>Vesanto</last></author>
-      <author><first>Asko</first> <last>Nivala</last></author>
-      <author><first>Heli</first> <last>Rantala</last></author>
-      <author><first>Tapio</first> <last>Salakoski</last></author>
-      <author><first>Hannu</first> <last>Salmi</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
+      <author><first>Aleksi</first><last>Vesanto</last></author>
+      <author><first>Asko</first><last>Nivala</last></author>
+      <author><first>Heli</first><last>Rantala</last></author>
+      <author><first>Tapio</first><last>Salakoski</last></author>
+      <author><first>Hannu</first><last>Salmi</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
       <pages>54–58</pages>
       <url>W17-0510</url>
     </paper>
@@ -966,71 +966,71 @@
     </frontmatter>
     <paper id="1">
       <title>Synchronized Mediawiki based analyzer dictionary development</title>
-      <author><first>Jack</first> <last>Rueter</last></author>
-      <author><first>Mika</first> <last>Hämäläinen</last></author>
+      <author><first>Jack</first><last>Rueter</last></author>
+      <author><first>Mika</first><last>Hämäläinen</last></author>
       <pages>1–7</pages>
       <url>W17-0601</url>
       <doi>10.18653/v1/W17-0601</doi>
     </paper>
     <paper id="2">
       <title><fixed-case>DEMO</fixed-case>: Giellatekno Open-source click-in-text dictionaries for bringing closely related languages into contact.</title>
-      <author><first>Jack</first> <last>Rueter</last></author>
+      <author><first>Jack</first><last>Rueter</last></author>
       <pages>8–9</pages>
       <url>W17-0602</url>
       <doi>10.18653/v1/W17-0602</doi>
     </paper>
     <paper id="3">
       <title>Languages under the influence: Building a database of Uralic languages</title>
-      <author><first>Eszter</first> <last>Simon</last></author>
-      <author><first>Nikolett</first> <last>Mus</last></author>
+      <author><first>Eszter</first><last>Simon</last></author>
+      <author><first>Nikolett</first><last>Mus</last></author>
       <pages>10–24</pages>
       <url>W17-0603</url>
       <doi>10.18653/v1/W17-0603</doi>
     </paper>
     <paper id="4">
       <title>Instant Annotations – Applying <fixed-case>NLP</fixed-case> Methods to the Annotation of Spoken Language Documentation Corpora</title>
-      <author><first>Ciprian</first> <last>Gerstenberger</last></author>
-      <author><first>Niko</first> <last>Partanen</last></author>
-      <author><first>Michael</first> <last>Rießler</last></author>
-      <author><first>Joshua</first> <last>Wilbur</last></author>
+      <author><first>Ciprian</first><last>Gerstenberger</last></author>
+      <author><first>Niko</first><last>Partanen</last></author>
+      <author><first>Michael</first><last>Rießler</last></author>
+      <author><first>Joshua</first><last>Wilbur</last></author>
       <pages>25–36</pages>
       <url>W17-0604</url>
       <doi>10.18653/v1/W17-0604</doi>
     </paper>
     <paper id="5">
       <title>Preliminary Experiments concerning Verbal Predicative Structure Extraction from a Large <fixed-case>F</fixed-case>innish Corpus</title>
-      <author><first>Guersande</first> <last>Chaminade</last></author>
-      <author><first>Thierry</first> <last>Poibeau</last></author>
+      <author><first>Guersande</first><last>Chaminade</last></author>
+      <author><first>Thierry</first><last>Poibeau</last></author>
       <pages>37–55</pages>
       <url>W17-0605</url>
       <doi>10.18653/v1/W17-0605</doi>
     </paper>
     <paper id="6">
       <title>Language technology resources and tools for Mansi: an overview</title>
-      <author><first>Csilla</first> <last>Horváth</last></author>
-      <author><first>Norbert</first> <last>Szilágyi</last></author>
-      <author><first>Veronika</first> <last>Vincze</last></author>
-      <author><first>Ágoston</first> <last>Nagy</last></author>
+      <author><first>Csilla</first><last>Horváth</last></author>
+      <author><first>Norbert</first><last>Szilágyi</last></author>
+      <author><first>Veronika</first><last>Vincze</last></author>
+      <author><first>Ágoston</first><last>Nagy</last></author>
       <pages>56–65</pages>
       <url>W17-0606</url>
       <doi>10.18653/v1/W17-0606</doi>
     </paper>
     <paper id="7">
       <title>Annotation schemes in North Sámi dependency parsing</title>
-      <author><first>Francis M.</first> <last>Tyers</last></author>
-      <author><first>Mariya</first> <last>Sheyanova</last></author>
+      <author><first>Francis M.</first><last>Tyers</last></author>
+      <author><first>Mariya</first><last>Sheyanova</last></author>
       <pages>66–75</pages>
       <url>W17-0607</url>
       <doi>10.18653/v1/W17-0607</doi>
     </paper>
     <paper id="8">
       <title>A morphological analyser for Kven</title>
-      <author><first>Sindre</first> <last>Reino Trosterud</last></author>
-      <author><first>Trond</first> <last>Trosterud</last></author>
-      <author><first>Anna-Kaisa</first> <last>Räisänen</last></author>
-      <author><first>Leena</first> <last>Niiranen</last></author>
-      <author><first>Mervi</first> <last>Haavisto</last></author>
-      <author><first>Kaisa</first> <last>Maliniemi</last></author>
+      <author><first>Sindre</first><last>Reino Trosterud</last></author>
+      <author><first>Trond</first><last>Trosterud</last></author>
+      <author><first>Anna-Kaisa</first><last>Räisänen</last></author>
+      <author><first>Leena</first><last>Niiranen</last></author>
+      <author><first>Mervi</first><last>Haavisto</last></author>
+      <author><first>Kaisa</first><last>Maliniemi</last></author>
       <pages>76–88</pages>
       <url>W17-0608</url>
       <doi>10.18653/v1/W17-0608</doi>
@@ -1055,10 +1055,10 @@
     </frontmatter>
     <paper id="1">
       <title>Entropy Reduction correlates with temporal lobe activity</title>
-      <author><first>Matthew</first> <last>Nelson</last></author>
-      <author><first>Stanislas</first> <last>Dehaene</last></author>
-      <author><first>Christophe</first> <last>Pallier</last></author>
-      <author><first>John</first> <last>Hale</last></author>
+      <author><first>Matthew</first><last>Nelson</last></author>
+      <author><first>Stanislas</first><last>Dehaene</last></author>
+      <author><first>Christophe</first><last>Pallier</last></author>
+      <author><first>John</first><last>Hale</last></author>
       <pages>1–10</pages>
       <url>W17-0701</url>
       <doi>10.18653/v1/W17-0701</doi>
@@ -1066,9 +1066,9 @@
     </paper>
     <paper id="2">
       <title>Learning an Input Filter for Argument Structure Acquisition</title>
-      <author><first>Laurel</first> <last>Perkins</last></author>
-      <author><first>Naomi</first> <last>Feldman</last></author>
-      <author><first>Jeffrey</first> <last>Lidz</last></author>
+      <author><first>Laurel</first><last>Perkins</last></author>
+      <author><first>Naomi</first><last>Feldman</last></author>
+      <author><first>Jeffrey</first><last>Lidz</last></author>
       <pages>11–19</pages>
       <url>W17-0702</url>
       <doi>10.18653/v1/W17-0702</doi>
@@ -1076,8 +1076,8 @@
     </paper>
     <paper id="3">
       <title>Grounding sound change in ideal observer models of perception</title>
-      <author><first>Zachary</first> <last>Burchill</last></author>
-      <author><first>T. Florian</first> <last>Jaeger</last></author>
+      <author><first>Zachary</first><last>Burchill</last></author>
+      <author><first>T. Florian</first><last>Jaeger</last></author>
       <pages>20–28</pages>
       <url>W17-0703</url>
       <doi>10.18653/v1/W17-0703</doi>
@@ -1085,7 +1085,7 @@
     </paper>
     <paper id="4">
       <title>“Oh, <fixed-case>I</fixed-case>’ve Heard That Before”: Modelling Own-Dialect Bias After Perceptual Learning by Weighting Training Data</title>
-      <author><first>Rachael</first> <last>Tatman</last></author>
+      <author><first>Rachael</first><last>Tatman</last></author>
       <pages>29–34</pages>
       <url>W17-0704</url>
       <doi>10.18653/v1/W17-0704</doi>
@@ -1093,7 +1093,7 @@
     </paper>
     <paper id="5">
       <title>Inherent Biases of Recurrent Neural Networks for Phonological Assimilation and Dissimilation</title>
-      <author><first>Amanda</first> <last>Doucette</last></author>
+      <author><first>Amanda</first><last>Doucette</last></author>
       <pages>35–40</pages>
       <url>W17-0705</url>
       <doi>10.18653/v1/W17-0705</doi>
@@ -1101,7 +1101,7 @@
     </paper>
     <paper id="6">
       <title>Predicting <fixed-case>J</fixed-case>apanese scrambling in the wild</title>
-      <author><first>Naho</first> <last>Orita</last></author>
+      <author><first>Naho</first><last>Orita</last></author>
       <pages>41–45</pages>
       <url>W17-0706</url>
       <doi>10.18653/v1/W17-0706</doi>
@@ -1125,8 +1125,8 @@
     </frontmatter>
     <paper id="1">
       <title>Readers vs. Writers vs. Texts: Coping with Different Perspectives of Text Understanding in Emotion Annotation</title>
-      <author><first>Sven</first> <last>Buechel</last></author>
-      <author><first>Udo</first> <last>Hahn</last></author>
+      <author><first>Sven</first><last>Buechel</last></author>
+      <author><first>Udo</first><last>Hahn</last></author>
       <pages>1–12</pages>
       <url>W17-0801</url>
       <doi>10.18653/v1/W17-0801</doi>
@@ -1134,11 +1134,11 @@
     </paper>
     <paper id="2">
       <title>Finding Good Conversations Online: The Yahoo News Annotated Comments Corpus</title>
-      <author><first>Courtney</first> <last>Napoles</last></author>
-      <author><first>Joel</first> <last>Tetreault</last></author>
-      <author><first>Aasish</first> <last>Pappu</last></author>
-      <author><first>Enrica</first> <last>Rosato</last></author>
-      <author><first>Brian</first> <last>Provenzale</last></author>
+      <author><first>Courtney</first><last>Napoles</last></author>
+      <author><first>Joel</first><last>Tetreault</last></author>
+      <author><first>Aasish</first><last>Pappu</last></author>
+      <author><first>Enrica</first><last>Rosato</last></author>
+      <author><first>Brian</first><last>Provenzale</last></author>
       <pages>13–23</pages>
       <url>W17-0802</url>
       <doi>10.18653/v1/W17-0802</doi>
@@ -1146,8 +1146,8 @@
     </paper>
     <paper id="3">
       <title>Crowdsourcing discourse interpretations: On the influence of context and the reliability of a connective insertion task</title>
-      <author><first>Merel</first> <last>Scholman</last></author>
-      <author><first>Vera</first> <last>Demberg</last></author>
+      <author><first>Merel</first><last>Scholman</last></author>
+      <author><first>Vera</first><last>Demberg</last></author>
       <pages>24–33</pages>
       <url>W17-0803</url>
       <doi>10.18653/v1/W17-0803</doi>
@@ -1155,7 +1155,7 @@
     </paper>
     <paper id="4">
       <title>A Code-Switching Corpus of <fixed-case>T</fixed-case>urkish-<fixed-case>G</fixed-case>erman Conversations</title>
-      <author><first>Özlem</first> <last>Çetinoğlu</last></author>
+      <author><first>Özlem</first><last>Çetinoğlu</last></author>
       <pages>34–40</pages>
       <url>W17-0804</url>
       <doi>10.18653/v1/W17-0804</doi>
@@ -1163,9 +1163,9 @@
     </paper>
     <paper id="5">
       <title>Annotating omission in statement pairs</title>
-      <author><first>Héctor</first> <last>Martínez Alonso</last></author>
-      <author><first>Amaury</first> <last>Delamaire</last></author>
-      <author><first>Benoît</first> <last>Sagot</last></author>
+      <author><first>Héctor</first><last>Martínez Alonso</last></author>
+      <author><first>Amaury</first><last>Delamaire</last></author>
+      <author><first>Benoît</first><last>Sagot</last></author>
       <pages>41–45</pages>
       <url>W17-0805</url>
       <doi>10.18653/v1/W17-0805</doi>
@@ -1173,11 +1173,11 @@
     </paper>
     <paper id="6">
       <title>Annotating Speech, Attitude and Perception Reports</title>
-      <author><first>Corien</first> <last>Bary</last></author>
-      <author><first>Leopold</first> <last>Hess</last></author>
-      <author><first>Kees</first> <last>Thijs</last></author>
-      <author><first>Peter</first> <last>Berck</last></author>
-      <author><first>Iris</first> <last>Hendrickx</last></author>
+      <author><first>Corien</first><last>Bary</last></author>
+      <author><first>Leopold</first><last>Hess</last></author>
+      <author><first>Kees</first><last>Thijs</last></author>
+      <author><first>Peter</first><last>Berck</last></author>
+      <author><first>Iris</first><last>Hendrickx</last></author>
       <pages>46–56</pages>
       <url>W17-0806</url>
       <doi>10.18653/v1/W17-0806</doi>
@@ -1185,12 +1185,12 @@
     </paper>
     <paper id="7">
       <title>Consistent Classification of Translation Revisions: A Case Study of <fixed-case>E</fixed-case>nglish-<fixed-case>J</fixed-case>apanese Student Translations</title>
-      <author><first>Atsushi</first> <last>Fujita</last></author>
-      <author><first>Kikuko</first> <last>Tanabe</last></author>
-      <author><first>Chiho</first> <last>Toyoshima</last></author>
-      <author><first>Mayuka</first> <last>Yamamoto</last></author>
-      <author><first>Kyo</first> <last>Kageura</last></author>
-      <author><first>Anthony</first> <last>Hartley</last></author>
+      <author><first>Atsushi</first><last>Fujita</last></author>
+      <author><first>Kikuko</first><last>Tanabe</last></author>
+      <author><first>Chiho</first><last>Toyoshima</last></author>
+      <author><first>Mayuka</first><last>Yamamoto</last></author>
+      <author><first>Kyo</first><last>Kageura</last></author>
+      <author><first>Anthony</first><last>Hartley</last></author>
       <pages>57–66</pages>
       <url>W17-0807</url>
       <doi>10.18653/v1/W17-0807</doi>
@@ -1198,13 +1198,13 @@
     </paper>
     <paper id="8">
       <title>Representation and Interchange of Linguistic Annotation. An In-Depth, Side-by-Side Comparison of Three Designs</title>
-      <author><first>Richard</first> <last>Eckart de Castilho</last></author>
-      <author><first>Nancy</first> <last>Ide</last></author>
-      <author><first>Emanuele</first> <last>Lapponi</last></author>
-      <author><first>Stephan</first> <last>Oepen</last></author>
-      <author><first>Keith</first> <last>Suderman</last></author>
-      <author><first>Erik</first> <last>Velldal</last></author>
-      <author><first>Marc</first> <last>Verhagen</last></author>
+      <author><first>Richard</first><last>Eckart de Castilho</last></author>
+      <author><first>Nancy</first><last>Ide</last></author>
+      <author><first>Emanuele</first><last>Lapponi</last></author>
+      <author><first>Stephan</first><last>Oepen</last></author>
+      <author><first>Keith</first><last>Suderman</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
+      <author><first>Marc</first><last>Verhagen</last></author>
       <pages>67–75</pages>
       <url>W17-0808</url>
       <doi>10.18653/v1/W17-0808</doi>
@@ -1212,8 +1212,8 @@
     </paper>
     <paper id="9">
       <title><fixed-case>TDB</fixed-case> 1.1: Extensions on <fixed-case>T</fixed-case>urkish Discourse Bank</title>
-      <author><first>Deniz</first> <last>Zeyrek</last></author>
-      <author><first>Murathan</first> <last>Kurfalı</last></author>
+      <author><first>Deniz</first><last>Zeyrek</last></author>
+      <author><first>Murathan</first><last>Kurfalı</last></author>
       <pages>76–81</pages>
       <url>W17-0809</url>
       <doi>10.18653/v1/W17-0809</doi>
@@ -1221,12 +1221,12 @@
     </paper>
     <paper id="10">
       <title>Two Layers of Annotation for Representing Event Mentions in News Stories</title>
-      <author><first>Maria Pia</first> <last>di Buono</last></author>
-      <author><first>Martin</first> <last>Tutek</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
-      <author><first>Goran</first> <last>Glavaš</last></author>
-      <author><first>Bojana</first> <last>Dalbelo Bašić</last></author>
-      <author><first>Nataša</first> <last>Milić-Frayling</last></author>
+      <author><first>Maria Pia</first><last>di Buono</last></author>
+      <author><first>Martin</first><last>Tutek</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
+      <author><first>Goran</first><last>Glavaš</last></author>
+      <author><first>Bojana</first><last>Dalbelo Bašić</last></author>
+      <author><first>Nataša</first><last>Milić-Frayling</last></author>
       <pages>82–90</pages>
       <url>W17-0810</url>
       <doi>10.18653/v1/W17-0810</doi>
@@ -1234,11 +1234,11 @@
     </paper>
     <paper id="11">
       <title>Word Similarity Datasets for <fixed-case>I</fixed-case>ndian Languages: Annotation and Baseline Systems</title>
-      <author><first>Syed Sarfaraz</first> <last>Akhtar</last></author>
-      <author><first>Arihant</first> <last>Gupta</last></author>
-      <author><first>Avijit</first> <last>Vajpayee</last></author>
-      <author><first>Arjit</first> <last>Srivastava</last></author>
-      <author><first>Manish</first> <last>Shrivastava</last></author>
+      <author><first>Syed Sarfaraz</first><last>Akhtar</last></author>
+      <author><first>Arihant</first><last>Gupta</last></author>
+      <author><first>Avijit</first><last>Vajpayee</last></author>
+      <author><first>Arjit</first><last>Srivastava</last></author>
+      <author><first>Manish</first><last>Shrivastava</last></author>
       <pages>91–94</pages>
       <url>W17-0811</url>
       <doi>10.18653/v1/W17-0811</doi>
@@ -1246,9 +1246,9 @@
     </paper>
     <paper id="12">
       <title>The <fixed-case>BEC</fixed-case>au<fixed-case>SE</fixed-case> Corpus 2.0: Annotating Causality and Overlapping Relations</title>
-      <author><first>Jesse</first> <last>Dunietz</last></author>
-      <author><first>Lori</first> <last>Levin</last></author>
-      <author><first>Jaime</first> <last>Carbonell</last></author>
+      <author><first>Jesse</first><last>Dunietz</last></author>
+      <author><first>Lori</first><last>Levin</last></author>
+      <author><first>Jaime</first><last>Carbonell</last></author>
       <pages>95–104</pages>
       <url>W17-0812</url>
       <doi>10.18653/v1/W17-0812</doi>
@@ -1256,8 +1256,8 @@
     </paper>
     <paper id="13">
       <title>Catching the Common Cause: Extraction and Annotation of Causal Relations and their Participants</title>
-      <author><first>Ines</first> <last>Rehbein</last></author>
-      <author><first>Josef</first> <last>Ruppenhofer</last></author>
+      <author><first>Ines</first><last>Rehbein</last></author>
+      <author><first>Josef</first><last>Ruppenhofer</last></author>
       <pages>105–114</pages>
       <url>W17-0813</url>
       <doi>10.18653/v1/W17-0813</doi>
@@ -1265,11 +1265,11 @@
     </paper>
     <paper id="14">
       <title>Assessing <fixed-case>SRL</fixed-case> Frameworks with Automatic Training Data Expansion</title>
-      <author><first>Silvana</first> <last>Hartmann</last></author>
-      <author><first>Éva</first> <last>Mújdricza-Maydt</last></author>
-      <author><first>Ilia</first> <last>Kuznetsov</last></author>
-      <author><first>Iryna</first> <last>Gurevych</last></author>
-      <author><first>Anette</first> <last>Frank</last></author>
+      <author><first>Silvana</first><last>Hartmann</last></author>
+      <author><first>Éva</first><last>Mújdricza-Maydt</last></author>
+      <author><first>Ilia</first><last>Kuznetsov</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
+      <author><first>Anette</first><last>Frank</last></author>
       <pages>115–121</pages>
       <url>W17-0814</url>
       <doi>10.18653/v1/W17-0814</doi>
@@ -1295,10 +1295,10 @@
     </frontmatter>
     <paper id="1">
       <title>Inducing Script Structure from Crowdsourced Event Descriptions via Semi-Supervised Clustering</title>
-      <author><first>Lilian</first> <last>Wanzare</last></author>
-      <author><first>Alessandra</first> <last>Zarcone</last></author>
-      <author><first>Stefan</first> <last>Thater</last></author>
-      <author><first>Manfred</first> <last>Pinkal</last></author>
+      <author><first>Lilian</first><last>Wanzare</last></author>
+      <author><first>Alessandra</first><last>Zarcone</last></author>
+      <author><first>Stefan</first><last>Thater</last></author>
+      <author><first>Manfred</first><last>Pinkal</last></author>
       <pages>1–11</pages>
       <url>W17-0901</url>
       <doi>10.18653/v1/W17-0901</doi>
@@ -1306,16 +1306,16 @@
     </paper>
     <paper id="2">
       <title>A Consolidated Open Knowledge Representation for Multiple Texts</title>
-      <author><first>Rachel</first> <last>Wities</last></author>
-      <author><first>Vered</first> <last>Shwartz</last></author>
-      <author><first>Gabriel</first> <last>Stanovsky</last></author>
-      <author><first>Meni</first> <last>Adler</last></author>
-      <author><first>Ori</first> <last>Shapira</last></author>
-      <author><first>Shyam</first> <last>Upadhyay</last></author>
-      <author><first>Dan</first> <last>Roth</last></author>
-      <author><first>Eugenio</first> <last>Martinez Camara</last></author>
-      <author><first>Iryna</first> <last>Gurevych</last></author>
-      <author><first>Ido</first> <last>Dagan</last></author>
+      <author><first>Rachel</first><last>Wities</last></author>
+      <author><first>Vered</first><last>Shwartz</last></author>
+      <author><first>Gabriel</first><last>Stanovsky</last></author>
+      <author><first>Meni</first><last>Adler</last></author>
+      <author><first>Ori</first><last>Shapira</last></author>
+      <author><first>Shyam</first><last>Upadhyay</last></author>
+      <author><first>Dan</first><last>Roth</last></author>
+      <author><first>Eugenio</first><last>Martinez Camara</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
+      <author><first>Ido</first><last>Dagan</last></author>
       <pages>12–24</pages>
       <url>W17-0902</url>
       <doi>10.18653/v1/W17-0902</doi>
@@ -1323,8 +1323,8 @@
     </paper>
     <paper id="3">
       <title>Event-Related Features in Feedforward Neural Networks Contribute to Identifying Causal Relations in Discourse</title>
-      <author><first>Edoardo Maria</first> <last>Ponti</last></author>
-      <author><first>Anna</first> <last>Korhonen</last></author>
+      <author><first>Edoardo Maria</first><last>Ponti</last></author>
+      <author><first>Anna</first><last>Korhonen</last></author>
       <pages>25–30</pages>
       <url>W17-0903</url>
       <doi>10.18653/v1/W17-0903</doi>
@@ -1332,9 +1332,9 @@
     </paper>
     <paper id="4">
       <title>Stance Detection in <fixed-case>F</fixed-case>acebook Posts of a <fixed-case>G</fixed-case>erman Right-wing Party</title>
-      <author><first>Manfred</first> <last>Klenner</last></author>
-      <author><first>Don</first> <last>Tuggener</last></author>
-      <author><first>Simon</first> <last>Clematide</last></author>
+      <author><first>Manfred</first><last>Klenner</last></author>
+      <author><first>Don</first><last>Tuggener</last></author>
+      <author><first>Simon</first><last>Clematide</last></author>
       <pages>31–40</pages>
       <url>W17-0904</url>
       <doi>10.18653/v1/W17-0904</doi>
@@ -1342,7 +1342,7 @@
     </paper>
     <paper id="5">
       <title>Behind the Scenes of an Evolving Event Cloze Test</title>
-      <author><first>Nathanael</first> <last>Chambers</last></author>
+      <author><first>Nathanael</first><last>Chambers</last></author>
       <pages>41–45</pages>
       <url>W17-0905</url>
       <doi>10.18653/v1/W17-0905</doi>
@@ -1350,11 +1350,11 @@
     </paper>
     <paper id="6">
       <title><fixed-case>LSDS</fixed-case>em 2017 Shared Task: The Story Cloze Test</title>
-      <author><first>Nasrin</first> <last>Mostafazadeh</last></author>
-      <author><first>Michael</first> <last>Roth</last></author>
-      <author><first>Annie</first> <last>Louis</last></author>
-      <author><first>Nathanael</first> <last>Chambers</last></author>
-      <author><first>James</first> <last>Allen</last></author>
+      <author><first>Nasrin</first><last>Mostafazadeh</last></author>
+      <author><first>Michael</first><last>Roth</last></author>
+      <author><first>Annie</first><last>Louis</last></author>
+      <author><first>Nathanael</first><last>Chambers</last></author>
+      <author><first>James</first><last>Allen</last></author>
       <pages>46–51</pages>
       <url>W17-0906</url>
       <doi>10.18653/v1/W17-0906</doi>
@@ -1362,12 +1362,12 @@
     </paper>
     <paper id="7">
       <title>Story Cloze Task: <fixed-case>UW</fixed-case> <fixed-case>NLP</fixed-case> System</title>
-      <author><first>Roy</first> <last>Schwartz</last></author>
-      <author><first>Maarten</first> <last>Sap</last></author>
-      <author><first>Ioannis</first> <last>Konstas</last></author>
-      <author><first>Leila</first> <last>Zilles</last></author>
-      <author><first>Yejin</first> <last>Choi</last></author>
-      <author><first>Noah A.</first> <last>Smith</last></author>
+      <author><first>Roy</first><last>Schwartz</last></author>
+      <author><first>Maarten</first><last>Sap</last></author>
+      <author><first>Ioannis</first><last>Konstas</last></author>
+      <author><first>Leila</first><last>Zilles</last></author>
+      <author><first>Yejin</first><last>Choi</last></author>
+      <author><first>Noah A.</first><last>Smith</last></author>
       <pages>52–55</pages>
       <url>W17-0907</url>
       <doi>10.18653/v1/W17-0907</doi>
@@ -1375,15 +1375,15 @@
     </paper>
     <paper id="8">
       <title><fixed-case>LSDS</fixed-case>em 2017: Exploring Data Generation Methods for the Story Cloze Test</title>
-      <author><first>Michael</first> <last>Bugert</last></author>
-      <author><first>Yevgeniy</first> <last>Puzikov</last></author>
-      <author><first>Andreas</first> <last>Rücklé</last></author>
-      <author><first>Judith</first> <last>Eckle-Kohler</last></author>
-      <author><first>Teresa</first> <last>Martin</last></author>
-      <author><first>Eugenio</first> <last>Martínez-Cámara</last></author>
-      <author><first>Daniil</first> <last>Sorokin</last></author>
-      <author><first>Maxime</first> <last>Peyrard</last></author>
-      <author><first>Iryna</first> <last>Gurevych</last></author>
+      <author><first>Michael</first><last>Bugert</last></author>
+      <author><first>Yevgeniy</first><last>Puzikov</last></author>
+      <author><first>Andreas</first><last>Rücklé</last></author>
+      <author><first>Judith</first><last>Eckle-Kohler</last></author>
+      <author><first>Teresa</first><last>Martin</last></author>
+      <author><first>Eugenio</first><last>Martínez-Cámara</last></author>
+      <author><first>Daniil</first><last>Sorokin</last></author>
+      <author><first>Maxime</first><last>Peyrard</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
       <pages>56–61</pages>
       <url>W17-0908</url>
       <doi>10.18653/v1/W17-0908</doi>
@@ -1391,8 +1391,8 @@
     </paper>
     <paper id="9">
       <title>Sentiment Analysis and Lexical Cohesion for the Story Cloze Task</title>
-      <author><first>Michael</first> <last>Flor</last></author>
-      <author><first>Swapna</first> <last>Somasundaran</last></author>
+      <author><first>Michael</first><last>Flor</last></author>
+      <author><first>Swapna</first><last>Somasundaran</last></author>
       <pages>62–67</pages>
       <url>W17-0909</url>
       <doi>10.18653/v1/W17-0909</doi>
@@ -1400,8 +1400,8 @@
     </paper>
     <paper id="10">
       <title>Resource-Lean Modeling of Coherence in Commonsense Stories</title>
-      <author><first>Niko</first> <last>Schenk</last></author>
-      <author><first>Christian</first> <last>Chiarcos</last></author>
+      <author><first>Niko</first><last>Schenk</last></author>
+      <author><first>Christian</first><last>Chiarcos</last></author>
       <pages>68–73</pages>
       <url>W17-0910</url>
       <doi>10.18653/v1/W17-0910</doi>
@@ -1409,10 +1409,10 @@
     </paper>
     <paper id="11">
       <title>An <fixed-case>RNN</fixed-case>-based Binary Classifier for the Story Cloze Test</title>
-      <author><first>Melissa</first> <last>Roemmele</last></author>
-      <author><first>Sosuke</first> <last>Kobayashi</last></author>
-      <author><first>Naoya</first> <last>Inoue</last></author>
-      <author><first>Andrew</first> <last>Gordon</last></author>
+      <author><first>Melissa</first><last>Roemmele</last></author>
+      <author><first>Sosuke</first><last>Kobayashi</last></author>
+      <author><first>Naoya</first><last>Inoue</last></author>
+      <author><first>Andrew</first><last>Gordon</last></author>
       <pages>74–80</pages>
       <url>W17-0911</url>
       <doi>10.18653/v1/W17-0911</doi>
@@ -1420,8 +1420,8 @@
     </paper>
     <paper id="12">
       <title>IIT (BHU): System Description for LSDSem’17 Shared Task</title>
-      <author><first>Pranav</first> <last>Goel</last></author>
-      <author><first>Anil Kumar</first> <last>Singh</last></author>
+      <author><first>Pranav</first><last>Goel</last></author>
+      <author><first>Anil Kumar</first><last>Singh</last></author>
       <pages>81–86</pages>
       <url>W17-0912</url>
       <doi>10.18653/v1/W17-0912</doi>
@@ -1429,8 +1429,8 @@
     </paper>
     <paper id="13">
       <title>Story Cloze Ending Selection Baselines and Data Examination</title>
-      <author><first>Todor</first> <last>Mihaylov</last></author>
-      <author><first>Anette</first> <last>Frank</last></author>
+      <author><first>Todor</first><last>Mihaylov</last></author>
+      <author><first>Anette</first><last>Frank</last></author>
       <pages>87–92</pages>
       <url>W17-0913</url>
       <doi>10.18653/v1/W17-0913</doi>
@@ -1459,14 +1459,14 @@
     </frontmatter>
     <paper id="1">
       <title><fixed-case>M</fixed-case>ulti<fixed-case>L</fixed-case>ing 2017 Overview</title>
-      <author><first>George</first> <last>Giannakopoulos</last></author>
-      <author><first>John</first> <last>Conroy</last></author>
-      <author><first>Jeff</first> <last>Kubina</last></author>
-      <author><first>Peter A.</first> <last>Rankel</last></author>
-      <author><first>Elena</first> <last>Lloret</last></author>
-      <author><first>Josef</first> <last>Steinberger</last></author>
-      <author><first>Marina</first> <last>Litvak</last></author>
-      <author><first>Benoit</first> <last>Favre</last></author>
+      <author><first>George</first><last>Giannakopoulos</last></author>
+      <author><first>John</first><last>Conroy</last></author>
+      <author><first>Jeff</first><last>Kubina</last></author>
+      <author><first>Peter A.</first><last>Rankel</last></author>
+      <author><first>Elena</first><last>Lloret</last></author>
+      <author><first>Josef</first><last>Steinberger</last></author>
+      <author><first>Marina</first><last>Litvak</last></author>
+      <author><first>Benoit</first><last>Favre</last></author>
       <pages>1–6</pages>
       <url>W17-1001</url>
       <doi>10.18653/v1/W17-1001</doi>
@@ -1474,10 +1474,10 @@
     </paper>
     <paper id="2">
       <title>Decoupling Encoder and Decoder Networks for Abstractive Document Summarization</title>
-      <author><first>Ying</first> <last>Xu</last></author>
-      <author><first>Jey Han</first> <last>Lau</last></author>
-      <author><first>Timothy</first> <last>Baldwin</last></author>
-      <author><first>Trevor</first> <last>Cohn</last></author>
+      <author><first>Ying</first><last>Xu</last></author>
+      <author><first>Jey Han</first><last>Lau</last></author>
+      <author><first>Timothy</first><last>Baldwin</last></author>
+      <author><first>Trevor</first><last>Cohn</last></author>
       <pages>7–11</pages>
       <url>W17-1002</url>
       <doi>10.18653/v1/W17-1002</doi>
@@ -1485,9 +1485,9 @@
     </paper>
     <paper id="3">
       <title>Centroid-based Text Summarization through Compositionality of Word Embeddings</title>
-      <author><first>Gaetano</first> <last>Rossiello</last></author>
-      <author><first>Pierpaolo</first> <last>Basile</last></author>
-      <author><first>Giovanni</first> <last>Semeraro</last></author>
+      <author><first>Gaetano</first><last>Rossiello</last></author>
+      <author><first>Pierpaolo</first><last>Basile</last></author>
+      <author><first>Giovanni</first><last>Semeraro</last></author>
       <pages>12–21</pages>
       <url>W17-1003</url>
       <doi>10.18653/v1/W17-1003</doi>
@@ -1495,8 +1495,8 @@
     </paper>
     <paper id="4">
       <title>Query-based summarization using <fixed-case>MDL</fixed-case> principle</title>
-      <author><first>Marina</first> <last>Litvak</last></author>
-      <author><first>Natalia</first> <last>Vanetik</last></author>
+      <author><first>Marina</first><last>Litvak</last></author>
+      <author><first>Natalia</first><last>Vanetik</last></author>
       <pages>22–31</pages>
       <url>W17-1004</url>
       <doi>10.18653/v1/W17-1004</doi>
@@ -1504,9 +1504,9 @@
     </paper>
     <paper id="5">
       <title>Word Embedding and Topic Modeling Enhanced Multiple Features for Content Linking and Argument / Sentiment Labeling in Online Forums</title>
-      <author><first>Lei</first> <last>Li</last></author>
-      <author><first>Liyuan</first> <last>Mao</last></author>
-      <author><first>Moye</first> <last>Chen</last></author>
+      <author><first>Lei</first><last>Li</last></author>
+      <author><first>Liyuan</first><last>Mao</last></author>
+      <author><first>Moye</first><last>Chen</last></author>
       <pages>32–36</pages>
       <url>W17-1005</url>
       <doi>10.18653/v1/W17-1005</doi>
@@ -1514,10 +1514,10 @@
     </paper>
     <paper id="6">
       <title>Ultra-Concise Multi-genre Summarisation of Web2.0: towards Intelligent Content Generation</title>
-      <author><first>Elena</first> <last>Lloret</last></author>
-      <author><first>Ester</first> <last>Boldrini</last></author>
-      <author><first>Patricio</first> <last>Martínez-Barco</last></author>
-      <author><first>Manuel</first> <last>Palomar</last></author>
+      <author><first>Elena</first><last>Lloret</last></author>
+      <author><first>Ester</first><last>Boldrini</last></author>
+      <author><first>Patricio</first><last>Martínez-Barco</last></author>
+      <author><first>Manuel</first><last>Palomar</last></author>
       <pages>37–46</pages>
       <url>W17-1006</url>
       <doi>10.18653/v1/W17-1006</doi>
@@ -1525,9 +1525,9 @@
     </paper>
     <paper id="7">
       <title>Machine Learning Approach to Evaluate <fixed-case>M</fixed-case>ulti<fixed-case>L</fixed-case>ingual Summaries</title>
-      <author><first>Samira</first> <last>Ellouze</last></author>
-      <author><first>Maher</first> <last>Jaoua</last></author>
-      <author><first>Lamia</first> <last>Hadrich Belguith</last></author>
+      <author><first>Samira</first><last>Ellouze</last></author>
+      <author><first>Maher</first><last>Jaoua</last></author>
+      <author><first>Lamia</first><last>Hadrich Belguith</last></author>
       <pages>47–54</pages>
       <url>W17-1007</url>
       <doi>10.18653/v1/W17-1007</doi>
@@ -1551,8 +1551,8 @@
     </frontmatter>
     <paper id="1">
       <title>A Survey on Hate Speech Detection using Natural Language Processing</title>
-      <author><first>Anna</first> <last>Schmidt</last></author>
-      <author><first>Michael</first> <last>Wiegand</last></author>
+      <author><first>Anna</first><last>Schmidt</last></author>
+      <author><first>Michael</first><last>Wiegand</last></author>
       <pages>1–10</pages>
       <url>W17-1101</url>
       <doi>10.18653/v1/W17-1101</doi>
@@ -1560,11 +1560,11 @@
     </paper>
     <paper id="2">
       <title><fixed-case>F</fixed-case>acebook sentiment: Reactions and Emojis</title>
-      <author><first>Ye</first> <last>Tian</last></author>
-      <author><first>Thiago</first> <last>Galery</last></author>
-      <author><first>Giulio</first> <last>Dulcinati</last></author>
-      <author><first>Emilia</first> <last>Molimpakis</last></author>
-      <author><first>Chao</first> <last>Sun</last></author>
+      <author><first>Ye</first><last>Tian</last></author>
+      <author><first>Thiago</first><last>Galery</last></author>
+      <author><first>Giulio</first><last>Dulcinati</last></author>
+      <author><first>Emilia</first><last>Molimpakis</last></author>
+      <author><first>Chao</first><last>Sun</last></author>
       <pages>11–16</pages>
       <url>W17-1102</url>
       <doi>10.18653/v1/W17-1102</doi>
@@ -1572,10 +1572,10 @@
     </paper>
     <paper id="3">
       <title>Potential and Limitations of Cross-Domain Sentiment Classification</title>
-      <author><first>Jan Milan</first> <last>Deriu</last></author>
-      <author><first>Martin</first> <last>Weilenmann</last></author>
-      <author><first>Dirk</first> <last>Von Gruenigen</last></author>
-      <author><first>Mark</first> <last>Cieliebak</last></author>
+      <author><first>Jan Milan</first><last>Deriu</last></author>
+      <author><first>Martin</first><last>Weilenmann</last></author>
+      <author><first>Dirk</first><last>Von Gruenigen</last></author>
+      <author><first>Mark</first><last>Cieliebak</last></author>
       <pages>17–24</pages>
       <url>W17-1103</url>
       <doi>10.18653/v1/W17-1103</doi>
@@ -1583,10 +1583,10 @@
     </paper>
     <paper id="4">
       <title>Aligning Entity Names with Online Aliases on Twitter</title>
-      <author><first>Kevin</first> <last>McKelvey</last></author>
-      <author><first>Peter</first> <last>Goutzounis</last></author>
-      <author><first>Stephen</first> <last>da Cruz</last></author>
-      <author><first>Nathanael</first> <last>Chambers</last></author>
+      <author><first>Kevin</first><last>McKelvey</last></author>
+      <author><first>Peter</first><last>Goutzounis</last></author>
+      <author><first>Stephen</first><last>da Cruz</last></author>
+      <author><first>Nathanael</first><last>Chambers</last></author>
       <pages>25–35</pages>
       <url>W17-1104</url>
       <doi>10.18653/v1/W17-1104</doi>
@@ -1594,9 +1594,9 @@
     </paper>
     <paper id="5">
       <title>Character-based Neural Embeddings for Tweet Clustering</title>
-      <author><first>Svitlana</first> <last>Vakulenko</last></author>
-      <author><first>Lyndon</first> <last>Nixon</last></author>
-      <author><first>Mihai</first> <last>Lupu</last></author>
+      <author><first>Svitlana</first><last>Vakulenko</last></author>
+      <author><first>Lyndon</first><last>Nixon</last></author>
+      <author><first>Mihai</first><last>Lupu</last></author>
       <pages>36–44</pages>
       <url>W17-1105</url>
       <doi>10.18653/v1/W17-1105</doi>
@@ -1605,10 +1605,10 @@
     </paper>
     <paper id="6">
       <title>A Twitter Corpus and Benchmark Resources for <fixed-case>G</fixed-case>erman Sentiment Analysis</title>
-      <author><first>Mark</first> <last>Cieliebak</last></author>
-      <author><first>Jan Milan</first> <last>Deriu</last></author>
-      <author><first>Dominic</first> <last>Egger</last></author>
-      <author><first>Fatih</first> <last>Uzdilli</last></author>
+      <author><first>Mark</first><last>Cieliebak</last></author>
+      <author><first>Jan Milan</first><last>Deriu</last></author>
+      <author><first>Dominic</first><last>Egger</last></author>
+      <author><first>Fatih</first><last>Uzdilli</last></author>
       <pages>45–51</pages>
       <url>W17-1106</url>
       <doi>10.18653/v1/W17-1106</doi>
@@ -1636,14 +1636,14 @@
     </frontmatter>
     <paper id="1">
       <title>Findings of the <fixed-case>V</fixed-case>ar<fixed-case>D</fixed-case>ial Evaluation Campaign 2017</title>
-      <author><first>Marcos</first> <last>Zampieri</last></author>
-      <author><first>Shervin</first> <last>Malmasi</last></author>
-      <author><first>Nikola</first> <last>Ljubešić</last></author>
-      <author><first>Preslav</first> <last>Nakov</last></author>
-      <author><first>Ahmed</first> <last>Ali</last></author>
-      <author><first>Jörg</first> <last>Tiedemann</last></author>
-      <author><first>Yves</first> <last>Scherrer</last></author>
-      <author><first>Noëmi</first> <last>Aepli</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <author><first>Shervin</first><last>Malmasi</last></author>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <author><first>Ahmed</first><last>Ali</last></author>
+      <author><first>Jörg</first><last>Tiedemann</last></author>
+      <author><first>Yves</first><last>Scherrer</last></author>
+      <author><first>Noëmi</first><last>Aepli</last></author>
       <pages>1–15</pages>
       <url>W17-1201</url>
       <doi>10.18653/v1/W17-1201</doi>
@@ -1651,8 +1651,8 @@
     </paper>
     <paper id="2">
       <title>Dialectometric analysis of language variation in Twitter</title>
-      <author><first>Gonzalo</first> <last>Donoso</last></author>
-      <author><first>David</first> <last>Sánchez</last></author>
+      <author><first>Gonzalo</first><last>Donoso</last></author>
+      <author><first>David</first><last>Sánchez</last></author>
       <pages>16–25</pages>
       <url>W17-1202</url>
       <doi>10.18653/v1/W17-1202</doi>
@@ -1660,9 +1660,9 @@
     </paper>
     <paper id="3">
       <title>Computational analysis of <fixed-case>G</fixed-case>ondi dialects</title>
-      <author><first>Taraka</first> <last>Rama</last></author>
-      <author><first>Çağrı</first> <last>Çöltekin</last></author>
-      <author><first>Pavel</first> <last>Sofroniev</last></author>
+      <author><first>Taraka</first><last>Rama</last></author>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
+      <author><first>Pavel</first><last>Sofroniev</last></author>
       <pages>26–35</pages>
       <url>W17-1203</url>
       <doi>10.18653/v1/W17-1203</doi>
@@ -1670,8 +1670,8 @@
     </paper>
     <paper id="4">
       <title>Investigating Diatopic Variation in a Historical Corpus</title>
-      <author><first>Stefanie</first> <last>Dipper</last></author>
-      <author><first>Sandra</first> <last>Waldenberger</last></author>
+      <author><first>Stefanie</first><last>Dipper</last></author>
+      <author><first>Sandra</first><last>Waldenberger</last></author>
       <pages>36–45</pages>
       <url>W17-1204</url>
       <doi>10.18653/v1/W17-1204</doi>
@@ -1679,7 +1679,7 @@
     </paper>
     <paper id="5">
       <title>Author Profiling at <fixed-case>PAN</fixed-case>: from Age and Gender Identification to Language Variety Identification (invited talk)</title>
-      <author><first>Paolo</first> <last>Rosso</last></author>
+      <author><first>Paolo</first><last>Rosso</last></author>
       <pages>46</pages>
       <url>W17-1205</url>
       <doi>10.18653/v1/W17-1205</doi>
@@ -1687,7 +1687,7 @@
     </paper>
     <paper id="6">
       <title>The similarity and Mutual Intelligibility between <fixed-case>A</fixed-case>mharic and <fixed-case>T</fixed-case>igrigna Varieties</title>
-      <author><first>Tekabe Legesse</first> <last>Feleke</last></author>
+      <author><first>Tekabe Legesse</first><last>Feleke</last></author>
       <pages>47–54</pages>
       <url>W17-1206</url>
       <doi>10.18653/v1/W17-1206</doi>
@@ -1695,7 +1695,7 @@
     </paper>
     <paper id="7">
       <title>Why <fixed-case>C</fixed-case>atalan-<fixed-case>S</fixed-case>panish Neural Machine Translation? Analysis, comparison and combination with standard Rule and Phrase-based technologies</title>
-      <author><first>Marta R.</first> <last>Costa-jussà</last></author>
+      <author><first>Marta R.</first><last>Costa-jussà</last></author>
       <pages>55–62</pages>
       <url>W17-1207</url>
       <doi>10.18653/v1/W17-1207</doi>
@@ -1703,7 +1703,7 @@
     </paper>
     <paper id="8">
       <title><fixed-case>K</fixed-case>urdish Interdialect Machine Translation</title>
-      <author><first>Hossein</first> <last>Hassani</last></author>
+      <author><first>Hossein</first><last>Hassani</last></author>
       <pages>63–72</pages>
       <url>W17-1208</url>
       <doi>10.18653/v1/W17-1208</doi>
@@ -1711,8 +1711,8 @@
     </paper>
     <paper id="9">
       <title>Twitter Language Identification Of Similar Languages And Dialects Without Ground Truth</title>
-      <author><first>Jennifer</first> <last>Williams</last></author>
-      <author><first>Charlie</first> <last>Dagli</last></author>
+      <author><first>Jennifer</first><last>Williams</last></author>
+      <author><first>Charlie</first><last>Dagli</last></author>
       <pages>73–83</pages>
       <url>W17-1209</url>
       <doi>10.18653/v1/W17-1209</doi>
@@ -1720,8 +1720,8 @@
     </paper>
     <paper id="10">
       <title>Multi-source morphosyntactic tagging for spoken <fixed-case>R</fixed-case>usyn</title>
-      <author><first>Yves</first> <last>Scherrer</last></author>
-      <author><first>Achim</first> <last>Rabus</last></author>
+      <author><first>Yves</first><last>Scherrer</last></author>
+      <author><first>Achim</first><last>Rabus</last></author>
       <pages>84–92</pages>
       <url>W17-1210</url>
       <doi>10.18653/v1/W17-1210</doi>
@@ -1729,9 +1729,9 @@
     </paper>
     <paper id="11">
       <title>Identifying dialects with textual and acoustic cues</title>
-      <author><first>Abualsoud</first> <last>Hanani</last></author>
-      <author><first>Aziz</first> <last>Qaroush</last></author>
-      <author><first>Stephen</first> <last>Taylor</last></author>
+      <author><first>Abualsoud</first><last>Hanani</last></author>
+      <author><first>Aziz</first><last>Qaroush</last></author>
+      <author><first>Stephen</first><last>Taylor</last></author>
       <pages>93–101</pages>
       <url>W17-1211</url>
       <doi>10.18653/v1/W17-1211</doi>
@@ -1739,9 +1739,9 @@
     </paper>
     <paper id="12">
       <title>Evaluating <fixed-case>H</fixed-case>e<fixed-case>LI</fixed-case> with Non-Linear Mappings</title>
-      <author><first>Tommi</first> <last>Jauhiainen</last></author>
-      <author><first>Krister</first> <last>Lindén</last></author>
-      <author><first>Heidi</first> <last>Jauhiainen</last></author>
+      <author><first>Tommi</first><last>Jauhiainen</last></author>
+      <author><first>Krister</first><last>Lindén</last></author>
+      <author><first>Heidi</first><last>Jauhiainen</last></author>
       <pages>102–108</pages>
       <url>W17-1212</url>
       <doi>10.18653/v1/W17-1212</doi>
@@ -1749,9 +1749,9 @@
     </paper>
     <paper id="13">
       <title>A Perplexity-Based Method for Similar Languages Discrimination</title>
-      <author><first>Pablo</first> <last>Gamallo</last></author>
-      <author><first>Jose Ramom</first> <last>Pichel</last></author>
-      <author><first>Iñaki</first> <last>Alegria</last></author>
+      <author><first>Pablo</first><last>Gamallo</last></author>
+      <author><first>Jose Ramom</first><last>Pichel</last></author>
+      <author><first>Iñaki</first><last>Alegria</last></author>
       <pages>109–114</pages>
       <url>W17-1213</url>
       <doi>10.18653/v1/W17-1213</doi>
@@ -1759,7 +1759,7 @@
     </paper>
     <paper id="14">
       <title>Improving the Character Ngram Model for the <fixed-case>DSL</fixed-case> Task with <fixed-case>BM</fixed-case>25 Weighting and Less Frequently Used Feature Sets</title>
-      <author><first>Yves</first> <last>Bestgen</last></author>
+      <author><first>Yves</first><last>Bestgen</last></author>
       <pages>115–123</pages>
       <url>W17-1214</url>
       <doi>10.18653/v1/W17-1214</doi>
@@ -1767,8 +1767,8 @@
     </paper>
     <paper id="15">
       <title>Discriminating between Similar Languages with Word-level Convolutional Neural Networks</title>
-      <author><first>Marcelo</first> <last>Criscuolo</last></author>
-      <author><first>Sandra Maria</first> <last>Aluísio</last></author>
+      <author><first>Marcelo</first><last>Criscuolo</last></author>
+      <author><first>Sandra Maria</first><last>Aluísio</last></author>
       <pages>124–130</pages>
       <url>W17-1215</url>
       <doi>10.18653/v1/W17-1215</doi>
@@ -1776,7 +1776,7 @@
     </paper>
     <paper id="16">
       <title>Cross-lingual dependency parsing for closely related languages - <fixed-case>H</fixed-case>elsinki’s submission to <fixed-case>V</fixed-case>ar<fixed-case>D</fixed-case>ial 2017</title>
-      <author><first>Jörg</first> <last>Tiedemann</last></author>
+      <author><first>Jörg</first><last>Tiedemann</last></author>
       <pages>131–136</pages>
       <url>W17-1216</url>
       <doi>10.18653/v1/W17-1216</doi>
@@ -1784,11 +1784,11 @@
     </paper>
     <paper id="17">
       <title>Discriminating between Similar Languages Using a Combination of Typed and Untyped Character N-grams and Words</title>
-      <author><first>Helena</first> <last>Gomez</last></author>
-      <author><first>Ilia</first> <last>Markov</last></author>
-      <author><first>Jorge</first> <last>Baptista</last></author>
-      <author><first>Grigori</first> <last>Sidorov</last></author>
-      <author><first>David</first> <last>Pinto</last></author>
+      <author><first>Helena</first><last>Gomez</last></author>
+      <author><first>Ilia</first><last>Markov</last></author>
+      <author><first>Jorge</first><last>Baptista</last></author>
+      <author><first>Grigori</first><last>Sidorov</last></author>
+      <author><first>David</first><last>Pinto</last></author>
       <pages>137–145</pages>
       <url>W17-1217</url>
       <doi>10.18653/v1/W17-1217</doi>
@@ -1796,8 +1796,8 @@
     </paper>
     <paper id="18">
       <title>Tübingen system in <fixed-case>V</fixed-case>ar<fixed-case>D</fixed-case>ial 2017 shared task: experiments with language identification and cross-lingual parsing</title>
-      <author><first>Çağrı</first> <last>Çöltekin</last></author>
-      <author><first>Taraka</first> <last>Rama</last></author>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
+      <author><first>Taraka</first><last>Rama</last></author>
       <pages>146–155</pages>
       <url>W17-1218</url>
       <doi>10.18653/v1/W17-1218</doi>
@@ -1805,9 +1805,9 @@
     </paper>
     <paper id="19">
       <title>When Sparse Traditional Models Outperform Dense Neural Networks: the Curious Case of Discriminating between Similar Languages</title>
-      <author><first>Maria</first> <last>Medvedeva</last></author>
-      <author><first>Martin</first> <last>Kroon</last></author>
-      <author><first>Barbara</first> <last>Plank</last></author>
+      <author><first>Maria</first><last>Medvedeva</last></author>
+      <author><first>Martin</first><last>Kroon</last></author>
+      <author><first>Barbara</first><last>Plank</last></author>
       <pages>156–163</pages>
       <url>W17-1219</url>
       <doi>10.18653/v1/W17-1219</doi>
@@ -1815,8 +1815,8 @@
     </paper>
     <paper id="20">
       <title><fixed-case>G</fixed-case>erman Dialect Identification in Interview Transcriptions</title>
-      <author><first>Shervin</first> <last>Malmasi</last></author>
-      <author><first>Marcos</first> <last>Zampieri</last></author>
+      <author><first>Shervin</first><last>Malmasi</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
       <pages>164–169</pages>
       <url>W17-1220</url>
       <doi>10.18653/v1/W17-1220</doi>
@@ -1824,8 +1824,8 @@
     </paper>
     <paper id="21">
       <title><fixed-case>CLUZH</fixed-case> at <fixed-case>V</fixed-case>ar<fixed-case>D</fixed-case>ial <fixed-case>GDI</fixed-case> 2017: Testing a Variety of Machine Learning Tools for the Classification of Swiss <fixed-case>G</fixed-case>erman Dialects</title>
-      <author><first>Simon</first> <last>Clematide</last></author>
-      <author><first>Peter</first> <last>Makarov</last></author>
+      <author><first>Simon</first><last>Clematide</last></author>
+      <author><first>Peter</first><last>Makarov</last></author>
       <pages>170–177</pages>
       <url>W17-1221</url>
       <doi>10.18653/v1/W17-1221</doi>
@@ -1833,8 +1833,8 @@
     </paper>
     <paper id="22">
       <title><fixed-case>A</fixed-case>rabic Dialect Identification Using i<fixed-case>V</fixed-case>ectors and <fixed-case>ASR</fixed-case> Transcripts</title>
-      <author><first>Shervin</first> <last>Malmasi</last></author>
-      <author><first>Marcos</first> <last>Zampieri</last></author>
+      <author><first>Shervin</first><last>Malmasi</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
       <pages>178–183</pages>
       <url>W17-1222</url>
       <doi>10.18653/v1/W17-1222</doi>
@@ -1842,7 +1842,7 @@
     </paper>
     <paper id="23">
       <title>Discriminating between Similar Languages using Weighted Subword Features</title>
-      <author><first>Adrien</first> <last>Barbaresi</last></author>
+      <author><first>Adrien</first><last>Barbaresi</last></author>
       <pages>184–189</pages>
       <url>W17-1223</url>
       <doi>10.18653/v1/W17-1223</doi>
@@ -1850,8 +1850,8 @@
     </paper>
     <paper id="24">
       <title>Exploring Lexical and Syntactic Features for Language Variety Identification</title>
-      <author><first>Chris</first> <last>van der Lee</last></author>
-      <author><first>Antal</first> <last>van den Bosch</last></author>
+      <author><first>Chris</first><last>van der Lee</last></author>
+      <author><first>Antal</first><last>van den Bosch</last></author>
       <pages>190–199</pages>
       <url>W17-1224</url>
       <doi>10.18653/v1/W17-1224</doi>
@@ -1859,8 +1859,8 @@
     </paper>
     <paper id="25">
       <title>Learning to Identify <fixed-case>A</fixed-case>rabic and <fixed-case>G</fixed-case>erman Dialects using Multiple Kernels</title>
-      <author><first>Radu Tudor</first> <last>Ionescu</last></author>
-      <author><first>Andrei</first> <last>Butnaru</last></author>
+      <author><first>Radu Tudor</first><last>Ionescu</last></author>
+      <author><first>Andrei</first><last>Butnaru</last></author>
       <pages>200–209</pages>
       <url>W17-1225</url>
       <doi>10.18653/v1/W17-1225</doi>
@@ -1868,10 +1868,10 @@
     </paper>
     <paper id="26">
       <title><fixed-case>S</fixed-case>lavic Forest, <fixed-case>N</fixed-case>orwegian Wood</title>
-      <author><first>Rudolf</first> <last>Rosa</last></author>
-      <author><first>Daniel</first> <last>Zeman</last></author>
-      <author><first>David</first> <last>Mareček</last></author>
-      <author><first>Zdeněk</first> <last>Žabokrtský</last></author>
+      <author><first>Rudolf</first><last>Rosa</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
+      <author><first>David</first><last>Mareček</last></author>
+      <author><first>Zdeněk</first><last>Žabokrtský</last></author>
       <pages>210–219</pages>
       <url>W17-1226</url>
       <doi>10.18653/v1/W17-1226</doi>
@@ -1901,8 +1901,8 @@
     </frontmatter>
     <paper id="1">
       <title>Identification of Languages in <fixed-case>A</fixed-case>lgerian <fixed-case>A</fixed-case>rabic Multilingual Documents</title>
-      <author><first>Wafia</first> <last>Adouane</last></author>
-      <author><first>Simon</first> <last>Dobnik</last></author>
+      <author><first>Wafia</first><last>Adouane</last></author>
+      <author><first>Simon</first><last>Dobnik</last></author>
       <pages>1–8</pages>
       <url>W17-1301</url>
       <doi>10.18653/v1/W17-1301</doi>
@@ -1910,9 +1910,9 @@
     </paper>
     <paper id="2">
       <title><fixed-case>A</fixed-case>rabic Diacritization: Stats, Rules, and Hacks</title>
-      <author><first>Kareem</first> <last>Darwish</last></author>
-      <author><first>Hamdy</first> <last>Mubarak</last></author>
-      <author><first>Ahmed</first> <last>Abdelali</last></author>
+      <author><first>Kareem</first><last>Darwish</last></author>
+      <author><first>Hamdy</first><last>Mubarak</last></author>
+      <author><first>Ahmed</first><last>Abdelali</last></author>
       <pages>9–17</pages>
       <url>W17-1302</url>
       <doi>10.18653/v1/W17-1302</doi>
@@ -1920,8 +1920,8 @@
     </paper>
     <paper id="3">
       <title>Semantic Similarity of <fixed-case>A</fixed-case>rabic Sentences with Word Embeddings</title>
-      <author><first>El Moatez Billah</first> <last>Nagoudi</last></author>
-      <author><first>Didier</first> <last>Schwab</last></author>
+      <author><first>El Moatez Billah</first><last>Nagoudi</last></author>
+      <author><first>Didier</first><last>Schwab</last></author>
       <pages>18–24</pages>
       <url>W17-1303</url>
       <doi>10.18653/v1/W17-1303</doi>
@@ -1929,8 +1929,8 @@
     </paper>
     <paper id="4">
       <title>Morphological Analysis for the <fixed-case>M</fixed-case>altese Language: The challenges of a hybrid system</title>
-      <author><first>Claudia</first> <last>Borg</last></author>
-      <author><first>Albert</first> <last>Gatt</last></author>
+      <author><first>Claudia</first><last>Borg</last></author>
+      <author><first>Albert</first><last>Gatt</last></author>
       <pages>25–34</pages>
       <url>W17-1304</url>
       <doi>10.18653/v1/W17-1304</doi>
@@ -1938,9 +1938,9 @@
     </paper>
     <paper id="5">
       <title>A Morphological Analyzer for Gulf <fixed-case>A</fixed-case>rabic Verbs</title>
-      <author><first>Salam</first> <last>Khalifa</last></author>
-      <author><first>Sara</first> <last>Hassan</last></author>
-      <author><first>Nizar</first> <last>Habash</last></author>
+      <author><first>Salam</first><last>Khalifa</last></author>
+      <author><first>Sara</first><last>Hassan</last></author>
+      <author><first>Nizar</first><last>Habash</last></author>
       <pages>35–45</pages>
       <url>W17-1305</url>
       <doi>10.18653/v1/W17-1305</doi>
@@ -1948,13 +1948,13 @@
     </paper>
     <paper id="6">
       <title>A Neural Architecture for Dialectal <fixed-case>A</fixed-case>rabic Segmentation</title>
-      <author><first>Younes</first> <last>Samih</last></author>
-      <author><first>Mohammed</first> <last>Attia</last></author>
-      <author><first>Mohamed</first> <last>Eldesouki</last></author>
-      <author><first>Ahmed</first> <last>Abdelali</last></author>
-      <author><first>Hamdy</first> <last>Mubarak</last></author>
-      <author><first>Laura</first> <last>Kallmeyer</last></author>
-      <author><first>Kareem</first> <last>Darwish</last></author>
+      <author><first>Younes</first><last>Samih</last></author>
+      <author><first>Mohammed</first><last>Attia</last></author>
+      <author><first>Mohamed</first><last>Eldesouki</last></author>
+      <author><first>Ahmed</first><last>Abdelali</last></author>
+      <author><first>Hamdy</first><last>Mubarak</last></author>
+      <author><first>Laura</first><last>Kallmeyer</last></author>
+      <author><first>Kareem</first><last>Darwish</last></author>
       <pages>46–54</pages>
       <url>W17-1306</url>
       <doi>10.18653/v1/W17-1306</doi>
@@ -1962,10 +1962,10 @@
     </paper>
     <paper id="7">
       <title>Sentiment Analysis of <fixed-case>T</fixed-case>unisian Dialects: Linguistic Ressources and Experiments</title>
-      <author><first>Salima</first> <last>Medhaffar</last></author>
-      <author><first>Fethi</first> <last>Bougares</last></author>
-      <author><first>Yannick</first> <last>Estève</last></author>
-      <author><first>Lamia</first> <last>Hadrich-Belguith</last></author>
+      <author><first>Salima</first><last>Medhaffar</last></author>
+      <author><first>Fethi</first><last>Bougares</last></author>
+      <author><first>Yannick</first><last>Estève</last></author>
+      <author><first>Lamia</first><last>Hadrich-Belguith</last></author>
       <pages>55–61</pages>
       <url>W17-1307</url>
       <doi>10.18653/v1/W17-1307</doi>
@@ -1973,12 +1973,12 @@
     </paper>
     <paper id="8">
       <title><fixed-case>CAT</fixed-case>: Credibility Analysis of <fixed-case>A</fixed-case>rabic Content on Twitter</title>
-      <author><first>Rim</first> <last>El Ballouli</last></author>
-      <author><first>Wassim</first> <last>El-Hajj</last></author>
-      <author><first>Ahmad</first> <last>Ghandour</last></author>
-      <author><first>Shady</first> <last>Elbassuoni</last></author>
-      <author><first>Hazem</first> <last>Hajj</last></author>
-      <author><first>Khaled</first> <last>Shaban</last></author>
+      <author><first>Rim</first><last>El Ballouli</last></author>
+      <author><first>Wassim</first><last>El-Hajj</last></author>
+      <author><first>Ahmad</first><last>Ghandour</last></author>
+      <author><first>Shady</first><last>Elbassuoni</last></author>
+      <author><first>Hazem</first><last>Hajj</last></author>
+      <author><first>Khaled</first><last>Shaban</last></author>
       <pages>62–71</pages>
       <url>W17-1308</url>
       <doi>10.18653/v1/W17-1308</doi>
@@ -1986,8 +1986,8 @@
     </paper>
     <paper id="9">
       <title>A New Error Annotation for Dyslexic texts in <fixed-case>A</fixed-case>rabic</title>
-      <author><first>Maha</first> <last>Alamri</last></author>
-      <author><first>William J</first> <last>Teahan</last></author>
+      <author><first>Maha</first><last>Alamri</last></author>
+      <author><first>William J</first><last>Teahan</last></author>
       <pages>72–78</pages>
       <url>W17-1309</url>
       <doi>10.18653/v1/W17-1309</doi>
@@ -1995,12 +1995,12 @@
     </paper>
     <paper id="10">
       <title>An Unsupervised Speaker Clustering Technique based on <fixed-case>SOM</fixed-case> and <fixed-case>I</fixed-case>-vectors for Speech Recognition Systems</title>
-      <author><first>Hany</first> <last>Ahmed</last></author>
-      <author><first>Mohamed</first> <last>Elaraby</last></author>
-      <author><first>Abdullah</first> <last>M. Mousa</last></author>
-      <author><first>Mostafa</first> <last>Elhosiny</last></author>
-      <author><first>Sherif</first> <last>Abdou</last></author>
-      <author><first>Mohsen</first> <last>Rashwan</last></author>
+      <author><first>Hany</first><last>Ahmed</last></author>
+      <author><first>Mohamed</first><last>Elaraby</last></author>
+      <author><first>Abdullah</first><last>M. Mousa</last></author>
+      <author><first>Mostafa</first><last>Elhosiny</last></author>
+      <author><first>Sherif</first><last>Abdou</last></author>
+      <author><first>Mohsen</first><last>Rashwan</last></author>
       <pages>79–83</pages>
       <url>W17-1310</url>
       <doi>10.18653/v1/W17-1310</doi>
@@ -2008,8 +2008,8 @@
     </paper>
     <paper id="11">
       <title><fixed-case>SHAKKIL</fixed-case>: An Automatic Diacritization System for Modern Standard <fixed-case>A</fixed-case>rabic Texts</title>
-      <author><first>Amany</first> <last>Fashwan</last></author>
-      <author><first>Sameh</first> <last>Alansary</last></author>
+      <author><first>Amany</first><last>Fashwan</last></author>
+      <author><first>Sameh</first><last>Alansary</last></author>
       <pages>84–93</pages>
       <url>W17-1311</url>
       <doi>10.18653/v1/W17-1311</doi>
@@ -2017,9 +2017,9 @@
     </paper>
     <paper id="12">
       <title><fixed-case>A</fixed-case>rabic Tweets Treebanking and Parsing: A Bootstrapping Approach</title>
-      <author><first>Fahad</first> <last>Albogamy</last></author>
-      <author><first>Allan</first> <last>Ramsay</last></author>
-      <author><first>Hanady</first> <last>Ahmed</last></author>
+      <author><first>Fahad</first><last>Albogamy</last></author>
+      <author><first>Allan</first><last>Ramsay</last></author>
+      <author><first>Hanady</first><last>Ahmed</last></author>
       <pages>94–99</pages>
       <url>W17-1312</url>
       <doi>10.18653/v1/W17-1312</doi>
@@ -2027,10 +2027,10 @@
     </paper>
     <paper id="13">
       <title>Identifying Effective Translations for Cross-lingual <fixed-case>A</fixed-case>rabic-to-<fixed-case>E</fixed-case>nglish User-generated Speech Search</title>
-      <author><first>Ahmad</first> <last>Khwileh</last></author>
-      <author><first>Haithem</first> <last>Afli</last></author>
-      <author><first>Gareth</first> <last>Jones</last></author>
-      <author><first>Andy</first> <last>Way</last></author>
+      <author><first>Ahmad</first><last>Khwileh</last></author>
+      <author><first>Haithem</first><last>Afli</last></author>
+      <author><first>Gareth</first><last>Jones</last></author>
+      <author><first>Andy</first><last>Way</last></author>
       <pages>100–109</pages>
       <url>W17-1313</url>
       <doi>10.18653/v1/W17-1313</doi>
@@ -2038,15 +2038,15 @@
     </paper>
     <paper id="14">
       <title>A Characterization Study of <fixed-case>A</fixed-case>rabic Twitter Data with a Benchmarking for State-of-the-Art Opinion Mining Models</title>
-      <author><first>Ramy</first> <last>Baly</last></author>
-      <author><first>Gilbert</first> <last>Badaro</last></author>
-      <author><first>Georges</first> <last>El-Khoury</last></author>
-      <author><first>Rawan</first> <last>Moukalled</last></author>
-      <author><first>Rita</first> <last>Aoun</last></author>
-      <author><first>Hazem</first> <last>Hajj</last></author>
-      <author><first>Wassim</first> <last>El-Hajj</last></author>
-      <author><first>Nizar</first> <last>Habash</last></author>
-      <author><first>Khaled</first> <last>Shaban</last></author>
+      <author><first>Ramy</first><last>Baly</last></author>
+      <author><first>Gilbert</first><last>Badaro</last></author>
+      <author><first>Georges</first><last>El-Khoury</last></author>
+      <author><first>Rawan</first><last>Moukalled</last></author>
+      <author><first>Rita</first><last>Aoun</last></author>
+      <author><first>Hazem</first><last>Hajj</last></author>
+      <author><first>Wassim</first><last>El-Hajj</last></author>
+      <author><first>Nizar</first><last>Habash</last></author>
+      <author><first>Khaled</first><last>Shaban</last></author>
       <pages>110–118</pages>
       <url>W17-1314</url>
       <doi>10.18653/v1/W17-1314</doi>
@@ -2054,9 +2054,9 @@
     </paper>
     <paper id="15">
       <title>Robust Dictionary Lookup in Multiple Noisy Orthographies</title>
-      <author><first>Lingliang</first> <last>Zhang</last></author>
-      <author><first>Nizar</first> <last>Habash</last></author>
-      <author><first>Godfried</first> <last>Toussaint</last></author>
+      <author><first>Lingliang</first><last>Zhang</last></author>
+      <author><first>Nizar</first><last>Habash</last></author>
+      <author><first>Godfried</first><last>Toussaint</last></author>
       <pages>119–129</pages>
       <url>W17-1315</url>
       <doi>10.18653/v1/W17-1315</doi>
@@ -2064,10 +2064,10 @@
     </paper>
     <paper id="16">
       <title><fixed-case>A</fixed-case>rabic <fixed-case>POS</fixed-case> Tagging: Don’t Abandon Feature Engineering Just Yet</title>
-      <author><first>Kareem</first> <last>Darwish</last></author>
-      <author><first>Hamdy</first> <last>Mubarak</last></author>
-      <author><first>Ahmed</first> <last>Abdelali</last></author>
-      <author><first>Mohamed</first> <last>Eldesouki</last></author>
+      <author><first>Kareem</first><last>Darwish</last></author>
+      <author><first>Hamdy</first><last>Mubarak</last></author>
+      <author><first>Ahmed</first><last>Abdelali</last></author>
+      <author><first>Mohamed</first><last>Eldesouki</last></author>
       <pages>130–137</pages>
       <url>W17-1316</url>
       <doi>10.18653/v1/W17-1316</doi>
@@ -2075,10 +2075,10 @@
     </paper>
     <paper id="17">
       <title>Toward a Web-based Speech Corpus for <fixed-case>A</fixed-case>lgerian Dialectal <fixed-case>A</fixed-case>rabic Varieties</title>
-      <author><first>Soumia</first> <last>Bougrine</last></author>
-      <author><first>Aicha</first> <last>Chorana</last></author>
-      <author><first>Abdallah</first> <last>Lakhdari</last></author>
-      <author><first>Hadda</first> <last>Cherroun</last></author>
+      <author><first>Soumia</first><last>Bougrine</last></author>
+      <author><first>Aicha</first><last>Chorana</last></author>
+      <author><first>Abdallah</first><last>Lakhdari</last></author>
+      <author><first>Hadda</first><last>Cherroun</last></author>
       <pages>138–146</pages>
       <url>W17-1317</url>
       <doi>10.18653/v1/W17-1317</doi>
@@ -2086,7 +2086,7 @@
     </paper>
     <paper id="18">
       <title>Not All Segments are Created Equal: Syntactically Motivated Sentiment Analysis in Lexical Space</title>
-      <author><first>Muhammad</first> <last>Abdul-Mageed</last></author>
+      <author><first>Muhammad</first><last>Abdul-Mageed</last></author>
       <pages>147–156</pages>
       <url>W17-1318</url>
       <doi>10.18653/v1/W17-1318</doi>
@@ -2094,12 +2094,12 @@
     </paper>
     <paper id="19">
       <title>An enhanced automatic speech recognition system for <fixed-case>A</fixed-case>rabic</title>
-      <author><first>Mohamed Amine</first> <last>Menacer</last></author>
-      <author><first>Odile</first> <last>Mella</last></author>
-      <author><first>Dominique</first> <last>Fohr</last></author>
-      <author><first>Denis</first> <last>Jouvet</last></author>
-      <author><first>David</first> <last>Langlois</last></author>
-      <author><first>Kamel</first> <last>Smaili</last></author>
+      <author><first>Mohamed Amine</first><last>Menacer</last></author>
+      <author><first>Odile</first><last>Mella</last></author>
+      <author><first>Dominique</first><last>Fohr</last></author>
+      <author><first>Denis</first><last>Jouvet</last></author>
+      <author><first>David</first><last>Langlois</last></author>
+      <author><first>Kamel</first><last>Smaili</last></author>
       <pages>157–165</pages>
       <url>W17-1319</url>
       <doi>10.18653/v1/W17-1319</doi>
@@ -2107,9 +2107,9 @@
     </paper>
     <paper id="20">
       <title>Universal Dependencies for <fixed-case>A</fixed-case>rabic</title>
-      <author><first>Dima</first> <last>Taji</last></author>
-      <author><first>Nizar</first> <last>Habash</last></author>
-      <author><first>Daniel</first> <last>Zeman</last></author>
+      <author><first>Dima</first><last>Taji</last></author>
+      <author><first>Nizar</first><last>Habash</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
       <pages>166–176</pages>
       <url>W17-1320</url>
       <doi>10.18653/v1/W17-1320</doi>
@@ -2117,9 +2117,9 @@
     </paper>
     <paper id="21">
       <title>A Layered Language Model based Hybrid Approach to Automatic Full Diacritization of <fixed-case>A</fixed-case>rabic</title>
-      <author><first>Mohamed</first> <last>Al-Badrashiny</last></author>
-      <author><first>Abdelati</first> <last>Hawwari</last></author>
-      <author><first>Mona</first> <last>Diab</last></author>
+      <author><first>Mohamed</first><last>Al-Badrashiny</last></author>
+      <author><first>Abdelati</first><last>Hawwari</last></author>
+      <author><first>Mona</first><last>Diab</last></author>
       <pages>177–184</pages>
       <url>W17-1321</url>
       <doi>10.18653/v1/W17-1321</doi>
@@ -2127,8 +2127,8 @@
     </paper>
     <paper id="22">
       <title><fixed-case>A</fixed-case>rabic Textual Entailment with Word Embeddings</title>
-      <author><first>Nada</first> <last>Almarwani</last></author>
-      <author><first>Mona</first> <last>Diab</last></author>
+      <author><first>Nada</first><last>Almarwani</last></author>
+      <author><first>Mona</first><last>Diab</last></author>
       <pages>185–190</pages>
       <url>W17-1322</url>
       <doi>10.18653/v1/W17-1322</doi>
@@ -2156,7 +2156,7 @@
     </frontmatter>
     <paper id="1">
       <title>Toward Pan-<fixed-case>S</fixed-case>lavic <fixed-case>NLP</fixed-case>: Some Experiments with Language Adaptation</title>
-      <author><first>Serge</first> <last>Sharoff</last></author>
+      <author><first>Serge</first><last>Sharoff</last></author>
       <pages>1–2</pages>
       <url>W17-1401</url>
       <doi>10.18653/v1/W17-1401</doi>
@@ -2164,9 +2164,9 @@
     </paper>
     <paper id="2">
       <title>Clustering of <fixed-case>R</fixed-case>ussian Adjective-Noun Constructions using Word Embeddings</title>
-      <author><first>Andrey</first> <last>Kutuzov</last></author>
-      <author><first>Elizaveta</first> <last>Kuzmenko</last></author>
-      <author><first>Lidia</first> <last>Pivovarova</last></author>
+      <author><first>Andrey</first><last>Kutuzov</last></author>
+      <author><first>Elizaveta</first><last>Kuzmenko</last></author>
+      <author><first>Lidia</first><last>Pivovarova</last></author>
       <pages>3–13</pages>
       <url>W17-1402</url>
       <doi>10.18653/v1/W17-1402</doi>
@@ -2174,8 +2174,8 @@
     </paper>
     <paper id="3">
       <title>A Preliminary Study of <fixed-case>C</fixed-case>roatian Lexical Substitution</title>
-      <author><first>Domagoj</first> <last>Alagić</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
+      <author><first>Domagoj</first><last>Alagić</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
       <pages>14–19</pages>
       <url>W17-1403</url>
       <doi>10.18653/v1/W17-1403</doi>
@@ -2183,8 +2183,8 @@
     </paper>
     <paper id="4">
       <title>Projecting Multiword Expression Resources on a Polish Treebank</title>
-      <author><first>Agata</first> <last>Savary</last></author>
-      <author><first>Jakub</first> <last>Waszczuk</last></author>
+      <author><first>Agata</first><last>Savary</last></author>
+      <author><first>Jakub</first><last>Waszczuk</last></author>
       <pages>20–26</pages>
       <url>W17-1404</url>
       <doi>10.18653/v1/W17-1404</doi>
@@ -2192,8 +2192,8 @@
     </paper>
     <paper id="5">
       <title>Lexicon Induction for Spoken <fixed-case>R</fixed-case>usyn – Challenges and Results</title>
-      <author><first>Achim</first> <last>Rabus</last></author>
-      <author><first>Yves</first> <last>Scherrer</last></author>
+      <author><first>Achim</first><last>Rabus</last></author>
+      <author><first>Yves</first><last>Scherrer</last></author>
       <pages>27–32</pages>
       <url>W17-1405</url>
       <doi>10.18653/v1/W17-1405</doi>
@@ -2201,9 +2201,9 @@
     </paper>
     <paper id="6">
       <title>The Universal Dependencies Treebank for <fixed-case>S</fixed-case>lovenian</title>
-      <author><first>Kaja</first> <last>Dobrovoljc</last></author>
-      <author><first>Tomaž</first> <last>Erjavec</last></author>
-      <author><first>Simon</first> <last>Krek</last></author>
+      <author><first>Kaja</first><last>Dobrovoljc</last></author>
+      <author><first>Tomaž</first><last>Erjavec</last></author>
+      <author><first>Simon</first><last>Krek</last></author>
       <pages>33–38</pages>
       <url>W17-1406</url>
       <doi>10.18653/v1/W17-1406</doi>
@@ -2211,10 +2211,10 @@
     </paper>
     <paper id="7">
       <title>Universal Dependencies for <fixed-case>S</fixed-case>erbian in Comparison with <fixed-case>C</fixed-case>roatian and Other <fixed-case>S</fixed-case>lavic Languages</title>
-      <author><first>Tanja</first> <last>Samardžić</last></author>
-      <author><first>Mirjana</first> <last>Starović</last></author>
-      <author><first>Željko</first> <last>Agić</last></author>
-      <author><first>Nikola</first> <last>Ljubešić</last></author>
+      <author><first>Tanja</first><last>Samardžić</last></author>
+      <author><first>Mirjana</first><last>Starović</last></author>
+      <author><first>Željko</first><last>Agić</last></author>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
       <pages>39–44</pages>
       <url>W17-1407</url>
       <doi>10.18653/v1/W17-1407</doi>
@@ -2222,7 +2222,7 @@
     </paper>
     <paper id="8">
       <title>Spelling Correction for Morphologically Rich Language: a Case Study of <fixed-case>R</fixed-case>ussian</title>
-      <author><first>Alexey</first> <last>Sorokin</last></author>
+      <author><first>Alexey</first><last>Sorokin</last></author>
       <pages>45–53</pages>
       <url>W17-1408</url>
       <doi>10.18653/v1/W17-1408</doi>
@@ -2230,10 +2230,10 @@
     </paper>
     <paper id="9">
       <title>Debunking Sentiment Lexicons: A Case of Domain-Specific Sentiment Classification for <fixed-case>C</fixed-case>roatian</title>
-      <author><first>Paula</first> <last>Gombar</last></author>
-      <author><first>Zoran</first> <last>Medić</last></author>
-      <author><first>Domagoj</first> <last>Alagić</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
+      <author><first>Paula</first><last>Gombar</last></author>
+      <author><first>Zoran</first><last>Medić</last></author>
+      <author><first>Domagoj</first><last>Alagić</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
       <pages>54–59</pages>
       <url>W17-1409</url>
       <doi>10.18653/v1/W17-1409</doi>
@@ -2241,9 +2241,9 @@
     </paper>
     <paper id="10">
       <title>Adapting a State-of-the-Art Tagger for South <fixed-case>S</fixed-case>lavic Languages to Non-Standard Text</title>
-      <author><first>Nikola</first> <last>Ljubešić</last></author>
-      <author><first>Tomaž</first> <last>Erjavec</last></author>
-      <author><first>Darja</first> <last>Fišer</last></author>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
+      <author><first>Tomaž</first><last>Erjavec</last></author>
+      <author><first>Darja</first><last>Fišer</last></author>
       <pages>60–68</pages>
       <url>W17-1410</url>
       <doi>10.18653/v1/W17-1410</doi>
@@ -2251,8 +2251,8 @@
     </paper>
     <paper id="11">
       <title>Comparison of Short-Text Sentiment Analysis Methods for <fixed-case>C</fixed-case>roatian</title>
-      <author><first>Leon</first> <last>Rotim</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
+      <author><first>Leon</first><last>Rotim</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
       <pages>69–75</pages>
       <url>W17-1411</url>
       <doi>10.18653/v1/W17-1411</doi>
@@ -2260,11 +2260,11 @@
     </paper>
     <paper id="12">
       <title>The First Cross-Lingual Challenge on Recognition, Normalization, and Matching of Named Entities in <fixed-case>S</fixed-case>lavic Languages</title>
-      <author><first>Jakub</first> <last>Piskorski</last></author>
-      <author><first>Lidia</first> <last>Pivovarova</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
-      <author><first>Josef</first> <last>Steinberger</last></author>
-      <author><first>Roman</first> <last>Yangarber</last></author>
+      <author><first>Jakub</first><last>Piskorski</last></author>
+      <author><first>Lidia</first><last>Pivovarova</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
+      <author><first>Josef</first><last>Steinberger</last></author>
+      <author><first>Roman</first><last>Yangarber</last></author>
       <pages>76–85</pages>
       <url>W17-1412</url>
       <doi>10.18653/v1/W17-1412</doi>
@@ -2272,9 +2272,9 @@
     </paper>
     <paper id="13">
       <title><fixed-case>L</fixed-case>iner2 — a Generic Framework for Named Entity Recognition</title>
-      <author><first>Michał</first> <last>Marcińczuk</last></author>
-      <author><first>Jan</first> <last>Kocoń</last></author>
-      <author><first>Marcin</first> <last>Oleksy</last></author>
+      <author><first>Michał</first><last>Marcińczuk</last></author>
+      <author><first>Jan</first><last>Kocoń</last></author>
+      <author><first>Marcin</first><last>Oleksy</last></author>
       <pages>86–91</pages>
       <url>W17-1413</url>
       <doi>10.18653/v1/W17-1413</doi>
@@ -2282,9 +2282,9 @@
     </paper>
     <paper id="14">
       <title>Language-Independent Named Entity Analysis Using Parallel Projection and Rule-Based Disambiguation</title>
-      <author><first>James</first> <last>Mayfield</last></author>
-      <author><first>Paul</first> <last>McNamee</last></author>
-      <author><first>Cash</first> <last>Costello</last></author>
+      <author><first>James</first><last>Mayfield</last></author>
+      <author><first>Paul</first><last>McNamee</last></author>
+      <author><first>Cash</first><last>Costello</last></author>
       <pages>92–96</pages>
       <url>W17-1414</url>
       <doi>10.18653/v1/W17-1414</doi>
@@ -2292,7 +2292,7 @@
     </paper>
     <paper id="15">
       <title>Comparison of String Similarity Measures for Obscenity Filtering</title>
-      <author><first>Ekaterina</first> <last>Chernyak</last></author>
+      <author><first>Ekaterina</first><last>Chernyak</last></author>
       <pages>97–101</pages>
       <url>W17-1415</url>
       <doi>10.18653/v1/W17-1415</doi>
@@ -2300,8 +2300,8 @@
     </paper>
     <paper id="16">
       <title>Stylometric Analysis of Parliamentary Speeches: Gender Dimension</title>
-      <author><first>Justina</first> <last>Mandravickaitė</last></author>
-      <author><first>Tomas</first> <last>Krilavičius</last></author>
+      <author><first>Justina</first><last>Mandravickaitė</last></author>
+      <author><first>Tomas</first><last>Krilavičius</last></author>
       <pages>102–107</pages>
       <url>W17-1416</url>
       <doi>10.18653/v1/W17-1416</doi>
@@ -2309,10 +2309,10 @@
     </paper>
     <paper id="17">
       <title>Towards Never Ending Language Learning for Morphologically Rich Languages</title>
-      <author><first>Kseniya</first> <last>Buraya</last></author>
-      <author><first>Lidia</first> <last>Pivovarova</last></author>
-      <author><first>Sergey</first> <last>Budkov</last></author>
-      <author><first>Andrey</first> <last>Filchenkov</last></author>
+      <author><first>Kseniya</first><last>Buraya</last></author>
+      <author><first>Lidia</first><last>Pivovarova</last></author>
+      <author><first>Sergey</first><last>Budkov</last></author>
+      <author><first>Andrey</first><last>Filchenkov</last></author>
       <pages>108–118</pages>
       <url>W17-1417</url>
       <doi>10.18653/v1/W17-1417</doi>
@@ -2320,9 +2320,9 @@
     </paper>
     <paper id="18">
       <title>Gender Profiling for <fixed-case>S</fixed-case>lovene Twitter communication: the Influence of Gender Marking, Content and Style</title>
-      <author><first>Ben</first> <last>Verhoeven</last></author>
-      <author><first>Iza</first> <last>Škrjanec</last></author>
-      <author><first>Senja</first> <last>Pollak</last></author>
+      <author><first>Ben</first><last>Verhoeven</last></author>
+      <author><first>Iza</first><last>Škrjanec</last></author>
+      <author><first>Senja</first><last>Pollak</last></author>
       <pages>119–125</pages>
       <url>W17-1418</url>
       <doi>10.18653/v1/W17-1418</doi>
@@ -2346,8 +2346,8 @@
     </frontmatter>
     <paper id="1">
       <title>Use Generalized Representations, But Do Not Forget Surface Features</title>
-      <author><first>Nafise Sadat</first> <last>Moosavi</last></author>
-      <author><first>Michael</first> <last>Strube</last></author>
+      <author><first>Nafise Sadat</first><last>Moosavi</last></author>
+      <author><first>Michael</first><last>Strube</last></author>
       <pages>1–7</pages>
       <url>W17-1501</url>
       <doi>10.18653/v1/W17-1501</doi>
@@ -2355,10 +2355,10 @@
     </paper>
     <paper id="2">
       <title>Enriching Basque Coreference Resolution System using Semantic Knowledge sources</title>
-      <author><first>Ander</first> <last>Soraluze</last></author>
-      <author><first>Olatz</first> <last>Arregi</last></author>
-      <author><first>Xabier</first> <last>Arregi</last></author>
-      <author><first>Arantza</first> <last>Díaz de Ilarraza</last></author>
+      <author><first>Ander</first><last>Soraluze</last></author>
+      <author><first>Olatz</first><last>Arregi</last></author>
+      <author><first>Xabier</first><last>Arregi</last></author>
+      <author><first>Arantza</first><last>Díaz de Ilarraza</last></author>
       <pages>8–16</pages>
       <url>W17-1502</url>
       <doi>10.18653/v1/W17-1502</doi>
@@ -2366,8 +2366,8 @@
     </paper>
     <paper id="3">
       <title>Improving Polish Mention Detection with Valency Dictionary</title>
-      <author><first>Maciej</first> <last>Ogrodniczuk</last></author>
-      <author><first>Bartłomiej</first> <last>Nitoń</last></author>
+      <author><first>Maciej</first><last>Ogrodniczuk</last></author>
+      <author><first>Bartłomiej</first><last>Nitoń</last></author>
       <pages>17–23</pages>
       <url>W17-1503</url>
       <doi>10.18653/v1/W17-1503</doi>
@@ -2375,8 +2375,8 @@
     </paper>
     <paper id="4">
       <title>A <fixed-case>G</fixed-case>oogle-Proof Collection of <fixed-case>F</fixed-case>rench <fixed-case>W</fixed-case>inograd Schemas</title>
-      <author><first>Pascal</first> <last>Amsili</last></author>
-      <author><first>Olga</first> <last>Seminck</last></author>
+      <author><first>Pascal</first><last>Amsili</last></author>
+      <author><first>Olga</first><last>Seminck</last></author>
       <pages>24–29</pages>
       <url>W17-1504</url>
       <doi>10.18653/v1/W17-1504</doi>
@@ -2384,8 +2384,8 @@
     </paper>
     <paper id="5">
       <title>Using Coreference Links to Improve <fixed-case>S</fixed-case>panish-to-<fixed-case>E</fixed-case>nglish Machine Translation</title>
-      <author><first>Lesly</first> <last>Miculicich Werlen</last></author>
-      <author><first>Andrei</first> <last>Popescu-Belis</last></author>
+      <author><first>Lesly</first><last>Miculicich Werlen</last></author>
+      <author><first>Andrei</first><last>Popescu-Belis</last></author>
       <pages>30–40</pages>
       <url>W17-1505</url>
       <doi>10.18653/v1/W17-1505</doi>
@@ -2393,8 +2393,8 @@
     </paper>
     <paper id="6">
       <title>Multi-source annotation projection of coreference chains: assessing strategies and testing opportunities</title>
-      <author><first>Yulia</first> <last>Grishina</last></author>
-      <author><first>Manfred</first> <last>Stede</last></author>
+      <author><first>Yulia</first><last>Grishina</last></author>
+      <author><first>Manfred</first><last>Stede</last></author>
       <pages>41–50</pages>
       <url>W17-1506</url>
       <doi>10.18653/v1/W17-1506</doi>
@@ -2402,7 +2402,7 @@
     </paper>
     <paper id="7">
       <title><fixed-case>CORBON</fixed-case> 2017 Shared Task: Projection-Based Coreference Resolution</title>
-      <author><first>Yulia</first> <last>Grishina</last></author>
+      <author><first>Yulia</first><last>Grishina</last></author>
       <pages>51–55</pages>
       <url>W17-1507</url>
       <doi>10.18653/v1/W17-1507</doi>
@@ -2410,9 +2410,9 @@
     </paper>
     <paper id="8">
       <title>Projection-based Coreference Resolution Using Deep Syntax</title>
-      <author><first>Michal</first> <last>Novák</last></author>
-      <author><first>Anna</first> <last>Nedoluzhko</last></author>
-      <author><first>Zdeněk</first> <last>Žabokrtský</last></author>
+      <author><first>Michal</first><last>Novák</last></author>
+      <author><first>Anna</first><last>Nedoluzhko</last></author>
+      <author><first>Zdeněk</first><last>Žabokrtský</last></author>
       <pages>56–64</pages>
       <url>W17-1508</url>
       <doi>10.18653/v1/W17-1508</doi>
@@ -2440,7 +2440,7 @@
     </frontmatter>
     <paper id="1">
       <title>Gender as a Variable in Natural-Language Processing: Ethical Considerations</title>
-      <author><first>Brian</first> <last>Larson</last></author>
+      <author><first>Brian</first><last>Larson</last></author>
       <pages>1–11</pages>
       <url>W17-1601</url>
       <doi>10.18653/v1/W17-1601</doi>
@@ -2448,8 +2448,8 @@
     </paper>
     <paper id="2">
       <title>These are not the Stereotypes You are Looking For: Bias and Fairness in Authorial Gender Attribution</title>
-      <author><first>Corina</first> <last>Koolen</last></author>
-      <author><first>Andreas</first> <last>van Cranenburgh</last></author>
+      <author><first>Corina</first><last>Koolen</last></author>
+      <author><first>Andreas</first><last>van Cranenburgh</last></author>
       <pages>12–22</pages>
       <url>W17-1602</url>
       <doi>10.18653/v1/W17-1602</doi>
@@ -2457,7 +2457,7 @@
     </paper>
     <paper id="3">
       <title>A Quantitative Study of Data in the <fixed-case>NLP</fixed-case> community</title>
-      <author><first>Margot</first> <last>Mieskes</last></author>
+      <author><first>Margot</first><last>Mieskes</last></author>
       <pages>23–29</pages>
       <url>W17-1603</url>
       <doi>10.18653/v1/W17-1603</doi>
@@ -2465,8 +2465,8 @@
     </paper>
     <paper id="4">
       <title>Ethical by Design: Ethics Best Practices for Natural Language Processing</title>
-      <author><first>Jochen L.</first> <last>Leidner</last></author>
-      <author><first>Vassilis</first> <last>Plachouras</last></author>
+      <author><first>Jochen L.</first><last>Leidner</last></author>
+      <author><first>Vassilis</first><last>Plachouras</last></author>
       <pages>30–40</pages>
       <url>W17-1604</url>
       <doi>10.18653/v1/W17-1604</doi>
@@ -2474,11 +2474,11 @@
     </paper>
     <paper id="5">
       <title>Building Better Open-Source Tools to Support Fairness in Automated Scoring</title>
-      <author><first>Nitin</first> <last>Madnani</last></author>
-      <author><first>Anastassia</first> <last>Loukina</last></author>
-      <author><first>Alina</first> <last>von Davier</last></author>
-      <author><first>Jill</first> <last>Burstein</last></author>
-      <author><first>Aoife</first> <last>Cahill</last></author>
+      <author><first>Nitin</first><last>Madnani</last></author>
+      <author><first>Anastassia</first><last>Loukina</last></author>
+      <author><first>Alina</first><last>von Davier</last></author>
+      <author><first>Jill</first><last>Burstein</last></author>
+      <author><first>Aoife</first><last>Cahill</last></author>
       <pages>41–52</pages>
       <url>W17-1605</url>
       <doi>10.18653/v1/W17-1605</doi>
@@ -2486,7 +2486,7 @@
     </paper>
     <paper id="6">
       <title>Gender and Dialect Bias in <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube’s Automatic Captions</title>
-      <author><first>Rachael</first> <last>Tatman</last></author>
+      <author><first>Rachael</first><last>Tatman</last></author>
       <pages>53–59</pages>
       <url>W17-1606</url>
       <doi>10.18653/v1/W17-1606</doi>
@@ -2494,9 +2494,9 @@
     </paper>
     <paper id="7">
       <title>Integrating the Management of Personal Data Protection and Open Science with Research Ethics</title>
-      <author><first>Dave</first> <last>Lewis</last></author>
-      <author><first>Joss</first> <last>Moorkens</last></author>
-      <author><first>Kaniz</first> <last>Fatema</last></author>
+      <author><first>Dave</first><last>Lewis</last></author>
+      <author><first>Joss</first><last>Moorkens</last></author>
+      <author><first>Kaniz</first><last>Fatema</last></author>
       <pages>60–65</pages>
       <url>W17-1607</url>
       <doi>10.18653/v1/W17-1607</doi>
@@ -2504,12 +2504,12 @@
     </paper>
     <paper id="8">
       <title>Ethical Considerations in <fixed-case>NLP</fixed-case> Shared Tasks</title>
-      <author><first>Carla</first> <last>Parra Escartín</last></author>
-      <author><first>Wessel</first> <last>Reijers</last></author>
-      <author><first>Teresa</first> <last>Lynn</last></author>
-      <author><first>Joss</first> <last>Moorkens</last></author>
-      <author><first>Andy</first> <last>Way</last></author>
-      <author><first>Chao-Hong</first> <last>Liu</last></author>
+      <author><first>Carla</first><last>Parra Escartín</last></author>
+      <author><first>Wessel</first><last>Reijers</last></author>
+      <author><first>Teresa</first><last>Lynn</last></author>
+      <author><first>Joss</first><last>Moorkens</last></author>
+      <author><first>Andy</first><last>Way</last></author>
+      <author><first>Chao-Hong</first><last>Liu</last></author>
       <pages>66–73</pages>
       <url>W17-1608</url>
       <doi>10.18653/v1/W17-1608</doi>
@@ -2517,9 +2517,9 @@
     </paper>
     <paper id="9">
       <title>Social Bias in Elicited Natural Language Inferences</title>
-      <author><first>Rachel</first> <last>Rudinger</last></author>
-      <author><first>Chandler</first> <last>May</last></author>
-      <author><first>Benjamin</first> <last>Van Durme</last></author>
+      <author><first>Rachel</first><last>Rudinger</last></author>
+      <author><first>Chandler</first><last>May</last></author>
+      <author><first>Benjamin</first><last>Van Durme</last></author>
       <pages>74–79</pages>
       <url>W17-1609</url>
       <doi>10.18653/v1/W17-1609</doi>
@@ -2527,9 +2527,9 @@
     </paper>
     <paper id="10">
       <title>A Short Review of Ethical Challenges in Clinical Natural Language Processing</title>
-      <author><first>Simon</first> <last>Šuster</last></author>
-      <author><first>Stéphan</first> <last>Tulkens</last></author>
-      <author><first>Walter</first> <last>Daelemans</last></author>
+      <author><first>Simon</first><last>Šuster</last></author>
+      <author><first>Stéphan</first><last>Tulkens</last></author>
+      <author><first>Walter</first><last>Daelemans</last></author>
       <pages>80–87</pages>
       <url>W17-1610</url>
       <doi>10.18653/v1/W17-1610</doi>
@@ -2537,7 +2537,7 @@
     </paper>
     <paper id="11">
       <title>Goal-Oriented Design for Ethical Machine Learning and <fixed-case>NLP</fixed-case></title>
-      <author><first>Tyler</first> <last>Schnoebelen</last></author>
+      <author><first>Tyler</first><last>Schnoebelen</last></author>
       <pages>88–93</pages>
       <url>W17-1611</url>
       <doi>10.18653/v1/W17-1611</doi>
@@ -2545,9 +2545,9 @@
     </paper>
     <paper id="12">
       <title>Ethical Research Protocols for Social Media Health Research</title>
-      <author><first>Adrian</first> <last>Benton</last></author>
-      <author><first>Glen</first> <last>Coppersmith</last></author>
-      <author><first>Mark</first> <last>Dredze</last></author>
+      <author><first>Adrian</first><last>Benton</last></author>
+      <author><first>Glen</first><last>Coppersmith</last></author>
+      <author><first>Mark</first><last>Dredze</last></author>
       <pages>94–102</pages>
       <url>W17-1612</url>
       <doi>10.18653/v1/W17-1612</doi>
@@ -2555,10 +2555,10 @@
     </paper>
     <paper id="13">
       <title>Say the Right Thing Right: Ethics Issues in Natural Language Generation Systems</title>
-      <author><first>Charese</first> <last>Smiley</last></author>
-      <author><first>Frank</first> <last>Schilder</last></author>
-      <author><first>Vassilis</first> <last>Plachouras</last></author>
-      <author><first>Jochen L.</first> <last>Leidner</last></author>
+      <author><first>Charese</first><last>Smiley</last></author>
+      <author><first>Frank</first><last>Schilder</last></author>
+      <author><first>Vassilis</first><last>Plachouras</last></author>
+      <author><first>Jochen L.</first><last>Leidner</last></author>
       <pages>103–108</pages>
       <url>W17-1613</url>
       <doi>10.18653/v1/W17-1613</doi>
@@ -2584,8 +2584,8 @@
     </frontmatter>
     <paper id="1">
       <title><fixed-case>P</fixed-case>ara<fixed-case>D</fixed-case>i: Dictionary of Paraphrases of <fixed-case>C</fixed-case>zech Complex Predicates with Light Verbs</title>
-      <author><first>Petra</first> <last>Barančíková</last></author>
-      <author><first>Václava</first> <last>Kettnerová</last></author>
+      <author><first>Petra</first><last>Barančíková</last></author>
+      <author><first>Václava</first><last>Kettnerová</last></author>
       <pages>1–10</pages>
       <url>W17-1701</url>
       <doi>10.18653/v1/W17-1701</doi>
@@ -2593,10 +2593,10 @@
     </paper>
     <paper id="2">
       <title>Multi-word Entity Classification in a Highly Multilingual Environment</title>
-      <author><first>Sophie</first> <last>Chesney</last></author>
-      <author><first>Guillaume</first> <last>Jacquet</last></author>
-      <author><first>Ralf</first> <last>Steinberger</last></author>
-      <author><first>Jakub</first> <last>Piskorski</last></author>
+      <author><first>Sophie</first><last>Chesney</last></author>
+      <author><first>Guillaume</first><last>Jacquet</last></author>
+      <author><first>Ralf</first><last>Steinberger</last></author>
+      <author><first>Jakub</first><last>Piskorski</last></author>
       <pages>11–20</pages>
       <url>W17-1702</url>
       <doi>10.18653/v1/W17-1702</doi>
@@ -2604,9 +2604,9 @@
     </paper>
     <paper id="3">
       <title>Using bilingual word-embeddings for multilingual collocation extraction</title>
-      <author><first>Marcos</first> <last>Garcia</last></author>
-      <author><first>Marcos</first> <last>García-Salido</last></author>
-      <author><first>Margarita</first> <last>Alonso-Ramos</last></author>
+      <author><first>Marcos</first><last>Garcia</last></author>
+      <author><first>Marcos</first><last>García-Salido</last></author>
+      <author><first>Margarita</first><last>Alonso-Ramos</last></author>
       <pages>21–30</pages>
       <url>W17-1703</url>
       <doi>10.18653/v1/W17-1703</doi>
@@ -2614,17 +2614,17 @@
     </paper>
     <paper id="4">
       <title>The <fixed-case>PARSEME</fixed-case> Shared Task on Automatic Identification of Verbal Multiword Expressions</title>
-      <author><first>Agata</first> <last>Savary</last></author>
-      <author><first>Carlos</first> <last>Ramisch</last></author>
-      <author><first>Silvio</first> <last>Cordeiro</last></author>
-      <author><first>Federico</first> <last>Sangati</last></author>
-      <author><first>Veronika</first> <last>Vincze</last></author>
-      <author><first>Behrang</first> <last>QasemiZadeh</last></author>
-      <author><first>Marie</first> <last>Candito</last></author>
-      <author><first>Fabienne</first> <last>Cap</last></author>
-      <author><first>Voula</first> <last>Giouli</last></author>
-      <author><first>Ivelina</first> <last>Stoyanova</last></author>
-      <author><first>Antoine</first> <last>Doucet</last></author>
+      <author><first>Agata</first><last>Savary</last></author>
+      <author><first>Carlos</first><last>Ramisch</last></author>
+      <author><first>Silvio</first><last>Cordeiro</last></author>
+      <author><first>Federico</first><last>Sangati</last></author>
+      <author><first>Veronika</first><last>Vincze</last></author>
+      <author><first>Behrang</first><last>QasemiZadeh</last></author>
+      <author><first>Marie</first><last>Candito</last></author>
+      <author><first>Fabienne</first><last>Cap</last></author>
+      <author><first>Voula</first><last>Giouli</last></author>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Antoine</first><last>Doucet</last></author>
       <pages>31–47</pages>
       <url>W17-1704</url>
       <doi>10.18653/v1/W17-1704</doi>
@@ -2632,9 +2632,9 @@
     </paper>
     <paper id="5">
       <title><fixed-case>US</fixed-case>zeged: Identifying Verbal Multiword Expressions with <fixed-case>POS</fixed-case> Tagging and Parsing Techniques</title>
-      <author><first>Katalin Ilona</first> <last>Simkó</last></author>
-      <author><first>Viktória</first> <last>Kovács</last></author>
-      <author><first>Veronika</first> <last>Vincze</last></author>
+      <author><first>Katalin Ilona</first><last>Simkó</last></author>
+      <author><first>Viktória</first><last>Kovács</last></author>
+      <author><first>Veronika</first><last>Vincze</last></author>
       <pages>48–53</pages>
       <url>W17-1705</url>
       <doi>10.18653/v1/W17-1705</doi>
@@ -2642,9 +2642,9 @@
     </paper>
     <paper id="6">
       <title>Parsing and <fixed-case>MWE</fixed-case> Detection: Fips at the <fixed-case>PARSEME</fixed-case> Shared Task</title>
-      <author><first>Luka</first> <last>Nerima</last></author>
-      <author><first>Vasiliki</first> <last>Foufi</last></author>
-      <author><first>Éric</first> <last>Wehrli</last></author>
+      <author><first>Luka</first><last>Nerima</last></author>
+      <author><first>Vasiliki</first><last>Foufi</last></author>
+      <author><first>Éric</first><last>Wehrli</last></author>
       <pages>54–59</pages>
       <url>W17-1706</url>
       <doi>10.18653/v1/W17-1706</doi>
@@ -2652,9 +2652,9 @@
     </paper>
     <paper id="7">
       <title>Neural Networks for Multi-Word Expression Detection</title>
-      <author><first>Natalia</first> <last>Klyueva</last></author>
-      <author><first>Antoine</first> <last>Doucet</last></author>
-      <author><first>Milan</first> <last>Straka</last></author>
+      <author><first>Natalia</first><last>Klyueva</last></author>
+      <author><first>Antoine</first><last>Doucet</last></author>
+      <author><first>Milan</first><last>Straka</last></author>
       <pages>60–65</pages>
       <url>W17-1707</url>
       <doi>10.18653/v1/W17-1707</doi>
@@ -2662,8 +2662,8 @@
     </paper>
     <paper id="8">
       <title>Factoring Ambiguity out of the Prediction of Compositionality for <fixed-case>G</fixed-case>erman Multi-Word Expressions</title>
-      <author><first>Stefan</first> <last>Bott</last></author>
-      <author><first>Sabine</first> <last>Schulte im Walde</last></author>
+      <author><first>Stefan</first><last>Bott</last></author>
+      <author><first>Sabine</first><last>Schulte im Walde</last></author>
       <pages>66–72</pages>
       <url>W17-1708</url>
       <doi>10.18653/v1/W17-1708</doi>
@@ -2671,7 +2671,7 @@
     </paper>
     <paper id="9">
       <title>Multiword expressions and lexicalism: the view from <fixed-case>LFG</fixed-case></title>
-      <author><first>Jamie Y.</first> <last>Findlay</last></author>
+      <author><first>Jamie Y.</first><last>Findlay</last></author>
       <pages>73–79</pages>
       <url>W17-1709</url>
       <doi>10.18653/v1/W17-1709</doi>
@@ -2679,9 +2679,9 @@
     </paper>
     <paper id="10">
       <title>Understanding Idiomatic Variation</title>
-      <author><first>Kristina</first> <last>Geeraert</last></author>
-      <author><first>R. Harald</first> <last>Baayen</last></author>
-      <author><first>John</first> <last>Newman</last></author>
+      <author><first>Kristina</first><last>Geeraert</last></author>
+      <author><first>R. Harald</first><last>Baayen</last></author>
+      <author><first>John</first><last>Newman</last></author>
       <pages>80–90</pages>
       <url>W17-1710</url>
       <doi>10.18653/v1/W17-1710</doi>
@@ -2689,9 +2689,9 @@
     </paper>
     <paper id="11">
       <title>Discovering Light Verb Constructions and their Translations from Parallel Corpora without Word Alignment</title>
-      <author><first>Natalie</first> <last>Vargas</last></author>
-      <author><first>Carlos</first> <last>Ramisch</last></author>
-      <author><first>Helena</first> <last>Caseli</last></author>
+      <author><first>Natalie</first><last>Vargas</last></author>
+      <author><first>Carlos</first><last>Ramisch</last></author>
+      <author><first>Helena</first><last>Caseli</last></author>
       <pages>91–96</pages>
       <url>W17-1711</url>
       <doi>10.18653/v1/W17-1711</doi>
@@ -2699,8 +2699,8 @@
     </paper>
     <paper id="12">
       <title>Identification of Multiword Expressions for <fixed-case>L</fixed-case>atvian and <fixed-case>L</fixed-case>ithuanian: Hybrid Approach</title>
-      <author><first>Justina</first> <last>Mandravickaitė</last></author>
-      <author><first>Tomas</first> <last>Krilavičius</last></author>
+      <author><first>Justina</first><last>Mandravickaitė</last></author>
+      <author><first>Tomas</first><last>Krilavičius</last></author>
       <pages>97–101</pages>
       <url>W17-1712</url>
       <doi>10.18653/v1/W17-1712</doi>
@@ -2708,7 +2708,7 @@
     </paper>
     <paper id="13">
       <title>Show Me Your Variance and <fixed-case>I</fixed-case> Tell You Who You Are - Deriving Compound Compositionality from Word Alignments</title>
-      <author><first>Fabienne</first> <last>Cap</last></author>
+      <author><first>Fabienne</first><last>Cap</last></author>
       <pages>102–107</pages>
       <url>W17-1713</url>
       <doi>10.18653/v1/W17-1713</doi>
@@ -2716,8 +2716,8 @@
     </paper>
     <paper id="14">
       <title>Semantic annotation to characterize contextual variation in terminological noun compounds: a pilot study</title>
-      <author><first>Melania</first> <last>Cabezas-García</last></author>
-      <author><first>Antonio</first> <last>San Martín</last></author>
+      <author><first>Melania</first><last>Cabezas-García</last></author>
+      <author><first>Antonio</first><last>San Martín</last></author>
       <pages>108–113</pages>
       <url>W17-1714</url>
       <doi>10.18653/v1/W17-1714</doi>
@@ -2725,13 +2725,13 @@
     </paper>
     <paper id="15">
       <title>Detection of Verbal Multi-Word Expressions via Conditional Random Fields with Syntactic Dependency Features and Semantic Re-Ranking</title>
-      <author><first>Alfredo</first> <last>Maldonado</last></author>
-      <author><first>Lifeng</first> <last>Han</last></author>
-      <author><first>Erwan</first> <last>Moreau</last></author>
-      <author><first>Ashjan</first> <last>Alsulaimani</last></author>
-      <author><first>Koel Dutta</first> <last>Chowdhury</last></author>
-      <author><first>Carl</first> <last>Vogel</last></author>
-      <author><first>Qun</first> <last>Liu</last></author>
+      <author><first>Alfredo</first><last>Maldonado</last></author>
+      <author><first>Lifeng</first><last>Han</last></author>
+      <author><first>Erwan</first><last>Moreau</last></author>
+      <author><first>Ashjan</first><last>Alsulaimani</last></author>
+      <author><first>Koel Dutta</first><last>Chowdhury</last></author>
+      <author><first>Carl</first><last>Vogel</last></author>
+      <author><first>Qun</first><last>Liu</last></author>
       <pages>114–120</pages>
       <url>W17-1715</url>
       <doi>10.18653/v1/W17-1715</doi>
@@ -2739,10 +2739,10 @@
     </paper>
     <paper id="16">
       <title>A data-driven approach to verbal multiword expression detection. <fixed-case>PARSEME</fixed-case> Shared Task system description paper</title>
-      <author><first>Tiberiu</first> <last>Boros</last></author>
-      <author><first>Sonia</first> <last>Pipa</last></author>
-      <author><first>Verginica</first> <last>Barbu Mititelu</last></author>
-      <author><first>Dan</first> <last>Tufis</last></author>
+      <author><first>Tiberiu</first><last>Boros</last></author>
+      <author><first>Sonia</first><last>Pipa</last></author>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Dan</first><last>Tufis</last></author>
       <pages>121–126</pages>
       <url>W17-1716</url>
       <doi>10.18653/v1/W17-1716</doi>
@@ -2750,9 +2750,9 @@
     </paper>
     <paper id="17">
       <title>The <fixed-case>ATILF</fixed-case>-<fixed-case>LLF</fixed-case> System for Parseme Shared Task: a Transition-based Verbal Multiword Expression Tagger</title>
-      <author><first>Hazem</first> <last>Al Saied</last></author>
-      <author><first>Matthieu</first> <last>Constant</last></author>
-      <author><first>Marie</first> <last>Candito</last></author>
+      <author><first>Hazem</first><last>Al Saied</last></author>
+      <author><first>Matthieu</first><last>Constant</last></author>
+      <author><first>Marie</first><last>Candito</last></author>
       <pages>127–132</pages>
       <url>W17-1717</url>
       <doi>10.18653/v1/W17-1717</doi>
@@ -2760,10 +2760,10 @@
     </paper>
     <paper id="18">
       <title>Investigating the Opacity of Verb-Noun Multiword Expression Usages in Context</title>
-      <author><first>Shiva</first> <last>Taslimipoor</last></author>
-      <author><first>Omid</first> <last>Rohanian</last></author>
-      <author><first>Ruslan</first> <last>Mitkov</last></author>
-      <author><first>Afsaneh</first> <last>Fazly</last></author>
+      <author><first>Shiva</first><last>Taslimipoor</last></author>
+      <author><first>Omid</first><last>Rohanian</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <author><first>Afsaneh</first><last>Fazly</last></author>
       <pages>133–138</pages>
       <url>W17-1718</url>
       <doi>10.18653/v1/W17-1718</doi>
@@ -2771,9 +2771,9 @@
     </paper>
     <paper id="19">
       <title>Compositionality in Verb-Particle Constructions</title>
-      <author><first>Archna</first> <last>Bhatia</last></author>
-      <author><first>Choh Man</first> <last>Teng</last></author>
-      <author><first>James</first> <last>Allen</last></author>
+      <author><first>Archna</first><last>Bhatia</last></author>
+      <author><first>Choh Man</first><last>Teng</last></author>
+      <author><first>James</first><last>Allen</last></author>
       <pages>139–148</pages>
       <url>W17-1719</url>
       <doi>10.18653/v1/W17-1719</doi>
@@ -2781,11 +2781,11 @@
     </paper>
     <paper id="20">
       <title>Rule-Based Translation of <fixed-case>S</fixed-case>panish Verb-Noun Combinations into Basque</title>
-      <author><first>Uxoa</first> <last>Iñurrieta</last></author>
-      <author><first>Itziar</first> <last>Aduriz</last></author>
-      <author><first>Arantza</first> <last>Díaz de Ilarraza</last></author>
-      <author><first>Gorka</first> <last>Labaka</last></author>
-      <author><first>Kepa</first> <last>Sarasola</last></author>
+      <author><first>Uxoa</first><last>Iñurrieta</last></author>
+      <author><first>Itziar</first><last>Aduriz</last></author>
+      <author><first>Arantza</first><last>Díaz de Ilarraza</last></author>
+      <author><first>Gorka</first><last>Labaka</last></author>
+      <author><first>Kepa</first><last>Sarasola</last></author>
       <pages>149–154</pages>
       <url>W17-1720</url>
       <doi>10.18653/v1/W17-1720</doi>
@@ -2793,7 +2793,7 @@
     </paper>
     <paper id="21">
       <title>Verb-Particle Constructions in Questions</title>
-      <author><first>Veronika</first> <last>Vincze</last></author>
+      <author><first>Veronika</first><last>Vincze</last></author>
       <pages>155–160</pages>
       <url>W17-1721</url>
       <doi>10.18653/v1/W17-1721</doi>
@@ -2801,7 +2801,7 @@
     </paper>
     <paper id="22">
       <title>Simple Compound Splitting for <fixed-case>G</fixed-case>erman</title>
-      <author><first>Marion</first> <last>Weller-Di Marco</last></author>
+      <author><first>Marion</first><last>Weller-Di Marco</last></author>
       <pages>161–166</pages>
       <url>W17-1722</url>
       <doi>10.18653/v1/W17-1722</doi>
@@ -2809,8 +2809,8 @@
     </paper>
     <paper id="23">
       <title>Identification of Ambiguous Multiword Expressions Using Sequence Models and Lexical Resources</title>
-      <author><first>Manon</first> <last>Scholivet</last></author>
-      <author><first>Carlos</first> <last>Ramisch</last></author>
+      <author><first>Manon</first><last>Scholivet</last></author>
+      <author><first>Carlos</first><last>Ramisch</last></author>
       <pages>167–175</pages>
       <url>W17-1723</url>
       <doi>10.18653/v1/W17-1723</doi>
@@ -2818,8 +2818,8 @@
     </paper>
     <paper id="24">
       <title>Comparing Recurring Lexico-Syntactic Trees (<fixed-case>RLT</fixed-case>s) and Ngram Techniques for Extended Phraseology Extraction</title>
-      <author><first>Agnès</first> <last>Tutin</last></author>
-      <author><first>Olivier</first> <last>Kraif</last></author>
+      <author><first>Agnès</first><last>Tutin</last></author>
+      <author><first>Olivier</first><last>Kraif</last></author>
       <pages>176–180</pages>
       <url>W17-1724</url>
       <doi>10.18653/v1/W17-1724</doi>
@@ -2827,8 +2827,8 @@
     </paper>
     <paper id="25">
       <title>Benchmarking Joint Lexical and Syntactic Analysis on Multiword-Rich Data</title>
-      <author><first>Matthieu</first> <last>Constant</last></author>
-      <author><first>Héctor</first> <last>Martinez Alonso</last></author>
+      <author><first>Matthieu</first><last>Constant</last></author>
+      <author><first>Héctor</first><last>Martinez Alonso</last></author>
       <pages>181–186</pages>
       <url>W17-1725</url>
       <doi>10.18653/v1/W17-1725</doi>
@@ -2836,9 +2836,9 @@
     </paper>
     <paper id="26">
       <title>Semi-Automated Resolution of Inconsistency for a Harmonized Multiword Expression and Dependency Parse Annotation</title>
-      <author><first>King</first> <last>Chan</last></author>
-      <author><first>Julian</first> <last>Brooke</last></author>
-      <author><first>Timothy</first> <last>Baldwin</last></author>
+      <author><first>King</first><last>Chan</last></author>
+      <author><first>Julian</first><last>Brooke</last></author>
+      <author><first>Timothy</first><last>Baldwin</last></author>
       <pages>187–193</pages>
       <url>W17-1726</url>
       <doi>10.18653/v1/W17-1726</doi>
@@ -2846,8 +2846,8 @@
     </paper>
     <paper id="27">
       <title>Combining Linguistic Features for the Detection of <fixed-case>C</fixed-case>roatian Multiword Expressions</title>
-      <author><first>Maja</first> <last>Buljan</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
+      <author><first>Maja</first><last>Buljan</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
       <pages>194–199</pages>
       <url>W17-1727</url>
       <doi>10.18653/v1/W17-1727</doi>
@@ -2855,8 +2855,8 @@
     </paper>
     <paper id="28">
       <title>Complex Verbs are Different: Exploring the Visual Modality in Multi-Modal Models to Predict Compositionality</title>
-      <author><first>Maximilian</first> <last>Köper</last></author>
-      <author><first>Sabine</first> <last>Schulte im Walde</last></author>
+      <author><first>Maximilian</first><last>Köper</last></author>
+      <author><first>Sabine</first><last>Schulte im Walde</last></author>
       <pages>200–206</pages>
       <url>W17-1728</url>
       <doi>10.18653/v1/W17-1728</doi>
@@ -2881,11 +2881,11 @@
     </frontmatter>
     <paper id="1">
       <title>Understanding the Semantics of Narratives of Interpersonal Violence through Reader Annotations and Physiological Reactions</title>
-      <author><first>Alexander</first> <last>Calderwood</last></author>
-      <author><first>Elizabeth A.</first> <last>Pruett</last></author>
-      <author><first>Raymond</first> <last>Ptucha</last></author>
-      <author><first>Christopher</first> <last>Homan</last></author>
-      <author><first>Cecilia</first> <last>Ovesdotter Alm</last></author>
+      <author><first>Alexander</first><last>Calderwood</last></author>
+      <author><first>Elizabeth A.</first><last>Pruett</last></author>
+      <author><first>Raymond</first><last>Ptucha</last></author>
+      <author><first>Christopher</first><last>Homan</last></author>
+      <author><first>Cecilia</first><last>Ovesdotter Alm</last></author>
       <pages>1–9</pages>
       <url>W17-1801</url>
       <doi>10.18653/v1/W17-1801</doi>
@@ -2893,8 +2893,8 @@
     </paper>
     <paper id="2">
       <title>Intension, Attitude, and Tense Annotation in a High-Fidelity Semantic Representation</title>
-      <author><first>Gene</first> <last>Kim</last></author>
-      <author><first>Lenhart</first> <last>Schubert</last></author>
+      <author><first>Gene</first><last>Kim</last></author>
+      <author><first>Lenhart</first><last>Schubert</last></author>
       <pages>10–15</pages>
       <url>W17-1802</url>
       <doi>10.18653/v1/W17-1802</doi>
@@ -2902,8 +2902,8 @@
     </paper>
     <paper id="3">
       <title>Towards a lexicon of event-selecting predicates for a <fixed-case>F</fixed-case>rench <fixed-case>F</fixed-case>act<fixed-case>B</fixed-case>ank</title>
-      <author><first>Ingrid</first> <last>Falk</last></author>
-      <author><first>Fabienne</first> <last>Martin</last></author>
+      <author><first>Ingrid</first><last>Falk</last></author>
+      <author><first>Fabienne</first><last>Martin</last></author>
       <pages>16–21</pages>
       <url>W17-1803</url>
       <doi>10.18653/v1/W17-1803</doi>
@@ -2911,10 +2911,10 @@
     </paper>
     <paper id="4">
       <title>Universal Dependencies to Logical Form with Negation Scope</title>
-      <author><first>Federico</first> <last>Fancellu</last></author>
-      <author><first>Siva</first> <last>Reddy</last></author>
-      <author><first>Adam</first> <last>Lopez</last></author>
-      <author><first>Bonnie</first> <last>Webber</last></author>
+      <author><first>Federico</first><last>Fancellu</last></author>
+      <author><first>Siva</first><last>Reddy</last></author>
+      <author><first>Adam</first><last>Lopez</last></author>
+      <author><first>Bonnie</first><last>Webber</last></author>
       <pages>22–32</pages>
       <url>W17-1804</url>
       <doi>10.18653/v1/W17-1804</doi>
@@ -2925,7 +2925,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Meaning Banking beyond Events and Roles</title>
-      <author><first>Johan</first> <last>Bos</last></author>
+      <author><first>Johan</first><last>Bos</last></author>
       <pages>33</pages>
       <url>W17-1805</url>
       <doi>10.18653/v1/W17-1805</doi>
@@ -2933,9 +2933,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>The Scope and Focus of Negation: A Complete Annotation Framework for <fixed-case>I</fixed-case>talian</title>
-      <author><first>Begoña</first> <last>Altuna</last></author>
-      <author><first>Anne-Lyse</first> <last>Minard</last></author>
-      <author><first>Manuela</first> <last>Speranza</last></author>
+      <author><first>Begoña</first><last>Altuna</last></author>
+      <author><first>Anne-Lyse</first><last>Minard</last></author>
+      <author><first>Manuela</first><last>Speranza</last></author>
       <pages>34–42</pages>
       <url>W17-1806</url>
       <doi>10.18653/v1/W17-1806</doi>
@@ -2943,9 +2943,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Annotation of negation in the <fixed-case>IULA</fixed-case> <fixed-case>S</fixed-case>panish Clinical Record Corpus</title>
-      <author><first>Montserrat</first> <last>Marimon</last></author>
-      <author><first>Jorge</first> <last>Vivaldi</last></author>
-      <author><first>Núria</first> <last>Bel</last></author>
+      <author><first>Montserrat</first><last>Marimon</last></author>
+      <author><first>Jorge</first><last>Vivaldi</last></author>
+      <author><first>Núria</first><last>Bel</last></author>
       <pages>43–52</pages>
       <url>W17-1807</url>
       <doi>10.18653/v1/W17-1807</doi>
@@ -2953,11 +2953,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Annotating Negation in <fixed-case>S</fixed-case>panish Clinical Texts</title>
-      <author><first>Noa</first> <last>Cruz</last></author>
-      <author><first>Roser</first> <last>Morante</last></author>
-      <author><first>Manuel J.</first> <last>Maña López</last></author>
-      <author><first>Jacinto</first> <last>Mata Vázquez</last></author>
-      <author><first>Carlos L.</first> <last>Parra Calderón</last></author>
+      <author><first>Noa</first><last>Cruz</last></author>
+      <author><first>Roser</first><last>Morante</last></author>
+      <author><first>Manuel J.</first><last>Maña López</last></author>
+      <author><first>Jacinto</first><last>Mata Vázquez</last></author>
+      <author><first>Carlos L.</first><last>Parra Calderón</last></author>
       <pages>53–58</pages>
       <url>W17-1808</url>
       <doi>10.18653/v1/W17-1808</doi>
@@ -2965,9 +2965,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Neural Networks for Negation Cue Detection in <fixed-case>C</fixed-case>hinese</title>
-      <author><first>Hangfeng</first> <last>He</last></author>
-      <author><first>Federico</first> <last>Fancellu</last></author>
-      <author><first>Bonnie</first> <last>Webber</last></author>
+      <author><first>Hangfeng</first><last>He</last></author>
+      <author><first>Federico</first><last>Fancellu</last></author>
+      <author><first>Bonnie</first><last>Webber</last></author>
       <pages>59–63</pages>
       <url>W17-1809</url>
       <doi>10.18653/v1/W17-1809</doi>
@@ -2975,9 +2975,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>An open-source tool for negation detection: a maximum-margin approach</title>
-      <author><first>Martine</first> <last>Enger</last></author>
-      <author><first>Erik</first> <last>Velldal</last></author>
-      <author><first>Lilja</first> <last>Øvrelid</last></author>
+      <author><first>Martine</first><last>Enger</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
       <pages>64–69</pages>
       <url>W17-1810</url>
       <doi>10.18653/v1/W17-1810</doi>
@@ -3001,8 +3001,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Compositional Semantics using Feature-Based Models from <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
-      <author><first>Pablo</first> <last>Gamallo</last></author>
-      <author><first>Martín</first> <last>Pereira-Fariña</last></author>
+      <author><first>Pablo</first><last>Gamallo</last></author>
+      <author><first>Martín</first><last>Pereira-Fariña</last></author>
       <pages>1–11</pages>
       <url>W17-1901</url>
       <doi>10.18653/v1/W17-1901</doi>
@@ -3010,10 +3010,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Automated <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Construction Using Word Embeddings</title>
-      <author><first>Mikhail</first> <last>Khodak</last></author>
-      <author><first>Andrej</first> <last>Risteski</last></author>
-      <author><first>Christiane</first> <last>Fellbaum</last></author>
-      <author><first>Sanjeev</first> <last>Arora</last></author>
+      <author><first>Mikhail</first><last>Khodak</last></author>
+      <author><first>Andrej</first><last>Risteski</last></author>
+      <author><first>Christiane</first><last>Fellbaum</last></author>
+      <author><first>Sanjeev</first><last>Arora</last></author>
       <pages>12–23</pages>
       <url>W17-1902</url>
       <doi>10.18653/v1/W17-1902</doi>
@@ -3021,8 +3021,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Improving Verb Metaphor Detection by Propagating Abstractness to Words, Phrases and Individual Senses</title>
-      <author><first>Maximilian</first> <last>Köper</last></author>
-      <author><first>Sabine</first> <last>Schulte im Walde</last></author>
+      <author><first>Maximilian</first><last>Köper</last></author>
+      <author><first>Sabine</first><last>Schulte im Walde</last></author>
       <pages>24–30</pages>
       <url>W17-1903</url>
       <doi>10.18653/v1/W17-1903</doi>
@@ -3030,9 +3030,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Improving Clinical Diagnosis Inference through Integration of Structured and Unstructured Knowledge</title>
-      <author><first>Yuan</first> <last>Ling</last></author>
-      <author><first>Yuan</first> <last>An</last></author>
-      <author><first>Sadid</first> <last>Hasan</last></author>
+      <author><first>Yuan</first><last>Ling</last></author>
+      <author><first>Yuan</first><last>An</last></author>
+      <author><first>Sadid</first><last>Hasan</last></author>
       <pages>31–36</pages>
       <url>W17-1904</url>
       <doi>10.18653/v1/W17-1904</doi>
@@ -3040,9 +3040,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Classifying Lexical-semantic Relationships by Exploiting Sense/Concept Representations</title>
-      <author><first>Kentaro</first> <last>Kanada</last></author>
-      <author><first>Tetsunori</first> <last>Kobayashi</last></author>
-      <author><first>Yoshihiko</first> <last>Hayashi</last></author>
+      <author><first>Kentaro</first><last>Kanada</last></author>
+      <author><first>Tetsunori</first><last>Kobayashi</last></author>
+      <author><first>Yoshihiko</first><last>Hayashi</last></author>
       <pages>37–46</pages>
       <url>W17-1905</url>
       <doi>10.18653/v1/W17-1905</doi>
@@ -3050,8 +3050,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Supervised and unsupervised approaches to measuring usage similarity</title>
-      <author><first>Milton</first> <last>King</last></author>
-      <author><first>Paul</first> <last>Cook</last></author>
+      <author><first>Milton</first><last>King</last></author>
+      <author><first>Paul</first><last>Cook</last></author>
       <pages>47–52</pages>
       <url>W17-1906</url>
       <doi>10.18653/v1/W17-1906</doi>
@@ -3059,9 +3059,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Lexical Disambiguation of <fixed-case>I</fixed-case>gbo using Diacritic Restoration</title>
-      <author><first>Ignatius</first> <last>Ezeani</last></author>
-      <author><first>Mark</first> <last>Hepple</last></author>
-      <author><first>Ikechukwu</first> <last>Onyenwe</last></author>
+      <author><first>Ignatius</first><last>Ezeani</last></author>
+      <author><first>Mark</first><last>Hepple</last></author>
+      <author><first>Ikechukwu</first><last>Onyenwe</last></author>
       <pages>53–60</pages>
       <url>W17-1907</url>
       <doi>10.18653/v1/W17-1907</doi>
@@ -3069,10 +3069,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Creating and Validating Multilingual Semantic Representations for Six Languages: Expert versus Non-Expert Crowds</title>
-      <author><first>Mahmoud</first> <last>El-Haj</last></author>
-      <author><first>Paul</first> <last>Rayson</last></author>
-      <author><first>Scott</first> <last>Piao</last></author>
-      <author><first>Stephen</first> <last>Wattam</last></author>
+      <author><first>Mahmoud</first><last>El-Haj</last></author>
+      <author><first>Paul</first><last>Rayson</last></author>
+      <author><first>Scott</first><last>Piao</last></author>
+      <author><first>Stephen</first><last>Wattam</last></author>
       <pages>61–71</pages>
       <url>W17-1908</url>
       <doi>10.18653/v1/W17-1908</doi>
@@ -3080,10 +3080,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Using Linked Disambiguated Distributional Networks for Word Sense Disambiguation</title>
-      <author><first>Alexander</first> <last>Panchenko</last></author>
-      <author><first>Stefano</first> <last>Faralli</last></author>
-      <author><first>Simone Paolo</first> <last>Ponzetto</last></author>
-      <author><first>Chris</first> <last>Biemann</last></author>
+      <author><first>Alexander</first><last>Panchenko</last></author>
+      <author><first>Stefano</first><last>Faralli</last></author>
+      <author><first>Simone Paolo</first><last>Ponzetto</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
       <pages>72–78</pages>
       <url>W17-1909</url>
       <doi>10.18653/v1/W17-1909</doi>
@@ -3091,11 +3091,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>One Representation per Word - Does it make Sense for Composition?</title>
-      <author><first>Thomas</first> <last>Kober</last></author>
-      <author><first>Julie</first> <last>Weeds</last></author>
-      <author><first>John</first> <last>Wilkie</last></author>
-      <author><first>Jeremy</first> <last>Reffin</last></author>
-      <author><first>David</first> <last>Weir</last></author>
+      <author><first>Thomas</first><last>Kober</last></author>
+      <author><first>Julie</first><last>Weeds</last></author>
+      <author><first>John</first><last>Wilkie</last></author>
+      <author><first>Jeremy</first><last>Reffin</last></author>
+      <author><first>David</first><last>Weir</last></author>
       <pages>79–90</pages>
       <url>W17-1910</url>
       <doi>10.18653/v1/W17-1910</doi>
@@ -3103,8 +3103,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Elucidating Conceptual Properties from Word Embeddings</title>
-      <author><first>Kyoung-Rok</first> <last>Jang</last></author>
-      <author><first>Sung-Hyon</first> <last>Myaeng</last></author>
+      <author><first>Kyoung-Rok</first><last>Jang</last></author>
+      <author><first>Sung-Hyon</first><last>Myaeng</last></author>
       <pages>91–95</pages>
       <url>W17-1911</url>
       <doi>10.18653/v1/W17-1911</doi>
@@ -3112,9 +3112,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>TTCS<tex-math>^{\mathcal{E}}</tex-math>: a Vectorial Resource for Computing Conceptual Similarity</title>
-      <author><first>Enrico</first> <last>Mensa</last></author>
-      <author><first>Daniele P.</first> <last>Radicioni</last></author>
-      <author><first>Antonio</first> <last>Lieto</last></author>
+      <author><first>Enrico</first><last>Mensa</last></author>
+      <author><first>Daniele P.</first><last>Radicioni</last></author>
+      <author><first>Antonio</first><last>Lieto</last></author>
       <pages>96–101</pages>
       <url>W17-1912</url>
       <doi>10.18653/v1/W17-1912</doi>
@@ -3122,8 +3122,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Measuring the <fixed-case>I</fixed-case>talian-<fixed-case>E</fixed-case>nglish lexical gap for action verbs and its impact on translation</title>
-      <author><first>Lorenzo</first> <last>Gregori</last></author>
-      <author><first>Alessandro</first> <last>Panunzi</last></author>
+      <author><first>Lorenzo</first><last>Gregori</last></author>
+      <author><first>Alessandro</first><last>Panunzi</last></author>
       <pages>102–109</pages>
       <url>W17-1913</url>
       <doi>10.18653/v1/W17-1913</doi>
@@ -3131,9 +3131,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Word Sense Filtering Improves Embedding-Based Lexical Substitution</title>
-      <author><first>Anne</first> <last>Cocos</last></author>
-      <author><first>Marianna</first> <last>Apidianaki</last></author>
-      <author><first>Chris</first> <last>Callison-Burch</last></author>
+      <author><first>Anne</first><last>Cocos</last></author>
+      <author><first>Marianna</first><last>Apidianaki</last></author>
+      <author><first>Chris</first><last>Callison-Burch</last></author>
       <pages>110–119</pages>
       <url>W17-1914</url>
       <doi>10.18653/v1/W17-1914</doi>
@@ -3141,8 +3141,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Supervised and Unsupervised Word Sense Disambiguation on Word Embedding Vectors of Unambigous Synonyms</title>
-      <author><first>Aleksander</first> <last>Wawer</last></author>
-      <author><first>Agnieszka</first> <last>Mykowiecka</last></author>
+      <author><first>Aleksander</first><last>Wawer</last></author>
+      <author><first>Agnieszka</first><last>Mykowiecka</last></author>
       <pages>120–125</pages>
       <url>W17-1915</url>
       <doi>10.18653/v1/W17-1915</doi>
@@ -3168,10 +3168,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>The <fixed-case>BURCHAK</fixed-case> corpus: a Challenge Data Set for Interactive Learning of Visually Grounded Word Meanings</title>
-      <author><first>Yanchao</first> <last>Yu</last></author>
-      <author><first>Arash</first> <last>Eshghi</last></author>
-      <author><first>Gregory</first> <last>Mills</last></author>
-      <author><first>Oliver</first> <last>Lemon</last></author>
+      <author><first>Yanchao</first><last>Yu</last></author>
+      <author><first>Arash</first><last>Eshghi</last></author>
+      <author><first>Gregory</first><last>Mills</last></author>
+      <author><first>Oliver</first><last>Lemon</last></author>
       <pages>1–10</pages>
       <url>W17-2001</url>
       <doi>10.18653/v1/W17-2001</doi>
@@ -3179,8 +3179,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>The Use of Object Labels and Spatial Prepositions as Keywords in a Web-Retrieval-Based Image Caption Generation System</title>
-      <author><first>Brandon</first> <last>Birmingham</last></author>
-      <author><first>Adrian</first> <last>Muscat</last></author>
+      <author><first>Brandon</first><last>Birmingham</last></author>
+      <author><first>Adrian</first><last>Muscat</last></author>
       <pages>11–20</pages>
       <url>W17-2002</url>
       <doi>10.18653/v1/W17-2002</doi>
@@ -3188,9 +3188,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Learning to Recognize Animals by Watching Documentaries: Using Subtitles as Weak Supervision</title>
-      <author><first>Aparna</first> <last>Nurani Venkitasubramanian</last></author>
-      <author><first>Tinne</first> <last>Tuytelaars</last></author>
-      <author><first>Marie-Francine</first> <last>Moens</last></author>
+      <author><first>Aparna</first><last>Nurani Venkitasubramanian</last></author>
+      <author><first>Tinne</first><last>Tuytelaars</last></author>
+      <author><first>Marie-Francine</first><last>Moens</last></author>
       <pages>21–30</pages>
       <url>W17-2003</url>
       <doi>10.18653/v1/W17-2003</doi>
@@ -3198,11 +3198,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Human Evaluation of Multi-modal Neural Machine Translation: A Case-Study on E-Commerce Listing Titles</title>
-      <author><first>Iacer</first> <last>Calixto</last></author>
-      <author><first>Daniel</first> <last>Stein</last></author>
-      <author><first>Evgeny</first> <last>Matusov</last></author>
-      <author><first>Sheila</first> <last>Castilho</last></author>
-      <author><first>Andy</first> <last>Way</last></author>
+      <author><first>Iacer</first><last>Calixto</last></author>
+      <author><first>Daniel</first><last>Stein</last></author>
+      <author><first>Evgeny</first><last>Matusov</last></author>
+      <author><first>Sheila</first><last>Castilho</last></author>
+      <author><first>Andy</first><last>Way</last></author>
       <pages>31–37</pages>
       <url>W17-2004</url>
       <doi>10.18653/v1/W17-2004</doi>
@@ -3210,10 +3210,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>The <fixed-case>B</fixed-case>reaking<fixed-case>N</fixed-case>ews Dataset</title>
-      <author><first>Arnau</first> <last>Ramisa</last></author>
-      <author><first>Fei</first> <last>Yan</last></author>
-      <author><first>Francesc</first> <last>Moreno-Noguer</last></author>
-      <author><first>Krystian</first> <last>Mikolajczyk</last></author>
+      <author><first>Arnau</first><last>Ramisa</last></author>
+      <author><first>Fei</first><last>Yan</last></author>
+      <author><first>Francesc</first><last>Moreno-Noguer</last></author>
+      <author><first>Krystian</first><last>Mikolajczyk</last></author>
       <pages>38–39</pages>
       <url>W17-2005</url>
       <doi>10.18653/v1/W17-2005</doi>
@@ -3221,9 +3221,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Automatic identification of head movements in video-recorded conversations: can words help?</title>
-      <author><first>Patrizia</first> <last>Paggio</last></author>
-      <author><first>Costanza</first> <last>Navarretta</last></author>
-      <author><first>Bart</first> <last>Jongejan</last></author>
+      <author><first>Patrizia</first><last>Paggio</last></author>
+      <author><first>Costanza</first><last>Navarretta</last></author>
+      <author><first>Bart</first><last>Jongejan</last></author>
       <pages>40–42</pages>
       <url>W17-2006</url>
       <doi>10.18653/v1/W17-2006</doi>
@@ -3231,10 +3231,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Multi-Modal Fashion Product Retrieval</title>
-      <author><first>Antonio</first> <last>Rubio Romano</last></author>
-      <author><first>LongLong</first> <last>Yu</last></author>
-      <author><first>Edgar</first> <last>Simo-Serra</last></author>
-      <author><first>Francesc</first> <last>Moreno-Noguer</last></author>
+      <author><first>Antonio</first><last>Rubio Romano</last></author>
+      <author><first>LongLong</first><last>Yu</last></author>
+      <author><first>Edgar</first><last>Simo-Serra</last></author>
+      <author><first>Francesc</first><last>Moreno-Noguer</last></author>
       <pages>43–45</pages>
       <url>W17-2007</url>
       <doi>10.18653/v1/W17-2007</doi>
@@ -3262,10 +3262,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Metaphor Detection in a Poetry Corpus</title>
-      <author><first>Vaibhav</first> <last>Kesarwani</last></author>
-      <author><first>Diana</first> <last>Inkpen</last></author>
-      <author><first>Stan</first> <last>Szpakowicz</last></author>
-      <author><first>Chris</first> <last>Tanasescu</last></author>
+      <author><first>Vaibhav</first><last>Kesarwani</last></author>
+      <author><first>Diana</first><last>Inkpen</last></author>
+      <author><first>Stan</first><last>Szpakowicz</last></author>
+      <author><first>Chris</first><last>Tanasescu</last></author>
       <pages>1–9</pages>
       <url>W17-2201</url>
       <doi>10.18653/v1/W17-2201</doi>
@@ -3273,10 +3273,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Machine Translation and Automated Analysis of the <fixed-case>S</fixed-case>umerian Language</title>
-      <author><first>Émilie</first> <last>Pagé-Perron</last></author>
-      <author><first>Maria</first> <last>Sukhareva</last></author>
-      <author><first>Ilya</first> <last>Khait</last></author>
-      <author><first>Christian</first> <last>Chiarcos</last></author>
+      <author><first>Émilie</first><last>Pagé-Perron</last></author>
+      <author><first>Maria</first><last>Sukhareva</last></author>
+      <author><first>Ilya</first><last>Khait</last></author>
+      <author><first>Christian</first><last>Chiarcos</last></author>
       <pages>10–16</pages>
       <url>W17-2202</url>
       <doi>10.18653/v1/W17-2202</doi>
@@ -3284,9 +3284,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Investigating the Relationship between Literary Genres and Emotional Plot Development</title>
-      <author><first>Evgeny</first> <last>Kim</last></author>
-      <author><first>Sebastian</first> <last>Padó</last></author>
-      <author><first>Roman</first> <last>Klinger</last></author>
+      <author><first>Evgeny</first><last>Kim</last></author>
+      <author><first>Sebastian</first><last>Padó</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
       <pages>17–26</pages>
       <url>W17-2203</url>
       <doi>10.18653/v1/W17-2203</doi>
@@ -3294,10 +3294,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Enjambment Detection in a Large Diachronic Corpus of <fixed-case>S</fixed-case>panish Sonnets</title>
-      <author><first>Pablo</first> <last>Ruiz</last></author>
-      <author><first>Clara</first> <last>Martínez Cantón</last></author>
-      <author><first>Thierry</first> <last>Poibeau</last></author>
-      <author><first>Elena</first> <last>González-Blanco</last></author>
+      <author><first>Pablo</first><last>Ruiz</last></author>
+      <author><first>Clara</first><last>Martínez Cantón</last></author>
+      <author><first>Thierry</first><last>Poibeau</last></author>
+      <author><first>Elena</first><last>González-Blanco</last></author>
       <pages>27–32</pages>
       <url>W17-2204</url>
       <doi>10.18653/v1/W17-2204</doi>
@@ -3305,8 +3305,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Plotting <fixed-case>M</fixed-case>arkson’s “Mistress”</title>
-      <author><first>Conor</first> <last>Kelleher</last></author>
-      <author><first>Mark</first> <last>Keane</last></author>
+      <author><first>Conor</first><last>Kelleher</last></author>
+      <author><first>Mark</first><last>Keane</last></author>
       <pages>33–39</pages>
       <url>W17-2205</url>
       <doi>10.18653/v1/W17-2205</doi>
@@ -3314,11 +3314,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Annotation Challenges for Reconstructing the Structural Elaboration of Middle Low <fixed-case>G</fixed-case>erman</title>
-      <author><first>Nina</first> <last>Seemann</last></author>
-      <author><first>Marie-Luis</first> <last>Merten</last></author>
-      <author><first>Michaela</first> <last>Geierhos</last></author>
-      <author><first>Doris</first> <last>Tophinke</last></author>
-      <author><first>Eyke</first> <last>Hüllermeier</last></author>
+      <author><first>Nina</first><last>Seemann</last></author>
+      <author><first>Marie-Luis</first><last>Merten</last></author>
+      <author><first>Michaela</first><last>Geierhos</last></author>
+      <author><first>Doris</first><last>Tophinke</last></author>
+      <author><first>Eyke</first><last>Hüllermeier</last></author>
       <pages>40–45</pages>
       <url>W17-2206</url>
       <doi>10.18653/v1/W17-2206</doi>
@@ -3326,7 +3326,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Phonological Soundscapes in Medieval Poetry</title>
-      <author><first>Christopher</first> <last>Hench</last></author>
+      <author><first>Christopher</first><last>Hench</last></author>
       <pages>46–56</pages>
       <url>W17-2207</url>
       <doi>10.18653/v1/W17-2207</doi>
@@ -3334,10 +3334,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>An End-to-end Environment for Research Question-Driven Entity Extraction and Network Analysis</title>
-      <author><first>Andre</first> <last>Blessing</last></author>
-      <author><first>Nora</first> <last>Echelmeyer</last></author>
-      <author><first>Markus</first> <last>John</last></author>
-      <author><first>Nils</first> <last>Reiter</last></author>
+      <author><first>Andre</first><last>Blessing</last></author>
+      <author><first>Nora</first><last>Echelmeyer</last></author>
+      <author><first>Markus</first><last>John</last></author>
+      <author><first>Nils</first><last>Reiter</last></author>
       <pages>57–67</pages>
       <url>W17-2208</url>
       <doi>10.18653/v1/W17-2208</doi>
@@ -3347,8 +3347,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Modeling intra-textual variation with entropy and surprisal: topical vs. stylistic patterns</title>
-      <author><first>Stefania</first> <last>Degaetano-Ortlieb</last></author>
-      <author><first>Elke</first> <last>Teich</last></author>
+      <author><first>Stefania</first><last>Degaetano-Ortlieb</last></author>
+      <author><first>Elke</first><last>Teich</last></author>
       <pages>68–77</pages>
       <url>W17-2209</url>
       <doi>10.18653/v1/W17-2209</doi>
@@ -3356,8 +3356,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Finding a Character’s Voice: Stylome Classification on Literary Characters</title>
-      <author><first>Liviu P.</first> <last>Dinu</last></author>
-      <author><first>Ana Sabina</first> <last>Uban</last></author>
+      <author><first>Liviu P.</first><last>Dinu</last></author>
+      <author><first>Ana Sabina</first><last>Uban</last></author>
       <pages>78–82</pages>
       <url>W17-2210</url>
       <doi>10.18653/v1/W17-2210</doi>
@@ -3365,7 +3365,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>An Ontology-Based Method for Extracting and Classifying Domain-Specific Compositional Nominal Compounds</title>
-      <author><first>Maria Pia</first> <last>di Buono</last></author>
+      <author><first>Maria Pia</first><last>di Buono</last></author>
       <pages>83–88</pages>
       <url>W17-2211</url>
       <doi>10.18653/v1/W17-2211</doi>
@@ -3373,8 +3373,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Speeding up corpus development for linguistic research: language documentation and acquisition in <fixed-case>R</fixed-case>omansh Tuatschin</title>
-      <author><first>Géraldine</first> <last>Walther</last></author>
-      <author><first>Benoît</first> <last>Sagot</last></author>
+      <author><first>Géraldine</first><last>Walther</last></author>
+      <author><first>Benoît</first><last>Sagot</last></author>
       <pages>89–94</pages>
       <url>W17-2212</url>
       <doi>10.18653/v1/W17-2212</doi>
@@ -3382,12 +3382,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Distantly Supervised <fixed-case>POS</fixed-case> Tagging of Low-Resource Languages under Extreme Data Sparsity: The Case of <fixed-case>H</fixed-case>ittite</title>
-      <author><first>Maria</first> <last>Sukhareva</last></author>
-      <author><first>Francesco</first> <last>Fuscagni</last></author>
-      <author><first>Johannes</first> <last>Daxenberger</last></author>
-      <author><first>Susanne</first> <last>Görke</last></author>
-      <author><first>Doris</first> <last>Prechel</last></author>
-      <author><first>Iryna</first> <last>Gurevych</last></author>
+      <author><first>Maria</first><last>Sukhareva</last></author>
+      <author><first>Francesco</first><last>Fuscagni</last></author>
+      <author><first>Johannes</first><last>Daxenberger</last></author>
+      <author><first>Susanne</first><last>Görke</last></author>
+      <author><first>Doris</first><last>Prechel</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
       <pages>95–104</pages>
       <url>W17-2213</url>
       <doi>10.18653/v1/W17-2213</doi>
@@ -3395,9 +3395,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>A Dataset for <fixed-case>S</fixed-case>anskrit Word Segmentation</title>
-      <author><first>Amrith</first> <last>Krishna</last></author>
-      <author><first>Pavan Kumar</first> <last>Satuluri</last></author>
-      <author><first>Pawan</first> <last>Goyal</last></author>
+      <author><first>Amrith</first><last>Krishna</last></author>
+      <author><first>Pavan Kumar</first><last>Satuluri</last></author>
+      <author><first>Pawan</first><last>Goyal</last></author>
       <pages>105–114</pages>
       <url>W17-2214</url>
       <doi>10.18653/v1/W17-2214</doi>
@@ -3405,8 +3405,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Lexical Correction of Polish Twitter Political Data</title>
-      <author><first>Maciej</first> <last>Ogrodniczuk</last></author>
-      <author><first>Mateusz</first> <last>Kopeć</last></author>
+      <author><first>Maciej</first><last>Ogrodniczuk</last></author>
+      <author><first>Mateusz</first><last>Kopeć</last></author>
       <pages>115–125</pages>
       <url>W17-2215</url>
       <doi>10.18653/v1/W17-2215</doi>
@@ -3432,11 +3432,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Target word prediction and paraphasia classification in spoken discourse</title>
-      <author><first>Joel</first> <last>Adams</last></author>
-      <author><first>Steven</first> <last>Bedrick</last></author>
-      <author><first>Gerasimos</first> <last>Fergadiotis</last></author>
-      <author><first>Kyle</first> <last>Gorman</last></author>
-      <author><first>Jan</first> <last>van Santen</last></author>
+      <author><first>Joel</first><last>Adams</last></author>
+      <author><first>Steven</first><last>Bedrick</last></author>
+      <author><first>Gerasimos</first><last>Fergadiotis</last></author>
+      <author><first>Kyle</first><last>Gorman</last></author>
+      <author><first>Jan</first><last>van Santen</last></author>
       <pages>1–8</pages>
       <url>W17-2301</url>
       <doi>10.18653/v1/W17-2301</doi>
@@ -3444,9 +3444,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Extracting Drug-Drug Interactions with Attention <fixed-case>CNN</fixed-case>s</title>
-      <author><first>Masaki</first> <last>Asada</last></author>
-      <author><first>Makoto</first> <last>Miwa</last></author>
-      <author><first>Yutaka</first> <last>Sasaki</last></author>
+      <author><first>Masaki</first><last>Asada</last></author>
+      <author><first>Makoto</first><last>Miwa</last></author>
+      <author><first>Yutaka</first><last>Sasaki</last></author>
       <pages>9–18</pages>
       <url>W17-2302</url>
       <doi>10.18653/v1/W17-2302</doi>
@@ -3454,9 +3454,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Insights into Analogy Completion from the Biomedical Domain</title>
-      <author><first>Denis</first> <last>Newman-Griffis</last></author>
-      <author><first>Albert</first> <last>Lai</last></author>
-      <author><first>Eric</first> <last>Fosler-Lussier</last></author>
+      <author><first>Denis</first><last>Newman-Griffis</last></author>
+      <author><first>Albert</first><last>Lai</last></author>
+      <author><first>Eric</first><last>Fosler-Lussier</last></author>
       <pages>19–28</pages>
       <url>W17-2303</url>
       <doi>10.18653/v1/W17-2303</doi>
@@ -3464,8 +3464,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Deep learning for extracting protein-protein interactions from biomedical literature</title>
-      <author><first>Yifan</first> <last>Peng</last></author>
-      <author><first>Zhiyong</first> <last>Lu</last></author>
+      <author><first>Yifan</first><last>Peng</last></author>
+      <author><first>Zhiyong</first><last>Lu</last></author>
       <pages>29–38</pages>
       <url>W17-2304</url>
       <doi>10.18653/v1/W17-2304</doi>
@@ -3473,9 +3473,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Stacking With Auxiliary Features for Entity Linking in the Medical Domain</title>
-      <author><first>Nazneen Fatema</first> <last>Rajani</last></author>
-      <author><first>Mihaela</first> <last>Bornea</last></author>
-      <author><first>Ken</first> <last>Barker</last></author>
+      <author><first>Nazneen Fatema</first><last>Rajani</last></author>
+      <author><first>Mihaela</first><last>Bornea</last></author>
+      <author><first>Ken</first><last>Barker</last></author>
       <pages>39–47</pages>
       <url>W17-2305</url>
       <doi>10.18653/v1/W17-2305</doi>
@@ -3483,11 +3483,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Results of the fifth edition of the <fixed-case>B</fixed-case>io<fixed-case>ASQ</fixed-case> Challenge</title>
-      <author><first>Anastasios</first> <last>Nentidis</last></author>
-      <author><first>Konstantinos</first> <last>Bougiatiotis</last></author>
-      <author><first>Anastasia</first> <last>Krithara</last></author>
-      <author><first>Georgios</first> <last>Paliouras</last></author>
-      <author><first>Ioannis</first> <last>Kakadiaris</last></author>
+      <author><first>Anastasios</first><last>Nentidis</last></author>
+      <author><first>Konstantinos</first><last>Bougiatiotis</last></author>
+      <author><first>Anastasia</first><last>Krithara</last></author>
+      <author><first>Georgios</first><last>Paliouras</last></author>
+      <author><first>Ioannis</first><last>Kakadiaris</last></author>
       <pages>48–57</pages>
       <url>W17-2306</url>
       <doi>10.18653/v1/W17-2306</doi>
@@ -3496,12 +3496,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Tackling Biomedical Text Summarization: <fixed-case>OAQA</fixed-case> at <fixed-case>B</fixed-case>io<fixed-case>ASQ</fixed-case> 5<fixed-case>B</fixed-case></title>
-      <author><first>Khyathi</first> <last>Chandu</last></author>
-      <author><first>Aakanksha</first> <last>Naik</last></author>
-      <author><first>Aditya</first> <last>Chandrasekar</last></author>
-      <author><first>Zi</first> <last>Yang</last></author>
-      <author><first>Niloy</first> <last>Gupta</last></author>
-      <author><first>Eric</first> <last>Nyberg</last></author>
+      <author><first>Khyathi</first><last>Chandu</last></author>
+      <author><first>Aakanksha</first><last>Naik</last></author>
+      <author><first>Aditya</first><last>Chandrasekar</last></author>
+      <author><first>Zi</first><last>Yang</last></author>
+      <author><first>Niloy</first><last>Gupta</last></author>
+      <author><first>Eric</first><last>Nyberg</last></author>
       <pages>58–66</pages>
       <url>W17-2307</url>
       <doi>10.18653/v1/W17-2307</doi>
@@ -3509,7 +3509,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title><fixed-case>M</fixed-case>acquarie University at <fixed-case>B</fixed-case>io<fixed-case>ASQ</fixed-case> 5b – Query-based Summarisation Techniques for Selecting the Ideal Answers</title>
-      <author><first>Diego</first> <last>Mollá</last></author>
+      <author><first>Diego</first><last>Mollá</last></author>
       <pages>67–75</pages>
       <url>W17-2308</url>
       <doi>10.18653/v1/W17-2308</doi>
@@ -3517,9 +3517,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Neural Question Answering at <fixed-case>B</fixed-case>io<fixed-case>ASQ</fixed-case> 5<fixed-case>B</fixed-case></title>
-      <author><first>Georg</first> <last>Wiese</last></author>
-      <author><first>Dirk</first> <last>Weissenborn</last></author>
-      <author><first>Mariana</first> <last>Neves</last></author>
+      <author><first>Georg</first><last>Wiese</last></author>
+      <author><first>Dirk</first><last>Weissenborn</last></author>
+      <author><first>Mariana</first><last>Neves</last></author>
       <pages>76–79</pages>
       <url>W17-2309</url>
       <doi>10.18653/v1/W17-2309</doi>
@@ -3528,12 +3528,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>End-to-End System for Bacteria Habitat Extraction</title>
-      <author><first>Farrokh</first> <last>Mehryary</last></author>
-      <author><first>Kai</first> <last>Hakala</last></author>
-      <author><first>Suwisa</first> <last>Kaewphan</last></author>
-      <author><first>Jari</first> <last>Björne</last></author>
-      <author><first>Tapio</first> <last>Salakoski</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
+      <author><first>Farrokh</first><last>Mehryary</last></author>
+      <author><first>Kai</first><last>Hakala</last></author>
+      <author><first>Suwisa</first><last>Kaewphan</last></author>
+      <author><first>Jari</first><last>Björne</last></author>
+      <author><first>Tapio</first><last>Salakoski</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
       <pages>80–90</pages>
       <url>W17-2310</url>
       <doi>10.18653/v1/W17-2310</doi>
@@ -3542,11 +3542,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Creation and evaluation of a dictionary-based tagger for virus species and proteins</title>
-      <author><first>Helen</first> <last>Cook</last></author>
-      <author><first>Rūdolfs</first> <last>Bērziņš</last></author>
-      <author><first>Cristina Leal</first> <last>Rodrıguez</last></author>
-      <author><first>Juan Miguel</first> <last>Cejuela</last></author>
-      <author><first>Lars Juhl</first> <last>Jensen</last></author>
+      <author><first>Helen</first><last>Cook</last></author>
+      <author><first>Rūdolfs</first><last>Bērziņš</last></author>
+      <author><first>Cristina Leal</first><last>Rodrıguez</last></author>
+      <author><first>Juan Miguel</first><last>Cejuela</last></author>
+      <author><first>Lars Juhl</first><last>Jensen</last></author>
       <pages>91–98</pages>
       <url>W17-2311</url>
       <doi>10.18653/v1/W17-2311</doi>
@@ -3554,9 +3554,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Representation of complex terms in a vector space structured by an ontology for a normalization task</title>
-      <author><first>Arnaud</first> <last>Ferré</last></author>
-      <author><first>Pierre</first> <last>Zweigenbaum</last></author>
-      <author><first>Claire</first> <last>Nédellec</last></author>
+      <author><first>Arnaud</first><last>Ferré</last></author>
+      <author><first>Pierre</first><last>Zweigenbaum</last></author>
+      <author><first>Claire</first><last>Nédellec</last></author>
       <pages>99–106</pages>
       <url>W17-2312</url>
       <doi>10.18653/v1/W17-2312</doi>
@@ -3564,8 +3564,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Improving Correlation with Human Judgments by Integrating Semantic Similarity with Second–Order Vectors</title>
-      <author><first>Bridget</first> <last>McInnes</last></author>
-      <author><first>Ted</first> <last>Pedersen</last></author>
+      <author><first>Bridget</first><last>McInnes</last></author>
+      <author><first>Ted</first><last>Pedersen</last></author>
       <pages>107–116</pages>
       <url>W17-2313</url>
       <doi>10.18653/v1/W17-2313</doi>
@@ -3573,9 +3573,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Proactive Learning for Named Entity Recognition</title>
-      <author><first>Maolin</first> <last>Li</last></author>
-      <author><first>Nhung</first> <last>Nguyen</last></author>
-      <author><first>Sophia</first> <last>Ananiadou</last></author>
+      <author><first>Maolin</first><last>Li</last></author>
+      <author><first>Nhung</first><last>Nguyen</last></author>
+      <author><first>Sophia</first><last>Ananiadou</last></author>
       <pages>117–125</pages>
       <url>W17-2314</url>
       <doi>10.18653/v1/W17-2314</doi>
@@ -3583,10 +3583,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Biomedical Event Extraction using Abstract Meaning Representation</title>
-      <author><first>Sudha</first> <last>Rao</last></author>
-      <author><first>Daniel</first> <last>Marcu</last></author>
-      <author><first>Kevin</first> <last>Knight</last></author>
-      <author><first>Hal</first> <last>Daumé III</last></author>
+      <author><first>Sudha</first><last>Rao</last></author>
+      <author><first>Daniel</first><last>Marcu</last></author>
+      <author><first>Kevin</first><last>Knight</last></author>
+      <author><first>Hal</first><last>Daumé III</last></author>
       <pages>126–135</pages>
       <url>W17-2315</url>
       <doi>10.18653/v1/W17-2315</doi>
@@ -3594,11 +3594,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="16">
       <title>Detecting Personal Medication Intake in Twitter: An Annotated Corpus and Baseline Classification System</title>
-      <author><first>Ari</first> <last>Klein</last></author>
-      <author><first>Abeed</first> <last>Sarker</last></author>
-      <author><first>Masoud</first> <last>Rouhizadeh</last></author>
-      <author><first>Karen</first> <last>O’Connor</last></author>
-      <author><first>Graciela</first> <last>Gonzalez</last></author>
+      <author><first>Ari</first><last>Klein</last></author>
+      <author><first>Abeed</first><last>Sarker</last></author>
+      <author><first>Masoud</first><last>Rouhizadeh</last></author>
+      <author><first>Karen</first><last>O’Connor</last></author>
+      <author><first>Graciela</first><last>Gonzalez</last></author>
       <pages>136–142</pages>
       <url>W17-2316</url>
       <doi>10.18653/v1/W17-2316</doi>
@@ -3606,9 +3606,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="17">
       <title>Unsupervised Context-Sensitive Spelling Correction of Clinical Free-Text with Word and Character N-Gram Embeddings</title>
-      <author><first>Pieter</first> <last>Fivez</last></author>
-      <author><first>Simon</first> <last>Šuster</last></author>
-      <author><first>Walter</first> <last>Daelemans</last></author>
+      <author><first>Pieter</first><last>Fivez</last></author>
+      <author><first>Simon</first><last>Šuster</last></author>
+      <author><first>Walter</first><last>Daelemans</last></author>
       <pages>143–148</pages>
       <url>W17-2317</url>
       <doi>10.18653/v1/W17-2317</doi>
@@ -3616,11 +3616,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="18">
       <title>Characterization of Divergence in Impaired Speech of <fixed-case>ALS</fixed-case> Patients</title>
-      <author><first>Archna</first> <last>Bhatia</last></author>
-      <author><first>Bonnie</first> <last>Dorr</last></author>
-      <author><first>Kristy</first> <last>Hollingshead</last></author>
-      <author><first>Samuel L.</first> <last>Phillips</last></author>
-      <author><first>Barbara</first> <last>McKenzie</last></author>
+      <author><first>Archna</first><last>Bhatia</last></author>
+      <author><first>Bonnie</first><last>Dorr</last></author>
+      <author><first>Kristy</first><last>Hollingshead</last></author>
+      <author><first>Samuel L.</first><last>Phillips</last></author>
+      <author><first>Barbara</first><last>McKenzie</last></author>
       <pages>149–158</pages>
       <url>W17-2318</url>
       <doi>10.18653/v1/W17-2318</doi>
@@ -3628,11 +3628,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="19">
       <title>Deep Learning for Punctuation Restoration in Medical Reports</title>
-      <author><first>Wael</first> <last>Salloum</last></author>
-      <author><first>Greg</first> <last>Finley</last></author>
-      <author><first>Erik</first> <last>Edwards</last></author>
-      <author><first>Mark</first> <last>Miller</last></author>
-      <author><first>David</first> <last>Suendermann-Oeft</last></author>
+      <author><first>Wael</first><last>Salloum</last></author>
+      <author><first>Greg</first><last>Finley</last></author>
+      <author><first>Erik</first><last>Edwards</last></author>
+      <author><first>Mark</first><last>Miller</last></author>
+      <author><first>David</first><last>Suendermann-Oeft</last></author>
       <pages>159–164</pages>
       <url>W17-2319</url>
       <doi>10.18653/v1/W17-2319</doi>
@@ -3640,10 +3640,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="20">
       <title>Unsupervised Domain Adaptation for Clinical Negation Detection</title>
-      <author><first>Timothy</first> <last>Miller</last></author>
-      <author><first>Steven</first> <last>Bethard</last></author>
-      <author><first>Hadi</first> <last>Amiri</last></author>
-      <author><first>Guergana</first> <last>Savova</last></author>
+      <author><first>Timothy</first><last>Miller</last></author>
+      <author><first>Steven</first><last>Bethard</last></author>
+      <author><first>Hadi</first><last>Amiri</last></author>
+      <author><first>Guergana</first><last>Savova</last></author>
       <pages>165–170</pages>
       <url>W17-2320</url>
       <doi>10.18653/v1/W17-2320</doi>
@@ -3651,13 +3651,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="21">
       <title><fixed-case>B</fixed-case>io<fixed-case>C</fixed-case>reative <fixed-case>VI</fixed-case> Precision Medicine Track: creating a training corpus for mining protein-protein interactions affected by mutations</title>
-      <author><first>Rezarta</first> <last>Islamaj Doğan</last></author>
-      <author><first>Andrew</first> <last>Chatr-aryamontri</last></author>
-      <author><first>Sun</first> <last>Kim</last></author>
-      <author><first>Chih-Hsuan</first> <last>Wei</last></author>
-      <author><first>Yifan</first> <last>Peng</last></author>
-      <author><first>Donald</first> <last>Comeau</last></author>
-      <author><first>Zhiyong</first> <last>Lu</last></author>
+      <author><first>Rezarta</first><last>Islamaj Doğan</last></author>
+      <author><first>Andrew</first><last>Chatr-aryamontri</last></author>
+      <author><first>Sun</first><last>Kim</last></author>
+      <author><first>Chih-Hsuan</first><last>Wei</last></author>
+      <author><first>Yifan</first><last>Peng</last></author>
+      <author><first>Donald</first><last>Comeau</last></author>
+      <author><first>Zhiyong</first><last>Lu</last></author>
       <pages>171–175</pages>
       <url>W17-2321</url>
       <doi>10.18653/v1/W17-2321</doi>
@@ -3665,8 +3665,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="22">
       <title>Painless Relation Extraction with Kindred</title>
-      <author><first>Jake</first> <last>Lever</last></author>
-      <author><first>Steven</first> <last>Jones</last></author>
+      <author><first>Jake</first><last>Lever</last></author>
+      <author><first>Steven</first><last>Jones</last></author>
       <pages>176–183</pages>
       <url>W17-2322</url>
       <doi>10.18653/v1/W17-2322</doi>
@@ -3674,9 +3674,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="23">
       <title>Noise Reduction Methods for Distantly Supervised Biomedical Relation Extraction</title>
-      <author><first>Gang</first> <last>Li</last></author>
-      <author><first>Cathy</first> <last>Wu</last></author>
-      <author><first>K.</first> <last>Vijay-Shanker</last></author>
+      <author><first>Gang</first><last>Li</last></author>
+      <author><first>Cathy</first><last>Wu</last></author>
+      <author><first>K.</first><last>Vijay-Shanker</last></author>
       <pages>184–193</pages>
       <url>W17-2323</url>
       <doi>10.18653/v1/W17-2323</doi>
@@ -3684,11 +3684,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="24">
       <title>Role-Preserving Redaction of Medical Records to Enable Ontology-Driven Processing</title>
-      <author><first>Seth</first> <last>Polsley</last></author>
-      <author><first>Atif</first> <last>Tahir</last></author>
-      <author><first>Muppala</first> <last>Raju</last></author>
-      <author><first>Akintayo</first> <last>Akinleye</last></author>
-      <author><first>Duane</first> <last>Steward</last></author>
+      <author><first>Seth</first><last>Polsley</last></author>
+      <author><first>Atif</first><last>Tahir</last></author>
+      <author><first>Muppala</first><last>Raju</last></author>
+      <author><first>Akintayo</first><last>Akinleye</last></author>
+      <author><first>Duane</first><last>Steward</last></author>
       <pages>194–199</pages>
       <url>W17-2324</url>
       <doi>10.18653/v1/W17-2324</doi>
@@ -3696,10 +3696,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="25">
       <title>Annotation of pain and anesthesia events for surgery-related processes and outcomes extraction</title>
-      <author><first>Wen-wai</first> <last>Yim</last></author>
-      <author><first>Dario</first> <last>Tedesco</last></author>
-      <author><first>Catherine</first> <last>Curtin</last></author>
-      <author><first>Tina</first> <last>Hernandez-Boussard</last></author>
+      <author><first>Wen-wai</first><last>Yim</last></author>
+      <author><first>Dario</first><last>Tedesco</last></author>
+      <author><first>Catherine</first><last>Curtin</last></author>
+      <author><first>Tina</first><last>Hernandez-Boussard</last></author>
       <pages>200–205</pages>
       <url>W17-2325</url>
       <doi>10.18653/v1/W17-2325</doi>
@@ -3707,11 +3707,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="26">
       <title>Identifying Comparative Structures in Biomedical Text</title>
-      <author><first>Samir</first> <last>Gupta</last></author>
-      <author><first>A.S.M. Ashique</first> <last>Mahmood</last></author>
-      <author><first>Karen</first> <last>Ross</last></author>
-      <author><first>Cathy</first> <last>Wu</last></author>
-      <author><first>K.</first> <last>Vijay-Shanker</last></author>
+      <author><first>Samir</first><last>Gupta</last></author>
+      <author><first>A.S.M. Ashique</first><last>Mahmood</last></author>
+      <author><first>Karen</first><last>Ross</last></author>
+      <author><first>Cathy</first><last>Wu</last></author>
+      <author><first>K.</first><last>Vijay-Shanker</last></author>
       <pages>206–215</pages>
       <url>W17-2326</url>
       <doi>10.18653/v1/W17-2326</doi>
@@ -3719,13 +3719,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="27">
       <title>Tagging Funding Agencies and Grants in Scientific Articles using Sequential Learning Models</title>
-      <author><first>Subhradeep</first> <last>Kayal</last></author>
-      <author><first>Zubair</first> <last>Afzal</last></author>
-      <author><first>George</first> <last>Tsatsaronis</last></author>
-      <author><first>Sophia</first> <last>Katrenko</last></author>
-      <author><first>Pascal</first> <last>Coupet</last></author>
-      <author><first>Marius</first> <last>Doornenbal</last></author>
-      <author><first>Michelle</first> <last>Gregory</last></author>
+      <author><first>Subhradeep</first><last>Kayal</last></author>
+      <author><first>Zubair</first><last>Afzal</last></author>
+      <author><first>George</first><last>Tsatsaronis</last></author>
+      <author><first>Sophia</first><last>Katrenko</last></author>
+      <author><first>Pascal</first><last>Coupet</last></author>
+      <author><first>Marius</first><last>Doornenbal</last></author>
+      <author><first>Michelle</first><last>Gregory</last></author>
       <pages>216–221</pages>
       <url>W17-2327</url>
       <doi>10.18653/v1/W17-2327</doi>
@@ -3733,10 +3733,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="28">
       <title>Deep Learning for Biomedical Information Retrieval: Learning Textual Relevance from Click Logs</title>
-      <author><first>Sunil</first> <last>Mohan</last></author>
-      <author><first>Nicolas</first> <last>Fiorini</last></author>
-      <author><first>Sun</first> <last>Kim</last></author>
-      <author><first>Zhiyong</first> <last>Lu</last></author>
+      <author><first>Sunil</first><last>Mohan</last></author>
+      <author><first>Nicolas</first><last>Fiorini</last></author>
+      <author><first>Sun</first><last>Kim</last></author>
+      <author><first>Zhiyong</first><last>Lu</last></author>
       <pages>222–231</pages>
       <url>W17-2328</url>
       <doi>10.18653/v1/W17-2328</doi>
@@ -3744,10 +3744,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="29">
       <title>Detecting Dementia through Retrospective Analysis of Routine Blog Posts by Bloggers with Dementia</title>
-      <author><first>Vaden</first> <last>Masrani</last></author>
-      <author><first>Gabriel</first> <last>Murray</last></author>
-      <author><first>Thalia</first> <last>Field</last></author>
-      <author><first>Giuseppe</first> <last>Carenini</last></author>
+      <author><first>Vaden</first><last>Masrani</last></author>
+      <author><first>Gabriel</first><last>Murray</last></author>
+      <author><first>Thalia</first><last>Field</last></author>
+      <author><first>Giuseppe</first><last>Carenini</last></author>
       <pages>232–237</pages>
       <url>W17-2329</url>
       <doi>10.18653/v1/W17-2329</doi>
@@ -3755,9 +3755,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="30">
       <title>Protein Word Detection using Text Segmentation Techniques</title>
-      <author><first>Devi</first> <last>Ganesan</last></author>
-      <author><first>Ashish V.</first> <last>Tendulkar</last></author>
-      <author><first>Sutanu</first> <last>Chakraborti</last></author>
+      <author><first>Devi</first><last>Ganesan</last></author>
+      <author><first>Ashish V.</first><last>Tendulkar</last></author>
+      <author><first>Sutanu</first><last>Chakraborti</last></author>
       <pages>238–246</pages>
       <url>W17-2330</url>
       <doi>10.18653/v1/W17-2330</doi>
@@ -3765,8 +3765,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="31">
       <title>External Evaluation of Event Extraction Classifiers for Automatic Pathway Curation: An extended study of the m<fixed-case>TOR</fixed-case> pathway</title>
-      <author><first>Wojciech</first> <last>Kusa</last></author>
-      <author><first>Michael</first> <last>Spranger</last></author>
+      <author><first>Wojciech</first><last>Kusa</last></author>
+      <author><first>Michael</first><last>Spranger</last></author>
       <pages>247–256</pages>
       <url>W17-2331</url>
       <doi>10.18653/v1/W17-2331</doi>
@@ -3774,8 +3774,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="32">
       <title>Toward Automated Early Sepsis Alerting: Identifying Infection Patients from Nursing Notes</title>
-      <author><first>Emilia</first> <last>Apostolova</last></author>
-      <author><first>Tom</first> <last>Velez</last></author>
+      <author><first>Emilia</first><last>Apostolova</last></author>
+      <author><first>Tom</first><last>Velez</last></author>
       <pages>257–262</pages>
       <url>W17-2332</url>
       <doi>10.18653/v1/W17-2332</doi>
@@ -3783,10 +3783,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="33">
       <title>Enhancing Automatic <fixed-case>ICD</fixed-case>-9-<fixed-case>CM</fixed-case> Code Assignment for Medical Texts with <fixed-case>P</fixed-case>ub<fixed-case>M</fixed-case>ed</title>
-      <author><first>Danchen</first> <last>Zhang</last></author>
-      <author><first>Daqing</first> <last>He</last></author>
-      <author><first>Sanqiang</first> <last>Zhao</last></author>
-      <author><first>Lei</first> <last>Li</last></author>
+      <author><first>Danchen</first><last>Zhang</last></author>
+      <author><first>Daqing</first><last>He</last></author>
+      <author><first>Sanqiang</first><last>Zhao</last></author>
+      <author><first>Lei</first><last>Li</last></author>
       <pages>263–271</pages>
       <url>W17-2333</url>
       <doi>10.18653/v1/W17-2333</doi>
@@ -3794,9 +3794,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="34">
       <title>Evaluating Feature Extraction Methods for Knowledge-based Biomedical Word Sense Disambiguation</title>
-      <author><first>Sam</first> <last>Henry</last></author>
-      <author><first>Clint</first> <last>Cuffy</last></author>
-      <author><first>Bridget</first> <last>McInnes</last></author>
+      <author><first>Sam</first><last>Henry</last></author>
+      <author><first>Clint</first><last>Cuffy</last></author>
+      <author><first>Bridget</first><last>McInnes</last></author>
       <pages>272–281</pages>
       <url>W17-2334</url>
       <doi>10.18653/v1/W17-2334</doi>
@@ -3804,11 +3804,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="35">
       <title>Investigating the Documentation of Electronic Cigarette Use in the Veteran Affairs Electronic Health Record: A Pilot Study</title>
-      <author><first>Danielle</first> <last>Mowery</last></author>
-      <author><first>Brett</first> <last>South</last></author>
-      <author><first>Olga</first> <last>Patterson</last></author>
-      <author><first>Shu-Hong</first> <last>Zhu</last></author>
-      <author><first>Mike</first> <last>Conway</last></author>
+      <author><first>Danielle</first><last>Mowery</last></author>
+      <author><first>Brett</first><last>South</last></author>
+      <author><first>Olga</first><last>Patterson</last></author>
+      <author><first>Shu-Hong</first><last>Zhu</last></author>
+      <author><first>Mike</first><last>Conway</last></author>
       <pages>282–286</pages>
       <url>W17-2335</url>
       <doi>10.18653/v1/W17-2335</doi>
@@ -3816,11 +3816,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="36">
       <title>Automated Preamble Detection in Dictated Medical Reports</title>
-      <author><first>Wael</first> <last>Salloum</last></author>
-      <author><first>Greg</first> <last>Finley</last></author>
-      <author><first>Erik</first> <last>Edwards</last></author>
-      <author><first>Mark</first> <last>Miller</last></author>
-      <author><first>David</first> <last>Suendermann-Oeft</last></author>
+      <author><first>Wael</first><last>Salloum</last></author>
+      <author><first>Greg</first><last>Finley</last></author>
+      <author><first>Erik</first><last>Edwards</last></author>
+      <author><first>Mark</first><last>Miller</last></author>
+      <author><first>David</first><last>Suendermann-Oeft</last></author>
       <pages>287–295</pages>
       <url>W17-2336</url>
       <doi>10.18653/v1/W17-2336</doi>
@@ -3828,8 +3828,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="37">
       <title>A Biomedical Question Answering System in <fixed-case>B</fixed-case>io<fixed-case>ASQ</fixed-case> 2017</title>
-      <author><first>Mourad</first> <last>Sarrouti</last></author>
-      <author><first>Said</first> <last>Ouatik El Alaoui</last></author>
+      <author><first>Mourad</first><last>Sarrouti</last></author>
+      <author><first>Said</first><last>Ouatik El Alaoui</last></author>
       <pages>296–301</pages>
       <url>W17-2337</url>
       <doi>10.18653/v1/W17-2337</doi>
@@ -3837,11 +3837,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="38">
       <title>Adapting Pre-trained Word Embeddings For Use In Medical Coding</title>
-      <author><first>Kevin</first> <last>Patel</last></author>
-      <author><first>Divya</first> <last>Patel</last></author>
-      <author><first>Mansi</first> <last>Golakiya</last></author>
-      <author><first>Pushpak</first> <last>Bhattacharyya</last></author>
-      <author><first>Nilesh</first> <last>Birari</last></author>
+      <author><first>Kevin</first><last>Patel</last></author>
+      <author><first>Divya</first><last>Patel</last></author>
+      <author><first>Mansi</first><last>Golakiya</last></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last></author>
+      <author><first>Nilesh</first><last>Birari</last></author>
       <pages>302–306</pages>
       <url>W17-2338</url>
       <doi>10.18653/v1/W17-2338</doi>
@@ -3849,8 +3849,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="39">
       <title>Initializing neural networks for hierarchical multi-label text classification</title>
-      <author><first>Simon</first> <last>Baker</last></author>
-      <author><first>Anna</first> <last>Korhonen</last></author>
+      <author><first>Simon</first><last>Baker</last></author>
+      <author><first>Anna</first><last>Korhonen</last></author>
       <pages>307–315</pages>
       <url>W17-2339</url>
       <doi>10.18653/v1/W17-2339</doi>
@@ -3858,9 +3858,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="40">
       <title>Biomedical Event Trigger Identification Using Bidirectional Recurrent Neural Network Based Models</title>
-      <author><first>Rahul</first> <last>V S S Patchigolla</last></author>
-      <author><first>Sunil</first> <last>Sahu</last></author>
-      <author><first>Ashish</first> <last>Anand</last></author>
+      <author><first>Rahul</first><last>V S S Patchigolla</last></author>
+      <author><first>Sunil</first><last>Sahu</last></author>
+      <author><first>Ashish</first><last>Anand</last></author>
       <pages>316–321</pages>
       <url>W17-2340</url>
       <doi>10.18653/v1/W17-2340</doi>
@@ -3868,11 +3868,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="41">
       <title>Representations of Time Expressions for Temporal Relation Extraction with Convolutional Neural Networks</title>
-      <author><first>Chen</first> <last>Lin</last></author>
-      <author><first>Timothy</first> <last>Miller</last></author>
-      <author><first>Dmitriy</first> <last>Dligach</last></author>
-      <author><first>Steven</first> <last>Bethard</last></author>
-      <author><first>Guergana</first> <last>Savova</last></author>
+      <author><first>Chen</first><last>Lin</last></author>
+      <author><first>Timothy</first><last>Miller</last></author>
+      <author><first>Dmitriy</first><last>Dligach</last></author>
+      <author><first>Steven</first><last>Bethard</last></author>
+      <author><first>Guergana</first><last>Savova</last></author>
       <pages>322–327</pages>
       <url>W17-2341</url>
       <doi>10.18653/v1/W17-2341</doi>
@@ -3880,10 +3880,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="42">
       <title>Automatic Diagnosis Coding of Radiology Reports: A Comparison of Deep Learning and Conventional Classification Methods</title>
-      <author><first>Sarvnaz</first> <last>Karimi</last></author>
-      <author><first>Xiang</first> <last>Dai</last></author>
-      <author><first>Hamed</first> <last>Hassanzadeh</last></author>
-      <author><first>Anthony</first> <last>Nguyen</last></author>
+      <author><first>Sarvnaz</first><last>Karimi</last></author>
+      <author><first>Xiang</first><last>Dai</last></author>
+      <author><first>Hamed</first><last>Hassanzadeh</last></author>
+      <author><first>Anthony</first><last>Nguyen</last></author>
       <pages>328–332</pages>
       <url>W17-2342</url>
       <doi>10.18653/v1/W17-2342</doi>
@@ -3891,9 +3891,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="43">
       <title>Automatic classification of doctor-patient questions for a virtual patient record query task</title>
-      <author><first>Leonardo</first> <last>Campillos Llanos</last></author>
-      <author><first>Sophie</first> <last>Rosset</last></author>
-      <author><first>Pierre</first> <last>Zweigenbaum</last></author>
+      <author><first>Leonardo</first><last>Campillos Llanos</last></author>
+      <author><first>Sophie</first><last>Rosset</last></author>
+      <author><first>Pierre</first><last>Zweigenbaum</last></author>
       <pages>333–341</pages>
       <url>W17-2343</url>
       <doi>10.18653/v1/W17-2343</doi>
@@ -3901,10 +3901,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="44">
       <title>Assessing the performance of <fixed-case>O</fixed-case>lelo, a real-time biomedical question answering application</title>
-      <author><first>Mariana</first> <last>Neves</last></author>
-      <author><first>Fabian</first> <last>Eckert</last></author>
-      <author><first>Hendrik</first> <last>Folkerts</last></author>
-      <author><first>Matthias</first> <last>Uflacker</last></author>
+      <author><first>Mariana</first><last>Neves</last></author>
+      <author><first>Fabian</first><last>Eckert</last></author>
+      <author><first>Hendrik</first><last>Folkerts</last></author>
+      <author><first>Matthias</first><last>Uflacker</last></author>
       <pages>342–350</pages>
       <url>W17-2344</url>
       <doi>10.18653/v1/W17-2344</doi>
@@ -3913,8 +3913,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="45">
       <title>Clinical Event Detection with Hybrid Neural Architecture</title>
-      <author><first>Adyasha</first> <last>Maharana</last></author>
-      <author><first>Meliha</first> <last>Yetisgen</last></author>
+      <author><first>Adyasha</first><last>Maharana</last></author>
+      <author><first>Meliha</first><last>Yetisgen</last></author>
       <pages>351–355</pages>
       <url>W17-2345</url>
       <doi>10.18653/v1/W17-2345</doi>
@@ -3922,9 +3922,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="46">
       <title>Extracting Personal Medical Events for User Timeline Construction using Minimal Supervision</title>
-      <author><first>Aakanksha</first> <last>Naik</last></author>
-      <author><first>Chris</first> <last>Bogart</last></author>
-      <author><first>Carolyn</first> <last>Rose</last></author>
+      <author><first>Aakanksha</first><last>Naik</last></author>
+      <author><first>Chris</first><last>Bogart</last></author>
+      <author><first>Carolyn</first><last>Rose</last></author>
       <pages>356–364</pages>
       <url>W17-2346</url>
       <doi>10.18653/v1/W17-2346</doi>
@@ -3932,13 +3932,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="47">
       <title>Detecting mentions of pain and acute confusion in <fixed-case>F</fixed-case>innish clinical text</title>
-      <author><first>Hans</first> <last>Moen</last></author>
-      <author><first>Kai</first> <last>Hakala</last></author>
-      <author><first>Farrokh</first> <last>Mehryary</last></author>
-      <author><first>Laura-Maria</first> <last>Peltonen</last></author>
-      <author><first>Tapio</first> <last>Salakoski</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
-      <author><first>Sanna</first> <last>Salanterä</last></author>
+      <author><first>Hans</first><last>Moen</last></author>
+      <author><first>Kai</first><last>Hakala</last></author>
+      <author><first>Farrokh</first><last>Mehryary</last></author>
+      <author><first>Laura-Maria</first><last>Peltonen</last></author>
+      <author><first>Tapio</first><last>Salakoski</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
+      <author><first>Sanna</first><last>Salanterä</last></author>
       <pages>365–372</pages>
       <url>W17-2347</url>
       <doi>10.18653/v1/W17-2347</doi>
@@ -3946,11 +3946,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="48">
       <title>A Multi-strategy Query Processing Approach for Biomedical Question Answering: <fixed-case>USTB</fixed-case>_<fixed-case>PRIR</fixed-case> at <fixed-case>B</fixed-case>io<fixed-case>ASQ</fixed-case> 2017 Task 5<fixed-case>B</fixed-case></title>
-      <author><first>Zan-Xia</first> <last>Jin</last></author>
-      <author><first>Bo-Wen</first> <last>Zhang</last></author>
-      <author><first>Fan</first> <last>Fang</last></author>
-      <author><first>Le-Le</first> <last>Zhang</last></author>
-      <author><first>Xu-Cheng</first> <last>Yin</last></author>
+      <author><first>Zan-Xia</first><last>Jin</last></author>
+      <author><first>Bo-Wen</first><last>Zhang</last></author>
+      <author><first>Fan</first><last>Fang</last></author>
+      <author><first>Le-Le</first><last>Zhang</last></author>
+      <author><first>Xu-Cheng</first><last>Yin</last></author>
       <pages>373–380</pages>
       <url>W17-2348</url>
       <doi>10.18653/v1/W17-2348</doi>
@@ -3976,11 +3976,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>On the “Calligraphy” of Books</title>
-      <author><first>Vanessa Queiroz</first> <last>Marinho</last></author>
-      <author><first>Henrique Ferraz</first> <last>de Arruda</last></author>
-      <author><first>Thales</first> <last>Sinelli</last></author>
-      <author><first>Luciano da Fontoura</first> <last>Costa</last></author>
-      <author><first>Diego Raphael</first> <last>Amancio</last></author>
+      <author><first>Vanessa Queiroz</first><last>Marinho</last></author>
+      <author><first>Henrique Ferraz</first><last>de Arruda</last></author>
+      <author><first>Thales</first><last>Sinelli</last></author>
+      <author><first>Luciano da Fontoura</first><last>Costa</last></author>
+      <author><first>Diego Raphael</first><last>Amancio</last></author>
       <pages>1–10</pages>
       <url>W17-2401</url>
       <doi>10.18653/v1/W17-2401</doi>
@@ -3988,11 +3988,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Adapting predominant and novel sense discovery algorithms for identifying corpus-specific sense differences</title>
-      <author><first>Binny</first> <last>Mathew</last></author>
-      <author><first>Suman Kalyan</first> <last>Maity</last></author>
-      <author><first>Pratip</first> <last>Sarkar</last></author>
-      <author><first>Animesh</first> <last>Mukherjee</last></author>
-      <author><first>Pawan</first> <last>Goyal</last></author>
+      <author><first>Binny</first><last>Mathew</last></author>
+      <author><first>Suman Kalyan</first><last>Maity</last></author>
+      <author><first>Pratip</first><last>Sarkar</last></author>
+      <author><first>Animesh</first><last>Mukherjee</last></author>
+      <author><first>Pawan</first><last>Goyal</last></author>
       <pages>11–20</pages>
       <url>W17-2402</url>
       <doi>10.18653/v1/W17-2402</doi>
@@ -4000,9 +4000,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Merging knowledge bases in different languages</title>
-      <author><first>Jerónimo</first> <last>Hernández-González</last></author>
-      <author><first>Estevam R.</first> <last>Hruschka Jr.</last></author>
-      <author><first>Tom M.</first> <last>Mitchell</last></author>
+      <author><first>Jerónimo</first><last>Hernández-González</last></author>
+      <author><first>Estevam R.</first><last>Hruschka Jr.</last></author>
+      <author><first>Tom M.</first><last>Mitchell</last></author>
       <pages>21–29</pages>
       <url>W17-2403</url>
       <doi>10.18653/v1/W17-2403</doi>
@@ -4010,8 +4010,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Parameter Free Hierarchical Graph-Based Clustering for Analyzing Continuous Word Embeddings</title>
-      <author><first>Thomas Alexander</first> <last>Trost</last></author>
-      <author><first>Dietrich</first> <last>Klakow</last></author>
+      <author><first>Thomas Alexander</first><last>Trost</last></author>
+      <author><first>Dietrich</first><last>Klakow</last></author>
       <pages>30–38</pages>
       <url>W17-2404</url>
       <doi>10.18653/v1/W17-2404</doi>
@@ -4019,9 +4019,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Spectral Graph-Based Method of Multimodal Word Embedding</title>
-      <author><first>Kazuki</first> <last>Fukui</last></author>
-      <author><first>Takamasa</first> <last>Oshikiri</last></author>
-      <author><first>Hidetoshi</first> <last>Shimodaira</last></author>
+      <author><first>Kazuki</first><last>Fukui</last></author>
+      <author><first>Takamasa</first><last>Oshikiri</last></author>
+      <author><first>Hidetoshi</first><last>Shimodaira</last></author>
       <pages>39–44</pages>
       <url>W17-2405</url>
       <doi>10.18653/v1/W17-2405</doi>
@@ -4029,8 +4029,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Graph Methods for Multilingual <fixed-case>F</fixed-case>rame<fixed-case>N</fixed-case>ets</title>
-      <author><first>Collin</first> <last>Baker</last></author>
-      <author><first>Michael</first> <last>Ellsworth</last></author>
+      <author><first>Collin</first><last>Baker</last></author>
+      <author><first>Michael</first><last>Ellsworth</last></author>
       <pages>45–50</pages>
       <url>W17-2406</url>
       <doi>10.18653/v1/W17-2406</doi>
@@ -4039,8 +4039,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Extract with Order for Coherent Multi-Document Summarization</title>
-      <author><first>Mir Tafseer</first> <last>Nayeem</last></author>
-      <author><first>Yllias</first> <last>Chali</last></author>
+      <author><first>Mir Tafseer</first><last>Nayeem</last></author>
+      <author><first>Yllias</first><last>Chali</last></author>
       <pages>51–56</pages>
       <url>W17-2407</url>
       <doi>10.18653/v1/W17-2407</doi>
@@ -4048,8 +4048,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Work Hard, Play Hard: Email Classification on the Avocado and <fixed-case>E</fixed-case>nron Corpora</title>
-      <author><first>Sakhar</first> <last>Alkhereyf</last></author>
-      <author><first>Owen</first> <last>Rambow</last></author>
+      <author><first>Sakhar</first><last>Alkhereyf</last></author>
+      <author><first>Owen</first><last>Rambow</last></author>
       <pages>57–65</pages>
       <url>W17-2408</url>
       <doi>10.18653/v1/W17-2408</doi>
@@ -4057,13 +4057,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>A Graph Based Semi-Supervised Approach for Analysis of Derivational Nouns in <fixed-case>S</fixed-case>anskrit</title>
-      <author><first>Amrith</first> <last>Krishna</last></author>
-      <author><first>Pavankumar</first> <last>Satuluri</last></author>
-      <author><first>Harshavardhan</first> <last>Ponnada</last></author>
-      <author><first>Muneeb</first> <last>Ahmed</last></author>
-      <author><first>Gulab</first> <last>Arora</last></author>
-      <author><first>Kaustubh</first> <last>Hiware</last></author>
-      <author><first>Pawan</first> <last>Goyal</last></author>
+      <author><first>Amrith</first><last>Krishna</last></author>
+      <author><first>Pavankumar</first><last>Satuluri</last></author>
+      <author><first>Harshavardhan</first><last>Ponnada</last></author>
+      <author><first>Muneeb</first><last>Ahmed</last></author>
+      <author><first>Gulab</first><last>Arora</last></author>
+      <author><first>Kaustubh</first><last>Hiware</last></author>
+      <author><first>Pawan</first><last>Goyal</last></author>
       <pages>66–75</pages>
       <url>W17-2409</url>
       <doi>10.18653/v1/W17-2409</doi>
@@ -4071,8 +4071,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Evaluating text coherence based on semantic similarity graph</title>
-      <author><first>Jan Wira Gotama</first> <last>Putra</last></author>
-      <author><first>Takenobu</first> <last>Tokunaga</last></author>
+      <author><first>Jan Wira Gotama</first><last>Putra</last></author>
+      <author><first>Takenobu</first><last>Tokunaga</last></author>
       <pages>76–85</pages>
       <url>W17-2410</url>
       <doi>10.18653/v1/W17-2410</doi>
@@ -4084,9 +4084,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     <meta>
       <booktitle>Proceedings of the 10th Workshop on Building and Using Comparable Corpora</booktitle>
       <url>W17-25</url>
-      <editor><first>Serge</first> <last>Sharoff</last></editor>
-      <editor><first>Pierre</first> <last>Zweigenbaum</last></editor>
-      <editor><first>Reinhard</first> <last>Rapp</last></editor>
+      <editor><first>Serge</first><last>Sharoff</last></editor>
+      <editor><first>Pierre</first><last>Zweigenbaum</last></editor>
+      <editor><first>Reinhard</first><last>Rapp</last></editor>
       <doi>10.18653/v1/W17-25</doi>
       <publisher>Association for Computational Linguistics</publisher>
       <address>Vancouver, Canada</address>
@@ -4098,7 +4098,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Users and Data: The Two Neglected Children of Bilingual Natural Language Processing Research</title>
-      <author><first>Phillippe</first> <last>Langlais</last></author>
+      <author><first>Phillippe</first><last>Langlais</last></author>
       <pages>1–5</pages>
       <url>W17-2501</url>
       <doi>10.18653/v1/W17-2501</doi>
@@ -4106,10 +4106,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Deep Investigation of Cross-Language Plagiarism Detection Methods</title>
-      <author><first>Jérémy</first> <last>Ferrero</last></author>
-      <author><first>Laurent</first> <last>Besacier</last></author>
-      <author><first>Didier</first> <last>Schwab</last></author>
-      <author><first>Frédéric</first> <last>Agnès</last></author>
+      <author><first>Jérémy</first><last>Ferrero</last></author>
+      <author><first>Laurent</first><last>Besacier</last></author>
+      <author><first>Didier</first><last>Schwab</last></author>
+      <author><first>Frédéric</first><last>Agnès</last></author>
       <pages>6–15</pages>
       <url>W17-2502</url>
       <doi>10.18653/v1/W17-2502</doi>
@@ -4118,8 +4118,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Sentence Alignment using Unfolding Recursive Autoencoders</title>
-      <author><first>Jeenu</first> <last>Grover</last></author>
-      <author><first>Pabitra</first> <last>Mitra</last></author>
+      <author><first>Jeenu</first><last>Grover</last></author>
+      <author><first>Pabitra</first><last>Mitra</last></author>
       <pages>16–20</pages>
       <url>W17-2503</url>
       <doi>10.18653/v1/W17-2503</doi>
@@ -4127,8 +4127,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Acquisition of Translation Lexicons for Historically Unwritten Languages via Bridging Loanwords</title>
-      <author><first>Michael</first> <last>Bloodgood</last></author>
-      <author><first>Benjamin</first> <last>Strauss</last></author>
+      <author><first>Michael</first><last>Bloodgood</last></author>
+      <author><first>Benjamin</first><last>Strauss</last></author>
       <pages>21–25</pages>
       <url>W17-2504</url>
       <doi>10.18653/v1/W17-2504</doi>
@@ -4137,7 +4137,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Toward a Comparable Corpus of <fixed-case>L</fixed-case>atvian, <fixed-case>R</fixed-case>ussian and <fixed-case>E</fixed-case>nglish Tweets</title>
-      <author><first>Dmitrijs</first> <last>Milajevs</last></author>
+      <author><first>Dmitrijs</first><last>Milajevs</last></author>
       <pages>26–30</pages>
       <url>W17-2505</url>
       <doi>10.18653/v1/W17-2505</doi>
@@ -4145,9 +4145,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Automatic Extraction of Parallel Speech Corpora from Dubbed Movies</title>
-      <author><first>Alp</first> <last>Öktem</last></author>
-      <author><first>Mireia</first> <last>Farrús</last></author>
-      <author><first>Leo</first> <last>Wanner</last></author>
+      <author><first>Alp</first><last>Öktem</last></author>
+      <author><first>Mireia</first><last>Farrús</last></author>
+      <author><first>Leo</first><last>Wanner</last></author>
       <pages>31–35</pages>
       <url>W17-2506</url>
       <doi>10.18653/v1/W17-2506</doi>
@@ -4156,7 +4156,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>A parallel collection of clinical trials in <fixed-case>P</fixed-case>ortuguese and <fixed-case>E</fixed-case>nglish</title>
-      <author><first>Mariana</first> <last>Neves</last></author>
+      <author><first>Mariana</first><last>Neves</last></author>
       <pages>36–40</pages>
       <url>W17-2507</url>
       <doi>10.18653/v1/W17-2507</doi>
@@ -4165,9 +4165,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Weighted Set-Theoretic Alignment of Comparable Sentences</title>
-      <author><first>Andoni</first> <last>Azpeitia</last></author>
-      <author><first>Thierry</first> <last>Etchegoyhen</last></author>
-      <author><first>Eva</first> <last>Martínez Garcia</last></author>
+      <author><first>Andoni</first><last>Azpeitia</last></author>
+      <author><first>Thierry</first><last>Etchegoyhen</last></author>
+      <author><first>Eva</first><last>Martínez Garcia</last></author>
       <pages>41–45</pages>
       <url>W17-2508</url>
       <doi>10.18653/v1/W17-2508</doi>
@@ -4175,8 +4175,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title><fixed-case>BUCC</fixed-case> 2017 Shared Task: a First Attempt Toward a Deep Learning Framework for Identifying Parallel Sentences in Comparable Corpora</title>
-      <author><first>Francis</first> <last>Grégoire</last></author>
-      <author><first>Philippe</first> <last>Langlais</last></author>
+      <author><first>Francis</first><last>Grégoire</last></author>
+      <author><first>Philippe</first><last>Langlais</last></author>
       <pages>46–50</pages>
       <url>W17-2509</url>
       <doi>10.18653/v1/W17-2509</doi>
@@ -4184,8 +4184,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>z<fixed-case>NLP</fixed-case>: Identifying Parallel Sentences in <fixed-case>C</fixed-case>hinese-<fixed-case>E</fixed-case>nglish Comparable Corpora</title>
-      <author><first>Zheng</first> <last>Zhang</last></author>
-      <author><first>Pierre</first> <last>Zweigenbaum</last></author>
+      <author><first>Zheng</first><last>Zhang</last></author>
+      <author><first>Pierre</first><last>Zweigenbaum</last></author>
       <pages>51–55</pages>
       <url>W17-2510</url>
       <doi>10.18653/v1/W17-2510</doi>
@@ -4193,9 +4193,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title><fixed-case>BUCC</fixed-case>2017: A Hybrid Approach for Identifying Parallel Sentences in Comparable Corpora</title>
-      <author><first>Sainik</first> <last>Mahata</last></author>
-      <author><first>Dipankar</first> <last>Das</last></author>
-      <author><first>Sivaji</first> <last>Bandyopadhyay</last></author>
+      <author><first>Sainik</first><last>Mahata</last></author>
+      <author><first>Dipankar</first><last>Das</last></author>
+      <author><first>Sivaji</first><last>Bandyopadhyay</last></author>
       <pages>56–59</pages>
       <url>W17-2511</url>
       <doi>10.18653/v1/W17-2511</doi>
@@ -4203,9 +4203,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Overview of the Second <fixed-case>BUCC</fixed-case> Shared Task: Spotting Parallel Sentences in Comparable Corpora</title>
-      <author><first>Pierre</first> <last>Zweigenbaum</last></author>
-      <author><first>Serge</first> <last>Sharoff</last></author>
-      <author><first>Reinhard</first> <last>Rapp</last></author>
+      <author><first>Pierre</first><last>Zweigenbaum</last></author>
+      <author><first>Serge</first><last>Sharoff</last></author>
+      <author><first>Reinhard</first><last>Rapp</last></author>
       <pages>60–67</pages>
       <url>W17-2512</url>
       <doi>10.18653/v1/W17-2512</doi>
@@ -4237,7 +4237,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Sense Contextualization in a Dependency-Based Compositional Distributional Model</title>
-      <author><first>Pablo</first> <last>Gamallo</last></author>
+      <author><first>Pablo</first><last>Gamallo</last></author>
       <pages>1–9</pages>
       <url>W17-2601</url>
       <doi>10.18653/v1/W17-2601</doi>
@@ -4245,7 +4245,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Context encoders as a simple but powerful extension of word2vec</title>
-      <author><first>Franziska</first> <last>Horn</last></author>
+      <author><first>Franziska</first><last>Horn</last></author>
       <pages>10–14</pages>
       <url>W17-2602</url>
       <doi>10.18653/v1/W17-2602</doi>
@@ -4253,14 +4253,14 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Machine Comprehension by Text-to-Text Neural Question Generation</title>
-      <author><first>Xingdi</first> <last>Yuan</last></author>
-      <author><first>Tong</first> <last>Wang</last></author>
-      <author><first>Caglar</first> <last>Gulcehre</last></author>
-      <author><first>Alessandro</first> <last>Sordoni</last></author>
-      <author><first>Philip</first> <last>Bachman</last></author>
-      <author><first>Saizheng</first> <last>Zhang</last></author>
-      <author><first>Sandeep</first> <last>Subramanian</last></author>
-      <author><first>Adam</first> <last>Trischler</last></author>
+      <author><first>Xingdi</first><last>Yuan</last></author>
+      <author><first>Tong</first><last>Wang</last></author>
+      <author><first>Caglar</first><last>Gulcehre</last></author>
+      <author><first>Alessandro</first><last>Sordoni</last></author>
+      <author><first>Philip</first><last>Bachman</last></author>
+      <author><first>Saizheng</first><last>Zhang</last></author>
+      <author><first>Sandeep</first><last>Subramanian</last></author>
+      <author><first>Adam</first><last>Trischler</last></author>
       <pages>15–25</pages>
       <url>W17-2603</url>
       <doi>10.18653/v1/W17-2603</doi>
@@ -4268,10 +4268,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Emergent Predication Structure in Hidden State Vectors of Neural Readers</title>
-      <author><first>Hai</first> <last>Wang</last></author>
-      <author><first>Takeshi</first> <last>Onishi</last></author>
-      <author><first>Kevin</first> <last>Gimpel</last></author>
-      <author><first>David</first> <last>McAllester</last></author>
+      <author><first>Hai</first><last>Wang</last></author>
+      <author><first>Takeshi</first><last>Onishi</last></author>
+      <author><first>Kevin</first><last>Gimpel</last></author>
+      <author><first>David</first><last>McAllester</last></author>
       <pages>26–36</pages>
       <url>W17-2604</url>
       <doi>10.18653/v1/W17-2604</doi>
@@ -4279,8 +4279,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Towards Harnessing Memory Networks for Coreference Resolution</title>
-      <author><first>Joe</first> <last>Cheri</last></author>
-      <author><first>Pushpak</first> <last>Bhattacharyya</last></author>
+      <author><first>Joe</first><last>Cheri</last></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last></author>
       <pages>37–42</pages>
       <url>W17-2605</url>
       <doi>10.18653/v1/W17-2605</doi>
@@ -4288,9 +4288,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Combining Word-Level and Character-Level Representations for Relation Classification of Informal Text</title>
-      <author><first>Dongyun</first> <last>Liang</last></author>
-      <author><first>Weiran</first> <last>Xu</last></author>
-      <author><first>Yinge</first> <last>Zhao</last></author>
+      <author><first>Dongyun</first><last>Liang</last></author>
+      <author><first>Weiran</first><last>Xu</last></author>
+      <author><first>Yinge</first><last>Zhao</last></author>
       <pages>43–47</pages>
       <url>W17-2606</url>
       <doi>10.18653/v1/W17-2606</doi>
@@ -4298,10 +4298,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Transfer Learning for Neural Semantic Parsing</title>
-      <author><first>Xing</first> <last>Fan</last></author>
-      <author><first>Emilio</first> <last>Monti</last></author>
-      <author><first>Lambert</first> <last>Mathias</last></author>
-      <author><first>Markus</first> <last>Dreyer</last></author>
+      <author><first>Xing</first><last>Fan</last></author>
+      <author><first>Emilio</first><last>Monti</last></author>
+      <author><first>Lambert</first><last>Mathias</last></author>
+      <author><first>Markus</first><last>Dreyer</last></author>
       <pages>48–56</pages>
       <url>W17-2607</url>
       <doi>10.18653/v1/W17-2607</doi>
@@ -4309,10 +4309,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Modeling Large-Scale Structured Relationships with Shared Memory for Knowledge Base Completion</title>
-      <author><first>Yelong</first> <last>Shen</last></author>
-      <author><first>Po-Sen</first> <last>Huang</last></author>
-      <author><first>Ming-Wei</first> <last>Chang</last></author>
-      <author><first>Jianfeng</first> <last>Gao</last></author>
+      <author><first>Yelong</first><last>Shen</last></author>
+      <author><first>Po-Sen</first><last>Huang</last></author>
+      <author><first>Ming-Wei</first><last>Chang</last></author>
+      <author><first>Jianfeng</first><last>Gao</last></author>
       <pages>57–68</pages>
       <url>W17-2608</url>
       <doi>10.18653/v1/W17-2608</doi>
@@ -4320,9 +4320,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Knowledge Base Completion: Baselines Strike Back</title>
-      <author><first>Rudolf</first> <last>Kadlec</last></author>
-      <author><first>Ondrej</first> <last>Bajgar</last></author>
-      <author><first>Jan</first> <last>Kleindienst</last></author>
+      <author><first>Rudolf</first><last>Kadlec</last></author>
+      <author><first>Ondrej</first><last>Bajgar</last></author>
+      <author><first>Jan</first><last>Kleindienst</last></author>
       <pages>69–74</pages>
       <url>W17-2609</url>
       <doi>10.18653/v1/W17-2609</doi>
@@ -4330,9 +4330,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Sequential Attention: A Context-Aware Alignment Function for Machine Reading</title>
-      <author><first>Sebastian</first> <last>Brarda</last></author>
-      <author><first>Philip</first> <last>Yeres</last></author>
-      <author><first>Samuel</first> <last>Bowman</last></author>
+      <author><first>Sebastian</first><last>Brarda</last></author>
+      <author><first>Philip</first><last>Yeres</last></author>
+      <author><first>Samuel</first><last>Bowman</last></author>
       <pages>75–80</pages>
       <url>W17-2610</url>
       <doi>10.18653/v1/W17-2610</doi>
@@ -4340,12 +4340,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Semantic Vector Encoding and Similarity Search Using Fulltext Search Engines</title>
-      <author><first>Jan</first> <last>Rygl</last></author>
-      <author><first>Jan</first> <last>Pomikálek</last></author>
-      <author><first>Radim</first> <last>Řehůřek</last></author>
-      <author><first>Michal</first> <last>Růžička</last></author>
-      <author><first>Vít</first> <last>Novotný</last></author>
-      <author><first>Petr</first> <last>Sojka</last></author>
+      <author><first>Jan</first><last>Rygl</last></author>
+      <author><first>Jan</first><last>Pomikálek</last></author>
+      <author><first>Radim</first><last>Řehůřek</last></author>
+      <author><first>Michal</first><last>Růžička</last></author>
+      <author><first>Vít</first><last>Novotný</last></author>
+      <author><first>Petr</first><last>Sojka</last></author>
       <pages>81–90</pages>
       <url>W17-2611</url>
       <doi>10.18653/v1/W17-2611</doi>
@@ -4353,8 +4353,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Multi-task Domain Adaptation for Sequence Tagging</title>
-      <author><first>Nanyun</first> <last>Peng</last></author>
-      <author><first>Mark</first> <last>Dredze</last></author>
+      <author><first>Nanyun</first><last>Peng</last></author>
+      <author><first>Mark</first><last>Dredze</last></author>
       <pages>91–100</pages>
       <url>W17-2612</url>
       <doi>10.18653/v1/W17-2612</doi>
@@ -4362,11 +4362,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Beyond Bilingual: Multi-sense Word Embeddings using Multilingual Context</title>
-      <author><first>Shyam</first> <last>Upadhyay</last></author>
-      <author><first>Kai-Wei</first> <last>Chang</last></author>
-      <author><first>Matt</first> <last>Taddy</last></author>
-      <author><first>Adam</first> <last>Kalai</last></author>
-      <author><first>James</first> <last>Zou</last></author>
+      <author><first>Shyam</first><last>Upadhyay</last></author>
+      <author><first>Kai-Wei</first><last>Chang</last></author>
+      <author><first>Matt</first><last>Taddy</last></author>
+      <author><first>Adam</first><last>Kalai</last></author>
+      <author><first>James</first><last>Zou</last></author>
       <pages>101–110</pages>
       <url>W17-2613</url>
       <doi>10.18653/v1/W17-2613</doi>
@@ -4374,10 +4374,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title><fixed-case>D</fixed-case>oc<fixed-case>T</fixed-case>ag2<fixed-case>V</fixed-case>ec: An Embedding Based Multi-label Learning Approach for Document Tagging</title>
-      <author><first>Sheng</first> <last>Chen</last></author>
-      <author><first>Akshay</first> <last>Soni</last></author>
-      <author><first>Aasish</first> <last>Pappu</last></author>
-      <author><first>Yashar</first> <last>Mehdad</last></author>
+      <author><first>Sheng</first><last>Chen</last></author>
+      <author><first>Akshay</first><last>Soni</last></author>
+      <author><first>Aasish</first><last>Pappu</last></author>
+      <author><first>Yashar</first><last>Mehdad</last></author>
       <pages>111–120</pages>
       <url>W17-2614</url>
       <doi>10.18653/v1/W17-2614</doi>
@@ -4385,8 +4385,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Binary Paragraph Vectors</title>
-      <author><first>Karol</first> <last>Grzegorczyk</last></author>
-      <author><first>Marcin</first> <last>Kurdziel</last></author>
+      <author><first>Karol</first><last>Grzegorczyk</last></author>
+      <author><first>Marcin</first><last>Kurdziel</last></author>
       <pages>121–130</pages>
       <url>W17-2615</url>
       <doi>10.18653/v1/W17-2615</doi>
@@ -4394,9 +4394,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="16">
       <title>Representing Compositionality based on Multiple Timescales Gated Recurrent Neural Networks with Adaptive Temporal Hierarchy for Character-Level Language Models</title>
-      <author><first>Dennis Singh</first> <last>Moirangthem</last></author>
-      <author><first>Jegyung</first> <last>Son</last></author>
-      <author><first>Minho</first> <last>Lee</last></author>
+      <author><first>Dennis Singh</first><last>Moirangthem</last></author>
+      <author><first>Jegyung</first><last>Son</last></author>
+      <author><first>Minho</first><last>Lee</last></author>
       <pages>131–138</pages>
       <url>W17-2616</url>
       <doi>10.18653/v1/W17-2616</doi>
@@ -4404,8 +4404,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="17">
       <title>Learning Bilingual Projections of Embeddings for Vocabulary Expansion in Machine Translation</title>
-      <author><first>Pranava Swaroop</first> <last>Madhyastha</last></author>
-      <author><first>Cristina</first> <last>España-Bonet</last></author>
+      <author><first>Pranava Swaroop</first><last>Madhyastha</last></author>
+      <author><first>Cristina</first><last>España-Bonet</last></author>
       <pages>139–145</pages>
       <url>W17-2617</url>
       <doi>10.18653/v1/W17-2617</doi>
@@ -4413,9 +4413,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="18">
       <title>Prediction of Frame-to-Frame Relations in the <fixed-case>F</fixed-case>rame<fixed-case>N</fixed-case>et Hierarchy with Frame Embeddings</title>
-      <author><first>Teresa</first> <last>Botschen</last></author>
-      <author><first>Hatem</first> <last>Mousselly-Sergieh</last></author>
-      <author><first>Iryna</first> <last>Gurevych</last></author>
+      <author><first>Teresa</first><last>Botschen</last></author>
+      <author><first>Hatem</first><last>Mousselly-Sergieh</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
       <pages>146–156</pages>
       <url>W17-2618</url>
       <doi>10.18653/v1/W17-2618</doi>
@@ -4423,8 +4423,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="19">
       <title>Learning Joint Multilingual Sentence Representations with Neural Machine Translation</title>
-      <author><first>Holger</first> <last>Schwenk</last></author>
-      <author><first>Matthijs</first> <last>Douze</last></author>
+      <author><first>Holger</first><last>Schwenk</last></author>
+      <author><first>Matthijs</first><last>Douze</last></author>
       <pages>157–167</pages>
       <url>W17-2619</url>
       <doi>10.18653/v1/W17-2619</doi>
@@ -4432,12 +4432,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="20">
       <title>Transfer Learning for Speech Recognition on a Budget</title>
-      <author><first>Julius</first> <last>Kunze</last></author>
-      <author><first>Louis</first> <last>Kirsch</last></author>
-      <author><first>Ilia</first> <last>Kurenkov</last></author>
-      <author><first>Andreas</first> <last>Krug</last></author>
-      <author><first>Jens</first> <last>Johannsmeier</last></author>
-      <author><first>Sebastian</first> <last>Stober</last></author>
+      <author><first>Julius</first><last>Kunze</last></author>
+      <author><first>Louis</first><last>Kirsch</last></author>
+      <author><first>Ilia</first><last>Kurenkov</last></author>
+      <author><first>Andreas</first><last>Krug</last></author>
+      <author><first>Jens</first><last>Johannsmeier</last></author>
+      <author><first>Sebastian</first><last>Stober</last></author>
       <pages>168–177</pages>
       <url>W17-2620</url>
       <doi>10.18653/v1/W17-2620</doi>
@@ -4445,8 +4445,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="21">
       <title>Gradual Learning of Matrix-Space Models of Language for Sentiment Analysis</title>
-      <author><first>Shima</first> <last>Asaadi</last></author>
-      <author><first>Sebastian</first> <last>Rudolph</last></author>
+      <author><first>Shima</first><last>Asaadi</last></author>
+      <author><first>Sebastian</first><last>Rudolph</last></author>
       <pages>178–185</pages>
       <url>W17-2621</url>
       <doi>10.18653/v1/W17-2621</doi>
@@ -4454,9 +4454,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="22">
       <title>Improving Language Modeling using Densely Connected Recurrent Neural Networks</title>
-      <author><first>Fréderic</first> <last>Godin</last></author>
-      <author><first>Joni</first> <last>Dambre</last></author>
-      <author><first>Wesley</first> <last>De Neve</last></author>
+      <author><first>Fréderic</first><last>Godin</last></author>
+      <author><first>Joni</first><last>Dambre</last></author>
+      <author><first>Wesley</first><last>De Neve</last></author>
       <pages>186–190</pages>
       <url>W17-2622</url>
       <doi>10.18653/v1/W17-2622</doi>
@@ -4464,13 +4464,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="23">
       <title><fixed-case>N</fixed-case>ews<fixed-case>QA</fixed-case>: A Machine Comprehension Dataset</title>
-      <author><first>Adam</first> <last>Trischler</last></author>
-      <author><first>Tong</first> <last>Wang</last></author>
-      <author><first>Xingdi</first> <last>Yuan</last></author>
-      <author><first>Justin</first> <last>Harris</last></author>
-      <author><first>Alessandro</first> <last>Sordoni</last></author>
-      <author><first>Philip</first> <last>Bachman</last></author>
-      <author><first>Kaheer</first> <last>Suleman</last></author>
+      <author><first>Adam</first><last>Trischler</last></author>
+      <author><first>Tong</first><last>Wang</last></author>
+      <author><first>Xingdi</first><last>Yuan</last></author>
+      <author><first>Justin</first><last>Harris</last></author>
+      <author><first>Alessandro</first><last>Sordoni</last></author>
+      <author><first>Philip</first><last>Bachman</last></author>
+      <author><first>Kaheer</first><last>Suleman</last></author>
       <pages>191–200</pages>
       <url>W17-2623</url>
       <doi>10.18653/v1/W17-2623</doi>
@@ -4478,11 +4478,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="24">
       <title>Intrinsic and Extrinsic Evaluation of Spatiotemporal Text Representations in Twitter Streams</title>
-      <author><first>Lawrence</first> <last>Phillips</last></author>
-      <author><first>Kyle</first> <last>Shaffer</last></author>
-      <author><first>Dustin</first> <last>Arendt</last></author>
-      <author><first>Nathan</first> <last>Hodas</last></author>
-      <author><first>Svitlana</first> <last>Volkova</last></author>
+      <author><first>Lawrence</first><last>Phillips</last></author>
+      <author><first>Kyle</first><last>Shaffer</last></author>
+      <author><first>Dustin</first><last>Arendt</last></author>
+      <author><first>Nathan</first><last>Hodas</last></author>
+      <author><first>Svitlana</first><last>Volkova</last></author>
       <pages>201–210</pages>
       <url>W17-2624</url>
       <doi>10.18653/v1/W17-2624</doi>
@@ -4490,11 +4490,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="25">
       <title>Rethinking Skip-thought: A Neighborhood based Approach</title>
-      <author><first>Shuai</first> <last>Tang</last></author>
-      <author><first>Hailin</first> <last>Jin</last></author>
-      <author><first>Chen</first> <last>Fang</last></author>
-      <author><first>Zhaowen</first> <last>Wang</last></author>
-      <author><first>Virginia</first> <last>de Sa</last></author>
+      <author><first>Shuai</first><last>Tang</last></author>
+      <author><first>Hailin</first><last>Jin</last></author>
+      <author><first>Chen</first><last>Fang</last></author>
+      <author><first>Zhaowen</first><last>Wang</last></author>
+      <author><first>Virginia</first><last>de Sa</last></author>
       <pages>211–218</pages>
       <url>W17-2625</url>
       <doi>10.18653/v1/W17-2625</doi>
@@ -4503,10 +4503,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="26">
       <title>A Frame Tracking Model for Memory-Enhanced Dialogue Systems</title>
-      <author><first>Hannes</first> <last>Schulz</last></author>
-      <author><first>Jeremie</first> <last>Zumer</last></author>
-      <author><first>Layla</first> <last>El Asri</last></author>
-      <author><first>Shikhar</first> <last>Sharma</last></author>
+      <author><first>Hannes</first><last>Schulz</last></author>
+      <author><first>Jeremie</first><last>Zumer</last></author>
+      <author><first>Layla</first><last>El Asri</last></author>
+      <author><first>Shikhar</first><last>Sharma</last></author>
       <pages>219–227</pages>
       <url>W17-2626</url>
       <doi>10.18653/v1/W17-2626</doi>
@@ -4514,10 +4514,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="27">
       <title>Plan, Attend, Generate: Character-Level Neural Machine Translation with Planning</title>
-      <author><first>Caglar</first> <last>Gulcehre</last></author>
-      <author><first>Francis</first> <last>Dutil</last></author>
-      <author><first>Adam</first> <last>Trischler</last></author>
-      <author><first>Yoshua</first> <last>Bengio</last></author>
+      <author><first>Caglar</first><last>Gulcehre</last></author>
+      <author><first>Francis</first><last>Dutil</last></author>
+      <author><first>Adam</first><last>Trischler</last></author>
+      <author><first>Yoshua</first><last>Bengio</last></author>
       <pages>228–234</pages>
       <url>W17-2627</url>
       <doi>10.18653/v1/W17-2627</doi>
@@ -4525,9 +4525,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="28">
       <title>Does the Geometry of Word Embeddings Help Document Classification? A Case Study on Persistent Homology-Based Representations</title>
-      <author><first>Paul</first> <last>Michel</last></author>
-      <author><first>Abhilasha</first> <last>Ravichander</last></author>
-      <author><first>Shruti</first> <last>Rijhwani</last></author>
+      <author><first>Paul</first><last>Michel</last></author>
+      <author><first>Abhilasha</first><last>Ravichander</last></author>
+      <author><first>Shruti</first><last>Rijhwani</last></author>
       <pages>235–240</pages>
       <url>W17-2628</url>
       <doi>10.18653/v1/W17-2628</doi>
@@ -4535,11 +4535,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="29">
       <title>Adversarial Generation of Natural Language</title>
-      <author><first>Sandeep</first> <last>Subramanian</last></author>
-      <author><first>Sai</first> <last>Rajeswar</last></author>
-      <author><first>Francis</first> <last>Dutil</last></author>
-      <author><first>Chris</first> <last>Pal</last></author>
-      <author><first>Aaron</first> <last>Courville</last></author>
+      <author><first>Sandeep</first><last>Subramanian</last></author>
+      <author><first>Sai</first><last>Rajeswar</last></author>
+      <author><first>Francis</first><last>Dutil</last></author>
+      <author><first>Chris</first><last>Pal</last></author>
+      <author><first>Aaron</first><last>Courville</last></author>
       <pages>241–251</pages>
       <url>W17-2629</url>
       <doi>10.18653/v1/W17-2629</doi>
@@ -4547,11 +4547,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="30">
       <title>Deep Active Learning for Named Entity Recognition</title>
-      <author><first>Yanyao</first> <last>Shen</last></author>
-      <author><first>Hyokun</first> <last>Yun</last></author>
-      <author><first>Zachary</first> <last>Lipton</last></author>
-      <author><first>Yakov</first> <last>Kronrod</last></author>
-      <author><first>Animashree</first> <last>Anandkumar</last></author>
+      <author><first>Yanyao</first><last>Shen</last></author>
+      <author><first>Hyokun</first><last>Yun</last></author>
+      <author><first>Zachary</first><last>Lipton</last></author>
+      <author><first>Yakov</first><last>Kronrod</last></author>
+      <author><first>Animashree</first><last>Anandkumar</last></author>
       <pages>252–256</pages>
       <url>W17-2630</url>
       <doi>10.18653/v1/W17-2630</doi>
@@ -4559,8 +4559,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="31">
       <title>Learning when to skim and when to read</title>
-      <author><first>Alexander</first> <last>Johansen</last></author>
-      <author><first>Richard</first> <last>Socher</last></author>
+      <author><first>Alexander</first><last>Johansen</last></author>
+      <author><first>Richard</first><last>Socher</last></author>
       <pages>257–264</pages>
       <url>W17-2631</url>
       <doi>10.18653/v1/W17-2631</doi>
@@ -4568,9 +4568,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="32">
       <title>Learning to Embed Words in Context for Syntactic Tasks</title>
-      <author><first>Lifu</first> <last>Tu</last></author>
-      <author><first>Kevin</first> <last>Gimpel</last></author>
-      <author><first>Karen</first> <last>Livescu</last></author>
+      <author><first>Lifu</first><last>Tu</last></author>
+      <author><first>Kevin</first><last>Gimpel</last></author>
+      <author><first>Karen</first><last>Livescu</last></author>
       <pages>265–275</pages>
       <url>W17-2632</url>
       <doi>10.18653/v1/W17-2632</doi>
@@ -4600,8 +4600,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>news<fixed-case>L</fixed-case>ens: building and visualizing long-ranging news stories</title>
-      <author><first>Philippe</first> <last>Laban</last></author>
-      <author><first>Marti</first> <last>Hearst</last></author>
+      <author><first>Philippe</first><last>Laban</last></author>
+      <author><first>Marti</first><last>Hearst</last></author>
       <pages>1–9</pages>
       <url>W17-2701</url>
       <doi>10.18653/v1/W17-2701</doi>
@@ -4609,8 +4609,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Detecting Changes in Twitter Streams using Temporal Clusters of Hashtags</title>
-      <author><first>Yunli</first> <last>Wang</last></author>
-      <author><first>Cyril</first> <last>Goutte</last></author>
+      <author><first>Yunli</first><last>Wang</last></author>
+      <author><first>Cyril</first><last>Goutte</last></author>
       <pages>10–14</pages>
       <url>W17-2702</url>
       <doi>10.18653/v1/W17-2702</doi>
@@ -4618,9 +4618,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Event Detection Using Frame-Semantic Parser</title>
-      <author><first>Evangelia</first> <last>Spiliopoulou</last></author>
-      <author><first>Eduard</first> <last>Hovy</last></author>
-      <author><first>Teruko</first> <last>Mitamura</last></author>
+      <author><first>Evangelia</first><last>Spiliopoulou</last></author>
+      <author><first>Eduard</first><last>Hovy</last></author>
+      <author><first>Teruko</first><last>Mitamura</last></author>
       <pages>15–20</pages>
       <url>W17-2703</url>
       <doi>10.18653/v1/W17-2703</doi>
@@ -4628,8 +4628,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Improving Shared Argument Identification in <fixed-case>J</fixed-case>apanese Event Knowledge Acquisition</title>
-      <author><first>Yin Jou</first> <last>Huang</last></author>
-      <author><first>Sadao</first> <last>Kurohashi</last></author>
+      <author><first>Yin Jou</first><last>Huang</last></author>
+      <author><first>Sadao</first><last>Kurohashi</last></author>
       <pages>21–30</pages>
       <url>W17-2704</url>
       <doi>10.18653/v1/W17-2704</doi>
@@ -4637,9 +4637,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Tracing armed conflicts with diachronic word embedding models</title>
-      <author><first>Andrey</first> <last>Kutuzov</last></author>
-      <author><first>Erik</first> <last>Velldal</last></author>
-      <author><first>Lilja</first> <last>Øvrelid</last></author>
+      <author><first>Andrey</first><last>Kutuzov</last></author>
+      <author><first>Erik</first><last>Velldal</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
       <pages>31–36</pages>
       <url>W17-2705</url>
       <doi>10.18653/v1/W17-2705</doi>
@@ -4648,9 +4648,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>The Circumstantial Event Ontology (<fixed-case>CEO</fixed-case>)</title>
-      <author><first>Roxane</first> <last>Segers</last></author>
-      <author><first>Tommaso</first> <last>Caselli</last></author>
-      <author><first>Piek</first> <last>Vossen</last></author>
+      <author><first>Roxane</first><last>Segers</last></author>
+      <author><first>Tommaso</first><last>Caselli</last></author>
+      <author><first>Piek</first><last>Vossen</last></author>
       <pages>37–41</pages>
       <url>W17-2706</url>
       <doi>10.18653/v1/W17-2706</doi>
@@ -4658,15 +4658,15 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Event Detection and Semantic Storytelling: Generating a Travelogue from a large Collection of Personal Letters</title>
-      <author><first>Georg</first> <last>Rehm</last></author>
-      <author><first>Julian</first> <last>Moreno Schneider</last></author>
-      <author><first>Peter</first> <last>Bourgonje</last></author>
-      <author><first>Ankit</first> <last>Srivastava</last></author>
-      <author><first>Jan</first> <last>Nehring</last></author>
-      <author><first>Armin</first> <last>Berger</last></author>
-      <author><first>Luca</first> <last>König</last></author>
-      <author><first>Sören</first> <last>Räuchle</last></author>
-      <author><first>Jens</first> <last>Gerth</last></author>
+      <author><first>Georg</first><last>Rehm</last></author>
+      <author><first>Julian</first><last>Moreno Schneider</last></author>
+      <author><first>Peter</first><last>Bourgonje</last></author>
+      <author><first>Ankit</first><last>Srivastava</last></author>
+      <author><first>Jan</first><last>Nehring</last></author>
+      <author><first>Armin</first><last>Berger</last></author>
+      <author><first>Luca</first><last>König</last></author>
+      <author><first>Sören</first><last>Räuchle</last></author>
+      <author><first>Jens</first><last>Gerth</last></author>
       <pages>42–51</pages>
       <url>W17-2707</url>
       <doi>10.18653/v1/W17-2707</doi>
@@ -4674,9 +4674,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Inference of Fine-Grained Event Causality from Blogs and Films</title>
-      <author><first>Zhichao</first> <last>Hu</last></author>
-      <author><first>Elahe</first> <last>Rahimtoroghi</last></author>
-      <author><first>Marilyn</first> <last>Walker</last></author>
+      <author><first>Zhichao</first><last>Hu</last></author>
+      <author><first>Elahe</first><last>Rahimtoroghi</last></author>
+      <author><first>Marilyn</first><last>Walker</last></author>
       <pages>52–58</pages>
       <url>W17-2708</url>
       <doi>10.18653/v1/W17-2708</doi>
@@ -4684,10 +4684,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>On the Creation of a Security-Related Event Corpus</title>
-      <author><first>Martin</first> <last>Atkinson</last></author>
-      <author><first>Jakub</first> <last>Piskorski</last></author>
-      <author><first>Hristo</first> <last>Tanev</last></author>
-      <author><first>Vanni</first> <last>Zavarella</last></author>
+      <author><first>Martin</first><last>Atkinson</last></author>
+      <author><first>Jakub</first><last>Piskorski</last></author>
+      <author><first>Hristo</first><last>Tanev</last></author>
+      <author><first>Vanni</first><last>Zavarella</last></author>
       <pages>59–65</pages>
       <url>W17-2709</url>
       <doi>10.18653/v1/W17-2709</doi>
@@ -4695,7 +4695,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Inducing Event Types and Roles in Reverse: Using Function to Discover Theme</title>
-      <author><first>Natalie</first> <last>Ahn</last></author>
+      <author><first>Natalie</first><last>Ahn</last></author>
       <pages>66–76</pages>
       <url>W17-2710</url>
       <doi>10.18653/v1/W17-2710</doi>
@@ -4703,8 +4703,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>The Event <fixed-case>S</fixed-case>tory<fixed-case>L</fixed-case>ine Corpus: A New Benchmark for Causal and Temporal Relation Extraction</title>
-      <author><first>Tommaso</first> <last>Caselli</last></author>
-      <author><first>Piek</first> <last>Vossen</last></author>
+      <author><first>Tommaso</first><last>Caselli</last></author>
+      <author><first>Piek</first><last>Vossen</last></author>
       <pages>77–86</pages>
       <url>W17-2711</url>
       <doi>10.18653/v1/W17-2711</doi>
@@ -4712,10 +4712,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>The Rich Event Ontology</title>
-      <author><first>Susan</first> <last>Brown</last></author>
-      <author><first>Claire</first> <last>Bonial</last></author>
-      <author><first>Leo</first> <last>Obrst</last></author>
-      <author><first>Martha</first> <last>Palmer</last></author>
+      <author><first>Susan</first><last>Brown</last></author>
+      <author><first>Claire</first><last>Bonial</last></author>
+      <author><first>Leo</first><last>Obrst</last></author>
+      <author><first>Martha</first><last>Palmer</last></author>
       <pages>87–97</pages>
       <url>W17-2712</url>
       <doi>10.18653/v1/W17-2712</doi>
@@ -4723,9 +4723,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Integrating Decompositional Event Structures into Storylines</title>
-      <author><first>William</first> <last>Croft</last></author>
-      <author><first>Pavlína</first> <last>Pešková</last></author>
-      <author><first>Michael</first> <last>Regan</last></author>
+      <author><first>William</first><last>Croft</last></author>
+      <author><first>Pavlína</first><last>Pešková</last></author>
+      <author><first>Michael</first><last>Regan</last></author>
       <pages>98–109</pages>
       <url>W17-2713</url>
       <doi>10.18653/v1/W17-2713</doi>
@@ -4752,10 +4752,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Grounding Language for Interactive Task Learning</title>
-      <author><first>Peter</first> <last>Lindes</last></author>
-      <author><first>Aaron</first> <last>Mininger</last></author>
-      <author><first>James R.</first> <last>Kirk</last></author>
-      <author><first>John E.</first> <last>Laird</last></author>
+      <author><first>Peter</first><last>Lindes</last></author>
+      <author><first>Aaron</first><last>Mininger</last></author>
+      <author><first>James R.</first><last>Kirk</last></author>
+      <author><first>John E.</first><last>Laird</last></author>
       <pages>1–9</pages>
       <url>W17-2801</url>
       <doi>10.18653/v1/W17-2801</doi>
@@ -4765,9 +4765,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Learning how to Learn: An Adaptive Dialogue Agent for Incrementally Learning Visually Grounded Word Meanings</title>
-      <author><first>Yanchao</first> <last>Yu</last></author>
-      <author><first>Arash</first> <last>Eshghi</last></author>
-      <author><first>Oliver</first> <last>Lemon</last></author>
+      <author><first>Yanchao</first><last>Yu</last></author>
+      <author><first>Arash</first><last>Eshghi</last></author>
+      <author><first>Oliver</first><last>Lemon</last></author>
       <pages>10–19</pages>
       <url>W17-2802</url>
       <doi>10.18653/v1/W17-2802</doi>
@@ -4775,9 +4775,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Guiding Interaction Behaviors for Multi-modal Grounded Language Learning</title>
-      <author><first>Jesse</first> <last>Thomason</last></author>
-      <author><first>Jivko</first> <last>Sinapov</last></author>
-      <author><first>Raymond</first> <last>Mooney</last></author>
+      <author><first>Jesse</first><last>Thomason</last></author>
+      <author><first>Jivko</first><last>Sinapov</last></author>
+      <author><first>Raymond</first><last>Mooney</last></author>
       <pages>20–24</pages>
       <url>W17-2803</url>
       <doi>10.18653/v1/W17-2803</doi>
@@ -4785,10 +4785,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Structured Learning for Context-aware Spoken Language Understanding of Robotic Commands</title>
-      <author><first>Andrea</first> <last>Vanzo</last></author>
-      <author><first>Danilo</first> <last>Croce</last></author>
-      <author><first>Roberto</first> <last>Basili</last></author>
-      <author><first>Daniele</first> <last>Nardi</last></author>
+      <author><first>Andrea</first><last>Vanzo</last></author>
+      <author><first>Danilo</first><last>Croce</last></author>
+      <author><first>Roberto</first><last>Basili</last></author>
+      <author><first>Daniele</first><last>Nardi</last></author>
       <pages>25–34</pages>
       <url>W17-2804</url>
       <doi>10.18653/v1/W17-2804</doi>
@@ -4796,11 +4796,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Natural Language Grounding and Grammar Induction for Robotic Manipulation Commands</title>
-      <author><first>Muhannad</first> <last>Alomari</last></author>
-      <author><first>Paul</first> <last>Duckworth</last></author>
-      <author><first>Majd</first> <last>Hawasly</last></author>
-      <author><first>David C.</first> <last>Hogg</last></author>
-      <author><first>Anthony G.</first> <last>Cohn</last></author>
+      <author><first>Muhannad</first><last>Alomari</last></author>
+      <author><first>Paul</first><last>Duckworth</last></author>
+      <author><first>Majd</first><last>Hawasly</last></author>
+      <author><first>David C.</first><last>Hogg</last></author>
+      <author><first>Anthony G.</first><last>Cohn</last></author>
       <pages>35–43</pages>
       <url>W17-2805</url>
       <doi>10.18653/v1/W17-2805</doi>
@@ -4808,8 +4808,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Communication with Robots using Multilayer Recurrent Networks</title>
-      <author><first>Bedřich</first> <last>Pišl</last></author>
-      <author><first>David</first> <last>Mareček</last></author>
+      <author><first>Bedřich</first><last>Pišl</last></author>
+      <author><first>David</first><last>Mareček</last></author>
       <pages>44–48</pages>
       <url>W17-2806</url>
       <doi>10.18653/v1/W17-2806</doi>
@@ -4817,10 +4817,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Grounding Symbols in Multi-Modal Instructions</title>
-      <author><first>Yordan</first> <last>Hristov</last></author>
-      <author><first>Svetlin</first> <last>Penkov</last></author>
-      <author><first>Alex</first> <last>Lascarides</last></author>
-      <author><first>Subramanian</first> <last>Ramamoorthy</last></author>
+      <author><first>Yordan</first><last>Hristov</last></author>
+      <author><first>Svetlin</first><last>Penkov</last></author>
+      <author><first>Alex</first><last>Lascarides</last></author>
+      <author><first>Subramanian</first><last>Ramamoorthy</last></author>
       <pages>49–57</pages>
       <url>W17-2807</url>
       <doi>10.18653/v1/W17-2807</doi>
@@ -4828,15 +4828,15 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Exploring Variation of Natural Human Commands to a Robot in a Collaborative Navigation Task</title>
-      <author><first>Matthew</first> <last>Marge</last></author>
-      <author><first>Claire</first> <last>Bonial</last></author>
-      <author><first>Ashley</first> <last>Foots</last></author>
-      <author><first>Cory</first> <last>Hayes</last></author>
-      <author><first>Cassidy</first> <last>Henry</last></author>
-      <author><first>Kimberly</first> <last>Pollard</last></author>
-      <author><first>Ron</first> <last>Artstein</last></author>
-      <author><first>Clare</first> <last>Voss</last></author>
-      <author><first>David</first> <last>Traum</last></author>
+      <author><first>Matthew</first><last>Marge</last></author>
+      <author><first>Claire</first><last>Bonial</last></author>
+      <author><first>Ashley</first><last>Foots</last></author>
+      <author><first>Cory</first><last>Hayes</last></author>
+      <author><first>Cassidy</first><last>Henry</last></author>
+      <author><first>Kimberly</first><last>Pollard</last></author>
+      <author><first>Ron</first><last>Artstein</last></author>
+      <author><first>Clare</first><last>Voss</last></author>
+      <author><first>David</first><last>Traum</last></author>
       <pages>58–66</pages>
       <url>W17-2808</url>
       <doi>10.18653/v1/W17-2808</doi>
@@ -4845,13 +4845,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>A Tale of Two <fixed-case>DRAGGN</fixed-case>s: A Hybrid Approach for Interpreting Action-Oriented and Goal-Oriented Instructions</title>
-      <author><first>Siddharth</first> <last>Karamcheti</last></author>
-      <author><first>Edward Clem</first> <last>Williams</last></author>
-      <author><first>Dilip</first> <last>Arumugam</last></author>
-      <author><first>Mina</first> <last>Rhee</last></author>
-      <author><first>Nakul</first> <last>Gopalan</last></author>
-      <author><first>Lawson L.S.</first> <last>Wong</last></author>
-      <author><first>Stefanie</first> <last>Tellex</last></author>
+      <author><first>Siddharth</first><last>Karamcheti</last></author>
+      <author><first>Edward Clem</first><last>Williams</last></author>
+      <author><first>Dilip</first><last>Arumugam</last></author>
+      <author><first>Mina</first><last>Rhee</last></author>
+      <author><first>Nakul</first><last>Gopalan</last></author>
+      <author><first>Lawson L.S.</first><last>Wong</last></author>
+      <author><first>Stefanie</first><last>Tellex</last></author>
       <pages>67–75</pages>
       <url>W17-2809</url>
       <doi>10.18653/v1/W17-2809</doi>
@@ -4859,8 +4859,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Are Distributional Representations Ready for the Real World? Evaluating Word Vectors for Grounded Perceptual Meaning</title>
-      <author><first>Li</first> <last>Lucy</last></author>
-      <author><first>Jon</first> <last>Gauthier</last></author>
+      <author><first>Li</first><last>Lucy</last></author>
+      <author><first>Jon</first><last>Gauthier</last></author>
       <pages>76–85</pages>
       <url>W17-2810</url>
       <doi>10.18653/v1/W17-2810</doi>
@@ -4868,10 +4868,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Sympathy Begins with a Smile, Intelligence Begins with a Word: Use of Multimodal Features in Spoken Human-Robot Interaction</title>
-      <author><first>Jekaterina</first> <last>Novikova</last></author>
-      <author><first>Christian</first> <last>Dondrup</last></author>
-      <author><first>Ioannis</first> <last>Papaioannou</last></author>
-      <author><first>Oliver</first> <last>Lemon</last></author>
+      <author><first>Jekaterina</first><last>Novikova</last></author>
+      <author><first>Christian</first><last>Dondrup</last></author>
+      <author><first>Ioannis</first><last>Papaioannou</last></author>
+      <author><first>Oliver</first><last>Lemon</last></author>
       <pages>86–94</pages>
       <url>W17-2811</url>
       <doi>10.18653/v1/W17-2811</doi>
@@ -4879,16 +4879,16 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Towards Problem Solving Agents that Communicate and Learn</title>
-      <author><first>Anjali</first> <last>Narayan-Chen</last></author>
-      <author><first>Colin</first> <last>Graber</last></author>
-      <author><first>Mayukh</first> <last>Das</last></author>
-      <author><first>Md Rakibul</first> <last>Islam</last></author>
-      <author><first>Soham</first> <last>Dan</last></author>
-      <author><first>Sriraam</first> <last>Natarajan</last></author>
-      <author><first>Janardhan Rao</first> <last>Doppa</last></author>
-      <author><first>Julia</first> <last>Hockenmaier</last></author>
-      <author><first>Martha</first> <last>Palmer</last></author>
-      <author><first>Dan</first> <last>Roth</last></author>
+      <author><first>Anjali</first><last>Narayan-Chen</last></author>
+      <author><first>Colin</first><last>Graber</last></author>
+      <author><first>Mayukh</first><last>Das</last></author>
+      <author><first>Md Rakibul</first><last>Islam</last></author>
+      <author><first>Soham</first><last>Dan</last></author>
+      <author><first>Sriraam</first><last>Natarajan</last></author>
+      <author><first>Janardhan Rao</first><last>Doppa</last></author>
+      <author><first>Julia</first><last>Hockenmaier</last></author>
+      <author><first>Martha</first><last>Palmer</last></author>
+      <author><first>Dan</first><last>Roth</last></author>
       <pages>95–103</pages>
       <url>W17-2812</url>
       <doi>10.18653/v1/W17-2812</doi>
@@ -4917,9 +4917,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Language-independent Gender Prediction on Twitter</title>
-      <author><first>Nikola</first> <last>Ljubešić</last></author>
-      <author><first>Darja</first> <last>Fišer</last></author>
-      <author><first>Tomaž</first> <last>Erjavec</last></author>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
+      <author><first>Darja</first><last>Fišer</last></author>
+      <author><first>Tomaž</first><last>Erjavec</last></author>
       <pages>1–6</pages>
       <url>W17-2901</url>
       <doi>10.18653/v1/W17-2901</doi>
@@ -4927,8 +4927,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>When does a compliment become sexist? Analysis and classification of ambivalent sexism using twitter data</title>
-      <author><first>Akshita</first> <last>Jha</last></author>
-      <author><first>Radhika</first> <last>Mamidi</last></author>
+      <author><first>Akshita</first><last>Jha</last></author>
+      <author><first>Radhika</first><last>Mamidi</last></author>
       <pages>7–16</pages>
       <url>W17-2902</url>
       <doi>10.18653/v1/W17-2902</doi>
@@ -4936,9 +4936,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Personality Driven Differences in Paraphrase Preference</title>
-      <author><first>Daniel</first> <last>Preoţiuc-Pietro</last></author>
-      <author><first>Jordan</first> <last>Carpenter</last></author>
-      <author><first>Lyle</first> <last>Ungar</last></author>
+      <author><first>Daniel</first><last>Preoţiuc-Pietro</last></author>
+      <author><first>Jordan</first><last>Carpenter</last></author>
+      <author><first>Lyle</first><last>Ungar</last></author>
       <pages>17–26</pages>
       <url>W17-2903</url>
       <doi>10.18653/v1/W17-2903</doi>
@@ -4947,7 +4947,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>community2vec: Vector representations of online communities encode semantic relationships</title>
-      <author><first>Trevor</first> <last>Martin</last></author>
+      <author><first>Trevor</first><last>Martin</last></author>
       <pages>27–31</pages>
       <url>W17-2904</url>
       <doi>10.18653/v1/W17-2904</doi>
@@ -4955,10 +4955,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Telling Apart Tweets Associated with Controversial versus Non-Controversial Topics</title>
-      <author><first>Aseel</first> <last>Addawood</last></author>
-      <author><first>Rezvaneh</first> <last>Rezapour</last></author>
-      <author><first>Omid</first> <last>Abdar</last></author>
-      <author><first>Jana</first> <last>Diesner</last></author>
+      <author><first>Aseel</first><last>Addawood</last></author>
+      <author><first>Rezvaneh</first><last>Rezapour</last></author>
+      <author><first>Omid</first><last>Abdar</last></author>
+      <author><first>Jana</first><last>Diesner</last></author>
       <pages>32–41</pages>
       <url>W17-2905</url>
       <doi>10.18653/v1/W17-2905</doi>
@@ -4966,9 +4966,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Cross-Lingual Classification of Topics in Political Texts</title>
-      <author><first>Goran</first> <last>Glavaš</last></author>
-      <author><first>Federico</first> <last>Nanni</last></author>
-      <author><first>Simone Paolo</first> <last>Ponzetto</last></author>
+      <author><first>Goran</first><last>Glavaš</last></author>
+      <author><first>Federico</first><last>Nanni</last></author>
+      <author><first>Simone Paolo</first><last>Ponzetto</last></author>
       <pages>42–46</pages>
       <url>W17-2906</url>
       <doi>10.18653/v1/W17-2906</doi>
@@ -4976,8 +4976,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Mining Social Science Publications for Survey Variables</title>
-      <author><first>Andrea</first> <last>Zielinski</last></author>
-      <author><first>Peter</first> <last>Mutschke</last></author>
+      <author><first>Andrea</first><last>Zielinski</last></author>
+      <author><first>Peter</first><last>Mutschke</last></author>
       <pages>47–52</pages>
       <url>W17-2907</url>
       <doi>10.18653/v1/W17-2907</doi>
@@ -4985,12 +4985,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Linguistic Markers of Influence in Informal Interactions</title>
-      <author><first>Shrimai</first> <last>Prabhumoye</last></author>
-      <author><first>Samridhi</first> <last>Choudhary</last></author>
-      <author><first>Evangelia</first> <last>Spiliopoulou</last></author>
-      <author><first>Christopher</first> <last>Bogart</last></author>
-      <author><first>Carolyn</first> <last>Rose</last></author>
-      <author><first>Alan W</first> <last>Black</last></author>
+      <author><first>Shrimai</first><last>Prabhumoye</last></author>
+      <author><first>Samridhi</first><last>Choudhary</last></author>
+      <author><first>Evangelia</first><last>Spiliopoulou</last></author>
+      <author><first>Christopher</first><last>Bogart</last></author>
+      <author><first>Carolyn</first><last>Rose</last></author>
+      <author><first>Alan W</first><last>Black</last></author>
       <pages>53–62</pages>
       <url>W17-2908</url>
       <doi>10.18653/v1/W17-2908</doi>
@@ -4998,10 +4998,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Non-lexical Features Encode Political Affiliation on Twitter</title>
-      <author><first>Rachael</first> <last>Tatman</last></author>
-      <author><first>Leo</first> <last>Stewart</last></author>
-      <author><first>Amandalynne</first> <last>Paullada</last></author>
-      <author><first>Emma</first> <last>Spiro</last></author>
+      <author><first>Rachael</first><last>Tatman</last></author>
+      <author><first>Leo</first><last>Stewart</last></author>
+      <author><first>Amandalynne</first><last>Paullada</last></author>
+      <author><first>Emma</first><last>Spiro</last></author>
       <pages>63–67</pages>
       <url>W17-2909</url>
       <doi>10.18653/v1/W17-2909</doi>
@@ -5009,7 +5009,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Modelling Participation in Small Group Social Sequences with <fixed-case>M</fixed-case>arkov Rewards Analysis</title>
-      <author><first>Gabriel</first> <last>Murray</last></author>
+      <author><first>Gabriel</first><last>Murray</last></author>
       <pages>68–72</pages>
       <url>W17-2910</url>
       <doi>10.18653/v1/W17-2910</doi>
@@ -5017,10 +5017,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Code-Switching as a Social Act: The Case of <fixed-case>A</fixed-case>rabic <fixed-case>W</fixed-case>ikipedia Talk Pages</title>
-      <author><first>Michael</first> <last>Yoder</last></author>
-      <author><first>Shruti</first> <last>Rijhwani</last></author>
-      <author><first>Carolyn</first> <last>Rosé</last></author>
-      <author><first>Lori</first> <last>Levin</last></author>
+      <author><first>Michael</first><last>Yoder</last></author>
+      <author><first>Shruti</first><last>Rijhwani</last></author>
+      <author><first>Carolyn</first><last>Rosé</last></author>
+      <author><first>Lori</first><last>Levin</last></author>
       <pages>73–82</pages>
       <url>W17-2911</url>
       <doi>10.18653/v1/W17-2911</doi>
@@ -5028,10 +5028,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>How Does Twitter User Behavior Vary Across Demographic Groups?</title>
-      <author><first>Zach</first> <last>Wood-Doughty</last></author>
-      <author><first>Michael</first> <last>Smith</last></author>
-      <author><first>David</first> <last>Broniatowski</last></author>
-      <author><first>Mark</first> <last>Dredze</last></author>
+      <author><first>Zach</first><last>Wood-Doughty</last></author>
+      <author><first>Michael</first><last>Smith</last></author>
+      <author><first>David</first><last>Broniatowski</last></author>
+      <author><first>Mark</first><last>Dredze</last></author>
       <pages>83–89</pages>
       <url>W17-2912</url>
       <doi>10.18653/v1/W17-2912</doi>
@@ -5039,9 +5039,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Ideological Phrase Indicators for Classification of Political Discourse Framing on Twitter</title>
-      <author><first>Kristen</first> <last>Johnson</last></author>
-      <author><first>I-Ta</first> <last>Lee</last></author>
-      <author><first>Dan</first> <last>Goldwasser</last></author>
+      <author><first>Kristen</first><last>Johnson</last></author>
+      <author><first>I-Ta</first><last>Lee</last></author>
+      <author><first>Dan</first><last>Goldwasser</last></author>
       <pages>90–99</pages>
       <url>W17-2913</url>
       <doi>10.18653/v1/W17-2913</doi>
@@ -5067,8 +5067,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Dimensions of Abusive Language on Twitter</title>
-      <author><first>Isobelle</first> <last>Clarke</last></author>
-      <author><first>Jack</first> <last>Grieve</last></author>
+      <author><first>Isobelle</first><last>Clarke</last></author>
+      <author><first>Jack</first><last>Grieve</last></author>
       <pages>1–10</pages>
       <url>W17-3001</url>
       <doi>10.18653/v1/W17-3001</doi>
@@ -5076,8 +5076,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Constructive Language in News Comments</title>
-      <author><first>Varada</first> <last>Kolhatkar</last></author>
-      <author><first>Maite</first> <last>Taboada</last></author>
+      <author><first>Varada</first><last>Kolhatkar</last></author>
+      <author><first>Maite</first><last>Taboada</last></author>
       <pages>11–17</pages>
       <url>W17-3002</url>
       <doi>10.18653/v1/W17-3002</doi>
@@ -5085,10 +5085,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Rephrasing Profanity in <fixed-case>C</fixed-case>hinese Text</title>
-      <author><first>Hui-Po</first> <last>Su</last></author>
-      <author><first>Zhen-Jie</first> <last>Huang</last></author>
-      <author><first>Hao-Tsung</first> <last>Chang</last></author>
-      <author><first>Chuan-Jie</first> <last>Lin</last></author>
+      <author><first>Hui-Po</first><last>Su</last></author>
+      <author><first>Zhen-Jie</first><last>Huang</last></author>
+      <author><first>Hao-Tsung</first><last>Chang</last></author>
+      <author><first>Chuan-Jie</first><last>Lin</last></author>
       <pages>18–24</pages>
       <url>W17-3003</url>
       <doi>10.18653/v1/W17-3003</doi>
@@ -5096,9 +5096,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Deep Learning for User Comment Moderation</title>
-      <author><first>John</first> <last>Pavlopoulos</last></author>
-      <author><first>Prodromos</first> <last>Malakasiotis</last></author>
-      <author><first>Ion</first> <last>Androutsopoulos</last></author>
+      <author><first>John</first><last>Pavlopoulos</last></author>
+      <author><first>Prodromos</first><last>Malakasiotis</last></author>
+      <author><first>Ion</first><last>Androutsopoulos</last></author>
       <pages>25–35</pages>
       <url>W17-3004</url>
       <doi>10.18653/v1/W17-3004</doi>
@@ -5106,12 +5106,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Class-based Prediction Errors to Detect Hate Speech with Out-of-vocabulary Words</title>
-      <author><first>Joan</first> <last>Serrà</last></author>
-      <author><first>Ilias</first> <last>Leontiadis</last></author>
-      <author><first>Dimitris</first> <last>Spathis</last></author>
-      <author><first>Gianluca</first> <last>Stringhini</last></author>
-      <author><first>Jeremy</first> <last>Blackburn</last></author>
-      <author><first>Athena</first> <last>Vakali</last></author>
+      <author><first>Joan</first><last>Serrà</last></author>
+      <author><first>Ilias</first><last>Leontiadis</last></author>
+      <author><first>Dimitris</first><last>Spathis</last></author>
+      <author><first>Gianluca</first><last>Stringhini</last></author>
+      <author><first>Jeremy</first><last>Blackburn</last></author>
+      <author><first>Athena</first><last>Vakali</last></author>
       <pages>36–40</pages>
       <url>W17-3005</url>
       <doi>10.18653/v1/W17-3005</doi>
@@ -5119,8 +5119,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>One-step and Two-step Classification for Abusive Language Detection on Twitter</title>
-      <author><first>Ji Ho</first> <last>Park</last></author>
-      <author><first>Pascale</first> <last>Fung</last></author>
+      <author><first>Ji Ho</first><last>Park</last></author>
+      <author><first>Pascale</first><last>Fung</last></author>
       <pages>41–45</pages>
       <url>W17-3006</url>
       <doi>10.18653/v1/W17-3006</doi>
@@ -5128,9 +5128,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Legal Framework, Dataset and Annotation Schema for Socially Unacceptable Online Discourse Practices in <fixed-case>S</fixed-case>lovene</title>
-      <author><first>Darja</first> <last>Fišer</last></author>
-      <author><first>Tomaž</first> <last>Erjavec</last></author>
-      <author><first>Nikola</first> <last>Ljubešić</last></author>
+      <author><first>Darja</first><last>Fišer</last></author>
+      <author><first>Tomaž</first><last>Erjavec</last></author>
+      <author><first>Nikola</first><last>Ljubešić</last></author>
       <pages>46–51</pages>
       <url>W17-3007</url>
       <doi>10.18653/v1/W17-3007</doi>
@@ -5138,9 +5138,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Abusive Language Detection on <fixed-case>A</fixed-case>rabic Social Media</title>
-      <author><first>Hamdy</first> <last>Mubarak</last></author>
-      <author><first>Kareem</first> <last>Darwish</last></author>
-      <author><first>Walid</first> <last>Magdy</last></author>
+      <author><first>Hamdy</first><last>Mubarak</last></author>
+      <author><first>Kareem</first><last>Darwish</last></author>
+      <author><first>Walid</first><last>Magdy</last></author>
       <pages>52–56</pages>
       <url>W17-3008</url>
       <doi>10.18653/v1/W17-3008</doi>
@@ -5148,11 +5148,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Vectors for Counterspeech on Twitter</title>
-      <author><first>Lucas</first> <last>Wright</last></author>
-      <author><first>Derek</first> <last>Ruths</last></author>
-      <author><first>Kelly P</first> <last>Dillon</last></author>
-      <author><first>Haji Mohammad</first> <last>Saleem</last></author>
-      <author><first>Susan</first> <last>Benesch</last></author>
+      <author><first>Lucas</first><last>Wright</last></author>
+      <author><first>Derek</first><last>Ruths</last></author>
+      <author><first>Kelly P</first><last>Dillon</last></author>
+      <author><first>Haji Mohammad</first><last>Saleem</last></author>
+      <author><first>Susan</first><last>Benesch</last></author>
       <pages>57–62</pages>
       <url>W17-3009</url>
       <doi>10.18653/v1/W17-3009</doi>
@@ -5160,11 +5160,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Detecting Nastiness in Social Media</title>
-      <author><first>Niloofar</first> <last>Safi Samghabadi</last></author>
-      <author><first>Suraj</first> <last>Maharjan</last></author>
-      <author><first>Alan</first> <last>Sprague</last></author>
-      <author><first>Raquel</first> <last>Diaz-Sprague</last></author>
-      <author><first>Thamar</first> <last>Solorio</last></author>
+      <author><first>Niloofar</first><last>Safi Samghabadi</last></author>
+      <author><first>Suraj</first><last>Maharjan</last></author>
+      <author><first>Alan</first><last>Sprague</last></author>
+      <author><first>Raquel</first><last>Diaz-Sprague</last></author>
+      <author><first>Thamar</first><last>Solorio</last></author>
       <pages>63–72</pages>
       <url>W17-3010</url>
       <doi>10.18653/v1/W17-3010</doi>
@@ -5172,13 +5172,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Technology Solutions to Combat Online Harassment</title>
-      <author><first>George</first> <last>Kennedy</last></author>
-      <author><first>Andrew</first> <last>McCollough</last></author>
-      <author><first>Edward</first> <last>Dixon</last></author>
-      <author><first>Alexei</first> <last>Bastidas</last></author>
-      <author><first>John</first> <last>Ryan</last></author>
-      <author><first>Chris</first> <last>Loo</last></author>
-      <author><first>Saurav</first> <last>Sahay</last></author>
+      <author><first>George</first><last>Kennedy</last></author>
+      <author><first>Andrew</first><last>McCollough</last></author>
+      <author><first>Edward</first><last>Dixon</last></author>
+      <author><first>Alexei</first><last>Bastidas</last></author>
+      <author><first>John</first><last>Ryan</last></author>
+      <author><first>Chris</first><last>Loo</last></author>
+      <author><first>Saurav</first><last>Sahay</last></author>
       <pages>73–77</pages>
       <url>W17-3011</url>
       <doi>10.18653/v1/W17-3011</doi>
@@ -5186,10 +5186,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Understanding Abuse: A Typology of Abusive Language Detection Subtasks</title>
-      <author><first>Zeerak</first> <last>Waseem</last></author>
-      <author><first>Thomas</first> <last>Davidson</last></author>
-      <author><first>Dana</first> <last>Warmsley</last></author>
-      <author><first>Ingmar</first> <last>Weber</last></author>
+      <author><first>Zeerak</first><last>Waseem</last></author>
+      <author><first>Thomas</first><last>Davidson</last></author>
+      <author><first>Dana</first><last>Warmsley</last></author>
+      <author><first>Ingmar</first><last>Weber</last></author>
       <pages>78–84</pages>
       <url>W17-3012</url>
       <doi>10.18653/v1/W17-3012</doi>
@@ -5197,8 +5197,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Using Convolutional Neural Networks to Classify Hate-Speech</title>
-      <author><first>Björn</first> <last>Gambäck</last></author>
-      <author><first>Utpal Kumar</first> <last>Sikdar</last></author>
+      <author><first>Björn</first><last>Gambäck</last></author>
+      <author><first>Utpal Kumar</first><last>Sikdar</last></author>
       <pages>85–90</pages>
       <url>W17-3013</url>
       <doi>10.18653/v1/W17-3013</doi>
@@ -5206,9 +5206,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Illegal is not a Noun: Linguistic Form for Detection of Pejorative Nominalizations</title>
-      <author><first>Alexis</first> <last>Palmer</last></author>
-      <author><first>Melissa</first> <last>Robinson</last></author>
-      <author><first>Kristy K.</first> <last>Phillips</last></author>
+      <author><first>Alexis</first><last>Palmer</last></author>
+      <author><first>Melissa</first><last>Robinson</last></author>
+      <author><first>Kristy K.</first><last>Phillips</last></author>
       <pages>91–100</pages>
       <url>W17-3014</url>
       <doi>10.18653/v1/W17-3014</doi>
@@ -5233,9 +5233,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>A Cross-modal Review of Indicators for Depression Detection Systems</title>
-      <author><first>Michelle</first> <last>Morales</last></author>
-      <author><first>Stefan</first> <last>Scherer</last></author>
-      <author><first>Rivka</first> <last>Levitan</last></author>
+      <author><first>Michelle</first><last>Morales</last></author>
+      <author><first>Stefan</first><last>Scherer</last></author>
+      <author><first>Rivka</first><last>Levitan</last></author>
       <pages>1–12</pages>
       <url>W17-3101</url>
       <doi>10.18653/v1/W17-3101</doi>
@@ -5243,11 +5243,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>In your wildest dreams: the language and psychological features of dreams</title>
-      <author><first>Kate</first> <last>Niederhoffer</last></author>
-      <author><first>Jonathan</first> <last>Schler</last></author>
-      <author><first>Patrick</first> <last>Crutchley</last></author>
-      <author><first>Kate</first> <last>Loveys</last></author>
-      <author><first>Glen</first> <last>Coppersmith</last></author>
+      <author><first>Kate</first><last>Niederhoffer</last></author>
+      <author><first>Jonathan</first><last>Schler</last></author>
+      <author><first>Patrick</first><last>Crutchley</last></author>
+      <author><first>Kate</first><last>Loveys</last></author>
+      <author><first>Glen</first><last>Coppersmith</last></author>
       <pages>13–25</pages>
       <url>W17-3102</url>
       <doi>10.18653/v1/W17-3102</doi>
@@ -5255,11 +5255,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>A Corpus Analysis of Social Connections and Social Isolation in Adolescents Suffering from Depressive Disorders</title>
-      <author><first>Jia-Wen</first> <last>Guo</last></author>
-      <author><first>Danielle L</first> <last>Mowery</last></author>
-      <author><first>Djin</first> <last>Lai</last></author>
-      <author><first>Katherine</first> <last>Sward</last></author>
-      <author><first>Mike</first> <last>Conway</last></author>
+      <author><first>Jia-Wen</first><last>Guo</last></author>
+      <author><first>Danielle L</first><last>Mowery</last></author>
+      <author><first>Djin</first><last>Lai</last></author>
+      <author><first>Katherine</first><last>Sward</last></author>
+      <author><first>Mike</first><last>Conway</last></author>
       <pages>26–31</pages>
       <url>W17-3103</url>
       <doi>10.18653/v1/W17-3103</doi>
@@ -5267,10 +5267,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Monitoring Tweets for Depression to Detect At-risk Users</title>
-      <author><first>Zunaira</first> <last>Jamil</last></author>
-      <author><first>Diana</first> <last>Inkpen</last></author>
-      <author><first>Prasadith</first> <last>Buddhitha</last></author>
-      <author><first>Kenton</first> <last>White</last></author>
+      <author><first>Zunaira</first><last>Jamil</last></author>
+      <author><first>Diana</first><last>Inkpen</last></author>
+      <author><first>Prasadith</first><last>Buddhitha</last></author>
+      <author><first>Kenton</first><last>White</last></author>
       <pages>32–40</pages>
       <url>W17-3104</url>
       <doi>10.18653/v1/W17-3104</doi>
@@ -5278,9 +5278,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Investigating Patient Attitudes Towards the use of Social Media Data to Augment Depression Diagnosis and Treatment: a Qualitative Study</title>
-      <author><first>Jude</first> <last>Mikal</last></author>
-      <author><first>Samantha</first> <last>Hurst</last></author>
-      <author><first>Mike</first> <last>Conway</last></author>
+      <author><first>Jude</first><last>Mikal</last></author>
+      <author><first>Samantha</first><last>Hurst</last></author>
+      <author><first>Mike</first><last>Conway</last></author>
       <pages>41–47</pages>
       <url>W17-3105</url>
       <doi>10.18653/v1/W17-3105</doi>
@@ -5288,9 +5288,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Natural-language Interactive Narratives in Imaginal Exposure Therapy for Obsessive-Compulsive Disorder</title>
-      <author><first>Melissa</first> <last>Roemmele</last></author>
-      <author><first>Paola</first> <last>Mardo</last></author>
-      <author><first>Andrew</first> <last>Gordon</last></author>
+      <author><first>Melissa</first><last>Roemmele</last></author>
+      <author><first>Paola</first><last>Mardo</last></author>
+      <author><first>Andrew</first><last>Gordon</last></author>
       <pages>48–57</pages>
       <url>W17-3106</url>
       <doi>10.18653/v1/W17-3106</doi>
@@ -5298,8 +5298,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Detecting Anxiety through <fixed-case>R</fixed-case>eddit</title>
-      <author><first>Judy Hanwen</first> <last>Shen</last></author>
-      <author><first>Frank</first> <last>Rudzicz</last></author>
+      <author><first>Judy Hanwen</first><last>Shen</last></author>
+      <author><first>Frank</first><last>Rudzicz</last></author>
       <pages>58–65</pages>
       <url>W17-3107</url>
       <doi>10.18653/v1/W17-3107</doi>
@@ -5307,9 +5307,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Detecting and Explaining Crisis</title>
-      <author><first>Rohan</first> <last>Kshirsagar</last></author>
-      <author><first>Robert</first> <last>Morris</last></author>
-      <author><first>Samuel</first> <last>Bowman</last></author>
+      <author><first>Rohan</first><last>Kshirsagar</last></author>
+      <author><first>Robert</first><last>Morris</last></author>
+      <author><first>Samuel</first><last>Bowman</last></author>
       <pages>66–73</pages>
       <url>W17-3108</url>
       <doi>10.18653/v1/W17-3108</doi>
@@ -5317,8 +5317,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>A Dictionary-Based Comparison of Autobiographies by People and Murderous Monsters</title>
-      <author><first>Micah</first> <last>Iserman</last></author>
-      <author><first>Molly</first> <last>Ireland</last></author>
+      <author><first>Micah</first><last>Iserman</last></author>
+      <author><first>Molly</first><last>Ireland</last></author>
       <pages>74–84</pages>
       <url>W17-3109</url>
       <doi>10.18653/v1/W17-3109</doi>
@@ -5326,10 +5326,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Small but Mighty: Affective Micropatterns for Quantifying Mental Health from Social Media Language</title>
-      <author><first>Kate</first> <last>Loveys</last></author>
-      <author><first>Patrick</first> <last>Crutchley</last></author>
-      <author><first>Emily</first> <last>Wyatt</last></author>
-      <author><first>Glen</first> <last>Coppersmith</last></author>
+      <author><first>Kate</first><last>Loveys</last></author>
+      <author><first>Patrick</first><last>Crutchley</last></author>
+      <author><first>Emily</first><last>Wyatt</last></author>
+      <author><first>Glen</first><last>Coppersmith</last></author>
       <pages>85–95</pages>
       <url>W17-3110</url>
       <doi>10.18653/v1/W17-3110</doi>
@@ -5355,8 +5355,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>An Empirical Study of Adequate Vision Span for Attention-Based Neural Machine Translation</title>
-      <author><first>Raphael</first> <last>Shu</last></author>
-      <author><first>Hideki</first> <last>Nakayama</last></author>
+      <author><first>Raphael</first><last>Shu</last></author>
+      <author><first>Hideki</first><last>Nakayama</last></author>
       <pages>1–10</pages>
       <url>W17-3201</url>
       <doi>10.18653/v1/W17-3201</doi>
@@ -5364,10 +5364,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Analyzing Neural <fixed-case>MT</fixed-case> Search and Model Performance</title>
-      <author><first>Jan</first> <last>Niehues</last></author>
-      <author><first>Eunah</first> <last>Cho</last></author>
-      <author><first>Thanh-Le</first> <last>Ha</last></author>
-      <author><first>Alex</first> <last>Waibel</last></author>
+      <author><first>Jan</first><last>Niehues</last></author>
+      <author><first>Eunah</first><last>Cho</last></author>
+      <author><first>Thanh-Le</first><last>Ha</last></author>
+      <author><first>Alex</first><last>Waibel</last></author>
       <pages>11–17</pages>
       <url>W17-3202</url>
       <doi>10.18653/v1/W17-3202</doi>
@@ -5375,8 +5375,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Stronger Baselines for Trustable Results in Neural Machine Translation</title>
-      <author><first>Michael</first> <last>Denkowski</last></author>
-      <author><first>Graham</first> <last>Neubig</last></author>
+      <author><first>Michael</first><last>Denkowski</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
       <pages>18–27</pages>
       <url>W17-3203</url>
       <doi>10.18653/v1/W17-3203</doi>
@@ -5384,8 +5384,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Six Challenges for Neural Machine Translation</title>
-      <author><first>Philipp</first> <last>Koehn</last></author>
-      <author><first>Rebecca</first> <last>Knowles</last></author>
+      <author><first>Philipp</first><last>Koehn</last></author>
+      <author><first>Rebecca</first><last>Knowles</last></author>
       <pages>28–39</pages>
       <url>W17-3204</url>
       <doi>10.18653/v1/W17-3204</doi>
@@ -5393,10 +5393,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Cost Weighting for Neural Machine Translation Domain Adaptation</title>
-      <author><first>Boxing</first> <last>Chen</last></author>
-      <author><first>Colin</first> <last>Cherry</last></author>
-      <author><first>George</first> <last>Foster</last></author>
-      <author><first>Samuel</first> <last>Larkin</last></author>
+      <author><first>Boxing</first><last>Chen</last></author>
+      <author><first>Colin</first><last>Cherry</last></author>
+      <author><first>George</first><last>Foster</last></author>
+      <author><first>Samuel</first><last>Larkin</last></author>
       <pages>40–46</pages>
       <url>W17-3205</url>
       <doi>10.18653/v1/W17-3205</doi>
@@ -5404,8 +5404,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Detecting Untranslated Content for Neural Machine Translation</title>
-      <author><first>Isao</first> <last>Goto</last></author>
-      <author><first>Hideki</first> <last>Tanaka</last></author>
+      <author><first>Isao</first><last>Goto</last></author>
+      <author><first>Hideki</first><last>Tanaka</last></author>
       <pages>47–55</pages>
       <url>W17-3206</url>
       <doi>10.18653/v1/W17-3206</doi>
@@ -5413,8 +5413,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Beam Search Strategies for Neural Machine Translation</title>
-      <author><first>Markus</first> <last>Freitag</last></author>
-      <author><first>Yaser</first> <last>Al-Onaizan</last></author>
+      <author><first>Markus</first><last>Freitag</last></author>
+      <author><first>Yaser</first><last>Al-Onaizan</last></author>
       <pages>56–60</pages>
       <url>W17-3207</url>
       <doi>10.18653/v1/W17-3207</doi>
@@ -5422,12 +5422,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>An Empirical Study of Mini-Batch Creation Strategies for Neural Machine Translation</title>
-      <author><first>Makoto</first> <last>Morishita</last></author>
-      <author><first>Yusuke</first> <last>Oda</last></author>
-      <author><first>Graham</first> <last>Neubig</last></author>
-      <author><first>Koichiro</first> <last>Yoshino</last></author>
-      <author><first>Katsuhito</first> <last>Sudoh</last></author>
-      <author><first>Satoshi</first> <last>Nakamura</last></author>
+      <author><first>Makoto</first><last>Morishita</last></author>
+      <author><first>Yusuke</first><last>Oda</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
+      <author><first>Koichiro</first><last>Yoshino</last></author>
+      <author><first>Katsuhito</first><last>Sudoh</last></author>
+      <author><first>Satoshi</first><last>Nakamura</last></author>
       <pages>61–68</pages>
       <url>W17-3208</url>
       <doi>10.18653/v1/W17-3208</doi>
@@ -5435,9 +5435,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Detecting Cross-Lingual Semantic Divergence for Neural Machine Translation</title>
-      <author><first>Marine</first> <last>Carpuat</last></author>
-      <author><first>Yogarshi</first> <last>Vyas</last></author>
-      <author><first>Xing</first> <last>Niu</last></author>
+      <author><first>Marine</first><last>Carpuat</last></author>
+      <author><first>Yogarshi</first><last>Vyas</last></author>
+      <author><first>Xing</first><last>Niu</last></author>
       <pages>69–79</pages>
       <url>W17-3209</url>
       <doi>10.18653/v1/W17-3209</doi>
@@ -5448,9 +5448,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     <meta>
       <booktitle>Proceedings of the 15th Meeting on the Mathematics of Language</booktitle>
       <url>W17-34</url>
-      <editor><first>Makoto</first> <last>Kanazawa</last></editor>
-      <editor><first>Philippe</first> <last>de Groote</last></editor>
-      <editor><first>Mehrnoosh</first> <last>Sadrzadeh</last></editor>
+      <editor><first>Makoto</first><last>Kanazawa</last></editor>
+      <editor><first>Philippe</first><last>de Groote</last></editor>
+      <editor><first>Mehrnoosh</first><last>Sadrzadeh</last></editor>
       <doi>10.18653/v1/W17-34</doi>
       <publisher>Association for Computational Linguistics</publisher>
       <address>London, UK</address>
@@ -5462,109 +5462,109 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title><fixed-case>BE</fixed-case> Is Not the Unique Homomorphism That Makes the Partee Triangle Commute</title>
-      <author><first>Junri</first> <last>Shimada</last></author>
+      <author><first>Junri</first><last>Shimada</last></author>
       <pages>1–10</pages>
       <url>W17-3401</url>
       <doi>10.18653/v1/W17-3401</doi>
     </paper>
     <paper id="2">
       <title>How Many Stemmata with Root Degree k?</title>
-      <author><first>Armin</first> <last>Hoenen</last></author>
-      <author><first>Steffen</first> <last>Eger</last></author>
-      <author><first>Ralf</first> <last>Gehrke</last></author>
+      <author><first>Armin</first><last>Hoenen</last></author>
+      <author><first>Steffen</first><last>Eger</last></author>
+      <author><first>Ralf</first><last>Gehrke</last></author>
       <pages>11–21</pages>
       <url>W17-3402</url>
       <doi>10.18653/v1/W17-3402</doi>
     </paper>
     <paper id="3">
       <title>On the Logical Complexity of Autosegmental Representations</title>
-      <author><first>Adam</first> <last>Jardine</last></author>
+      <author><first>Adam</first><last>Jardine</last></author>
       <pages>22–35</pages>
       <url>W17-3403</url>
       <doi>10.18653/v1/W17-3403</doi>
     </paper>
     <paper id="4">
       <title>Extracting Forbidden Factors from Regular Stringsets</title>
-      <author><first>James</first> <last>Rogers</last></author>
-      <author><first>Dakotah</first> <last>Lambert</last></author>
+      <author><first>James</first><last>Rogers</last></author>
+      <author><first>Dakotah</first><last>Lambert</last></author>
       <pages>36–46</pages>
       <url>W17-3404</url>
       <doi>10.18653/v1/W17-3404</doi>
     </paper>
     <paper id="5">
       <title>Latent-Variable <fixed-case>PCFG</fixed-case>s: Background and Applications</title>
-      <author><first>Shay</first> <last>Cohen</last></author>
+      <author><first>Shay</first><last>Cohen</last></author>
       <pages>47–58</pages>
       <url>W17-3405</url>
       <doi>10.18653/v1/W17-3405</doi>
     </paper>
     <paper id="6">
       <title>A Proof-Theoretic Semantics for Transitive Verbs with an Implicit Object</title>
-      <author><first>Nissim</first> <last>Francez</last></author>
+      <author><first>Nissim</first><last>Francez</last></author>
       <pages>59–67</pages>
       <url>W17-3406</url>
       <doi>10.18653/v1/W17-3406</doi>
     </paper>
     <paper id="7">
       <title>Why We Speak</title>
-      <author><first>Rohit</first> <last>Parikh</last></author>
+      <author><first>Rohit</first><last>Parikh</last></author>
       <pages>68–74</pages>
       <url>W17-3407</url>
       <doi>10.18653/v1/W17-3407</doi>
     </paper>
     <paper id="8">
       <title>A Monotonicity Calculus and Its Completeness</title>
-      <author><first>Thomas</first> <last>Icard</last></author>
-      <author><first>Lawrence</first> <last>Moss</last></author>
-      <author><first>William</first> <last>Tune</last></author>
+      <author><first>Thomas</first><last>Icard</last></author>
+      <author><first>Lawrence</first><last>Moss</last></author>
+      <author><first>William</first><last>Tune</last></author>
       <pages>75–87</pages>
       <url>W17-3408</url>
       <doi>10.18653/v1/W17-3408</doi>
     </paper>
     <paper id="9">
       <title><fixed-case>DAG</fixed-case> Automata for Meaning Representation</title>
-      <author><first>Frank</first> <last>Drewes</last></author>
+      <author><first>Frank</first><last>Drewes</last></author>
       <pages>88–99</pages>
       <url>W17-3409</url>
       <doi>10.18653/v1/W17-3409</doi>
     </paper>
     <paper id="10">
       <title>(Re)introducing Regular Graph Languages</title>
-      <author><first>Sorcha</first> <last>Gilroy</last></author>
-      <author><first>Adam</first> <last>Lopez</last></author>
-      <author><first>Sebastian</first> <last>Maneth</last></author>
-      <author><first>Pijus</first> <last>Simonaitis</last></author>
+      <author><first>Sorcha</first><last>Gilroy</last></author>
+      <author><first>Adam</first><last>Lopez</last></author>
+      <author><first>Sebastian</first><last>Maneth</last></author>
+      <author><first>Pijus</first><last>Simonaitis</last></author>
       <pages>100–113</pages>
       <url>W17-3410</url>
       <doi>10.18653/v1/W17-3410</doi>
     </paper>
     <paper id="11">
       <title>Graph Transductions and Typological Gaps in Morphological Paradigms</title>
-      <author><first>Thomas</first> <last>Graf</last></author>
+      <author><first>Thomas</first><last>Graf</last></author>
       <pages>114–126</pages>
       <url>W17-3411</url>
       <doi>10.18653/v1/W17-3411</doi>
     </paper>
     <paper id="12">
       <title>Introducing Structure into Neural Network-Based Semantic Models</title>
-      <author><first>Stephen</first> <last>Clark</last></author>
+      <author><first>Stephen</first><last>Clark</last></author>
       <pages>127</pages>
       <url>W17-3412</url>
       <doi>10.18653/v1/W17-3412</doi>
     </paper>
     <paper id="13">
       <title>Count-Invariance Including Exponentials</title>
-      <author><first>Stepan</first> <last>Kuznetsov</last></author>
-      <author><first>Glyn</first> <last>Morrill</last></author>
-      <author><first>Oriol</first> <last>Valentín</last></author>
+      <author><first>Stepan</first><last>Kuznetsov</last></author>
+      <author><first>Glyn</first><last>Morrill</last></author>
+      <author><first>Oriol</first><last>Valentín</last></author>
       <pages>128–139</pages>
       <url>W17-3413</url>
       <doi>10.18653/v1/W17-3413</doi>
     </paper>
     <paper id="14">
       <title>Conjunctive Categorial Grammars</title>
-      <author><first>Stepan</first> <last>Kuznetsov</last></author>
-      <author><first>Alexander</first> <last>Okhotin</last></author>
+      <author><first>Stepan</first><last>Kuznetsov</last></author>
+      <author><first>Alexander</first><last>Okhotin</last></author>
       <pages>140–151</pages>
       <url>W17-3414</url>
       <doi>10.18653/v1/W17-3414</doi>
@@ -5588,10 +5588,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Linguistic realisation as machine translation: Comparing different <fixed-case>MT</fixed-case> models for <fixed-case>AMR</fixed-case>-to-text generation</title>
-      <author><first>Thiago</first> <last>Castro Ferreira</last></author>
-      <author><first>Iacer</first> <last>Calixto</last></author>
-      <author><first>Sander</first> <last>Wubben</last></author>
-      <author><first>Emiel</first> <last>Krahmer</last></author>
+      <author><first>Thiago</first><last>Castro Ferreira</last></author>
+      <author><first>Iacer</first><last>Calixto</last></author>
+      <author><first>Sander</first><last>Wubben</last></author>
+      <author><first>Emiel</first><last>Krahmer</last></author>
       <pages>1–10</pages>
       <url>W17-3501</url>
       <doi>10.18653/v1/W17-3501</doi>
@@ -5599,7 +5599,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>A Survey on Intelligent Poetry Generation: Languages, Features, Techniques, Reutilisation and Evaluation</title>
-      <author><first>Hugo</first> <last>Gonçalo Oliveira</last></author>
+      <author><first>Hugo</first><last>Gonçalo Oliveira</last></author>
       <pages>11–20</pages>
       <url>W17-3502</url>
       <doi>10.18653/v1/W17-3502</doi>
@@ -5607,9 +5607,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Cross-linguistic differences and similarities in image descriptions</title>
-      <author><first>Emiel</first> <last>van Miltenburg</last></author>
-      <author><first>Desmond</first> <last>Elliott</last></author>
-      <author><first>Piek</first> <last>Vossen</last></author>
+      <author><first>Emiel</first><last>van Miltenburg</last></author>
+      <author><first>Desmond</first><last>Elliott</last></author>
+      <author><first>Piek</first><last>Vossen</last></author>
       <pages>21–30</pages>
       <url>W17-3503</url>
       <doi>10.18653/v1/W17-3503</doi>
@@ -5617,10 +5617,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Content Selection for Real-time Sports News Construction from Commentary Texts</title>
-      <author><first>Jin-ge</first> <last>Yao</last></author>
-      <author><first>Jianmin</first> <last>Zhang</last></author>
-      <author><first>Xiaojun</first> <last>Wan</last></author>
-      <author><first>Jianguo</first> <last>Xiao</last></author>
+      <author><first>Jin-ge</first><last>Yao</last></author>
+      <author><first>Jianmin</first><last>Zhang</last></author>
+      <author><first>Xiaojun</first><last>Wan</last></author>
+      <author><first>Jianguo</first><last>Xiao</last></author>
       <pages>31–40</pages>
       <url>W17-3504</url>
       <doi>10.18653/v1/W17-3504</doi>
@@ -5628,9 +5628,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Improving the Naturalness and Expressivity of Language Generation for <fixed-case>S</fixed-case>panish</title>
-      <author><first>Cristina</first> <last>Barros</last></author>
-      <author><first>Dimitra</first> <last>Gkatzia</last></author>
-      <author><first>Elena</first> <last>Lloret</last></author>
+      <author><first>Cristina</first><last>Barros</last></author>
+      <author><first>Dimitra</first><last>Gkatzia</last></author>
+      <author><first>Elena</first><last>Lloret</last></author>
       <pages>41–50</pages>
       <url>W17-3505</url>
       <doi>10.18653/v1/W17-3505</doi>
@@ -5638,9 +5638,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>What is the Role of Recurrent Neural Networks (<fixed-case>RNN</fixed-case>s) in an Image Caption Generator?</title>
-      <author><first>Marc</first> <last>Tanti</last></author>
-      <author><first>Albert</first> <last>Gatt</last></author>
-      <author><first>Kenneth</first> <last>Camilleri</last></author>
+      <author><first>Marc</first><last>Tanti</last></author>
+      <author><first>Albert</first><last>Gatt</last></author>
+      <author><first>Kenneth</first><last>Camilleri</last></author>
       <pages>51–60</pages>
       <url>W17-3506</url>
       <doi>10.18653/v1/W17-3506</doi>
@@ -5648,11 +5648,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Exploring the Behavior of Classic <fixed-case>REG</fixed-case> Algorithms in the Description of Characters in 3<fixed-case>D</fixed-case> Images</title>
-      <author><first>Gonzalo</first> <last>Méndez</last></author>
-      <author><first>Raquel</first> <last>Hervás</last></author>
-      <author><first>Susana</first> <last>Bautista</last></author>
-      <author><first>Adrián</first> <last>Rabadán</last></author>
-      <author><first>Teresa</first> <last>Rodríguez</last></author>
+      <author><first>Gonzalo</first><last>Méndez</last></author>
+      <author><first>Raquel</first><last>Hervás</last></author>
+      <author><first>Susana</first><last>Bautista</last></author>
+      <author><first>Adrián</first><last>Rabadán</last></author>
+      <author><first>Teresa</first><last>Rodríguez</last></author>
       <pages>61–69</pages>
       <url>W17-3507</url>
       <doi>10.18653/v1/W17-3507</doi>
@@ -5660,9 +5660,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Co-<fixed-case>P</fixed-case>oe<fixed-case>T</fixed-case>ry<fixed-case>M</fixed-case>e: a Co-Creative Interface for the Composition of Poetry</title>
-      <author><first>Hugo</first> <last>Gonçalo Oliveira</last></author>
-      <author><first>Tiago</first> <last>Mendes</last></author>
-      <author><first>Ana</first> <last>Boavida</last></author>
+      <author><first>Hugo</first><last>Gonçalo Oliveira</last></author>
+      <author><first>Tiago</first><last>Mendes</last></author>
+      <author><first>Ana</first><last>Boavida</last></author>
       <pages>70–71</pages>
       <url>W17-3508</url>
       <doi>10.18653/v1/W17-3508</doi>
@@ -5670,9 +5670,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Refer-i<fixed-case>TTS</fixed-case>: A System for Referring in Spoken Installments to Objects in Real-World Images</title>
-      <author><first>Sina</first> <last>Zarrieß</last></author>
-      <author><first>M. Soledad</first> <last>López Gambino</last></author>
-      <author><first>David</first> <last>Schlangen</last></author>
+      <author><first>Sina</first><last>Zarrieß</last></author>
+      <author><first>M. Soledad</first><last>López Gambino</last></author>
+      <author><first>David</first><last>Schlangen</last></author>
       <pages>72–73</pages>
       <url>W17-3509</url>
       <doi>10.18653/v1/W17-3509</doi>
@@ -5680,7 +5680,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Finding the “right” answers for customers</title>
-      <author><first>Frank</first> <last>Schilder</last></author>
+      <author><first>Frank</first><last>Schilder</last></author>
       <pages>74</pages>
       <url>W17-3510</url>
       <doi>10.18653/v1/W17-3510</doi>
@@ -5688,8 +5688,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Referring Expression Generation under Uncertainty: Algorithm and Evaluation Framework</title>
-      <author><first>Tom</first> <last>Williams</last></author>
-      <author><first>Matthias</first> <last>Scheutz</last></author>
+      <author><first>Tom</first><last>Williams</last></author>
+      <author><first>Matthias</first><last>Scheutz</last></author>
       <pages>75–84</pages>
       <url>W17-3511</url>
       <doi>10.18653/v1/W17-3511</doi>
@@ -5697,8 +5697,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Natural Language Descriptions for Human Activities in Video Streams</title>
-      <author><first>Nouf</first> <last>Alharbi</last></author>
-      <author><first>Yoshihiko</first> <last>Gotoh</last></author>
+      <author><first>Nouf</first><last>Alharbi</last></author>
+      <author><first>Yoshihiko</first><last>Gotoh</last></author>
       <pages>85–94</pages>
       <url>W17-3512</url>
       <doi>10.18653/v1/W17-3512</doi>
@@ -5706,9 +5706,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title><fixed-case>PASS</fixed-case>: A <fixed-case>D</fixed-case>utch data-to-text system for soccer, targeted towards specific audiences</title>
-      <author><first>Chris</first> <last>van der Lee</last></author>
-      <author><first>Emiel</first> <last>Krahmer</last></author>
-      <author><first>Sander</first> <last>Wubben</last></author>
+      <author><first>Chris</first><last>van der Lee</last></author>
+      <author><first>Emiel</first><last>Krahmer</last></author>
+      <author><first>Sander</first><last>Wubben</last></author>
       <pages>95–104</pages>
       <url>W17-3513</url>
       <doi>10.18653/v1/W17-3513</doi>
@@ -5716,9 +5716,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Evaluation of a <fixed-case>R</fixed-case>unyankore grammar engine for healthcare messages</title>
-      <author><first>Joan</first> <last>Byamugisha</last></author>
-      <author><first>C. Maria</first> <last>Keet</last></author>
-      <author><first>Brian</first> <last>DeRenzi</last></author>
+      <author><first>Joan</first><last>Byamugisha</last></author>
+      <author><first>C. Maria</first><last>Keet</last></author>
+      <author><first>Brian</first><last>DeRenzi</last></author>
       <pages>105–113</pages>
       <url>W17-3514</url>
       <doi>10.18653/v1/W17-3514</doi>
@@ -5726,7 +5726,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Talking about the world with a distributed model</title>
-      <author><first>Gemma</first> <last>Boleda</last></author>
+      <author><first>Gemma</first><last>Boleda</last></author>
       <pages>114</pages>
       <url>W17-3515</url>
       <doi>10.18653/v1/W17-3515</doi>
@@ -5734,9 +5734,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="16">
       <title>The <fixed-case>C</fixed-case>ode2<fixed-case>T</fixed-case>ext Challenge: Text Generation in Source Libraries</title>
-      <author><first>Kyle</first> <last>Richardson</last></author>
-      <author><first>Sina</first> <last>Zarrieß</last></author>
-      <author><first>Jonas</first> <last>Kuhn</last></author>
+      <author><first>Kyle</first><last>Richardson</last></author>
+      <author><first>Sina</first><last>Zarrieß</last></author>
+      <author><first>Jonas</first><last>Kuhn</last></author>
       <pages>115–119</pages>
       <url>W17-3516</url>
       <doi>10.18653/v1/W17-3516</doi>
@@ -5744,10 +5744,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="17">
       <title>Shared Task Proposal: Multilingual Surface Realization Using Universal Dependency Trees</title>
-      <author><first>Simon</first> <last>Mille</last></author>
-      <author><first>Bernd</first> <last>Bohnet</last></author>
-      <author><first>Leo</first> <last>Wanner</last></author>
-      <author><first>Anja</first> <last>Belz</last></author>
+      <author><first>Simon</first><last>Mille</last></author>
+      <author><first>Bernd</first><last>Bohnet</last></author>
+      <author><first>Leo</first><last>Wanner</last></author>
+      <author><first>Anja</first><last>Belz</last></author>
       <pages>120–123</pages>
       <url>W17-3517</url>
       <doi>10.18653/v1/W17-3517</doi>
@@ -5755,10 +5755,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="18">
       <title>The <fixed-case>W</fixed-case>eb<fixed-case>NLG</fixed-case> Challenge: Generating Text from <fixed-case>RDF</fixed-case> Data</title>
-      <author><first>Claire</first> <last>Gardent</last></author>
-      <author><first>Anastasia</first> <last>Shimorina</last></author>
-      <author><first>Shashi</first> <last>Narayan</last></author>
-      <author><first>Laura</first> <last>Perez-Beltrachini</last></author>
+      <author><first>Claire</first><last>Gardent</last></author>
+      <author><first>Anastasia</first><last>Shimorina</last></author>
+      <author><first>Shashi</first><last>Narayan</last></author>
+      <author><first>Laura</first><last>Perez-Beltrachini</last></author>
       <pages>124–133</pages>
       <url>W17-3518</url>
       <doi>10.18653/v1/W17-3518</doi>
@@ -5766,7 +5766,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="19">
       <title>A Commercial Perspective on Reference</title>
-      <author><first>Ehud</first> <last>Reiter</last></author>
+      <author><first>Ehud</first><last>Reiter</last></author>
       <pages>134–138</pages>
       <url>W17-3519</url>
       <doi>10.18653/v1/W17-3519</doi>
@@ -5774,8 +5774,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="20">
       <title>Integrated sentence generation using charts</title>
-      <author><first>Alexander</first> <last>Koller</last></author>
-      <author><first>Nikos</first> <last>Engonopoulos</last></author>
+      <author><first>Alexander</first><last>Koller</last></author>
+      <author><first>Nikos</first><last>Engonopoulos</last></author>
       <pages>139–143</pages>
       <url>W17-3520</url>
       <doi>10.18653/v1/W17-3520</doi>
@@ -5783,9 +5783,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="21">
       <title>Adapting <fixed-case>S</fixed-case>imple<fixed-case>NLG</fixed-case> to <fixed-case>S</fixed-case>panish</title>
-      <author><first>Alejandro</first> <last>Ramos-Soto</last></author>
-      <author><first>Julio</first> <last>Janeiro-Gallardo</last></author>
-      <author><first>Alberto</first> <last>Bugarín Diz</last></author>
+      <author><first>Alejandro</first><last>Ramos-Soto</last></author>
+      <author><first>Julio</first><last>Janeiro-Gallardo</last></author>
+      <author><first>Alberto</first><last>Bugarín Diz</last></author>
       <pages>144–148</pages>
       <url>W17-3521</url>
       <doi>10.18653/v1/W17-3521</doi>
@@ -5793,9 +5793,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="22">
       <title>G-<fixed-case>TUNA</fixed-case>: a corpus of referring expressions in <fixed-case>G</fixed-case>erman, including duration information</title>
-      <author><first>David</first> <last>Howcroft</last></author>
-      <author><first>Jorrig</first> <last>Vogels</last></author>
-      <author><first>Vera</first> <last>Demberg</last></author>
+      <author><first>David</first><last>Howcroft</last></author>
+      <author><first>Jorrig</first><last>Vogels</last></author>
+      <author><first>Vera</first><last>Demberg</last></author>
       <pages>149–153</pages>
       <url>W17-3522</url>
       <doi>10.18653/v1/W17-3522</doi>
@@ -5803,9 +5803,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="23">
       <title>Toward an <fixed-case>NLG</fixed-case> System for <fixed-case>B</fixed-case>antu languages: first steps with <fixed-case>R</fixed-case>unyankore (demo)</title>
-      <author><first>Joan</first> <last>Byamugisha</last></author>
-      <author><first>C. Maria</first> <last>Keet</last></author>
-      <author><first>Brian</first> <last>DeRenzi</last></author>
+      <author><first>Joan</first><last>Byamugisha</last></author>
+      <author><first>C. Maria</first><last>Keet</last></author>
+      <author><first>Brian</first><last>DeRenzi</last></author>
       <pages>154–155</pages>
       <url>W17-3523</url>
       <doi>10.18653/v1/W17-3523</doi>
@@ -5813,8 +5813,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="24">
       <title>A working, non-trivial, topically indifferent <fixed-case>NLG</fixed-case> System for 17 languages</title>
-      <author><first>Robert</first> <last>Weißgraeber</last></author>
-      <author><first>Andreas</first> <last>Madsack</last></author>
+      <author><first>Robert</first><last>Weißgraeber</last></author>
+      <author><first>Andreas</first><last>Madsack</last></author>
       <pages>156–157</pages>
       <url>W17-3524</url>
       <doi>10.18653/v1/W17-3524</doi>
@@ -5822,9 +5822,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="25">
       <title>Generating titles for millions of browse pages on an e-Commerce site</title>
-      <author><first>Prashant</first> <last>Mathur</last></author>
-      <author><first>Nicola</first> <last>Ueffing</last></author>
-      <author><first>Gregor</first> <last>Leusch</last></author>
+      <author><first>Prashant</first><last>Mathur</last></author>
+      <author><first>Nicola</first><last>Ueffing</last></author>
+      <author><first>Gregor</first><last>Leusch</last></author>
       <pages>158–167</pages>
       <url>W17-3525</url>
       <doi>10.18653/v1/W17-3525</doi>
@@ -5832,8 +5832,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="26">
       <title>Towards Automatic Generation of Product Reviews from Aspect-Sentiment Scores</title>
-      <author><first>Hongyu</first> <last>Zang</last></author>
-      <author><first>Xiaojun</first> <last>Wan</last></author>
+      <author><first>Hongyu</first><last>Zang</last></author>
+      <author><first>Xiaojun</first><last>Wan</last></author>
       <pages>168–177</pages>
       <url>W17-3526</url>
       <doi>10.18653/v1/W17-3526</doi>
@@ -5841,8 +5841,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="27">
       <title>A model of suspense for narrative generation</title>
-      <author><first>Richard</first> <last>Doust</last></author>
-      <author><first>Paul</first> <last>Piwek</last></author>
+      <author><first>Richard</first><last>Doust</last></author>
+      <author><first>Paul</first><last>Piwek</last></author>
       <pages>178–187</pages>
       <url>W17-3527</url>
       <doi>10.18653/v1/W17-3527</doi>
@@ -5850,10 +5850,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="28">
       <title>Data-Driven News Generation for Automated Journalism</title>
-      <author><first>Leo</first> <last>Leppänen</last></author>
-      <author><first>Myriam</first> <last>Munezero</last></author>
-      <author><first>Mark</first> <last>Granroth-Wilding</last></author>
-      <author><first>Hannu</first> <last>Toivonen</last></author>
+      <author><first>Leo</first><last>Leppänen</last></author>
+      <author><first>Myriam</first><last>Munezero</last></author>
+      <author><first>Mark</first><last>Granroth-Wilding</last></author>
+      <author><first>Hannu</first><last>Toivonen</last></author>
       <pages>188–197</pages>
       <url>W17-3528</url>
       <doi>10.18653/v1/W17-3528</doi>
@@ -5861,9 +5861,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="29">
       <title>Data Augmentation for Visual Question Answering</title>
-      <author><first>Kushal</first> <last>Kafle</last></author>
-      <author><first>Mohammed</first> <last>Yousefhussien</last></author>
-      <author><first>Christopher</first> <last>Kanan</last></author>
+      <author><first>Kushal</first><last>Kafle</last></author>
+      <author><first>Mohammed</first><last>Yousefhussien</last></author>
+      <author><first>Christopher</first><last>Kanan</last></author>
       <pages>198–202</pages>
       <url>W17-3529</url>
       <doi>10.18653/v1/W17-3529</doi>
@@ -5871,7 +5871,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="30">
       <title>Personalized Questions, Answers and Grammars: Aiding the Search for Relevant Web Information</title>
-      <author><first>Marta</first> <last>Gatius</last></author>
+      <author><first>Marta</first><last>Gatius</last></author>
       <pages>203–207</pages>
       <url>W17-3530</url>
       <doi>10.18653/v1/W17-3530</doi>
@@ -5879,11 +5879,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="31">
       <title>A Comparison of Neural Models for Word Ordering</title>
-      <author><first>Eva</first> <last>Hasler</last></author>
-      <author><first>Felix</first> <last>Stahlberg</last></author>
-      <author><first>Marcus</first> <last>Tomalin</last></author>
-      <author><first>Adrià</first> <last>de Gispert</last></author>
-      <author><first>Bill</first> <last>Byrne</last></author>
+      <author><first>Eva</first><last>Hasler</last></author>
+      <author><first>Felix</first><last>Stahlberg</last></author>
+      <author><first>Marcus</first><last>Tomalin</last></author>
+      <author><first>Adrià</first><last>de Gispert</last></author>
+      <author><first>Bill</first><last>Byrne</last></author>
       <pages>208–212</pages>
       <url>W17-3531</url>
       <doi>10.18653/v1/W17-3531</doi>
@@ -5891,12 +5891,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="32">
       <title>Investigating the content and form of referring expressions in <fixed-case>M</fixed-case>andarin: introducing the Mtuna corpus</title>
-      <author><first>Kees</first> <last>van Deemter</last></author>
-      <author><first>Le</first> <last>Sun</last></author>
-      <author><first>Rint</first> <last>Sybesma</last></author>
-      <author><first>Xiao</first> <last>Li</last></author>
-      <author><first>Bo</first> <last>Chen</last></author>
-      <author><first>Muyun</first> <last>Yang</last></author>
+      <author><first>Kees</first><last>van Deemter</last></author>
+      <author><first>Le</first><last>Sun</last></author>
+      <author><first>Rint</first><last>Sybesma</last></author>
+      <author><first>Xiao</first><last>Li</last></author>
+      <author><first>Bo</first><last>Chen</last></author>
+      <author><first>Muyun</first><last>Yang</last></author>
       <pages>213–217</pages>
       <url>W17-3532</url>
       <doi>10.18653/v1/W17-3532</doi>
@@ -5904,8 +5904,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="33">
       <title>Realization of long sentences using chunking</title>
-      <author><first>Ewa</first> <last>Muszyńska</last></author>
-      <author><first>Ann</first> <last>Copestake</last></author>
+      <author><first>Ewa</first><last>Muszyńska</last></author>
+      <author><first>Ann</first><last>Copestake</last></author>
       <pages>218–222</pages>
       <url>W17-3533</url>
       <doi>10.18653/v1/W17-3533</doi>
@@ -5913,10 +5913,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="34">
       <title><fixed-case>S</fixed-case>a<fixed-case>T</fixed-case>o<fixed-case>S</fixed-case>: Assessing and Summarising Terms of Services from <fixed-case>G</fixed-case>erman Webshops</title>
-      <author><first>Daniel</first> <last>Braun</last></author>
-      <author><first>Elena</first> <last>Scepankova</last></author>
-      <author><first>Patrick</first> <last>Holl</last></author>
-      <author><first>Florian</first> <last>Matthes</last></author>
+      <author><first>Daniel</first><last>Braun</last></author>
+      <author><first>Elena</first><last>Scepankova</last></author>
+      <author><first>Patrick</first><last>Holl</last></author>
+      <author><first>Florian</first><last>Matthes</last></author>
       <pages>223–227</pages>
       <url>W17-3534</url>
       <doi>10.18653/v1/W17-3534</doi>
@@ -5924,9 +5924,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="35">
       <title>Textually Summarising Incomplete Data</title>
-      <author><first>Stephanie</first> <last>Inglis</last></author>
-      <author><first>Ehud</first> <last>Reiter</last></author>
-      <author><first>Somayajulu</first> <last>Sripada</last></author>
+      <author><first>Stephanie</first><last>Inglis</last></author>
+      <author><first>Ehud</first><last>Reiter</last></author>
+      <author><first>Somayajulu</first><last>Sripada</last></author>
       <pages>228–232</pages>
       <url>W17-3535</url>
       <doi>10.18653/v1/W17-3535</doi>
@@ -5934,8 +5934,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="36">
       <title>Improving the generation of personalised descriptions</title>
-      <author><first>Thiago</first> <last>Castro Ferreira</last></author>
-      <author><first>Ivandré</first> <last>Paraboni</last></author>
+      <author><first>Thiago</first><last>Castro Ferreira</last></author>
+      <author><first>Ivandré</first><last>Paraboni</last></author>
       <pages>233–237</pages>
       <url>W17-3536</url>
       <doi>10.18653/v1/W17-3536</doi>
@@ -5943,8 +5943,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="37">
       <title>Analysing Data-To-Text Generation Benchmarks</title>
-      <author><first>Laura</first> <last>Perez-Beltrachini</last></author>
-      <author><first>Claire</first> <last>Gardent</last></author>
+      <author><first>Laura</first><last>Perez-Beltrachini</last></author>
+      <author><first>Claire</first><last>Gardent</last></author>
       <pages>238–242</pages>
       <url>W17-3537</url>
       <doi>10.18653/v1/W17-3537</doi>
@@ -5952,9 +5952,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="38">
       <title>Linguistic Description of Complex Phenomena with the r<fixed-case>LDCP</fixed-case> R Package</title>
-      <author><first>Jose</first> <last>Alonso</last></author>
-      <author><first>Patricia</first> <last>Conde-Clemente</last></author>
-      <author><first>Gracian</first> <last>Trivino</last></author>
+      <author><first>Jose</first><last>Alonso</last></author>
+      <author><first>Patricia</first><last>Conde-Clemente</last></author>
+      <author><first>Gracian</first><last>Trivino</last></author>
       <pages>243–244</pages>
       <url>W17-3538</url>
       <doi>10.18653/v1/W17-3538</doi>
@@ -5962,8 +5962,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="39">
       <title>A demo of <fixed-case>FORG</fixed-case>e: the <fixed-case>P</fixed-case>ompeu <fixed-case>F</fixed-case>abra Open Rule-based Generator</title>
-      <author><first>Simon</first> <last>Mille</last></author>
-      <author><first>Leo</first> <last>Wanner</last></author>
+      <author><first>Simon</first><last>Mille</last></author>
+      <author><first>Leo</first><last>Wanner</last></author>
       <pages>245–246</pages>
       <url>W17-3539</url>
       <doi>10.18653/v1/W17-3539</doi>
@@ -5971,9 +5971,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="40">
       <title>Referential Success of Set Referring Expressions with Fuzzy Properties</title>
-      <author><first>Nicolás</first> <last>Marín</last></author>
-      <author><first>Gustavo</first> <last>Rivas-Gervilla</last></author>
-      <author><first>Daniel</first> <last>Sánchez</last></author>
+      <author><first>Nicolás</first><last>Marín</last></author>
+      <author><first>Gustavo</first><last>Rivas-Gervilla</last></author>
+      <author><first>Daniel</first><last>Sánchez</last></author>
       <pages>247–251</pages>
       <url>W17-3540</url>
       <doi>10.18653/v1/W17-3540</doi>
@@ -5981,10 +5981,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="41">
       <title>Neural Response Generation for Customer Service based on Personality Traits</title>
-      <author><first>Jonathan</first> <last>Herzig</last></author>
-      <author><first>Michal</first> <last>Shmueli-Scheuer</last></author>
-      <author><first>Tommy</first> <last>Sandbank</last></author>
-      <author><first>David</first> <last>Konopnicki</last></author>
+      <author><first>Jonathan</first><last>Herzig</last></author>
+      <author><first>Michal</first><last>Shmueli-Scheuer</last></author>
+      <author><first>Tommy</first><last>Sandbank</last></author>
+      <author><first>David</first><last>Konopnicki</last></author>
       <pages>252–256</pages>
       <url>W17-3541</url>
       <doi>10.18653/v1/W17-3541</doi>
@@ -5992,8 +5992,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="42">
       <title>Neural Paraphrase Generation using Transfer Learning</title>
-      <author><first>Florin</first> <last>Brad</last></author>
-      <author><first>Traian</first> <last>Rebedea</last></author>
+      <author><first>Florin</first><last>Brad</last></author>
+      <author><first>Traian</first><last>Rebedea</last></author>
       <pages>257–261</pages>
       <url>W17-3542</url>
       <doi>10.18653/v1/W17-3542</doi>
@@ -6022,91 +6022,91 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Deliberation as Genre: Mapping Argumentation through Relational Discourse Structure</title>
-      <author><first>Oier</first> <last>Imaz</last></author>
-      <author><first>Mikel</first> <last>Iruskieta</last></author>
+      <author><first>Oier</first><last>Imaz</last></author>
+      <author><first>Mikel</first><last>Iruskieta</last></author>
       <pages>1–10</pages>
       <url>W17-3601</url>
       <doi>10.18653/v1/W17-3601</doi>
     </paper>
     <paper id="2">
       <title>The Good, the Bad, and the Disagreement: Complex ground truth in rhetorical structure analysis</title>
-      <author><first>Debopam</first> <last>Das</last></author>
-      <author><first>Manfred</first> <last>Stede</last></author>
-      <author><first>Maite</first> <last>Taboada</last></author>
+      <author><first>Debopam</first><last>Das</last></author>
+      <author><first>Manfred</first><last>Stede</last></author>
+      <author><first>Maite</first><last>Taboada</last></author>
       <pages>11–19</pages>
       <url>W17-3602</url>
       <doi>10.18653/v1/W17-3602</doi>
     </paper>
     <paper id="3">
       <title>A Distributional View of Discourse Encapsulation: Multifactorial Prediction of Coreference Density in <fixed-case>RST</fixed-case></title>
-      <author><first>Amir</first> <last>Zeldes</last></author>
+      <author><first>Amir</first><last>Zeldes</last></author>
       <pages>20–28</pages>
       <url>W17-3603</url>
       <doi>10.18653/v1/W17-3603</doi>
     </paper>
     <paper id="4">
       <title>Rhetorical relations markers in <fixed-case>R</fixed-case>ussian <fixed-case>RST</fixed-case> Treebank</title>
-      <author><first>Svetlana</first> <last>Toldova</last></author>
-      <author><first>Dina</first> <last>Pisarevskaya</last></author>
-      <author><first>Margarita</first> <last>Ananyeva</last></author>
-      <author><first>Maria</first> <last>Kobozeva</last></author>
-      <author><first>Alexander</first> <last>Nasedkin</last></author>
-      <author><first>Sofia</first> <last>Nikiforova</last></author>
-      <author><first>Irina</first> <last>Pavlova</last></author>
-      <author><first>Alexey</first> <last>Shelepov</last></author>
+      <author><first>Svetlana</first><last>Toldova</last></author>
+      <author><first>Dina</first><last>Pisarevskaya</last></author>
+      <author><first>Margarita</first><last>Ananyeva</last></author>
+      <author><first>Maria</first><last>Kobozeva</last></author>
+      <author><first>Alexander</first><last>Nasedkin</last></author>
+      <author><first>Sofia</first><last>Nikiforova</last></author>
+      <author><first>Irina</first><last>Pavlova</last></author>
+      <author><first>Alexey</first><last>Shelepov</last></author>
       <pages>29–33</pages>
       <url>W17-3604</url>
       <doi>10.18653/v1/W17-3604</doi>
     </paper>
     <paper id="5">
       <title>Applying the Rhetorical Structure Theory in <fixed-case>A</fixed-case>lzheimer patients’ speech</title>
-      <author><first>Anayeli</first> <last>Paulino</last></author>
-      <author><first>Gerardo</first> <last>Sierra</last></author>
+      <author><first>Anayeli</first><last>Paulino</last></author>
+      <author><first>Gerardo</first><last>Sierra</last></author>
       <pages>34–38</pages>
       <url>W17-3605</url>
       <doi>10.18653/v1/W17-3605</doi>
     </paper>
     <paper id="6">
       <title>Using lexical level information in discourse structures for Basque sentiment analysis</title>
-      <author><first>Jon</first> <last>Alkorta</last></author>
-      <author><first>Koldo</first> <last>Gojenola</last></author>
-      <author><first>Mikel</first> <last>Iruskieta</last></author>
-      <author><first>Maite</first> <last>Taboada</last></author>
+      <author><first>Jon</first><last>Alkorta</last></author>
+      <author><first>Koldo</first><last>Gojenola</last></author>
+      <author><first>Mikel</first><last>Iruskieta</last></author>
+      <author><first>Maite</first><last>Taboada</last></author>
       <pages>39–47</pages>
       <url>W17-3606</url>
       <doi>10.18653/v1/W17-3606</doi>
     </paper>
     <paper id="7">
       <title>Framework for the Analysis of Simplified Texts Taking Discourse into Account: the Basque Causal Relations as Case Study</title>
-      <author><first>Itziar</first> <last>Gonzalez-Dios</last></author>
-      <author><first>Arantza</first> <last>Diaz de Ilarraza</last></author>
-      <author><first>Mikel</first> <last>Iruskieta</last></author>
+      <author><first>Itziar</first><last>Gonzalez-Dios</last></author>
+      <author><first>Arantza</first><last>Diaz de Ilarraza</last></author>
+      <author><first>Mikel</first><last>Iruskieta</last></author>
       <pages>48–57</pages>
       <url>W17-3607</url>
       <doi>10.18653/v1/W17-3607</doi>
     </paper>
     <paper id="8">
       <title>Using Rhetorical Structure Theory for Detection of Fake Online Reviews</title>
-      <author><first>Olu</first> <last>Popoola</last></author>
+      <author><first>Olu</first><last>Popoola</last></author>
       <pages>58–63</pages>
       <url>W17-3608</url>
       <doi>10.18653/v1/W17-3608</doi>
     </paper>
     <paper id="9">
       <title>“Haters gonna hate”: challenges for sentiment analysis of <fixed-case>F</fixed-case>acebook comments in <fixed-case>B</fixed-case>razilian <fixed-case>P</fixed-case>ortuguese</title>
-      <author><first>Juliano D.</first> <last>Antonio</last></author>
-      <author><first>Ana Carolina L.</first> <last>Santin</last></author>
+      <author><first>Juliano D.</first><last>Antonio</last></author>
+      <author><first>Ana Carolina L.</first><last>Santin</last></author>
       <pages>64–72</pages>
       <url>W17-3609</url>
       <doi>10.18653/v1/W17-3609</doi>
     </paper>
     <paper id="10">
       <title>Discourse Segmentation for Building a <fixed-case>RST</fixed-case> <fixed-case>C</fixed-case>hinese Treebank</title>
-      <author><first>Shuyuan</first> <last>Cao</last></author>
-      <author><first>Nianwen</first> <last>Xue</last></author>
-      <author><first>Iria</first> <last>da Cunha</last></author>
-      <author><first>Mikel</first> <last>Iruskieta</last></author>
-      <author><first>Chuan</first> <last>Wang</last></author>
+      <author><first>Shuyuan</first><last>Cao</last></author>
+      <author><first>Nianwen</first><last>Xue</last></author>
+      <author><first>Iria</first><last>da Cunha</last></author>
+      <author><first>Mikel</first><last>Iruskieta</last></author>
+      <author><first>Chuan</first><last>Wang</last></author>
       <pages>73–81</pages>
       <url>W17-3610</url>
       <doi>10.18653/v1/W17-3610</doi>
@@ -6129,30 +6129,30 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Two Challenges for <fixed-case>CI</fixed-case> Trustworthiness and How to Address Them</title>
-      <author><first>Kevin</first> <last>Baum</last></author>
-      <author><first>Maximilian A.</first> <last>Köhl</last></author>
-      <author><first>Eva</first> <last>Schmidt</last></author>
+      <author><first>Kevin</first><last>Baum</last></author>
+      <author><first>Maximilian A.</first><last>Köhl</last></author>
+      <author><first>Eva</first><last>Schmidt</last></author>
       <url>W17-3701</url>
       <doi>10.18653/v1/W17-3701</doi>
     </paper>
     <paper id="2">
       <title>A Simple Method for Clarifying Sentences with Coordination Ambiguities</title>
-      <author><first>Michael</first> <last>White</last></author>
-      <author><first>Manjuan</first> <last>Duan</last></author>
-      <author><first>David L.</first> <last>King</last></author>
+      <author><first>Michael</first><last>White</last></author>
+      <author><first>Manjuan</first><last>Duan</last></author>
+      <author><first>David L.</first><last>King</last></author>
       <url>W17-3702</url>
       <doi>10.18653/v1/W17-3702</doi>
     </paper>
     <paper id="3">
       <title>Requirements for Conceptual Representations of Explanations and How Reasoning Systems Can Serve Them</title>
-      <author><first>Helmut</first> <last>Horacek</last></author>
+      <author><first>Helmut</first><last>Horacek</last></author>
       <url>W17-3703</url>
       <doi>10.18653/v1/W17-3703</doi>
     </paper>
     <paper id="4">
       <title>An Essay on Self-explanatory Computational Intelligence: A Linguistic Model of Data Processing Systems</title>
-      <author><first>Jose M.</first> <last>Alonso</last></author>
-      <author><first>Gracian</first> <last>Trivino</last></author>
+      <author><first>Jose M.</first><last>Alonso</last></author>
+      <author><first>Gracian</first><last>Trivino</last></author>
       <url>W17-3704</url>
       <doi>10.18653/v1/W17-3704</doi>
     </paper>
@@ -6175,66 +6175,66 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>From <fixed-case>FOAF</fixed-case> to <fixed-case>E</fixed-case>nglish: Linguistic Contribution to Web Semantics</title>
-      <author><first>Max</first> <last>Silberztein</last></author>
+      <author><first>Max</first><last>Silberztein</last></author>
       <pages>1–9</pages>
       <url>W17-3801</url>
       <doi>10.18653/v1/W17-3801</doi>
     </paper>
     <paper id="2">
       <title>Lexicon for Natural Language Generation in <fixed-case>S</fixed-case>panish Adapted to Alternative and Augmentative Communication</title>
-      <author><first>Silvia</first> <last>García-Méndez</last></author>
-      <author><first>Milagros</first> <last>Fernández-Gavilanes</last></author>
-      <author><first>Enrique</first> <last>Costa-Montenegro</last></author>
-      <author><first>Jonathan</first> <last>Juncal-Martínez</last></author>
-      <author><first>Francisco J.</first> <last>González-Castaño</last></author>
+      <author><first>Silvia</first><last>García-Méndez</last></author>
+      <author><first>Milagros</first><last>Fernández-Gavilanes</last></author>
+      <author><first>Enrique</first><last>Costa-Montenegro</last></author>
+      <author><first>Jonathan</first><last>Juncal-Martínez</last></author>
+      <author><first>Francisco J.</first><last>González-Castaño</last></author>
       <pages>11-15</pages>
       <url>W17-3802</url>
       <doi>10.18653/v1/W17-3802</doi>
     </paper>
     <paper id="3">
       <title>Generating Answering Patterns from Factoid <fixed-case>A</fixed-case>rabic Questions</title>
-      <author><first>Essia</first> <last>Bessaies</last></author>
-      <author><first>Slim</first> <last>Mesfar</last></author>
-      <author><first>Henda</first> <last>Ben Ghezala</last></author>
+      <author><first>Essia</first><last>Bessaies</last></author>
+      <author><first>Slim</first><last>Mesfar</last></author>
+      <author><first>Henda</first><last>Ben Ghezala</last></author>
       <pages>17–24</pages>
       <url>W17-3803</url>
       <doi>10.18653/v1/W17-3803</doi>
     </paper>
     <paper id="4">
       <title>Language Generation from <fixed-case>DB</fixed-case> Query</title>
-      <author><first>Kristina</first> <last>Kocijan</last></author>
-      <author><first>Božo</first> <last>Bekavac</last></author>
-      <author><first>Krešimir</first> <last>Šojat</last></author>
+      <author><first>Kristina</first><last>Kocijan</last></author>
+      <author><first>Božo</first><last>Bekavac</last></author>
+      <author><first>Krešimir</first><last>Šojat</last></author>
       <pages>25–32</pages>
       <url>W17-3804</url>
       <doi>10.18653/v1/W17-3804</doi>
     </paper>
     <paper id="5">
       <title>Using Electronic Dictionaries and <fixed-case>N</fixed-case>oo<fixed-case>J</fixed-case> to Generate Sentences Containing <fixed-case>E</fixed-case>nglish Phrasal Verbs</title>
-      <author><first>Peter A.</first> <last>Machonis</last></author>
+      <author><first>Peter A.</first><last>Machonis</last></author>
       <pages>33–37</pages>
       <url>W17-3805</url>
       <doi>10.18653/v1/W17-3805</doi>
     </paper>
     <paper id="6">
       <title>Generating Text with Correct Verb Conjugation: Proposal for a New Automatic Conjugator with <fixed-case>N</fixed-case>oo<fixed-case>J</fixed-case></title>
-      <author><first>Héla</first> <last>Fehri</last></author>
-      <author><first>Sondes</first> <last>Dardour</last></author>
+      <author><first>Héla</first><last>Fehri</last></author>
+      <author><first>Sondes</first><last>Dardour</last></author>
       <pages>39–42</pages>
       <url>W17-3806</url>
       <doi>10.18653/v1/W17-3806</doi>
     </paper>
     <paper id="7">
       <title>Formalization of Speech Verbs with <fixed-case>N</fixed-case>oo<fixed-case>J</fixed-case> for Machine Translation: the <fixed-case>F</fixed-case>rench Verb accuser</title>
-      <author><first>Jouda</first> <last>Ghorbel</last></author>
+      <author><first>Jouda</first><last>Ghorbel</last></author>
       <pages>43–47</pages>
       <url>W17-3807</url>
       <doi>10.18653/v1/W17-3807</doi>
     </paper>
     <paper id="8">
       <title>Using Serious Games to Correct <fixed-case>F</fixed-case>rench Dictations: Proposal for a New <fixed-case>U</fixed-case>nity3<fixed-case>D</fixed-case>/<fixed-case>N</fixed-case>oo<fixed-case>J</fixed-case> Connector</title>
-      <author><first>Ikram</first> <last>Bououd</last></author>
-      <author><first>Rania</first> <last>Fafi</last></author>
+      <author><first>Ikram</first><last>Bououd</last></author>
+      <author><first>Rania</first><last>Fafi</last></author>
       <pages>49–52</pages>
       <url>W17-3808</url>
       <doi>10.18653/v1/W17-3808</doi>
@@ -6244,10 +6244,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     <meta>
       <booktitle>Proceedings of the Workshop on Computational Creativity in Natural Language Generation (<fixed-case>CC</fixed-case>-<fixed-case>NLG</fixed-case> 2017)</booktitle>
       <url>W17-39</url>
-      <editor><first>Hugo</first> <last>Gonçalo Oliveira</last></editor>
-      <editor><first>Ben</first> <last>Burtenshaw</last></editor>
-      <editor><first>Mike</first> <last>Kestemont</last></editor>
-      <editor><first>Tom</first> <last>De Smedt</last></editor>
+      <editor><first>Hugo</first><last>Gonçalo Oliveira</last></editor>
+      <editor><first>Ben</first><last>Burtenshaw</last></editor>
+      <editor><first>Mike</first><last>Kestemont</last></editor>
+      <editor><first>Tom</first><last>De Smedt</last></editor>
       <publisher>Association for Computational Linguistics</publisher>
       <address>Santiago de Compostela, Spain</address>
       <month>September</month>
@@ -6259,43 +6259,43 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Poet’s Little Helper: A methodology for computer-based poetry generation. A case study for the Basque language</title>
-      <author><first>Aitzol</first> <last>Astigarraga</last></author>
-      <author><first>José</first> <last>María Martínez-Otzeta</last></author>
-      <author><first>Igor</first> <last>Rodriguez</last></author>
-      <author><first>Basilio</first> <last>Sierra</last></author>
-      <author><first>Elena</first> <last>Lazkano</last></author>
+      <author><first>Aitzol</first><last>Astigarraga</last></author>
+      <author><first>José</first><last>María Martínez-Otzeta</last></author>
+      <author><first>Igor</first><last>Rodriguez</last></author>
+      <author><first>Basilio</first><last>Sierra</last></author>
+      <author><first>Elena</first><last>Lazkano</last></author>
       <pages>2–10</pages>
       <url>W17-3901</url>
       <doi>10.18653/v1/W17-3901</doi>
     </paper>
     <paper id="2">
       <title>O Poeta Artificial 2.0: Increasing Meaningfulness in a Poetry Generation Twitter bot</title>
-      <author><first>Hugo</first> <last>Gonçalo Oliveira</last></author>
+      <author><first>Hugo</first><last>Gonçalo Oliveira</last></author>
       <pages>11–20</pages>
       <url>W17-3902</url>
       <doi>10.18653/v1/W17-3902</doi>
     </paper>
     <paper id="3">
       <title>Template-Free Construction of Rhyming Poems with Thematic Cohesion</title>
-      <author><first>Pablo</first> <last>Gervás</last></author>
+      <author><first>Pablo</first><last>Gervás</last></author>
       <pages>21–28</pages>
       <url>W17-3903</url>
       <doi>10.18653/v1/W17-3903</doi>
     </paper>
     <paper id="4">
       <title>Synthetic Literature: Writing Science Fiction in a Co-Creative Process</title>
-      <author><first>Enrique</first> <last>Manjavacas</last></author>
-      <author><first>Folgert</first> <last>Karsdorp</last></author>
-      <author><first>Ben</first> <last>Burtenshaw</last></author>
-      <author><first>Mike</first> <last>Kestemont</last></author>
+      <author><first>Enrique</first><last>Manjavacas</last></author>
+      <author><first>Folgert</first><last>Karsdorp</last></author>
+      <author><first>Ben</first><last>Burtenshaw</last></author>
+      <author><first>Mike</first><last>Kestemont</last></author>
       <pages>29–37</pages>
       <url>W17-3904</url>
       <doi>10.18653/v1/W17-3904</doi>
     </paper>
     <paper id="5">
       <title>Constructing narrative using a generative model and continuous action policies</title>
-      <author><first>Emmanouil</first> <last>Theofanis Chourdakis</last></author>
-      <author><first>Joshua</first> <last>Reiss</last></author>
+      <author><first>Emmanouil</first><last>Theofanis Chourdakis</last></author>
+      <author><first>Joshua</first><last>Reiss</last></author>
       <pages>38–43</pages>
       <url>W17-3905</url>
       <doi>10.18653/v1/W17-3905</doi>
@@ -6418,8 +6418,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Character and Subword-Based Word Representation for Neural Language Modeling Prediction</title>
-      <author><first>Matthieu</first> <last>Labeau</last></author>
-      <author><first>Alexandre</first> <last>Allauzen</last></author>
+      <author><first>Matthieu</first><last>Labeau</last></author>
+      <author><first>Alexandre</first><last>Allauzen</last></author>
       <pages>1–13</pages>
       <url>W17-4101</url>
       <doi>10.18653/v1/W17-4101</doi>
@@ -6427,8 +6427,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Learning variable length units for <fixed-case>SMT</fixed-case> between related languages via Byte Pair Encoding</title>
-      <author><first>Anoop</first> <last>Kunchukuttan</last></author>
-      <author><first>Pushpak</first> <last>Bhattacharyya</last></author>
+      <author><first>Anoop</first><last>Kunchukuttan</last></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last></author>
       <pages>14–24</pages>
       <url>W17-4102</url>
       <doi>10.18653/v1/W17-4102</doi>
@@ -6436,8 +6436,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Character Based Pattern Mining for Neology Detection</title>
-      <author><first>Gaël</first> <last>Lejeune</last></author>
-      <author><first>Emmanuel</first> <last>Cartier</last></author>
+      <author><first>Gaël</first><last>Lejeune</last></author>
+      <author><first>Emmanuel</first><last>Cartier</last></author>
       <pages>25–30</pages>
       <url>W17-4103</url>
       <doi>10.18653/v1/W17-4103</doi>
@@ -6445,10 +6445,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Automated Word Stress Detection in <fixed-case>R</fixed-case>ussian</title>
-      <author><first>Maria</first> <last>Ponomareva</last></author>
-      <author><first>Kirill</first> <last>Milintsevich</last></author>
-      <author><first>Ekaterina</first> <last>Chernyak</last></author>
-      <author><first>Anatoly</first> <last>Starostin</last></author>
+      <author><first>Maria</first><last>Ponomareva</last></author>
+      <author><first>Kirill</first><last>Milintsevich</last></author>
+      <author><first>Ekaterina</first><last>Chernyak</last></author>
+      <author><first>Anatoly</first><last>Starostin</last></author>
       <pages>31–35</pages>
       <url>W17-4104</url>
       <doi>10.18653/v1/W17-4104</doi>
@@ -6456,10 +6456,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>A Syllable-based Technique for Word Embeddings of <fixed-case>K</fixed-case>orean Words</title>
-      <author><first>Sanghyuk</first> <last>Choi</last></author>
-      <author><first>Taeuk</first> <last>Kim</last></author>
-      <author><first>Jinseok</first> <last>Seol</last></author>
-      <author><first>Sang-goo</first> <last>Lee</last></author>
+      <author><first>Sanghyuk</first><last>Choi</last></author>
+      <author><first>Taeuk</first><last>Kim</last></author>
+      <author><first>Jinseok</first><last>Seol</last></author>
+      <author><first>Sang-goo</first><last>Lee</last></author>
       <pages>36–40</pages>
       <url>W17-4105</url>
       <doi>10.18653/v1/W17-4105</doi>
@@ -6467,8 +6467,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Supersense Tagging with a Combination of Character, Subword, and Word-level Representations</title>
-      <author><first>Youhyun</first> <last>Shin</last></author>
-      <author><first>Sang-goo</first> <last>Lee</last></author>
+      <author><first>Youhyun</first><last>Shin</last></author>
+      <author><first>Sang-goo</first><last>Lee</last></author>
       <pages>41–45</pages>
       <url>W17-4106</url>
       <doi>10.18653/v1/W17-4106</doi>
@@ -6476,8 +6476,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Weakly supervised learning of allomorphy</title>
-      <author><first>Miikka</first> <last>Silfverberg</last></author>
-      <author><first>Mans</first> <last>Hulden</last></author>
+      <author><first>Miikka</first><last>Silfverberg</last></author>
+      <author><first>Mans</first><last>Hulden</last></author>
       <pages>46–56</pages>
       <url>W17-4107</url>
       <doi>10.18653/v1/W17-4107</doi>
@@ -6485,8 +6485,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Character-based recurrent neural networks for morphological relational reasoning</title>
-      <author><first>Olof</first> <last>Mogren</last></author>
-      <author><first>Richard</first> <last>Johansson</last></author>
+      <author><first>Olof</first><last>Mogren</last></author>
+      <author><first>Richard</first><last>Johansson</last></author>
       <pages>57–63</pages>
       <url>W17-4108</url>
       <doi>10.18653/v1/W17-4108</doi>
@@ -6508,8 +6508,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Glyph-aware Embedding of <fixed-case>C</fixed-case>hinese Characters</title>
-      <author><first>Falcon</first> <last>Dai</last></author>
-      <author><first>Zheng</first> <last>Cai</last></author>
+      <author><first>Falcon</first><last>Dai</last></author>
+      <author><first>Zheng</first><last>Cai</last></author>
       <pages>64–69</pages>
       <url>W17-4109</url>
       <doi>10.18653/v1/W17-4109</doi>
@@ -6517,8 +6517,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Exploring Cross-Lingual Transfer of Morphological Knowledge In Sequence-to-Sequence Models</title>
-      <author><first>Huiming</first> <last>Jin</last></author>
-      <author><first>Katharina</first> <last>Kann</last></author>
+      <author><first>Huiming</first><last>Jin</last></author>
+      <author><first>Katharina</first><last>Kann</last></author>
       <pages>70–75</pages>
       <url>W17-4110</url>
       <doi>10.18653/v1/W17-4110</doi>
@@ -6526,8 +6526,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Unlabeled Data for Morphological Generation With Character-Based Sequence-to-Sequence Models</title>
-      <author><first>Katharina</first> <last>Kann</last></author>
-      <author><first>Hinrich</first> <last>Schütze</last></author>
+      <author><first>Katharina</first><last>Kann</last></author>
+      <author><first>Hinrich</first><last>Schütze</last></author>
       <pages>76–81</pages>
       <url>W17-4111</url>
       <doi>10.18653/v1/W17-4111</doi>
@@ -6535,8 +6535,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Vowel and Consonant Classification through Spectral Decomposition</title>
-      <author><first>Patricia</first> <last>Thaine</last></author>
-      <author><first>Gerald</first> <last>Penn</last></author>
+      <author><first>Patricia</first><last>Thaine</last></author>
+      <author><first>Gerald</first><last>Penn</last></author>
       <pages>82–91</pages>
       <url>W17-4112</url>
       <doi>10.18653/v1/W17-4112</doi>
@@ -6545,10 +6545,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Syllable-level Neural Language Model for Agglutinative Language</title>
-      <author><first>Seunghak</first> <last>Yu</last></author>
-      <author><first>Nilesh</first> <last>Kulkarni</last></author>
-      <author><first>Haejun</first> <last>Lee</last></author>
-      <author><first>Jihie</first> <last>Kim</last></author>
+      <author><first>Seunghak</first><last>Yu</last></author>
+      <author><first>Nilesh</first><last>Kulkarni</last></author>
+      <author><first>Haejun</first><last>Lee</last></author>
+      <author><first>Jihie</first><last>Kim</last></author>
       <pages>92–96</pages>
       <url>W17-4113</url>
       <doi>10.18653/v1/W17-4113</doi>
@@ -6557,10 +6557,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Character-based Bidirectional <fixed-case>LSTM</fixed-case>-<fixed-case>CRF</fixed-case> with words and characters for <fixed-case>J</fixed-case>apanese Named Entity Recognition</title>
-      <author><first>Shotaro</first> <last>Misawa</last></author>
-      <author><first>Motoki</first> <last>Taniguchi</last></author>
-      <author><first>Yasuhide</first> <last>Miura</last></author>
-      <author><first>Tomoko</first> <last>Ohkuma</last></author>
+      <author><first>Shotaro</first><last>Misawa</last></author>
+      <author><first>Motoki</first><last>Taniguchi</last></author>
+      <author><first>Yasuhide</first><last>Miura</last></author>
+      <author><first>Tomoko</first><last>Ohkuma</last></author>
       <pages>97–102</pages>
       <url>W17-4114</url>
       <doi>10.18653/v1/W17-4114</doi>
@@ -6568,10 +6568,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Word Representation Models for Morphologically Rich Languages in Neural Machine Translation</title>
-      <author><first>Ekaterina</first> <last>Vylomova</last></author>
-      <author><first>Trevor</first> <last>Cohn</last></author>
-      <author><first>Xuanli</first> <last>He</last></author>
-      <author><first>Gholamreza</first> <last>Haffari</last></author>
+      <author><first>Ekaterina</first><last>Vylomova</last></author>
+      <author><first>Trevor</first><last>Cohn</last></author>
+      <author><first>Xuanli</first><last>He</last></author>
+      <author><first>Gholamreza</first><last>Haffari</last></author>
       <pages>103–108</pages>
       <url>W17-4115</url>
       <doi>10.18653/v1/W17-4115</doi>
@@ -6579,8 +6579,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="16">
       <title>Spell-Checking based on Syllabification and Character-level Graphs for a <fixed-case>P</fixed-case>eruvian Agglutinative Language</title>
-      <author><first>Carlo</first> <last>Alva</last></author>
-      <author><first>Arturo</first> <last>Oncevay</last></author>
+      <author><first>Carlo</first><last>Alva</last></author>
+      <author><first>Arturo</first><last>Oncevay</last></author>
       <pages>109–116</pages>
       <url>W17-4116</url>
       <doi>10.18653/v1/W17-4116</doi>
@@ -6588,9 +6588,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="17">
       <title>What do we need to know about an unknown word when parsing <fixed-case>G</fixed-case>erman</title>
-      <author><first>Bich-Ngoc</first> <last>Do</last></author>
-      <author><first>Ines</first> <last>Rehbein</last></author>
-      <author><first>Anette</first> <last>Frank</last></author>
+      <author><first>Bich-Ngoc</first><last>Do</last></author>
+      <author><first>Ines</first><last>Rehbein</last></author>
+      <author><first>Anette</first><last>Frank</last></author>
       <pages>117–123</pages>
       <url>W17-4117</url>
       <doi>10.18653/v1/W17-4117</doi>
@@ -6598,9 +6598,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="18">
       <title>A General-Purpose Tagger with Convolutional Neural Networks</title>
-      <author><first>Xiang</first> <last>Yu</last></author>
-      <author><first>Agnieszka</first> <last>Falenska</last></author>
-      <author><first>Ngoc Thang</first> <last>Vu</last></author>
+      <author><first>Xiang</first><last>Yu</last></author>
+      <author><first>Agnieszka</first><last>Falenska</last></author>
+      <author><first>Ngoc Thang</first><last>Vu</last></author>
       <pages>124–129</pages>
       <url>W17-4118</url>
       <doi>10.18653/v1/W17-4118</doi>
@@ -6608,7 +6608,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="19">
       <title>Reconstruction of Word Embeddings from Sub-Word Parameters</title>
-      <author><first>Karl</first> <last>Stratos</last></author>
+      <author><first>Karl</first><last>Stratos</last></author>
       <pages>130–135</pages>
       <url>W17-4119</url>
       <doi>10.18653/v1/W17-4119</doi>
@@ -6616,9 +6616,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="20">
       <title>Inflection Generation for <fixed-case>S</fixed-case>panish Verbs using Supervised Learning</title>
-      <author><first>Cristina</first> <last>Barros</last></author>
-      <author><first>Dimitra</first> <last>Gkatzia</last></author>
-      <author><first>Elena</first> <last>Lloret</last></author>
+      <author><first>Cristina</first><last>Barros</last></author>
+      <author><first>Dimitra</first><last>Gkatzia</last></author>
+      <author><first>Elena</first><last>Lloret</last></author>
       <pages>136–141</pages>
       <url>W17-4120</url>
       <doi>10.18653/v1/W17-4120</doi>
@@ -6626,11 +6626,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="21">
       <title>Neural Paraphrase Identification of Questions with Noisy Pretraining</title>
-      <author><first>Gaurav Singh</first> <last>Tomar</last></author>
-      <author><first>Thyago</first> <last>Duque</last></author>
-      <author><first>Oscar</first> <last>Täckström</last></author>
-      <author><first>Jakob</first> <last>Uszkoreit</last></author>
-      <author><first>Dipanjan</first> <last>Das</last></author>
+      <author><first>Gaurav Singh</first><last>Tomar</last></author>
+      <author><first>Thyago</first><last>Duque</last></author>
+      <author><first>Oscar</first><last>Täckström</last></author>
+      <author><first>Jakob</first><last>Uszkoreit</last></author>
+      <author><first>Dipanjan</first><last>Das</last></author>
       <pages>142–147</pages>
       <url>W17-4121</url>
       <doi>10.18653/v1/W17-4121</doi>
@@ -6638,9 +6638,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="22">
       <title>Sub-character Neural Language Modelling in <fixed-case>J</fixed-case>apanese</title>
-      <author><first>Viet</first> <last>Nguyen</last></author>
-      <author><first>Julian</first> <last>Brooke</last></author>
-      <author><first>Timothy</first> <last>Baldwin</last></author>
+      <author><first>Viet</first><last>Nguyen</last></author>
+      <author><first>Julian</first><last>Brooke</last></author>
+      <author><first>Timothy</first><last>Baldwin</last></author>
       <pages>148–153</pages>
       <url>W17-4122</url>
       <doi>10.18653/v1/W17-4122</doi>
@@ -6648,9 +6648,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="23">
       <title>Byte-based Neural Machine Translation</title>
-      <author><first>Marta R.</first> <last>Costa-jussà</last></author>
-      <author><first>Carlos</first> <last>Escolano</last></author>
-      <author><first>José A. R.</first> <last>Fonollosa</last></author>
+      <author><first>Marta R.</first><last>Costa-jussà</last></author>
+      <author><first>Carlos</first><last>Escolano</last></author>
+      <author><first>José A. R.</first><last>Fonollosa</last></author>
       <pages>154–158</pages>
       <url>W17-4123</url>
       <doi>10.18653/v1/W17-4123</doi>
@@ -6658,8 +6658,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="24">
       <title>Improving Opinion-Target Extraction with Character-Level Word Embeddings</title>
-      <author><first>Soufian</first> <last>Jebbara</last></author>
-      <author><first>Philipp</first> <last>Cimiano</last></author>
+      <author><first>Soufian</first><last>Jebbara</last></author>
+      <author><first>Philipp</first><last>Cimiano</last></author>
       <pages>159–167</pages>
       <url>W17-4124</url>
       <doi>10.18653/v1/W17-4124</doi>
@@ -6670,8 +6670,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     <meta>
       <booktitle>Proceedings of the 2017 <fixed-case>EMNLP</fixed-case> Workshop: Natural Language Processing meets Journalism</booktitle>
       <url>W17-42</url>
-      <editor><first>Octavian</first> <last>Popescu</last></editor>
-      <editor><first>Carlo</first> <last>Strapparava</last></editor>
+      <editor><first>Octavian</first><last>Popescu</last></editor>
+      <editor><first>Carlo</first><last>Strapparava</last></editor>
       <doi>10.18653/v1/W17-42</doi>
       <publisher>Association for Computational Linguistics</publisher>
       <address>Copenhagen, Denmark</address>
@@ -6683,12 +6683,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Predicting News Values from Headline Text and Emotions</title>
-      <author><first>Maria Pia</first> <last>di Buono</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
-      <author><first>Bojana</first> <last>Dalbelo Bašić</last></author>
-      <author><first>Goran</first> <last>Glavaš</last></author>
-      <author><first>Martin</first> <last>Tutek</last></author>
-      <author><first>Natasa</first> <last>Milic-Frayling</last></author>
+      <author><first>Maria Pia</first><last>di Buono</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
+      <author><first>Bojana</first><last>Dalbelo Bašić</last></author>
+      <author><first>Goran</first><last>Glavaš</last></author>
+      <author><first>Martin</first><last>Tutek</last></author>
+      <author><first>Natasa</first><last>Milic-Frayling</last></author>
       <pages>1–6</pages>
       <url>W17-4201</url>
       <doi>10.18653/v1/W17-4201</doi>
@@ -6696,8 +6696,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Predicting User Views in Online News</title>
-      <author><first>Daniel</first> <last>Hardt</last></author>
-      <author><first>Owen</first> <last>Rambow</last></author>
+      <author><first>Daniel</first><last>Hardt</last></author>
+      <author><first>Owen</first><last>Rambow</last></author>
       <pages>7–12</pages>
       <url>W17-4202</url>
       <doi>10.18653/v1/W17-4202</doi>
@@ -6705,11 +6705,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Tracking Bias in News Sources Using Social Media: the Russia-<fixed-case>U</fixed-case>kraine Maidan Crisis of 2013–2014</title>
-      <author><first>Peter</first> <last>Potash</last></author>
-      <author><first>Alexey</first> <last>Romanov</last></author>
-      <author><first>Mikhail</first> <last>Gronas</last></author>
-      <author><first>Anna</first> <last>Rumshisky</last></author>
-      <author><first>Mikhail</first> <last>Gronas</last></author>
+      <author><first>Peter</first><last>Potash</last></author>
+      <author><first>Alexey</first><last>Romanov</last></author>
+      <author><first>Mikhail</first><last>Gronas</last></author>
+      <author><first>Anna</first><last>Rumshisky</last></author>
+      <author><first>Mikhail</first><last>Gronas</last></author>
       <pages>13–18</pages>
       <url>W17-4203</url>
       <doi>10.18653/v1/W17-4203</doi>
@@ -6717,10 +6717,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>What to Write? A topic recommender for journalists</title>
-      <author><first>Alessandro</first> <last>Cucchiarelli</last></author>
-      <author><first>Christian</first> <last>Morbidoni</last></author>
-      <author><first>Giovanni</first> <last>Stilo</last></author>
-      <author><first>Paola</first> <last>Velardi</last></author>
+      <author><first>Alessandro</first><last>Cucchiarelli</last></author>
+      <author><first>Christian</first><last>Morbidoni</last></author>
+      <author><first>Giovanni</first><last>Stilo</last></author>
+      <author><first>Paola</first><last>Velardi</last></author>
       <pages>19–24</pages>
       <url>W17-4204</url>
       <doi>10.18653/v1/W17-4204</doi>
@@ -6728,11 +6728,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Comparing Attitudes to Climate Change in the Media using sentiment analysis based on Latent <fixed-case>D</fixed-case>irichlet Allocation</title>
-      <author><first>Ye</first> <last>Jiang</last></author>
-      <author><first>Xingyi</first> <last>Song</last></author>
-      <author><first>Jackie</first> <last>Harrison</last></author>
-      <author><first>Shaun</first> <last>Quegan</last></author>
-      <author><first>Diana</first> <last>Maynard</last></author>
+      <author><first>Ye</first><last>Jiang</last></author>
+      <author><first>Xingyi</first><last>Song</last></author>
+      <author><first>Jackie</first><last>Harrison</last></author>
+      <author><first>Shaun</first><last>Quegan</last></author>
+      <author><first>Diana</first><last>Maynard</last></author>
       <pages>25–30</pages>
       <url>W17-4205</url>
       <doi>10.18653/v1/W17-4205</doi>
@@ -6740,12 +6740,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Language-based Construction of Explorable News Graphs for Journalists</title>
-      <author><first>Rémi</first> <last>Bois</last></author>
-      <author><first>Guillaume</first> <last>Gravier</last></author>
-      <author><first>Eric</first> <last>Jamet</last></author>
-      <author><first>Emmanuel</first> <last>Morin</last></author>
-      <author><first>Pascale</first> <last>Sébillot</last></author>
-      <author><first>Maxime</first> <last>Robert</last></author>
+      <author><first>Rémi</first><last>Bois</last></author>
+      <author><first>Guillaume</first><last>Gravier</last></author>
+      <author><first>Eric</first><last>Jamet</last></author>
+      <author><first>Emmanuel</first><last>Morin</last></author>
+      <author><first>Pascale</first><last>Sébillot</last></author>
+      <author><first>Maxime</first><last>Robert</last></author>
       <pages>31–36</pages>
       <url>W17-4206</url>
       <doi>10.18653/v1/W17-4206</doi>
@@ -6753,13 +6753,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title><fixed-case>S</fixed-case>toryteller: Visual Analytics of Perspectives on Rich Text Interpretations</title>
-      <author><first>Maarten</first> <last>van Meersbergen</last></author>
-      <author><first>Piek</first> <last>Vossen</last></author>
-      <author><first>Janneke</first> <last>van der Zwaan</last></author>
-      <author><first>Antske</first> <last>Fokkens</last></author>
-      <author><first>Willem</first> <last>van Hage</last></author>
-      <author><first>Inger</first> <last>Leemans</last></author>
-      <author><first>Isa</first> <last>Maks</last></author>
+      <author><first>Maarten</first><last>van Meersbergen</last></author>
+      <author><first>Piek</first><last>Vossen</last></author>
+      <author><first>Janneke</first><last>van der Zwaan</last></author>
+      <author><first>Antske</first><last>Fokkens</last></author>
+      <author><first>Willem</first><last>van Hage</last></author>
+      <author><first>Inger</first><last>Leemans</last></author>
+      <author><first>Isa</first><last>Maks</last></author>
       <pages>37–45</pages>
       <url>W17-4207</url>
       <doi>10.18653/v1/W17-4207</doi>
@@ -6767,10 +6767,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Analyzing the Revision Logs of a <fixed-case>J</fixed-case>apanese Newspaper for Article Quality Assessment</title>
-      <author><first>Hideaki</first> <last>Tamori</last></author>
-      <author><first>Yuta</first> <last>Hitomi</last></author>
-      <author><first>Naoaki</first> <last>Okazaki</last></author>
-      <author><first>Kentaro</first> <last>Inui</last></author>
+      <author><first>Hideaki</first><last>Tamori</last></author>
+      <author><first>Yuta</first><last>Hitomi</last></author>
+      <author><first>Naoaki</first><last>Okazaki</last></author>
+      <author><first>Kentaro</first><last>Inui</last></author>
       <pages>46–50</pages>
       <url>W17-4208</url>
       <doi>10.18653/v1/W17-4208</doi>
@@ -6778,10 +6778,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Improved Abusive Comment Moderation with User Embeddings</title>
-      <author><first>John</first> <last>Pavlopoulos</last></author>
-      <author><first>Prodromos</first> <last>Malakasiotis</last></author>
-      <author><first>Juli</first> <last>Bakagianni</last></author>
-      <author><first>Ion</first> <last>Androutsopoulos</last></author>
+      <author><first>John</first><last>Pavlopoulos</last></author>
+      <author><first>Prodromos</first><last>Malakasiotis</last></author>
+      <author><first>Juli</first><last>Bakagianni</last></author>
+      <author><first>Ion</first><last>Androutsopoulos</last></author>
       <pages>51–55</pages>
       <url>W17-4209</url>
       <doi>10.18653/v1/W17-4209</doi>
@@ -6789,10 +6789,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Incongruent Headlines: Yet Another Way to Mislead Your Readers</title>
-      <author><first>Sophie</first> <last>Chesney</last></author>
-      <author><first>Maria</first> <last>Liakata</last></author>
-      <author><first>Massimo</first> <last>Poesio</last></author>
-      <author><first>Matthew</first> <last>Purver</last></author>
+      <author><first>Sophie</first><last>Chesney</last></author>
+      <author><first>Maria</first><last>Liakata</last></author>
+      <author><first>Massimo</first><last>Poesio</last></author>
+      <author><first>Matthew</first><last>Purver</last></author>
       <pages>56–61</pages>
       <url>W17-4210</url>
       <doi>10.18653/v1/W17-4210</doi>
@@ -6800,9 +6800,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Unsupervised Event Clustering and Aggregation from Newswire and Web Articles</title>
-      <author><first>Swen</first> <last>Ribeiro</last></author>
-      <author><first>Olivier</first> <last>Ferret</last></author>
-      <author><first>Xavier</first> <last>Tannier</last></author>
+      <author><first>Swen</first><last>Ribeiro</last></author>
+      <author><first>Olivier</first><last>Ferret</last></author>
+      <author><first>Xavier</first><last>Tannier</last></author>
       <pages>62–67</pages>
       <url>W17-4211</url>
       <doi>10.18653/v1/W17-4211</doi>
@@ -6810,11 +6810,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Semantic Storytelling, Cross-lingual Event Detection and other Semantic Services for a Newsroom Content Curation Dashboard</title>
-      <author><first>Julian</first> <last>Moreno-Schneider</last></author>
-      <author><first>Ankit</first> <last>Srivastava</last></author>
-      <author><first>Peter</first> <last>Bourgonje</last></author>
-      <author><first>David</first> <last>Wabnitz</last></author>
-      <author><first>Georg</first> <last>Rehm</last></author>
+      <author><first>Julian</first><last>Moreno-Schneider</last></author>
+      <author><first>Ankit</first><last>Srivastava</last></author>
+      <author><first>Peter</first><last>Bourgonje</last></author>
+      <author><first>David</first><last>Wabnitz</last></author>
+      <author><first>Georg</first><last>Rehm</last></author>
       <pages>68–73</pages>
       <url>W17-4212</url>
       <doi>10.18653/v1/W17-4212</doi>
@@ -6822,7 +6822,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Deception Detection in News Reports in the <fixed-case>R</fixed-case>ussian Language: Lexics and Discourse</title>
-      <author><first>Dina</first> <last>Pisarevskaya</last></author>
+      <author><first>Dina</first><last>Pisarevskaya</last></author>
       <pages>74–79</pages>
       <url>W17-4213</url>
       <doi>10.18653/v1/W17-4213</doi>
@@ -6830,12 +6830,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Fake news stance detection using stacked ensemble of classifiers</title>
-      <author><first>James</first> <last>Thorne</last></author>
-      <author><first>Mingjie</first> <last>Chen</last></author>
-      <author><first>Giorgos</first> <last>Myrianthous</last></author>
-      <author><first>Jiashu</first> <last>Pu</last></author>
-      <author><first>Xiaoxuan</first> <last>Wang</last></author>
-      <author><first>Andreas</first> <last>Vlachos</last></author>
+      <author><first>James</first><last>Thorne</last></author>
+      <author><first>Mingjie</first><last>Chen</last></author>
+      <author><first>Giorgos</first><last>Myrianthous</last></author>
+      <author><first>Jiashu</first><last>Pu</last></author>
+      <author><first>Xiaoxuan</first><last>Wang</last></author>
+      <author><first>Andreas</first><last>Vlachos</last></author>
       <pages>80–83</pages>
       <url>W17-4214</url>
       <doi>10.18653/v1/W17-4214</doi>
@@ -6843,9 +6843,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>From Clickbait to Fake News Detection: An Approach based on Detecting the Stance of Headlines to Articles</title>
-      <author><first>Peter</first> <last>Bourgonje</last></author>
-      <author><first>Julian</first> <last>Moreno Schneider</last></author>
-      <author><first>Georg</first> <last>Rehm</last></author>
+      <author><first>Peter</first><last>Bourgonje</last></author>
+      <author><first>Julian</first><last>Moreno Schneider</last></author>
+      <author><first>Georg</first><last>Rehm</last></author>
       <pages>84–89</pages>
       <url>W17-4215</url>
       <doi>10.18653/v1/W17-4215</doi>
@@ -6853,8 +6853,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="16">
       <title>‘Fighting’ or ‘Conflict’? An Approach to Revealing Concepts of Terms in Political Discourse</title>
-      <author><first>Linyuan</first> <last>Tang</last></author>
-      <author><first>Kyo</first> <last>Kageura</last></author>
+      <author><first>Linyuan</first><last>Tang</last></author>
+      <author><first>Kyo</first><last>Kageura</last></author>
       <pages>90–94</pages>
       <url>W17-4216</url>
       <doi>10.18653/v1/W17-4216</doi>
@@ -6862,9 +6862,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="17">
       <title>A News Chain Evaluation Methodology along with a Lattice-based Approach for News Chain Construction</title>
-      <author><first>Mustafa</first> <last>Toprak</last></author>
-      <author><first>Özer</first> <last>Özkahraman</last></author>
-      <author><first>Selma</first> <last>Tekir</last></author>
+      <author><first>Mustafa</first><last>Toprak</last></author>
+      <author><first>Özer</first><last>Özkahraman</last></author>
+      <author><first>Selma</first><last>Tekir</last></author>
       <pages>95–99</pages>
       <url>W17-4217</url>
       <doi>10.18653/v1/W17-4217</doi>
@@ -6872,8 +6872,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="18">
       <title>Using New York Times Picks to Identify Constructive Comments</title>
-      <author><first>Varada</first> <last>Kolhatkar</last></author>
-      <author><first>Maite</first> <last>Taboada</last></author>
+      <author><first>Varada</first><last>Kolhatkar</last></author>
+      <author><first>Maite</first><last>Taboada</last></author>
       <pages>100–105</pages>
       <url>W17-4218</url>
       <doi>10.18653/v1/W17-4218</doi>
@@ -6881,9 +6881,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="19">
       <title>An <fixed-case>NLP</fixed-case> Analysis of Exaggerated Claims in Science News</title>
-      <author><first>Yingya</first> <last>Li</last></author>
-      <author><first>Jieke</first> <last>Zhang</last></author>
-      <author><first>Bei</first> <last>Yu</last></author>
+      <author><first>Yingya</first><last>Li</last></author>
+      <author><first>Jieke</first><last>Zhang</last></author>
+      <author><first>Bei</first><last>Yu</last></author>
       <pages>106–111</pages>
       <url>W17-4219</url>
       <doi>10.18653/v1/W17-4219</doi>
@@ -6909,8 +6909,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Dependency Parsing with Dilated Iterated Graph <fixed-case>CNN</fixed-case>s</title>
-      <author><first>Emma</first> <last>Strubell</last></author>
-      <author><first>Andrew</first> <last>McCallum</last></author>
+      <author><first>Emma</first><last>Strubell</last></author>
+      <author><first>Andrew</first><last>McCallum</last></author>
       <pages>1–6</pages>
       <url>W17-4301</url>
       <doi>10.18653/v1/W17-4301</doi>
@@ -6918,7 +6918,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Entity Identification as Multitasking</title>
-      <author><first>Karl</first> <last>Stratos</last></author>
+      <author><first>Karl</first><last>Stratos</last></author>
       <pages>7–11</pages>
       <url>W17-4302</url>
       <doi>10.18653/v1/W17-4302</doi>
@@ -6926,8 +6926,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Towards Neural Machine Translation with Latent Tree Attention</title>
-      <author><first>James</first> <last>Bradbury</last></author>
-      <author><first>Richard</first> <last>Socher</last></author>
+      <author><first>James</first><last>Bradbury</last></author>
+      <author><first>Richard</first><last>Socher</last></author>
       <pages>12–16</pages>
       <url>W17-4303</url>
       <doi>10.18653/v1/W17-4303</doi>
@@ -6936,8 +6936,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Structured Prediction via Learning to Search under Bandit Feedback</title>
-      <author><first>Amr</first> <last>Sharaf</last></author>
-      <author><first>Hal</first> <last>Daumé III</last></author>
+      <author><first>Amr</first><last>Sharaf</last></author>
+      <author><first>Hal</first><last>Daumé III</last></author>
       <pages>17–26</pages>
       <url>W17-4304</url>
       <doi>10.18653/v1/W17-4304</doi>
@@ -6945,11 +6945,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Syntax Aware <fixed-case>LSTM</fixed-case> model for Semantic Role Labeling</title>
-      <author><first>Feng</first> <last>Qian</last></author>
-      <author><first>Lei</first> <last>Sha</last></author>
-      <author><first>Baobao</first> <last>Chang</last></author>
-      <author><first>Lu-chen</first> <last>Liu</last></author>
-      <author><first>Ming</first> <last>Zhang</last></author>
+      <author><first>Feng</first><last>Qian</last></author>
+      <author><first>Lei</first><last>Sha</last></author>
+      <author><first>Baobao</first><last>Chang</last></author>
+      <author><first>Lu-chen</first><last>Liu</last></author>
+      <author><first>Ming</first><last>Zhang</last></author>
       <pages>27–32</pages>
       <url>W17-4305</url>
       <doi>10.18653/v1/W17-4305</doi>
@@ -6957,9 +6957,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Spatial Language Understanding with Multimodal Graphs using Declarative Learning based Programming</title>
-      <author><first>Parisa</first> <last>Kordjamshidi</last></author>
-      <author><first>Taher</first> <last>Rahgooy</last></author>
-      <author><first>Umar</first> <last>Manzoor</last></author>
+      <author><first>Parisa</first><last>Kordjamshidi</last></author>
+      <author><first>Taher</first><last>Rahgooy</last></author>
+      <author><first>Umar</first><last>Manzoor</last></author>
       <pages>33–43</pages>
       <url>W17-4306</url>
       <doi>10.18653/v1/W17-4306</doi>
@@ -6967,8 +6967,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Boosting Information Extraction Systems with Character-level Neural Networks and Free Noisy Supervision</title>
-      <author><first>Philipp</first> <last>Meerkamp</last></author>
-      <author><first>Zhengyi</first> <last>Zhou</last></author>
+      <author><first>Philipp</first><last>Meerkamp</last></author>
+      <author><first>Zhengyi</first><last>Zhou</last></author>
       <pages>44–51</pages>
       <url>W17-4307</url>
       <doi>10.18653/v1/W17-4307</doi>
@@ -6976,10 +6976,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Piecewise Latent Variables for Neural Variational Text Processing</title>
-      <author><first>Iulian Vlad</first> <last>Serban</last></author>
-      <author><first>Alexander</first> <last>Ororbia II</last></author>
-      <author><first>Joelle</first> <last>Pineau</last></author>
-      <author><first>Aaron</first> <last>Courville</last></author>
+      <author><first>Iulian Vlad</first><last>Serban</last></author>
+      <author><first>Alexander</first><last>Ororbia II</last></author>
+      <author><first>Joelle</first><last>Pineau</last></author>
+      <author><first>Aaron</first><last>Courville</last></author>
       <pages>52–62</pages>
       <url>W17-4308</url>
       <doi>10.18653/v1/W17-4308</doi>
@@ -7005,7 +7005,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Boundary-based <fixed-case>MWE</fixed-case> segmentation with text partitioning</title>
-      <author><first>Jake</first> <last>Williams</last></author>
+      <author><first>Jake</first><last>Williams</last></author>
       <pages>1–10</pages>
       <url>W17-4401</url>
       <doi>10.18653/v1/W17-4401</doi>
@@ -7013,11 +7013,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Towards the Understanding of Gaming Audiences by Modeling Twitch Emotes</title>
-      <author><first>Francesco</first> <last>Barbieri</last></author>
-      <author><first>Luis</first> <last>Espinosa-Anke</last></author>
-      <author><first>Miguel</first> <last>Ballesteros</last></author>
-      <author><first>Juan</first> <last>Soler-Company</last></author>
-      <author><first>Horacio</first> <last>Saggion</last></author>
+      <author><first>Francesco</first><last>Barbieri</last></author>
+      <author><first>Luis</first><last>Espinosa-Anke</last></author>
+      <author><first>Miguel</first><last>Ballesteros</last></author>
+      <author><first>Juan</first><last>Soler-Company</last></author>
+      <author><first>Horacio</first><last>Saggion</last></author>
       <pages>11–20</pages>
       <url>W17-4402</url>
       <doi>10.18653/v1/W17-4402</doi>
@@ -7025,9 +7025,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Churn Identification in Microblogs using Convolutional Neural Networks with Structured Logical Knowledge</title>
-      <author><first>Mourad</first> <last>Gridach</last></author>
-      <author><first>Hatem</first> <last>Haddad</last></author>
-      <author><first>Hala</first> <last>Mulki</last></author>
+      <author><first>Mourad</first><last>Gridach</last></author>
+      <author><first>Hatem</first><last>Haddad</last></author>
+      <author><first>Hala</first><last>Mulki</last></author>
       <pages>21–30</pages>
       <url>W17-4403</url>
       <doi>10.18653/v1/W17-4403</doi>
@@ -7036,9 +7036,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>To normalize, or not to normalize: The impact of normalization on Part-of-Speech tagging</title>
-      <author><first>Rob</first> <last>van der Goot</last></author>
-      <author><first>Barbara</first> <last>Plank</last></author>
-      <author><first>Malvina</first> <last>Nissim</last></author>
+      <author><first>Rob</first><last>van der Goot</last></author>
+      <author><first>Barbara</first><last>Plank</last></author>
+      <author><first>Malvina</first><last>Nissim</last></author>
       <pages>31–39</pages>
       <url>W17-4404</url>
       <doi>10.18653/v1/W17-4404</doi>
@@ -7046,10 +7046,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Constructing an Alias List for Named Entities during an Event</title>
-      <author><first>Anietie</first> <last>Andy</last></author>
-      <author><first>Mark</first> <last>Dredze</last></author>
-      <author><first>Mugizi</first> <last>Rwebangira</last></author>
-      <author><first>Chris</first> <last>Callison-Burch</last></author>
+      <author><first>Anietie</first><last>Andy</last></author>
+      <author><first>Mark</first><last>Dredze</last></author>
+      <author><first>Mugizi</first><last>Rwebangira</last></author>
+      <author><first>Chris</first><last>Callison-Burch</last></author>
       <pages>40–44</pages>
       <url>W17-4405</url>
       <doi>10.18653/v1/W17-4405</doi>
@@ -7057,8 +7057,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Incorporating Metadata into Content-Based User Embeddings</title>
-      <author><first>Linzi</first> <last>Xing</last></author>
-      <author><first>Michael J.</first> <last>Paul</last></author>
+      <author><first>Linzi</first><last>Xing</last></author>
+      <author><first>Michael J.</first><last>Paul</last></author>
       <pages>45–49</pages>
       <url>W17-4406</url>
       <doi>10.18653/v1/W17-4406</doi>
@@ -7066,9 +7066,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Simple Queries as Distant Labels for Predicting Gender on Twitter</title>
-      <author><first>Chris</first> <last>Emmery</last></author>
-      <author><first>Grzegorz</first> <last>Chrupała</last></author>
-      <author><first>Walter</first> <last>Daelemans</last></author>
+      <author><first>Chris</first><last>Emmery</last></author>
+      <author><first>Grzegorz</first><last>Chrupała</last></author>
+      <author><first>Walter</first><last>Daelemans</last></author>
       <pages>50–55</pages>
       <url>W17-4407</url>
       <doi>10.18653/v1/W17-4407</doi>
@@ -7076,9 +7076,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>A Dataset and Classifier for Recognizing Social Media <fixed-case>E</fixed-case>nglish</title>
-      <author><first>Su Lin</first> <last>Blodgett</last></author>
-      <author><first>Johnny</first> <last>Wei</last></author>
-      <author><first>Brendan</first> <last>O’Connor</last></author>
+      <author><first>Su Lin</first><last>Blodgett</last></author>
+      <author><first>Johnny</first><last>Wei</last></author>
+      <author><first>Brendan</first><last>O’Connor</last></author>
       <pages>56–61</pages>
       <url>W17-4408</url>
       <doi>10.18653/v1/W17-4408</doi>
@@ -7086,8 +7086,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Evaluating hypotheses in geolocation on a very large sample of Twitter</title>
-      <author><first>Bahar</first> <last>Salehi</last></author>
-      <author><first>Anders</first> <last>Søgaard</last></author>
+      <author><first>Bahar</first><last>Salehi</last></author>
+      <author><first>Anders</first><last>Søgaard</last></author>
       <pages>62–67</pages>
       <url>W17-4409</url>
       <doi>10.18653/v1/W17-4409</doi>
@@ -7095,9 +7095,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>The Effect of Error Rate in Artificially Generated Data for Automatic Preposition and Determiner Correction</title>
-      <author><first>Fraser</first> <last>Bowen</last></author>
-      <author><first>Jon</first> <last>Dehdari</last></author>
-      <author><first>Josef</first> <last>van Genabith</last></author>
+      <author><first>Fraser</first><last>Bowen</last></author>
+      <author><first>Jon</first><last>Dehdari</last></author>
+      <author><first>Josef</first><last>van Genabith</last></author>
       <pages>68–76</pages>
       <url>W17-4410</url>
       <doi>10.18653/v1/W17-4410</doi>
@@ -7105,10 +7105,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>An Entity Resolution Approach to Isolate Instances of Human Trafficking Online</title>
-      <author><first>Chirag</first> <last>Nagpal</last></author>
-      <author><first>Kyle</first> <last>Miller</last></author>
-      <author><first>Benedikt</first> <last>Boecking</last></author>
-      <author><first>Artur</first> <last>Dubrawski</last></author>
+      <author><first>Chirag</first><last>Nagpal</last></author>
+      <author><first>Kyle</first><last>Miller</last></author>
+      <author><first>Benedikt</first><last>Boecking</last></author>
+      <author><first>Artur</first><last>Dubrawski</last></author>
       <pages>77–84</pages>
       <url>W17-4411</url>
       <doi>10.18653/v1/W17-4411</doi>
@@ -7116,8 +7116,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Noisy <fixed-case>U</fixed-case>yghur Text Normalization</title>
-      <author><first>Osman</first> <last>Tursun</last></author>
-      <author><first>Ruket</first> <last>Cakici</last></author>
+      <author><first>Osman</first><last>Tursun</last></author>
+      <author><first>Ruket</first><last>Cakici</last></author>
       <pages>85–93</pages>
       <url>W17-4412</url>
       <doi>10.18653/v1/W17-4412</doi>
@@ -7125,9 +7125,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Crowdsourcing Multiple Choice Science Questions</title>
-      <author><first>Johannes</first> <last>Welbl</last></author>
-      <author><first>Nelson F.</first> <last>Liu</last></author>
-      <author><first>Matt</first> <last>Gardner</last></author>
+      <author><first>Johannes</first><last>Welbl</last></author>
+      <author><first>Nelson F.</first><last>Liu</last></author>
+      <author><first>Matt</first><last>Gardner</last></author>
       <pages>94–106</pages>
       <url>W17-4413</url>
       <doi>10.18653/v1/W17-4413</doi>
@@ -7135,11 +7135,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>A Text Normalisation System for Non-Standard <fixed-case>E</fixed-case>nglish Words</title>
-      <author><first>Emma</first> <last>Flint</last></author>
-      <author><first>Elliot</first> <last>Ford</last></author>
-      <author><first>Olivia</first> <last>Thomas</last></author>
-      <author><first>Andrew</first> <last>Caines</last></author>
-      <author><first>Paula</first> <last>Buttery</last></author>
+      <author><first>Emma</first><last>Flint</last></author>
+      <author><first>Elliot</first><last>Ford</last></author>
+      <author><first>Olivia</first><last>Thomas</last></author>
+      <author><first>Andrew</first><last>Caines</last></author>
+      <author><first>Paula</first><last>Buttery</last></author>
       <pages>107–115</pages>
       <url>W17-4414</url>
       <doi>10.18653/v1/W17-4414</doi>
@@ -7147,10 +7147,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Huntsville, hospitals, and hockey teams: Names can reveal your location</title>
-      <author><first>Bahar</first> <last>Salehi</last></author>
-      <author><first>Dirk</first> <last>Hovy</last></author>
-      <author><first>Eduard</first> <last>Hovy</last></author>
-      <author><first>Anders</first> <last>Søgaard</last></author>
+      <author><first>Bahar</first><last>Salehi</last></author>
+      <author><first>Dirk</first><last>Hovy</last></author>
+      <author><first>Eduard</first><last>Hovy</last></author>
+      <author><first>Anders</first><last>Søgaard</last></author>
       <pages>116–121</pages>
       <url>W17-4415</url>
       <doi>10.18653/v1/W17-4415</doi>
@@ -7158,9 +7158,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="16">
       <title>Improving Document Clustering by Removing Unnatural Language</title>
-      <author><first>Myungha</first> <last>Jang</last></author>
-      <author><first>Jinho D.</first> <last>Choi</last></author>
-      <author><first>James</first> <last>Allan</last></author>
+      <author><first>Myungha</first><last>Jang</last></author>
+      <author><first>Jinho D.</first><last>Choi</last></author>
+      <author><first>James</first><last>Allan</last></author>
       <pages>122–130</pages>
       <url>W17-4416</url>
       <doi>10.18653/v1/W17-4416</doi>
@@ -7168,9 +7168,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="17">
       <title>Lithium <fixed-case>NLP</fixed-case>: A System for Rich Information Extraction from Noisy User Generated Text on Social Media</title>
-      <author><first>Preeti</first> <last>Bhargava</last></author>
-      <author><first>Nemanja</first> <last>Spasojevic</last></author>
-      <author><first>Guoning</first> <last>Hu</last></author>
+      <author><first>Preeti</first><last>Bhargava</last></author>
+      <author><first>Nemanja</first><last>Spasojevic</last></author>
+      <author><first>Guoning</first><last>Hu</last></author>
       <pages>131–139</pages>
       <url>W17-4417</url>
       <doi>10.18653/v1/W17-4417</doi>
@@ -7178,10 +7178,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="18">
       <title>Results of the <fixed-case>WNUT</fixed-case>2017 Shared Task on Novel and Emerging Entity Recognition</title>
-      <author><first>Leon</first> <last>Derczynski</last></author>
-      <author><first>Eric</first> <last>Nichols</last></author>
-      <author><first>Marieke</first> <last>van Erp</last></author>
-      <author><first>Nut</first> <last>Limsopatham</last></author>
+      <author><first>Leon</first><last>Derczynski</last></author>
+      <author><first>Eric</first><last>Nichols</last></author>
+      <author><first>Marieke</first><last>van Erp</last></author>
+      <author><first>Nut</first><last>Limsopatham</last></author>
       <pages>140–147</pages>
       <url>W17-4418</url>
       <doi>10.18653/v1/W17-4418</doi>
@@ -7189,10 +7189,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="19">
       <title>A Multi-task Approach for Named Entity Recognition in Social Media Data</title>
-      <author><first>Gustavo</first> <last>Aguilar</last></author>
-      <author><first>Suraj</first> <last>Maharjan</last></author>
-      <author><first>Adrian Pastor</first> <last>López-Monroy</last></author>
-      <author><first>Thamar</first> <last>Solorio</last></author>
+      <author><first>Gustavo</first><last>Aguilar</last></author>
+      <author><first>Suraj</first><last>Maharjan</last></author>
+      <author><first>Adrian Pastor</first><last>López-Monroy</last></author>
+      <author><first>Thamar</first><last>Solorio</last></author>
       <pages>148–153</pages>
       <url>W17-4419</url>
       <doi>10.18653/v1/W17-4419</doi>
@@ -7200,8 +7200,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="20">
       <title>Distributed Representation, <fixed-case>LDA</fixed-case> Topic Modelling and Deep Learning for Emerging Named Entity Recognition from Social Media</title>
-      <author><first>Patrick</first> <last>Jansson</last></author>
-      <author><first>Shuhua</first> <last>Liu</last></author>
+      <author><first>Patrick</first><last>Jansson</last></author>
+      <author><first>Shuhua</first><last>Liu</last></author>
       <pages>154–159</pages>
       <url>W17-4420</url>
       <doi>10.18653/v1/W17-4420</doi>
@@ -7209,10 +7209,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="21">
       <title>Multi-channel <fixed-case>B</fixed-case>i<fixed-case>LSTM</fixed-case>-<fixed-case>CRF</fixed-case> Model for Emerging Named Entity Recognition in Social Media</title>
-      <author><first>Bill Y.</first> <last>Lin</last></author>
-      <author><first>Frank</first> <last>Xu</last></author>
-      <author><first>Zhiyi</first> <last>Luo</last></author>
-      <author><first>Kenny</first> <last>Zhu</last></author>
+      <author><first>Bill Y.</first><last>Lin</last></author>
+      <author><first>Frank</first><last>Xu</last></author>
+      <author><first>Zhiyi</first><last>Luo</last></author>
+      <author><first>Kenny</first><last>Zhu</last></author>
       <pages>160–165</pages>
       <url>W17-4421</url>
       <doi>10.18653/v1/W17-4421</doi>
@@ -7220,8 +7220,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="22">
       <title>Transfer Learning and Sentence Level Features for Named Entity Recognition on Tweets</title>
-      <author><first>Pius</first> <last>von Däniken</last></author>
-      <author><first>Mark</first> <last>Cieliebak</last></author>
+      <author><first>Pius</first><last>von Däniken</last></author>
+      <author><first>Mark</first><last>Cieliebak</last></author>
       <pages>166–171</pages>
       <url>W17-4422</url>
       <doi>10.18653/v1/W17-4422</doi>
@@ -7229,8 +7229,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="23">
       <title>Context-Sensitive Recognition for Emerging and Rare Entities</title>
-      <author><first>Jake</first> <last>Williams</last></author>
-      <author><first>Giovanni</first> <last>Santia</last></author>
+      <author><first>Jake</first><last>Williams</last></author>
+      <author><first>Giovanni</first><last>Santia</last></author>
       <pages>172–176</pages>
       <url>W17-4423</url>
       <doi>10.18653/v1/W17-4423</doi>
@@ -7238,8 +7238,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="24">
       <title>A Feature-based Ensemble Approach to Recognition of Emerging and Rare Named Entities</title>
-      <author><first>Utpal Kumar</first> <last>Sikdar</last></author>
-      <author><first>Björn</first> <last>Gambäck</last></author>
+      <author><first>Utpal Kumar</first><last>Sikdar</last></author>
+      <author><first>Björn</first><last>Gambäck</last></author>
       <pages>177–181</pages>
       <url>W17-4424</url>
       <doi>10.18653/v1/W17-4424</doi>
@@ -7265,8 +7265,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Video Highlights Detection and Summarization with Lag-Calibration based on Concept-Emotion Mapping of Crowdsourced Time-Sync Comments</title>
-      <author><first>Qing</first> <last>Ping</last></author>
-      <author><first>Chaomei</first> <last>Chen</last></author>
+      <author><first>Qing</first><last>Ping</last></author>
+      <author><first>Chaomei</first><last>Chen</last></author>
       <pages>1–11</pages>
       <url>W17-4501</url>
       <doi>10.18653/v1/W17-4501</doi>
@@ -7274,8 +7274,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Multimedia Summary Generation from Online Conversations: Current Approaches and Future Directions</title>
-      <author><first>Enamul</first> <last>Hoque</last></author>
-      <author><first>Giuseppe</first> <last>Carenini</last></author>
+      <author><first>Enamul</first><last>Hoque</last></author>
+      <author><first>Giuseppe</first><last>Carenini</last></author>
       <pages>12–19</pages>
       <url>W17-4502</url>
       <doi>10.18653/v1/W17-4502</doi>
@@ -7283,8 +7283,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Low-Resource Neural Headline Generation</title>
-      <author><first>Ottokar</first> <last>Tilk</last></author>
-      <author><first>Tanel</first> <last>Alumäe</last></author>
+      <author><first>Ottokar</first><last>Tilk</last></author>
+      <author><first>Tanel</first><last>Alumäe</last></author>
       <pages>20–26</pages>
       <url>W17-4503</url>
       <doi>10.18653/v1/W17-4503</doi>
@@ -7292,9 +7292,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Towards Improving Abstractive Summarization via Entailment Generation</title>
-      <author><first>Ramakanth</first> <last>Pasunuru</last></author>
-      <author><first>Han</first> <last>Guo</last></author>
-      <author><first>Mohit</first> <last>Bansal</last></author>
+      <author><first>Ramakanth</first><last>Pasunuru</last></author>
+      <author><first>Han</first><last>Guo</last></author>
+      <author><first>Mohit</first><last>Bansal</last></author>
       <pages>27–32</pages>
       <url>W17-4504</url>
       <doi>10.18653/v1/W17-4504</doi>
@@ -7302,8 +7302,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Coarse-to-Fine Attention Models for Document Summarization</title>
-      <author><first>Jeffrey</first> <last>Ling</last></author>
-      <author><first>Alexander</first> <last>Rush</last></author>
+      <author><first>Jeffrey</first><last>Ling</last></author>
+      <author><first>Alexander</first><last>Rush</last></author>
       <pages>33–42</pages>
       <url>W17-4505</url>
       <doi>10.18653/v1/W17-4505</doi>
@@ -7311,11 +7311,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Automatic Community Creation for Abstractive Spoken Conversations Summarization</title>
-      <author><first>Karan</first> <last>Singla</last></author>
-      <author><first>Evgeny</first> <last>Stepanov</last></author>
-      <author><first>Ali Orkan</first> <last>Bayer</last></author>
-      <author><first>Giuseppe</first> <last>Carenini</last></author>
-      <author><first>Giuseppe</first> <last>Riccardi</last></author>
+      <author><first>Karan</first><last>Singla</last></author>
+      <author><first>Evgeny</first><last>Stepanov</last></author>
+      <author><first>Ali Orkan</first><last>Bayer</last></author>
+      <author><first>Giuseppe</first><last>Carenini</last></author>
+      <author><first>Giuseppe</first><last>Riccardi</last></author>
       <pages>43–47</pages>
       <url>W17-4506</url>
       <doi>10.18653/v1/W17-4506</doi>
@@ -7323,9 +7323,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Combining Graph Degeneracy and Submodularity for Unsupervised Extractive Summarization</title>
-      <author><first>Antoine</first> <last>Tixier</last></author>
-      <author><first>Polykarpos</first> <last>Meladianos</last></author>
-      <author><first>Michalis</first> <last>Vazirgiannis</last></author>
+      <author><first>Antoine</first><last>Tixier</last></author>
+      <author><first>Polykarpos</first><last>Meladianos</last></author>
+      <author><first>Michalis</first><last>Vazirgiannis</last></author>
       <pages>48–58</pages>
       <url>W17-4507</url>
       <doi>10.18653/v1/W17-4507</doi>
@@ -7333,10 +7333,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title><fixed-case>TL</fixed-case>;<fixed-case>DR</fixed-case>: Mining <fixed-case>R</fixed-case>eddit to Learn Automatic Summarization</title>
-      <author><first>Michael</first> <last>Völske</last></author>
-      <author><first>Martin</first> <last>Potthast</last></author>
-      <author><first>Shahbaz</first> <last>Syed</last></author>
-      <author><first>Benno</first> <last>Stein</last></author>
+      <author><first>Michael</first><last>Völske</last></author>
+      <author><first>Martin</first><last>Potthast</last></author>
+      <author><first>Shahbaz</first><last>Syed</last></author>
+      <author><first>Benno</first><last>Stein</last></author>
       <pages>59–63</pages>
       <url>W17-4508</url>
       <doi>10.18653/v1/W17-4508</doi>
@@ -7344,8 +7344,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Topic Model Stability for Hierarchical Summarization</title>
-      <author><first>John</first> <last>Miller</last></author>
-      <author><first>Kathleen</first> <last>McCoy</last></author>
+      <author><first>John</first><last>Miller</last></author>
+      <author><first>Kathleen</first><last>McCoy</last></author>
       <pages>64–73</pages>
       <url>W17-4509</url>
       <doi>10.18653/v1/W17-4509</doi>
@@ -7354,9 +7354,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Learning to Score System Summaries for Better Content Selection Evaluation.</title>
-      <author><first>Maxime</first> <last>Peyrard</last></author>
-      <author><first>Teresa</first> <last>Botschen</last></author>
-      <author><first>Iryna</first> <last>Gurevych</last></author>
+      <author><first>Maxime</first><last>Peyrard</last></author>
+      <author><first>Teresa</first><last>Botschen</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
       <pages>74–84</pages>
       <url>W17-4510</url>
       <doi>10.18653/v1/W17-4510</doi>
@@ -7364,7 +7364,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Revisiting the Centroid-based Method: A Strong Baseline for Multi-Document Summarization</title>
-      <author><first>Demian</first> <last>Gholipour Ghalandari</last></author>
+      <author><first>Demian</first><last>Gholipour Ghalandari</last></author>
       <pages>85–90</pages>
       <url>W17-4511</url>
       <doi>10.18653/v1/W17-4511</doi>
@@ -7372,9 +7372,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset</title>
-      <author><first>Piji</first> <last>Li</last></author>
-      <author><first>Lidong</first> <last>Bing</last></author>
-      <author><first>Wai</first> <last>Lam</last></author>
+      <author><first>Piji</first><last>Li</last></author>
+      <author><first>Lidong</first><last>Bing</last></author>
+      <author><first>Wai</first><last>Lam</last></author>
       <pages>91–99</pages>
       <url>W17-4512</url>
       <doi>10.18653/v1/W17-4512</doi>
@@ -7382,8 +7382,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>A Pilot Study of Domain Adaptation Effect for Neural Abstractive Summarization</title>
-      <author><first>Xinyu</first> <last>Hua</last></author>
-      <author><first>Lu</first> <last>Wang</last></author>
+      <author><first>Xinyu</first><last>Hua</last></author>
+      <author><first>Lu</first><last>Wang</last></author>
       <pages>100–106</pages>
       <url>W17-4513</url>
       <doi>10.18653/v1/W17-4513</doi>
@@ -7407,10 +7407,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Functions of Silences towards Information Flow in Spoken Conversation</title>
-      <author><first>Shammur Absar</first> <last>Chowdhury</last></author>
-      <author><first>Evgeny</first> <last>Stepanov</last></author>
-      <author><first>Morena</first> <last>Danieli</last></author>
-      <author><first>Giuseppe</first> <last>Riccardi</last></author>
+      <author><first>Shammur Absar</first><last>Chowdhury</last></author>
+      <author><first>Evgeny</first><last>Stepanov</last></author>
+      <author><first>Morena</first><last>Danieli</last></author>
+      <author><first>Giuseppe</first><last>Riccardi</last></author>
       <pages>1–9</pages>
       <url>W17-4601</url>
       <doi>10.18653/v1/W17-4601</doi>
@@ -7418,8 +7418,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Encoding Word Confusion Networks with Recurrent Neural Networks for Dialog State Tracking</title>
-      <author><first>Glorianna</first> <last>Jagfeld</last></author>
-      <author><first>Ngoc Thang</first> <last>Vu</last></author>
+      <author><first>Glorianna</first><last>Jagfeld</last></author>
+      <author><first>Ngoc Thang</first><last>Vu</last></author>
       <pages>10–17</pages>
       <url>W17-4602</url>
       <doi>10.18653/v1/W17-4602</doi>
@@ -7427,8 +7427,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Analyzing Human and Machine Performance In Resolving Ambiguous Spoken Sentences</title>
-      <author><first>Hussein</first> <last>Ghaly</last></author>
-      <author><first>Michael</first> <last>Mandel</last></author>
+      <author><first>Hussein</first><last>Ghaly</last></author>
+      <author><first>Michael</first><last>Mandel</last></author>
       <pages>18–26</pages>
       <url>W17-4603</url>
       <doi>10.18653/v1/W17-4603</doi>
@@ -7437,9 +7437,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Parsing transcripts of speech</title>
-      <author><first>Andrew</first> <last>Caines</last></author>
-      <author><first>Michael</first> <last>McCarthy</last></author>
-      <author><first>Paula</first> <last>Buttery</last></author>
+      <author><first>Andrew</first><last>Caines</last></author>
+      <author><first>Michael</first><last>McCarthy</last></author>
+      <author><first>Paula</first><last>Buttery</last></author>
       <pages>27–36</pages>
       <url>W17-4604</url>
       <doi>10.18653/v1/W17-4604</doi>
@@ -7447,8 +7447,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Enriching <fixed-case>ASR</fixed-case> Lattices with <fixed-case>POS</fixed-case> Tags for Dependency Parsing</title>
-      <author><first>Moritz</first> <last>Stiefel</last></author>
-      <author><first>Ngoc Thang</first> <last>Vu</last></author>
+      <author><first>Moritz</first><last>Stiefel</last></author>
+      <author><first>Ngoc Thang</first><last>Vu</last></author>
       <pages>37–47</pages>
       <url>W17-4605</url>
       <doi>10.18653/v1/W17-4605</doi>
@@ -7456,10 +7456,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>End-to-End Information Extraction without Token-Level Supervision</title>
-      <author><first>Rasmus Berg</first> <last>Palm</last></author>
-      <author><first>Dirk</first> <last>Hovy</last></author>
-      <author><first>Florian</first> <last>Laws</last></author>
-      <author><first>Ole</first> <last>Winther</last></author>
+      <author><first>Rasmus Berg</first><last>Palm</last></author>
+      <author><first>Dirk</first><last>Hovy</last></author>
+      <author><first>Florian</first><last>Laws</last></author>
+      <author><first>Ole</first><last>Winther</last></author>
       <pages>48–52</pages>
       <url>W17-4606</url>
       <doi>10.18653/v1/W17-4606</doi>
@@ -7467,11 +7467,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Spoken Term Discovery for Language Documentation using Translations</title>
-      <author><first>Antonios</first> <last>Anastasopoulos</last></author>
-      <author><first>Sameer</first> <last>Bansal</last></author>
-      <author><first>David</first> <last>Chiang</last></author>
-      <author><first>Sharon</first> <last>Goldwater</last></author>
-      <author><first>Adam</first> <last>Lopez</last></author>
+      <author><first>Antonios</first><last>Anastasopoulos</last></author>
+      <author><first>Sameer</first><last>Bansal</last></author>
+      <author><first>David</first><last>Chiang</last></author>
+      <author><first>Sharon</first><last>Goldwater</last></author>
+      <author><first>Adam</first><last>Lopez</last></author>
       <pages>53–58</pages>
       <url>W17-4607</url>
       <doi>10.18653/v1/W17-4607</doi>
@@ -7479,9 +7479,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title><fixed-case>A</fixed-case>mharic-<fixed-case>E</fixed-case>nglish Speech Translation in Tourism Domain</title>
-      <author><first>Michael</first> <last>Melese</last></author>
-      <author><first>Laurent</first> <last>Besacier</last></author>
-      <author><first>Million</first> <last>Meshesha</last></author>
+      <author><first>Michael</first><last>Melese</last></author>
+      <author><first>Laurent</first><last>Besacier</last></author>
+      <author><first>Million</first><last>Meshesha</last></author>
       <pages>59–66</pages>
       <url>W17-4608</url>
       <doi>10.18653/v1/W17-4608</doi>
@@ -7489,9 +7489,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Speech- and Text-driven Features for Automated Scoring of <fixed-case>E</fixed-case>nglish Speaking Tasks</title>
-      <author><first>Anastassia</first> <last>Loukina</last></author>
-      <author><first>Nitin</first> <last>Madnani</last></author>
-      <author><first>Aoife</first> <last>Cahill</last></author>
+      <author><first>Anastassia</first><last>Loukina</last></author>
+      <author><first>Nitin</first><last>Madnani</last></author>
+      <author><first>Aoife</first><last>Cahill</last></author>
       <pages>67–77</pages>
       <url>W17-4609</url>
       <doi>10.18653/v1/W17-4609</doi>
@@ -7499,10 +7499,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Improving coreference resolution with automatically predicted prosodic information</title>
-      <author><first>Ina</first> <last>Roesiger</last></author>
-      <author><first>Sabrina</first> <last>Stehwien</last></author>
-      <author><first>Arndt</first> <last>Riester</last></author>
-      <author><first>Ngoc Thang</first> <last>Vu</last></author>
+      <author><first>Ina</first><last>Roesiger</last></author>
+      <author><first>Sabrina</first><last>Stehwien</last></author>
+      <author><first>Arndt</first><last>Riester</last></author>
+      <author><first>Ngoc Thang</first><last>Vu</last></author>
       <pages>78–83</pages>
       <url>W17-4610</url>
       <doi>10.18653/v1/W17-4610</doi>
@@ -7534,456 +7534,456 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Sense-Aware Statistical Machine Translation using Adaptive Context-Dependent Clustering</title>
-      <author><first>Xiao</first> <last>Pu</last></author>
-      <author><first>Nikolaos</first> <last>Pappas</last></author>
-      <author><first>Andrei</first> <last>Popescu-Belis</last></author>
+      <author><first>Xiao</first><last>Pu</last></author>
+      <author><first>Nikolaos</first><last>Pappas</last></author>
+      <author><first>Andrei</first><last>Popescu-Belis</last></author>
       <pages>1–10</pages>
       <url>W17-4701</url>
       <doi>10.18653/v1/W17-4701</doi>
     </paper>
     <paper id="2">
       <title>Improving Word Sense Disambiguation in Neural Machine Translation with Sense Embeddings</title>
-      <author><first>Annette</first> <last>Rios Gonzales</last></author>
-      <author><first>Laura</first> <last>Mascarell</last></author>
-      <author><first>Rico</first> <last>Sennrich</last></author>
+      <author><first>Annette</first><last>Rios Gonzales</last></author>
+      <author><first>Laura</first><last>Mascarell</last></author>
+      <author><first>Rico</first><last>Sennrich</last></author>
       <pages>11–19</pages>
       <url>W17-4702</url>
       <doi>10.18653/v1/W17-4702</doi>
     </paper>
     <paper id="3">
       <title>Word Representations in Factored Neural Machine Translation</title>
-      <author><first>Franck</first> <last>Burlot</last></author>
-      <author><first>Mercedes</first> <last>García-Martínez</last></author>
-      <author><first>Loïc</first> <last>Barrault</last></author>
-      <author><first>Fethi</first> <last>Bougares</last></author>
-      <author><first>François</first> <last>Yvon</last></author>
+      <author><first>Franck</first><last>Burlot</last></author>
+      <author><first>Mercedes</first><last>García-Martínez</last></author>
+      <author><first>Loïc</first><last>Barrault</last></author>
+      <author><first>Fethi</first><last>Bougares</last></author>
+      <author><first>François</first><last>Yvon</last></author>
       <pages>20–31</pages>
       <url>W17-4703</url>
       <doi>10.18653/v1/W17-4703</doi>
     </paper>
     <paper id="4">
       <title>Modeling Target-Side Inflection in Neural Machine Translation</title>
-      <author><first>Aleš</first> <last>Tamchyna</last></author>
-      <author><first>Marion</first> <last>Weller-Di Marco</last></author>
-      <author><first>Alexander</first> <last>Fraser</last></author>
+      <author><first>Aleš</first><last>Tamchyna</last></author>
+      <author><first>Marion</first><last>Weller-Di Marco</last></author>
+      <author><first>Alexander</first><last>Fraser</last></author>
       <pages>32–42</pages>
       <url>W17-4704</url>
       <doi>10.18653/v1/W17-4704</doi>
     </paper>
     <paper id="5">
       <title>Evaluating the morphological competence of Machine Translation Systems</title>
-      <author><first>Franck</first> <last>Burlot</last></author>
-      <author><first>François</first> <last>Yvon</last></author>
+      <author><first>Franck</first><last>Burlot</last></author>
+      <author><first>François</first><last>Yvon</last></author>
       <pages>43–55</pages>
       <url>W17-4705</url>
       <doi>10.18653/v1/W17-4705</doi>
     </paper>
     <paper id="6">
       <title>Target-side Word Segmentation Strategies for Neural Machine Translation</title>
-      <author><first>Matthias</first> <last>Huck</last></author>
-      <author><first>Simon</first> <last>Riess</last></author>
-      <author><first>Alexander</first> <last>Fraser</last></author>
+      <author><first>Matthias</first><last>Huck</last></author>
+      <author><first>Simon</first><last>Riess</last></author>
+      <author><first>Alexander</first><last>Fraser</last></author>
       <pages>56–67</pages>
       <url>W17-4706</url>
       <doi>10.18653/v1/W17-4706</doi>
     </paper>
     <paper id="7">
       <title>Predicting Target Language <fixed-case>CCG</fixed-case> Supertags Improves Neural Machine Translation</title>
-      <author><first>Maria</first> <last>Nădejde</last></author>
-      <author><first>Siva</first> <last>Reddy</last></author>
-      <author><first>Rico</first> <last>Sennrich</last></author>
-      <author><first>Tomasz</first> <last>Dwojak</last></author>
-      <author><first>Marcin</first> <last>Junczys-Dowmunt</last></author>
-      <author><first>Philipp</first> <last>Koehn</last></author>
-      <author><first>Alexandra</first> <last>Birch</last></author>
+      <author><first>Maria</first><last>Nădejde</last></author>
+      <author><first>Siva</first><last>Reddy</last></author>
+      <author><first>Rico</first><last>Sennrich</last></author>
+      <author><first>Tomasz</first><last>Dwojak</last></author>
+      <author><first>Marcin</first><last>Junczys-Dowmunt</last></author>
+      <author><first>Philipp</first><last>Koehn</last></author>
+      <author><first>Alexandra</first><last>Birch</last></author>
       <pages>68–79</pages>
       <url>W17-4707</url>
       <doi>10.18653/v1/W17-4707</doi>
     </paper>
     <paper id="8">
       <title>Exploiting Linguistic Resources for Neural Machine Translation Using Multi-task Learning</title>
-      <author><first>Jan</first> <last>Niehues</last></author>
-      <author><first>Eunah</first> <last>Cho</last></author>
+      <author><first>Jan</first><last>Niehues</last></author>
+      <author><first>Eunah</first><last>Cho</last></author>
       <pages>80–89</pages>
       <url>W17-4708</url>
       <doi>10.18653/v1/W17-4708</doi>
     </paper>
     <paper id="9">
       <title>Tree as a Pivot: Syntactic Matching Methods in Pivot Translation</title>
-      <author><first>Akiva</first> <last>Miura</last></author>
-      <author><first>Graham</first> <last>Neubig</last></author>
-      <author><first>Katsuhito</first> <last>Sudoh</last></author>
-      <author><first>Satoshi</first> <last>Nakamura</last></author>
+      <author><first>Akiva</first><last>Miura</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
+      <author><first>Katsuhito</first><last>Sudoh</last></author>
+      <author><first>Satoshi</first><last>Nakamura</last></author>
       <pages>90–98</pages>
       <url>W17-4709</url>
       <doi>10.18653/v1/W17-4709</doi>
     </paper>
     <paper id="10">
       <title>Deep architectures for Neural Machine Translation</title>
-      <author><first>Antonio Valerio</first> <last>Miceli Barone</last></author>
-      <author><first>Jindřich</first> <last>Helcl</last></author>
-      <author><first>Rico</first> <last>Sennrich</last></author>
-      <author><first>Barry</first> <last>Haddow</last></author>
-      <author><first>Alexandra</first> <last>Birch</last></author>
+      <author><first>Antonio Valerio</first><last>Miceli Barone</last></author>
+      <author><first>Jindřich</first><last>Helcl</last></author>
+      <author><first>Rico</first><last>Sennrich</last></author>
+      <author><first>Barry</first><last>Haddow</last></author>
+      <author><first>Alexandra</first><last>Birch</last></author>
       <pages>99–107</pages>
       <url>W17-4710</url>
       <doi>10.18653/v1/W17-4710</doi>
     </paper>
     <paper id="11">
       <title>Biasing Attention-Based Recurrent Neural Networks Using External Alignment Information</title>
-      <author><first>Tamer</first> <last>Alkhouli</last></author>
-      <author><first>Hermann</first> <last>Ney</last></author>
+      <author><first>Tamer</first><last>Alkhouli</last></author>
+      <author><first>Hermann</first><last>Ney</last></author>
       <pages>108–117</pages>
       <url>W17-4711</url>
       <doi>10.18653/v1/W17-4711</doi>
     </paper>
     <paper id="12">
       <title>Effective Domain Mixing for Neural Machine Translation</title>
-      <author><first>Denny</first> <last>Britz</last></author>
-      <author><first>Quoc</first> <last>Le</last></author>
-      <author><first>Reid</first> <last>Pryzant</last></author>
+      <author><first>Denny</first><last>Britz</last></author>
+      <author><first>Quoc</first><last>Le</last></author>
+      <author><first>Reid</first><last>Pryzant</last></author>
       <pages>118–126</pages>
       <url>W17-4712</url>
       <doi>10.18653/v1/W17-4712</doi>
     </paper>
     <paper id="13">
       <title>Multi-Domain Neural Machine Translation through Unsupervised Adaptation</title>
-      <author><first>M. Amin</first> <last>Farajian</last></author>
-      <author><first>Marco</first> <last>Turchi</last></author>
-      <author><first>Matteo</first> <last>Negri</last></author>
-      <author><first>Marcello</first> <last>Federico</last></author>
+      <author><first>M. Amin</first><last>Farajian</last></author>
+      <author><first>Marco</first><last>Turchi</last></author>
+      <author><first>Matteo</first><last>Negri</last></author>
+      <author><first>Marcello</first><last>Federico</last></author>
       <pages>127–137</pages>
       <url>W17-4713</url>
       <doi>10.18653/v1/W17-4713</doi>
     </paper>
     <paper id="14">
       <title>Adapting Neural Machine Translation with Parallel Synthetic Data</title>
-      <author><first>Mara</first> <last>Chinea-Ríos</last></author>
-      <author><first>Álvaro</first> <last>Peris</last></author>
-      <author><first>Francisco</first> <last>Casacuberta</last></author>
+      <author><first>Mara</first><last>Chinea-Ríos</last></author>
+      <author><first>Álvaro</first><last>Peris</last></author>
+      <author><first>Francisco</first><last>Casacuberta</last></author>
       <pages>138–147</pages>
       <url>W17-4714</url>
       <doi>10.18653/v1/W17-4714</doi>
     </paper>
     <paper id="15">
       <title>Copied Monolingual Data Improves Low-Resource Neural Machine Translation</title>
-      <author><first>Anna</first> <last>Currey</last></author>
-      <author><first>Antonio Valerio</first> <last>Miceli Barone</last></author>
-      <author><first>Kenneth</first> <last>Heafield</last></author>
+      <author><first>Anna</first><last>Currey</last></author>
+      <author><first>Antonio Valerio</first><last>Miceli Barone</last></author>
+      <author><first>Kenneth</first><last>Heafield</last></author>
       <pages>148–156</pages>
       <url>W17-4715</url>
       <doi>10.18653/v1/W17-4715</doi>
     </paper>
     <paper id="16">
       <title>Guiding Neural Machine Translation Decoding with External Knowledge</title>
-      <author><first>Rajen</first> <last>Chatterjee</last></author>
-      <author><first>Matteo</first> <last>Negri</last></author>
-      <author><first>Marco</first> <last>Turchi</last></author>
-      <author><first>Marcello</first> <last>Federico</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
-      <author><first>Frédéric</first> <last>Blain</last></author>
+      <author><first>Rajen</first><last>Chatterjee</last></author>
+      <author><first>Matteo</first><last>Negri</last></author>
+      <author><first>Marco</first><last>Turchi</last></author>
+      <author><first>Marcello</first><last>Federico</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
+      <author><first>Frédéric</first><last>Blain</last></author>
       <pages>157–168</pages>
       <url>W17-4716</url>
       <doi>10.18653/v1/W17-4716</doi>
     </paper>
     <paper id="17">
       <title>Findings of the 2017 Conference on Machine Translation (<fixed-case>WMT</fixed-case>17)</title>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
-      <author><first>Rajen</first> <last>Chatterjee</last></author>
-      <author><first>Christian</first> <last>Federmann</last></author>
-      <author><first>Yvette</first> <last>Graham</last></author>
-      <author><first>Barry</first> <last>Haddow</last></author>
-      <author><first>Shujian</first> <last>Huang</last></author>
-      <author><first>Matthias</first> <last>Huck</last></author>
-      <author><first>Philipp</first> <last>Koehn</last></author>
-      <author><first>Qun</first> <last>Liu</last></author>
-      <author><first>Varvara</first> <last>Logacheva</last></author>
-      <author><first>Christof</first> <last>Monz</last></author>
-      <author><first>Matteo</first> <last>Negri</last></author>
-      <author><first>Matt</first> <last>Post</last></author>
-      <author><first>Raphael</first> <last>Rubino</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
-      <author><first>Marco</first> <last>Turchi</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <author><first>Rajen</first><last>Chatterjee</last></author>
+      <author><first>Christian</first><last>Federmann</last></author>
+      <author><first>Yvette</first><last>Graham</last></author>
+      <author><first>Barry</first><last>Haddow</last></author>
+      <author><first>Shujian</first><last>Huang</last></author>
+      <author><first>Matthias</first><last>Huck</last></author>
+      <author><first>Philipp</first><last>Koehn</last></author>
+      <author><first>Qun</first><last>Liu</last></author>
+      <author><first>Varvara</first><last>Logacheva</last></author>
+      <author><first>Christof</first><last>Monz</last></author>
+      <author><first>Matteo</first><last>Negri</last></author>
+      <author><first>Matt</first><last>Post</last></author>
+      <author><first>Raphael</first><last>Rubino</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
+      <author><first>Marco</first><last>Turchi</last></author>
       <pages>169–214</pages>
       <url>W17-4717</url>
       <doi>10.18653/v1/W17-4717</doi>
     </paper>
     <paper id="18">
       <title>Findings of the Second Shared Task on Multimodal Machine Translation and Multilingual Image Description</title>
-      <author><first>Desmond</first> <last>Elliott</last></author>
-      <author><first>Stella</first> <last>Frank</last></author>
-      <author><first>Loïc</first> <last>Barrault</last></author>
-      <author><first>Fethi</first> <last>Bougares</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
+      <author><first>Desmond</first><last>Elliott</last></author>
+      <author><first>Stella</first><last>Frank</last></author>
+      <author><first>Loïc</first><last>Barrault</last></author>
+      <author><first>Fethi</first><last>Bougares</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
       <pages>215–233</pages>
       <url>W17-4718</url>
       <doi>10.18653/v1/W17-4718</doi>
     </paper>
     <paper id="19">
       <title>Findings of the <fixed-case>WMT</fixed-case> 2017 Biomedical Translation Shared Task</title>
-      <author><first>Antonio</first> <last>Jimeno Yepes</last></author>
-      <author><first>Aurélie</first> <last>Névéol</last></author>
-      <author><first>Mariana</first> <last>Neves</last></author>
-      <author><first>Karin</first> <last>Verspoor</last></author>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
-      <author><first>Arthur</first> <last>Boyer</last></author>
-      <author><first>Cristian</first> <last>Grozea</last></author>
-      <author><first>Barry</first> <last>Haddow</last></author>
-      <author><first>Madeleine</first> <last>Kittner</last></author>
-      <author><first>Yvonne</first> <last>Lichtblau</last></author>
-      <author><first>Pavel</first> <last>Pecina</last></author>
-      <author><first>Roland</first> <last>Roller</last></author>
-      <author><first>Rudolf</first> <last>Rosa</last></author>
-      <author><first>Amy</first> <last>Siu</last></author>
-      <author><first>Philippe</first> <last>Thomas</last></author>
-      <author><first>Saskia</first> <last>Trescher</last></author>
+      <author><first>Antonio</first><last>Jimeno Yepes</last></author>
+      <author><first>Aurélie</first><last>Névéol</last></author>
+      <author><first>Mariana</first><last>Neves</last></author>
+      <author><first>Karin</first><last>Verspoor</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <author><first>Arthur</first><last>Boyer</last></author>
+      <author><first>Cristian</first><last>Grozea</last></author>
+      <author><first>Barry</first><last>Haddow</last></author>
+      <author><first>Madeleine</first><last>Kittner</last></author>
+      <author><first>Yvonne</first><last>Lichtblau</last></author>
+      <author><first>Pavel</first><last>Pecina</last></author>
+      <author><first>Roland</first><last>Roller</last></author>
+      <author><first>Rudolf</first><last>Rosa</last></author>
+      <author><first>Amy</first><last>Siu</last></author>
+      <author><first>Philippe</first><last>Thomas</last></author>
+      <author><first>Saskia</first><last>Trescher</last></author>
       <pages>234–247</pages>
       <url>W17-4719</url>
       <doi>10.18653/v1/W17-4719</doi>
     </paper>
     <paper id="20">
       <title><fixed-case>CUNI</fixed-case> submission in <fixed-case>WMT</fixed-case>17: Chimera goes neural</title>
-      <author><first>Roman</first> <last>Sudarikov</last></author>
-      <author><first>David</first> <last>Mareček</last></author>
-      <author><first>Tom</first> <last>Kocmi</last></author>
-      <author><first>Dušan</first> <last>Variš</last></author>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
+      <author><first>Roman</first><last>Sudarikov</last></author>
+      <author><first>David</first><last>Mareček</last></author>
+      <author><first>Tom</first><last>Kocmi</last></author>
+      <author><first>Dušan</first><last>Variš</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
       <pages>248–256</pages>
       <url>W17-4720</url>
       <doi>10.18653/v1/W17-4720</doi>
     </paper>
     <paper id="21">
       <title>LIMSI@WMT’17</title>
-      <author><first>Franck</first> <last>Burlot</last></author>
-      <author><first>Pooyan</first> <last>Safari</last></author>
-      <author><first>Matthieu</first> <last>Labeau</last></author>
-      <author><first>Alexandre</first> <last>Allauzen</last></author>
-      <author><first>François</first> <last>Yvon</last></author>
+      <author><first>Franck</first><last>Burlot</last></author>
+      <author><first>Pooyan</first><last>Safari</last></author>
+      <author><first>Matthieu</first><last>Labeau</last></author>
+      <author><first>Alexandre</first><last>Allauzen</last></author>
+      <author><first>François</first><last>Yvon</last></author>
       <pages>257–264</pages>
       <url>W17-4721</url>
       <doi>10.18653/v1/W17-4721</doi>
     </paper>
     <paper id="22">
       <title><fixed-case>SYSTRAN</fixed-case> Purely Neural <fixed-case>MT</fixed-case> Engines for <fixed-case>WMT</fixed-case>2017</title>
-      <author><first>Yongchao</first> <last>Deng</last></author>
-      <author><first>Jungi</first> <last>Kim</last></author>
-      <author><first>Guillaume</first> <last>Klein</last></author>
-      <author><first>Catherine</first> <last>Kobus</last></author>
-      <author><first>Natalia</first> <last>Segal</last></author>
-      <author><first>Christophe</first> <last>Servan</last></author>
-      <author><first>Bo</first> <last>Wang</last></author>
-      <author><first>Dakun</first> <last>Zhang</last></author>
-      <author><first>Josep</first> <last>Crego</last></author>
-      <author><first>Jean</first> <last>Senellart</last></author>
+      <author><first>Yongchao</first><last>Deng</last></author>
+      <author><first>Jungi</first><last>Kim</last></author>
+      <author><first>Guillaume</first><last>Klein</last></author>
+      <author><first>Catherine</first><last>Kobus</last></author>
+      <author><first>Natalia</first><last>Segal</last></author>
+      <author><first>Christophe</first><last>Servan</last></author>
+      <author><first>Bo</first><last>Wang</last></author>
+      <author><first>Dakun</first><last>Zhang</last></author>
+      <author><first>Josep</first><last>Crego</last></author>
+      <author><first>Jean</first><last>Senellart</last></author>
       <pages>265–270</pages>
       <url>W17-4722</url>
       <doi>10.18653/v1/W17-4722</doi>
     </paper>
     <paper id="23">
       <title><fixed-case>FBK</fixed-case>’s Participation to the <fixed-case>E</fixed-case>nglish-to-<fixed-case>G</fixed-case>erman News Translation Task of <fixed-case>WMT</fixed-case> 2017</title>
-      <author><first>Mattia Antonino</first> <last>Di Gangi</last></author>
-      <author><first>Nicola</first> <last>Bertoldi</last></author>
-      <author><first>Marcello</first> <last>Federico</last></author>
+      <author><first>Mattia Antonino</first><last>Di Gangi</last></author>
+      <author><first>Nicola</first><last>Bertoldi</last></author>
+      <author><first>Marcello</first><last>Federico</last></author>
       <pages>271–275</pages>
       <url>W17-4723</url>
       <doi>10.18653/v1/W17-4723</doi>
     </paper>
     <paper id="24">
       <title>The <fixed-case>JHU</fixed-case> Machine Translation Systems for <fixed-case>WMT</fixed-case> 2017</title>
-      <author><first>Shuoyang</first> <last>Ding</last></author>
-      <author><first>Huda</first> <last>Khayrallah</last></author>
-      <author><first>Philipp</first> <last>Koehn</last></author>
-      <author><first>Matt</first> <last>Post</last></author>
-      <author><first>Gaurav</first> <last>Kumar</last></author>
-      <author><first>Kevin</first> <last>Duh</last></author>
+      <author><first>Shuoyang</first><last>Ding</last></author>
+      <author><first>Huda</first><last>Khayrallah</last></author>
+      <author><first>Philipp</first><last>Koehn</last></author>
+      <author><first>Matt</first><last>Post</last></author>
+      <author><first>Gaurav</first><last>Kumar</last></author>
+      <author><first>Kevin</first><last>Duh</last></author>
       <pages>276–282</pages>
       <url>W17-4724</url>
       <doi>10.18653/v1/W17-4724</doi>
     </paper>
     <paper id="25">
       <title>The <fixed-case>TALP</fixed-case>-<fixed-case>UPC</fixed-case> Neural Machine Translation System for <fixed-case>G</fixed-case>erman/<fixed-case>F</fixed-case>innish-<fixed-case>E</fixed-case>nglish Using the Inverse Direction Model in Rescoring</title>
-      <author><first>Carlos</first> <last>Escolano</last></author>
-      <author><first>Marta R.</first> <last>Costa-jussà</last></author>
-      <author><first>José A. R.</first> <last>Fonollosa</last></author>
+      <author><first>Carlos</first><last>Escolano</last></author>
+      <author><first>Marta R.</first><last>Costa-jussà</last></author>
+      <author><first>José A. R.</first><last>Fonollosa</last></author>
       <pages>283–287</pages>
       <url>W17-4725</url>
       <doi>10.18653/v1/W17-4725</doi>
     </paper>
     <paper id="26">
       <title><fixed-case>LIUM</fixed-case> Machine Translation Systems for <fixed-case>WMT</fixed-case>17 News Translation Task</title>
-      <author><first>Mercedes</first> <last>García-Martínez</last></author>
-      <author><first>Ozan</first> <last>Caglayan</last></author>
-      <author><first>Walid</first> <last>Aransa</last></author>
-      <author><first>Adrien</first> <last>Bardet</last></author>
-      <author><first>Fethi</first> <last>Bougares</last></author>
-      <author><first>Loïc</first> <last>Barrault</last></author>
+      <author><first>Mercedes</first><last>García-Martínez</last></author>
+      <author><first>Ozan</first><last>Caglayan</last></author>
+      <author><first>Walid</first><last>Aransa</last></author>
+      <author><first>Adrien</first><last>Bardet</last></author>
+      <author><first>Fethi</first><last>Bougares</last></author>
+      <author><first>Loïc</first><last>Barrault</last></author>
       <pages>288–295</pages>
       <url>W17-4726</url>
       <doi>10.18653/v1/W17-4726</doi>
     </paper>
     <paper id="27">
       <title>Extending hybrid word-character neural machine translation with multi-task learning of morphological analysis</title>
-      <author><first>Stig-Arne</first> <last>Grönroos</last></author>
-      <author><first>Sami</first> <last>Virpioja</last></author>
-      <author><first>Mikko</first> <last>Kurimo</last></author>
+      <author><first>Stig-Arne</first><last>Grönroos</last></author>
+      <author><first>Sami</first><last>Virpioja</last></author>
+      <author><first>Mikko</first><last>Kurimo</last></author>
       <pages>296–302</pages>
       <url>W17-4727</url>
       <doi>10.18653/v1/W17-4727</doi>
     </paper>
     <paper id="28">
       <title>The <fixed-case>AFRL-MITLL</fixed-case> <fixed-case>WMT17</fixed-case> Systems: Old, New, Borrowed, <fixed-case>BLEU</fixed-case></title>
-      <author><first>Jeremy</first> <last>Gwinnup</last></author>
-      <author><first>Timothy</first> <last>Anderson</last></author>
-      <author><first>Grant</first> <last>Erdmann</last></author>
-      <author><first>Katherine</first> <last>Young</last></author>
-      <author><first>Michaeel</first> <last>Kazi</last></author>
-      <author><first>Elizabeth</first> <last>Salesky</last></author>
-      <author><first>Brian</first> <last>Thompson</last></author>
-      <author><first>Jonathan</first> <last>Taylor</last></author>
+      <author><first>Jeremy</first><last>Gwinnup</last></author>
+      <author><first>Timothy</first><last>Anderson</last></author>
+      <author><first>Grant</first><last>Erdmann</last></author>
+      <author><first>Katherine</first><last>Young</last></author>
+      <author><first>Michaeel</first><last>Kazi</last></author>
+      <author><first>Elizabeth</first><last>Salesky</last></author>
+      <author><first>Brian</first><last>Thompson</last></author>
+      <author><first>Jonathan</first><last>Taylor</last></author>
       <pages>303–309</pages>
       <url>W17-4728</url>
       <doi>10.18653/v1/W17-4728</doi>
     </paper>
     <paper id="29">
       <title>University of Rochester <fixed-case>WMT</fixed-case> 2017 <fixed-case>NMT</fixed-case> System Submission</title>
-      <author><first>Chester</first> <last>Holtz</last></author>
-      <author><first>Chuyang</first> <last>Ke</last></author>
-      <author><first>Daniel</first> <last>Gildea</last></author>
+      <author><first>Chester</first><last>Holtz</last></author>
+      <author><first>Chuyang</first><last>Ke</last></author>
+      <author><first>Daniel</first><last>Gildea</last></author>
       <pages>310–314</pages>
       <url>W17-4729</url>
       <doi>10.18653/v1/W17-4729</doi>
     </paper>
     <paper id="30">
       <title><fixed-case>LMU</fixed-case> Munich’s Neural Machine Translation Systems for News Articles and Health Information Texts</title>
-      <author><first>Matthias</first> <last>Huck</last></author>
-      <author><first>Fabienne</first> <last>Braune</last></author>
-      <author><first>Alexander</first> <last>Fraser</last></author>
+      <author><first>Matthias</first><last>Huck</last></author>
+      <author><first>Fabienne</first><last>Braune</last></author>
+      <author><first>Alexander</first><last>Fraser</last></author>
       <pages>315–322</pages>
       <url>W17-4730</url>
       <doi>10.18653/v1/W17-4730</doi>
     </paper>
     <paper id="31">
       <title>Rule-based Machine translation from <fixed-case>E</fixed-case>nglish to <fixed-case>F</fixed-case>innish</title>
-      <author><first>Arvi</first> <last>Hurskainen</last></author>
-      <author><first>Jörg</first> <last>Tiedemann</last></author>
+      <author><first>Arvi</first><last>Hurskainen</last></author>
+      <author><first>Jörg</first><last>Tiedemann</last></author>
       <pages>323–329</pages>
       <url>W17-4731</url>
       <doi>10.18653/v1/W17-4731</doi>
     </paper>
     <paper id="32">
       <title><fixed-case>NRC</fixed-case> Machine Translation System for <fixed-case>WMT</fixed-case> 2017</title>
-      <author><first>Chi-kiu</first> <last>Lo</last></author>
-      <author><first>Boxing</first> <last>Chen</last></author>
-      <author><first>Colin</first> <last>Cherry</last></author>
-      <author><first>George</first> <last>Foster</last></author>
-      <author><first>Samuel</first> <last>Larkin</last></author>
-      <author><first>Darlene</first> <last>Stewart</last></author>
-      <author><first>Roland</first> <last>Kuhn</last></author>
+      <author><first>Chi-kiu</first><last>Lo</last></author>
+      <author><first>Boxing</first><last>Chen</last></author>
+      <author><first>Colin</first><last>Cherry</last></author>
+      <author><first>George</first><last>Foster</last></author>
+      <author><first>Samuel</first><last>Larkin</last></author>
+      <author><first>Darlene</first><last>Stewart</last></author>
+      <author><first>Roland</first><last>Kuhn</last></author>
       <pages>330–337</pages>
       <url>W17-4732</url>
       <doi>10.18653/v1/W17-4732</doi>
     </paper>
     <paper id="33">
       <title>The <fixed-case>H</fixed-case>elsinki Neural Machine Translation System</title>
-      <author><first>Robert</first> <last>Östling</last></author>
-      <author><first>Yves</first> <last>Scherrer</last></author>
-      <author><first>Jörg</first> <last>Tiedemann</last></author>
-      <author><first>Gongbo</first> <last>Tang</last></author>
-      <author><first>Tommi</first> <last>Nieminen</last></author>
+      <author><first>Robert</first><last>Östling</last></author>
+      <author><first>Yves</first><last>Scherrer</last></author>
+      <author><first>Jörg</first><last>Tiedemann</last></author>
+      <author><first>Gongbo</first><last>Tang</last></author>
+      <author><first>Tommi</first><last>Nieminen</last></author>
       <pages>338–347</pages>
       <url>W17-4733</url>
       <doi>10.18653/v1/W17-4733</doi>
     </paper>
     <paper id="34">
       <title>The <fixed-case>QT</fixed-case>21 Combined Machine Translation System for <fixed-case>E</fixed-case>nglish to <fixed-case>L</fixed-case>atvian</title>
-      <author><first>Jan-Thorsten</first> <last>Peter</last></author>
-      <author><first>Hermann</first> <last>Ney</last></author>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
-      <author><first>Ngoc-Quan</first> <last>Pham</last></author>
-      <author><first>Jan</first> <last>Niehues</last></author>
-      <author><first>Alex</first> <last>Waibel</last></author>
-      <author><first>Franck</first> <last>Burlot</last></author>
-      <author><first>François</first> <last>Yvon</last></author>
-      <author><first>Mārcis</first> <last>Pinnis</last></author>
-      <author><first>Valters</first> <last>Šics</last></author>
-      <author><first>Joost</first> <last>Bastings</last></author>
-      <author><first>Miguel</first> <last>Rios</last></author>
-      <author><first>Wilker</first> <last>Aziz</last></author>
-      <author><first>Philip</first> <last>Williams</last></author>
-      <author><first>Frédéric</first> <last>Blain</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
+      <author><first>Jan-Thorsten</first><last>Peter</last></author>
+      <author><first>Hermann</first><last>Ney</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <author><first>Ngoc-Quan</first><last>Pham</last></author>
+      <author><first>Jan</first><last>Niehues</last></author>
+      <author><first>Alex</first><last>Waibel</last></author>
+      <author><first>Franck</first><last>Burlot</last></author>
+      <author><first>François</first><last>Yvon</last></author>
+      <author><first>Mārcis</first><last>Pinnis</last></author>
+      <author><first>Valters</first><last>Šics</last></author>
+      <author><first>Joost</first><last>Bastings</last></author>
+      <author><first>Miguel</first><last>Rios</last></author>
+      <author><first>Wilker</first><last>Aziz</last></author>
+      <author><first>Philip</first><last>Williams</last></author>
+      <author><first>Frédéric</first><last>Blain</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
       <pages>348–357</pages>
       <url>W17-4734</url>
       <doi>10.18653/v1/W17-4734</doi>
     </paper>
     <paper id="35">
       <title>The <fixed-case>RWTH</fixed-case> Aachen University <fixed-case>E</fixed-case>nglish-<fixed-case>G</fixed-case>erman and <fixed-case>G</fixed-case>erman-<fixed-case>E</fixed-case>nglish Machine Translation System for <fixed-case>WMT</fixed-case> 2017</title>
-      <author><first>Jan-Thorsten</first> <last>Peter</last></author>
-      <author><first>Andreas</first> <last>Guta</last></author>
-      <author><first>Tamer</first> <last>Alkhouli</last></author>
-      <author><first>Parnia</first> <last>Bahar</last></author>
-      <author><first>Jan</first> <last>Rosendahl</last></author>
-      <author><first>Nick</first> <last>Rossenbach</last></author>
-      <author><first>Miguel</first> <last>Graça</last></author>
-      <author><first>Hermann</first> <last>Ney</last></author>
+      <author><first>Jan-Thorsten</first><last>Peter</last></author>
+      <author><first>Andreas</first><last>Guta</last></author>
+      <author><first>Tamer</first><last>Alkhouli</last></author>
+      <author><first>Parnia</first><last>Bahar</last></author>
+      <author><first>Jan</first><last>Rosendahl</last></author>
+      <author><first>Nick</first><last>Rossenbach</last></author>
+      <author><first>Miguel</first><last>Graça</last></author>
+      <author><first>Hermann</first><last>Ney</last></author>
       <pages>358–365</pages>
       <url>W17-4735</url>
       <doi>10.18653/v1/W17-4735</doi>
     </paper>
     <paper id="36">
       <title>The Karlsruhe Institute of Technology Systems for the News Translation Task in <fixed-case>WMT</fixed-case> 2017</title>
-      <author><first>Ngoc-Quan</first> <last>Pham</last></author>
-      <author><first>Jan</first> <last>Niehues</last></author>
-      <author><first>Thanh-Le</first> <last>Ha</last></author>
-      <author><first>Eunah</first> <last>Cho</last></author>
-      <author><first>Matthias</first> <last>Sperber</last></author>
-      <author><first>Alexander</first> <last>Waibel</last></author>
+      <author><first>Ngoc-Quan</first><last>Pham</last></author>
+      <author><first>Jan</first><last>Niehues</last></author>
+      <author><first>Thanh-Le</first><last>Ha</last></author>
+      <author><first>Eunah</first><last>Cho</last></author>
+      <author><first>Matthias</first><last>Sperber</last></author>
+      <author><first>Alexander</first><last>Waibel</last></author>
       <pages>366–373</pages>
       <url>W17-4736</url>
       <doi>10.18653/v1/W17-4736</doi>
     </paper>
     <paper id="37">
       <title>Tilde’s Machine Translation Systems for <fixed-case>WMT</fixed-case> 2017</title>
-      <author><first>Mārcis</first> <last>Pinnis</last></author>
-      <author><first>Rihards</first> <last>Krišlauks</last></author>
-      <author><first>Toms</first> <last>Miks</last></author>
-      <author><first>Daiga</first> <last>Deksne</last></author>
-      <author><first>Valters</first> <last>Šics</last></author>
+      <author><first>Mārcis</first><last>Pinnis</last></author>
+      <author><first>Rihards</first><last>Krišlauks</last></author>
+      <author><first>Toms</first><last>Miks</last></author>
+      <author><first>Daiga</first><last>Deksne</last></author>
+      <author><first>Valters</first><last>Šics</last></author>
       <pages>374–381</pages>
       <url>W17-4737</url>
       <doi>10.18653/v1/W17-4737</doi>
     </paper>
     <paper id="38">
       <title>C-3<fixed-case>MA</fixed-case>: Tartu-<fixed-case>R</fixed-case>iga-<fixed-case>Z</fixed-case>urich Translation Systems for <fixed-case>WMT</fixed-case>17</title>
-      <author><first>Matīss</first> <last>Rikters</last></author>
-      <author><first>Chantal</first> <last>Amrhein</last></author>
-      <author><first>Maksym</first> <last>Del</last></author>
-      <author><first>Mark</first> <last>Fishel</last></author>
+      <author><first>Matīss</first><last>Rikters</last></author>
+      <author><first>Chantal</first><last>Amrhein</last></author>
+      <author><first>Maksym</first><last>Del</last></author>
+      <author><first>Mark</first><last>Fishel</last></author>
       <pages>382–388</pages>
       <url>W17-4738</url>
       <doi>10.18653/v1/W17-4738</doi>
     </paper>
     <paper id="39">
       <title>The University of <fixed-case>E</fixed-case>dinburgh’s Neural <fixed-case>MT</fixed-case> Systems for <fixed-case>WMT</fixed-case>17</title>
-      <author><first>Rico</first> <last>Sennrich</last></author>
-      <author><first>Alexandra</first> <last>Birch</last></author>
-      <author><first>Anna</first> <last>Currey</last></author>
-      <author><first>Ulrich</first> <last>Germann</last></author>
-      <author><first>Barry</first> <last>Haddow</last></author>
-      <author><first>Kenneth</first> <last>Heafield</last></author>
-      <author><first>Antonio Valerio</first> <last>Miceli Barone</last></author>
-      <author><first>Philip</first> <last>Williams</last></author>
+      <author><first>Rico</first><last>Sennrich</last></author>
+      <author><first>Alexandra</first><last>Birch</last></author>
+      <author><first>Anna</first><last>Currey</last></author>
+      <author><first>Ulrich</first><last>Germann</last></author>
+      <author><first>Barry</first><last>Haddow</last></author>
+      <author><first>Kenneth</first><last>Heafield</last></author>
+      <author><first>Antonio Valerio</first><last>Miceli Barone</last></author>
+      <author><first>Philip</first><last>Williams</last></author>
       <pages>389–399</pages>
       <url>W17-4739</url>
       <doi>10.18653/v1/W17-4739</doi>
     </paper>
     <paper id="40">
       <title><fixed-case>XMU</fixed-case> Neural Machine Translation Systems for <fixed-case>WMT</fixed-case> 17</title>
-      <author><first>Zhixing</first> <last>Tan</last></author>
-      <author><first>Boli</first> <last>Wang</last></author>
-      <author><first>Jinming</first> <last>Hu</last></author>
-      <author><first>Yidong</first> <last>Chen</last></author>
-      <author><first>Xiaodong</first> <last>Shi</last></author>
+      <author><first>Zhixing</first><last>Tan</last></author>
+      <author><first>Boli</first><last>Wang</last></author>
+      <author><first>Jinming</first><last>Hu</last></author>
+      <author><first>Yidong</first><last>Chen</last></author>
+      <author><first>Xiaodong</first><last>Shi</last></author>
       <pages>400–404</pages>
       <url>W17-4740</url>
       <doi>10.18653/v1/W17-4740</doi>
@@ -7991,32 +7991,32 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="41">
       <title>The <fixed-case>JAIST</fixed-case> Machine Translation Systems for <fixed-case>WMT</fixed-case> 17</title>
-      <author><first>Hai-Long</first> <last>Trieu</last></author>
-      <author><first>Trung-Tin</first> <last>Pham</last></author>
-      <author><first>Le-Minh</first> <last>Nguyen</last></author>
+      <author><first>Hai-Long</first><last>Trieu</last></author>
+      <author><first>Trung-Tin</first><last>Pham</last></author>
+      <author><first>Le-Minh</first><last>Nguyen</last></author>
       <pages>405–409</pages>
       <url>W17-4741</url>
       <doi>10.18653/v1/W17-4741</doi>
     </paper>
     <paper id="42">
       <title>Sogou Neural Machine Translation Systems for <fixed-case>WMT</fixed-case>17</title>
-      <author><first>Yuguang</first> <last>Wang</last></author>
-      <author><first>Shanbo</first> <last>Cheng</last></author>
-      <author><first>Liyang</first> <last>Jiang</last></author>
-      <author><first>Jiajun</first> <last>Yang</last></author>
-      <author><first>Wei</first> <last>Chen</last></author>
-      <author><first>Muze</first> <last>Li</last></author>
-      <author><first>Lin</first> <last>Shi</last></author>
-      <author><first>Yanfeng</first> <last>Wang</last></author>
-      <author><first>Hongtao</first> <last>Yang</last></author>
+      <author><first>Yuguang</first><last>Wang</last></author>
+      <author><first>Shanbo</first><last>Cheng</last></author>
+      <author><first>Liyang</first><last>Jiang</last></author>
+      <author><first>Jiajun</first><last>Yang</last></author>
+      <author><first>Wei</first><last>Chen</last></author>
+      <author><first>Muze</first><last>Li</last></author>
+      <author><first>Lin</first><last>Shi</last></author>
+      <author><first>Yanfeng</first><last>Wang</last></author>
+      <author><first>Hongtao</first><last>Yang</last></author>
       <pages>410–415</pages>
       <url>W17-4742</url>
       <doi>10.18653/v1/W17-4742</doi>
     </paper>
     <paper id="43">
       <title><fixed-case>PJIIT</fixed-case>’s systems for <fixed-case>WMT</fixed-case> 2017 Conference</title>
-      <author><first>Krzysztof</first> <last>Wolk</last></author>
-      <author><first>Krzysztof</first> <last>Marasek</last></author>
+      <author><first>Krzysztof</first><last>Wolk</last></author>
+      <author><first>Krzysztof</first><last>Marasek</last></author>
       <pages>416–421</pages>
       <url>W17-4743</url>
       <doi>10.18653/v1/W17-4743</doi>
@@ -8024,296 +8024,296 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="44">
       <title>Hunter <fixed-case>MT</fixed-case>: A Course for Young Researchers in <fixed-case>WMT</fixed-case>17</title>
-      <author><first>Jia</first> <last>Xu</last></author>
-      <author><first>Yi Zong</first> <last>Kuang</last></author>
-      <author><first>Shondell</first> <last>Baijoo</last></author>
-      <author><first>Jacob Hyun</first> <last>Lee</last></author>
-      <author><first>Uman</first> <last>Shahzad</last></author>
-      <author><first>Mir</first> <last>Ahmed</last></author>
-      <author><first>Meredith</first> <last>Lancaster</last></author>
-      <author><first>Chris</first> <last>Carlan</last></author>
+      <author><first>Jia</first><last>Xu</last></author>
+      <author><first>Yi Zong</first><last>Kuang</last></author>
+      <author><first>Shondell</first><last>Baijoo</last></author>
+      <author><first>Jacob Hyun</first><last>Lee</last></author>
+      <author><first>Uman</first><last>Shahzad</last></author>
+      <author><first>Mir</first><last>Ahmed</last></author>
+      <author><first>Meredith</first><last>Lancaster</last></author>
+      <author><first>Chris</first><last>Carlan</last></author>
       <pages>422–427</pages>
       <url>W17-4744</url>
       <doi>10.18653/v1/W17-4744</doi>
     </paper>
     <paper id="45">
       <title><fixed-case>CASICT</fixed-case>-<fixed-case>DCU</fixed-case> Neural Machine Translation Systems for <fixed-case>WMT</fixed-case>17</title>
-      <author><first>Jinchao</first> <last>Zhang</last></author>
-      <author><first>Peerachet</first> <last>Porkaew</last></author>
-      <author><first>Jiawei</first> <last>Hu</last></author>
-      <author><first>Qiuye</first> <last>Zhao</last></author>
-      <author><first>Qun</first> <last>Liu</last></author>
+      <author><first>Jinchao</first><last>Zhang</last></author>
+      <author><first>Peerachet</first><last>Porkaew</last></author>
+      <author><first>Jiawei</first><last>Hu</last></author>
+      <author><first>Qiuye</first><last>Zhao</last></author>
+      <author><first>Qun</first><last>Liu</last></author>
       <pages>428–431</pages>
       <url>W17-4745</url>
       <doi>10.18653/v1/W17-4745</doi>
     </paper>
     <paper id="46">
       <title><fixed-case>LIUM</fixed-case>-<fixed-case>CVC</fixed-case> Submissions for <fixed-case>WMT</fixed-case>17 Multimodal Translation Task</title>
-      <author><first>Ozan</first> <last>Caglayan</last></author>
-      <author><first>Walid</first> <last>Aransa</last></author>
-      <author><first>Adrien</first> <last>Bardet</last></author>
-      <author><first>Mercedes</first> <last>García-Martínez</last></author>
-      <author><first>Fethi</first> <last>Bougares</last></author>
-      <author><first>Loïc</first> <last>Barrault</last></author>
-      <author><first>Marc</first> <last>Masana</last></author>
-      <author><first>Luis</first> <last>Herranz</last></author>
-      <author><first>Joost</first> <last>van de Weijer</last></author>
+      <author><first>Ozan</first><last>Caglayan</last></author>
+      <author><first>Walid</first><last>Aransa</last></author>
+      <author><first>Adrien</first><last>Bardet</last></author>
+      <author><first>Mercedes</first><last>García-Martínez</last></author>
+      <author><first>Fethi</first><last>Bougares</last></author>
+      <author><first>Loïc</first><last>Barrault</last></author>
+      <author><first>Marc</first><last>Masana</last></author>
+      <author><first>Luis</first><last>Herranz</last></author>
+      <author><first>Joost</first><last>van de Weijer</last></author>
       <pages>432–439</pages>
       <url>W17-4746</url>
       <doi>10.18653/v1/W17-4746</doi>
     </paper>
     <paper id="47">
       <title><fixed-case>DCU</fixed-case> System Report on the <fixed-case>WMT</fixed-case> 2017 Multi-modal Machine Translation Task</title>
-      <author><first>Iacer</first> <last>Calixto</last></author>
-      <author><first>Koel</first> <last>Dutta Chowdhury</last></author>
-      <author><first>Qun</first> <last>Liu</last></author>
+      <author><first>Iacer</first><last>Calixto</last></author>
+      <author><first>Koel</first><last>Dutta Chowdhury</last></author>
+      <author><first>Qun</first><last>Liu</last></author>
       <pages>440–444</pages>
       <url>W17-4747</url>
       <doi>10.18653/v1/W17-4747</doi>
     </paper>
     <paper id="48">
       <title>The <fixed-case>AFRL</fixed-case>-<fixed-case>OSU</fixed-case> <fixed-case>WMT</fixed-case>17 Multimodal Translation System: An Image Processing Approach</title>
-      <author><first>John</first> <last>Duselis</last></author>
-      <author><first>Michael</first> <last>Hutt</last></author>
-      <author><first>Jeremy</first> <last>Gwinnup</last></author>
-      <author><first>James</first> <last>Davis</last></author>
-      <author><first>Joshua</first> <last>Sandvick</last></author>
+      <author><first>John</first><last>Duselis</last></author>
+      <author><first>Michael</first><last>Hutt</last></author>
+      <author><first>Jeremy</first><last>Gwinnup</last></author>
+      <author><first>James</first><last>Davis</last></author>
+      <author><first>Joshua</first><last>Sandvick</last></author>
       <pages>445–449</pages>
       <url>W17-4748</url>
       <doi>10.18653/v1/W17-4748</doi>
     </paper>
     <paper id="49">
       <title><fixed-case>CUNI</fixed-case> System for the <fixed-case>WMT</fixed-case>17 Multimodal Translation Task</title>
-      <author><first>Jindřich</first> <last>Helcl</last></author>
-      <author><first>Jindřich</first> <last>Libovický</last></author>
+      <author><first>Jindřich</first><last>Helcl</last></author>
+      <author><first>Jindřich</first><last>Libovický</last></author>
       <pages>450–457</pages>
       <url>W17-4749</url>
       <doi>10.18653/v1/W17-4749</doi>
     </paper>
     <paper id="50">
       <title>Generating Image Descriptions using Multilingual Data</title>
-      <author><first>Alan</first> <last>Jaffe</last></author>
+      <author><first>Alan</first><last>Jaffe</last></author>
       <pages>458–464</pages>
       <url>W17-4750</url>
       <doi>10.18653/v1/W17-4750</doi>
     </paper>
     <paper id="51">
       <title><fixed-case>OSU</fixed-case> Multimodal Machine Translation System Report</title>
-      <author><first>Mingbo</first> <last>Ma</last></author>
-      <author><first>Dapeng</first> <last>Li</last></author>
-      <author><first>Kai</first> <last>Zhao</last></author>
-      <author><first>Liang</first> <last>Huang</last></author>
+      <author><first>Mingbo</first><last>Ma</last></author>
+      <author><first>Dapeng</first><last>Li</last></author>
+      <author><first>Kai</first><last>Zhao</last></author>
+      <author><first>Liang</first><last>Huang</last></author>
       <pages>465–469</pages>
       <url>W17-4751</url>
       <doi>10.18653/v1/W17-4751</doi>
     </paper>
     <paper id="52">
       <title><fixed-case>S</fixed-case>heffield <fixed-case>M</fixed-case>ulti<fixed-case>MT</fixed-case>: Using Object Posterior Predictions for Multimodal Machine Translation</title>
-      <author><first>Pranava Swaroop</first> <last>Madhyastha</last></author>
-      <author><first>Josiah</first> <last>Wang</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
+      <author><first>Pranava Swaroop</first><last>Madhyastha</last></author>
+      <author><first>Josiah</first><last>Wang</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
       <pages>470–476</pages>
       <url>W17-4752</url>
       <doi>10.18653/v1/W17-4752</doi>
     </paper>
     <paper id="53">
       <title><fixed-case>NICT</fixed-case>-<fixed-case>NAIST</fixed-case> System for <fixed-case>WMT</fixed-case>17 Multimodal Translation Task</title>
-      <author><first>Jingyi</first> <last>Zhang</last></author>
-      <author><first>Masao</first> <last>Utiyama</last></author>
-      <author><first>Eiichro</first> <last>Sumita</last></author>
-      <author><first>Graham</first> <last>Neubig</last></author>
-      <author><first>Satoshi</first> <last>Nakamura</last></author>
+      <author><first>Jingyi</first><last>Zhang</last></author>
+      <author><first>Masao</first><last>Utiyama</last></author>
+      <author><first>Eiichro</first><last>Sumita</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
+      <author><first>Satoshi</first><last>Nakamura</last></author>
       <pages>477–482</pages>
       <url>W17-4753</url>
       <doi>10.18653/v1/W17-4753</doi>
     </paper>
     <paper id="54">
       <title>Automatic Threshold Detection for Data Selection in Machine Translation</title>
-      <author><first>Mirela-Stefania</first> <last>Duma</last></author>
-      <author><first>Wolfgang</first> <last>Menzel</last></author>
+      <author><first>Mirela-Stefania</first><last>Duma</last></author>
+      <author><first>Wolfgang</first><last>Menzel</last></author>
       <pages>483–488</pages>
       <url>W17-4754</url>
       <doi>10.18653/v1/W17-4754</doi>
     </paper>
     <paper id="55">
       <title>Results of the <fixed-case>WMT</fixed-case>17 Metrics Shared Task</title>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
-      <author><first>Yvette</first> <last>Graham</last></author>
-      <author><first>Amir</first> <last>Kamran</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <author><first>Yvette</first><last>Graham</last></author>
+      <author><first>Amir</first><last>Kamran</last></author>
       <pages>489–513</pages>
       <url>W17-4755</url>
       <doi>10.18653/v1/W17-4755</doi>
     </paper>
     <paper id="56">
       <title>A Shared Task on Bandit Learning for Machine Translation</title>
-      <author><first>Artem</first> <last>Sokolov</last></author>
-      <author><first>Julia</first> <last>Kreutzer</last></author>
-      <author><first>Kellen</first> <last>Sunderland</last></author>
-      <author><first>Pavel</first> <last>Danchenko</last></author>
-      <author><first>Witold</first> <last>Szymaniak</last></author>
-      <author><first>Hagen</first> <last>Fürstenau</last></author>
-      <author><first>Stefan</first> <last>Riezler</last></author>
+      <author><first>Artem</first><last>Sokolov</last></author>
+      <author><first>Julia</first><last>Kreutzer</last></author>
+      <author><first>Kellen</first><last>Sunderland</last></author>
+      <author><first>Pavel</first><last>Danchenko</last></author>
+      <author><first>Witold</first><last>Szymaniak</last></author>
+      <author><first>Hagen</first><last>Fürstenau</last></author>
+      <author><first>Stefan</first><last>Riezler</last></author>
       <pages>514–524</pages>
       <url>W17-4756</url>
       <doi>10.18653/v1/W17-4756</doi>
     </paper>
     <paper id="57">
       <title>Results of the <fixed-case>WMT</fixed-case>17 Neural <fixed-case>MT</fixed-case> Training Task</title>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
-      <author><first>Jindřich</first> <last>Helcl</last></author>
-      <author><first>Tom</first> <last>Kocmi</last></author>
-      <author><first>Jindřich</first> <last>Libovický</last></author>
-      <author><first>Tomáš</first> <last>Musil</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <author><first>Jindřich</first><last>Helcl</last></author>
+      <author><first>Tom</first><last>Kocmi</last></author>
+      <author><first>Jindřich</first><last>Libovický</last></author>
+      <author><first>Tomáš</first><last>Musil</last></author>
       <pages>525–533</pages>
       <url>W17-4757</url>
       <doi>10.18653/v1/W17-4757</doi>
     </paper>
     <paper id="58">
       <title>Sentence-level quality estimation by predicting <fixed-case>HTER</fixed-case> as a multi-component metric</title>
-      <author><first>Eleftherios</first> <last>Avramidis</last></author>
+      <author><first>Eleftherios</first><last>Avramidis</last></author>
       <pages>534–539</pages>
       <url>W17-4758</url>
       <doi>10.18653/v1/W17-4758</doi>
     </paper>
     <paper id="59">
       <title>Predicting Translation Performance with Referential Translation Machines</title>
-      <author><first>Ergun</first> <last>Biçici</last></author>
+      <author><first>Ergun</first><last>Biçici</last></author>
       <pages>540–544</pages>
       <url>W17-4759</url>
       <doi>10.18653/v1/W17-4759</doi>
     </paper>
     <paper id="60">
       <title>Bilexical Embeddings for Quality Estimation</title>
-      <author><first>Frédéric</first> <last>Blain</last></author>
-      <author><first>Carolina</first> <last>Scarton</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
+      <author><first>Frédéric</first><last>Blain</last></author>
+      <author><first>Carolina</first><last>Scarton</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
       <pages>545–550</pages>
       <url>W17-4760</url>
       <doi>10.18653/v1/W17-4760</doi>
     </paper>
     <paper id="61">
       <title>Improving Machine Translation Quality Estimation with Neural Network Features</title>
-      <author><first>Zhiming</first> <last>Chen</last></author>
-      <author><first>Yiming</first> <last>Tan</last></author>
-      <author><first>Chenlin</first> <last>Zhang</last></author>
-      <author><first>Qingyu</first> <last>Xiang</last></author>
-      <author><first>Lilin</first> <last>Zhang</last></author>
-      <author><first>Maoxi</first> <last>Li</last></author>
-      <author><first>Mingwen</first> <last>Wang</last></author>
+      <author><first>Zhiming</first><last>Chen</last></author>
+      <author><first>Yiming</first><last>Tan</last></author>
+      <author><first>Chenlin</first><last>Zhang</last></author>
+      <author><first>Qingyu</first><last>Xiang</last></author>
+      <author><first>Lilin</first><last>Zhang</last></author>
+      <author><first>Maoxi</first><last>Li</last></author>
+      <author><first>Mingwen</first><last>Wang</last></author>
       <pages>551–555</pages>
       <url>W17-4761</url>
       <doi>10.18653/v1/W17-4761</doi>
     </paper>
     <paper id="62">
       <title><fixed-case>UHH</fixed-case> Submission to the <fixed-case>WMT</fixed-case>17 Quality Estimation Shared Task</title>
-      <author><first>Melania</first> <last>Duma</last></author>
-      <author><first>Wolfgang</first> <last>Menzel</last></author>
+      <author><first>Melania</first><last>Duma</last></author>
+      <author><first>Wolfgang</first><last>Menzel</last></author>
       <pages>556–561</pages>
       <url>W17-4762</url>
       <doi>10.18653/v1/W17-4762</doi>
     </paper>
     <paper id="63">
       <title>Predictor-Estimator using Multilevel Task Learning with Stack Propagation for Neural Quality Estimation</title>
-      <author><first>Hyun</first> <last>Kim</last></author>
-      <author><first>Jong-Hyeok</first> <last>Lee</last></author>
-      <author><first>Seung-Hoon</first> <last>Na</last></author>
+      <author><first>Hyun</first><last>Kim</last></author>
+      <author><first>Jong-Hyeok</first><last>Lee</last></author>
+      <author><first>Seung-Hoon</first><last>Na</last></author>
       <pages>562–568</pages>
       <url>W17-4763</url>
       <doi>10.18653/v1/W17-4763</doi>
     </paper>
     <paper id="64">
       <title>Unbabel’s Participation in the <fixed-case>WMT</fixed-case>17 Translation Quality Estimation Shared Task</title>
-      <author><first>André F. T.</first> <last>Martins</last></author>
-      <author><first>Fabio</first> <last>Kepler</last></author>
-      <author><first>José</first> <last>Monteiro</last></author>
+      <author><first>André F. T.</first><last>Martins</last></author>
+      <author><first>Fabio</first><last>Kepler</last></author>
+      <author><first>José</first><last>Monteiro</last></author>
       <pages>569–574</pages>
       <url>W17-4764</url>
       <doi>10.18653/v1/W17-4764</doi>
     </paper>
     <paper id="65">
       <title>Feature-Enriched Character-Level Convolutions for Text Regression</title>
-      <author><first>Gustavo</first> <last>Paetzold</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
+      <author><first>Gustavo</first><last>Paetzold</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
       <pages>575–581</pages>
       <url>W17-4765</url>
       <doi>10.18653/v1/W17-4765</doi>
     </paper>
     <paper id="66">
       <title><fixed-case>UHH</fixed-case> Submission to the <fixed-case>WMT</fixed-case>17 Metrics Shared Task</title>
-      <author><first>Melania</first> <last>Duma</last></author>
-      <author><first>Wolfgang</first> <last>Menzel</last></author>
+      <author><first>Melania</first><last>Duma</last></author>
+      <author><first>Wolfgang</first><last>Menzel</last></author>
       <pages>582–588</pages>
       <url>W17-4766</url>
       <doi>10.18653/v1/W17-4766</doi>
     </paper>
     <paper id="67">
       <title><fixed-case>MEANT</fixed-case> 2.0: Accurate semantic <fixed-case>MT</fixed-case> evaluation for any output language</title>
-      <author><first>Chi-kiu</first> <last>Lo</last></author>
+      <author><first>Chi-kiu</first><last>Lo</last></author>
       <pages>589–597</pages>
       <url>W17-4767</url>
       <doi>10.18653/v1/W17-4767</doi>
     </paper>
     <paper id="68">
       <title><fixed-case>B</fixed-case>lend: a Novel Combined <fixed-case>MT</fixed-case> Metric Based on Direct Assessment — <fixed-case>CASICT</fixed-case>-<fixed-case>DCU</fixed-case> submission to <fixed-case>WMT</fixed-case>17 Metrics Task</title>
-      <author><first>Qingsong</first> <last>Ma</last></author>
-      <author><first>Yvette</first> <last>Graham</last></author>
-      <author><first>Shugen</first> <last>Wang</last></author>
-      <author><first>Qun</first> <last>Liu</last></author>
+      <author><first>Qingsong</first><last>Ma</last></author>
+      <author><first>Yvette</first><last>Graham</last></author>
+      <author><first>Shugen</first><last>Wang</last></author>
+      <author><first>Qun</first><last>Liu</last></author>
       <pages>598–603</pages>
       <url>W17-4768</url>
       <doi>10.18653/v1/W17-4768</doi>
     </paper>
     <paper id="69">
       <title><fixed-case>CUNI</fixed-case> Experiments for <fixed-case>WMT</fixed-case>17 Metrics Task</title>
-      <author><first>David</first> <last>Mareček</last></author>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
-      <author><first>Ondřej</first> <last>Hübsch</last></author>
-      <author><first>Rudolf</first> <last>Rosa</last></author>
-      <author><first>Dušan</first> <last>Variš</last></author>
+      <author><first>David</first><last>Mareček</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <author><first>Ondřej</first><last>Hübsch</last></author>
+      <author><first>Rudolf</first><last>Rosa</last></author>
+      <author><first>Dušan</first><last>Variš</last></author>
       <pages>604–611</pages>
       <url>W17-4769</url>
       <doi>10.18653/v1/W17-4769</doi>
     </paper>
     <paper id="70">
       <title>chr<fixed-case>F</fixed-case>++: words helping character n-grams</title>
-      <author><first>Maja</first> <last>Popović</last></author>
+      <author><first>Maja</first><last>Popović</last></author>
       <pages>612–618</pages>
       <url>W17-4770</url>
       <doi>10.18653/v1/W17-4770</doi>
     </paper>
     <paper id="71">
       <title>bleu2vec: the Painfully Familiar Metric on Continuous Vector Space Steroids</title>
-      <author><first>Andre</first> <last>Tättar</last></author>
-      <author><first>Mark</first> <last>Fishel</last></author>
+      <author><first>Andre</first><last>Tättar</last></author>
+      <author><first>Mark</first><last>Fishel</last></author>
       <pages>619–622</pages>
       <url>W17-4771</url>
       <doi>10.18653/v1/W17-4771</doi>
     </paper>
     <paper id="72">
       <title><fixed-case>LIG</fixed-case>-<fixed-case>CRIS</fixed-case>t<fixed-case>AL</fixed-case> Submission for the <fixed-case>WMT</fixed-case> 2017 Automatic Post-Editing Task</title>
-      <author><first>Alexandre</first> <last>Bérard</last></author>
-      <author><first>Laurent</first> <last>Besacier</last></author>
-      <author><first>Olivier</first> <last>Pietquin</last></author>
+      <author><first>Alexandre</first><last>Bérard</last></author>
+      <author><first>Laurent</first><last>Besacier</last></author>
+      <author><first>Olivier</first><last>Pietquin</last></author>
       <pages>623–629</pages>
       <url>W17-4772</url>
       <doi>10.18653/v1/W17-4772</doi>
     </paper>
     <paper id="73">
       <title>Multi-source Neural Automatic Post-Editing: <fixed-case>FBK</fixed-case>’s participation in the <fixed-case>WMT</fixed-case> 2017 <fixed-case>APE</fixed-case> shared task</title>
-      <author><first>Rajen</first> <last>Chatterjee</last></author>
-      <author><first>M. Amin</first> <last>Farajian</last></author>
-      <author><first>Matteo</first> <last>Negri</last></author>
-      <author><first>Marco</first> <last>Turchi</last></author>
-      <author><first>Ankit</first> <last>Srivastava</last></author>
-      <author><first>Santanu</first> <last>Pal</last></author>
+      <author><first>Rajen</first><last>Chatterjee</last></author>
+      <author><first>M. Amin</first><last>Farajian</last></author>
+      <author><first>Matteo</first><last>Negri</last></author>
+      <author><first>Marco</first><last>Turchi</last></author>
+      <author><first>Ankit</first><last>Srivastava</last></author>
+      <author><first>Santanu</first><last>Pal</last></author>
       <pages>630–638</pages>
       <url>W17-4773</url>
       <doi>10.18653/v1/W17-4773</doi>
     </paper>
     <paper id="74">
       <title>The <fixed-case>AMU</fixed-case>-<fixed-case>UE</fixed-case>din Submission to the <fixed-case>WMT</fixed-case> 2017 Shared Task on Automatic Post-Editing</title>
-      <author><first>Marcin</first> <last>Junczys-Dowmunt</last></author>
-      <author><first>Marcin</first> <last>Junczys-Dowmunt</last></author>
+      <author><first>Marcin</first><last>Junczys-Dowmunt</last></author>
+      <author><first>Marcin</first><last>Junczys-Dowmunt</last></author>
       <pages>639–646</pages>
       <url>W17-4774</url>
       <doi>10.18653/v1/W17-4774</doi>
@@ -8321,63 +8321,63 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="75">
       <title>Ensembling Factored Neural Machine Translation Models for Automatic Post-Editing and Quality Estimation</title>
-      <author><first>Chris</first> <last>Hokamp</last></author>
+      <author><first>Chris</first><last>Hokamp</last></author>
       <pages>647–654</pages>
       <url>W17-4775</url>
       <doi>10.18653/v1/W17-4775</doi>
     </paper>
     <paper id="76">
       <title>Neural Post-Editing Based on Quality Estimation</title>
-      <author><first>Yiming</first> <last>Tan</last></author>
-      <author><first>Zhiming</first> <last>Chen</last></author>
-      <author><first>Liu</first> <last>Huang</last></author>
-      <author><first>Lilin</first> <last>Zhang</last></author>
-      <author><first>Maoxi</first> <last>Li</last></author>
-      <author><first>Mingwen</first> <last>Wang</last></author>
+      <author><first>Yiming</first><last>Tan</last></author>
+      <author><first>Zhiming</first><last>Chen</last></author>
+      <author><first>Liu</first><last>Huang</last></author>
+      <author><first>Lilin</first><last>Zhang</last></author>
+      <author><first>Maoxi</first><last>Li</last></author>
+      <author><first>Mingwen</first><last>Wang</last></author>
       <pages>655–660</pages>
       <url>W17-4776</url>
       <doi>10.18653/v1/W17-4776</doi>
     </paper>
     <paper id="77">
       <title><fixed-case>CUNI</fixed-case> System for <fixed-case>WMT</fixed-case>17 Automatic Post-Editing Task</title>
-      <author><first>Dušan</first> <last>Variš</last></author>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
+      <author><first>Dušan</first><last>Variš</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
       <pages>661–666</pages>
       <url>W17-4777</url>
       <doi>10.18653/v1/W17-4777</doi>
     </paper>
     <paper id="78">
       <title>The <fixed-case>UMD</fixed-case> Neural Machine Translation Systems at <fixed-case>WMT</fixed-case>17 Bandit Learning Task</title>
-      <author><first>Amr</first> <last>Sharaf</last></author>
-      <author><first>Shi</first> <last>Feng</last></author>
-      <author><first>Khanh</first> <last>Nguyen</last></author>
-      <author><first>Kianté</first> <last>Brantley</last></author>
-      <author><first>Hal</first> <last>Daumé III</last></author>
+      <author><first>Amr</first><last>Sharaf</last></author>
+      <author><first>Shi</first><last>Feng</last></author>
+      <author><first>Khanh</first><last>Nguyen</last></author>
+      <author><first>Kianté</first><last>Brantley</last></author>
+      <author><first>Hal</first><last>Daumé III</last></author>
       <pages>667–673</pages>
       <url>W17-4778</url>
       <doi>10.18653/v1/W17-4778</doi>
     </paper>
     <paper id="79">
       <title><fixed-case>LIMSI</fixed-case> Submission for <fixed-case>WMT</fixed-case>’17 Shared Task on Bandit Learning</title>
-      <author><first>Guillaume</first> <last>Wisniewski</last></author>
+      <author><first>Guillaume</first><last>Wisniewski</last></author>
       <pages>674–679</pages>
       <url>W17-4779</url>
       <doi>10.18653/v1/W17-4779</doi>
     </paper>
     <paper id="80">
       <title>Variable Mini-Batch Sizing and Pre-Trained Embeddings</title>
-      <author><first>Mostafa</first> <last>Abdou</last></author>
-      <author><first>Vladan</first> <last>Glončák</last></author>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
+      <author><first>Mostafa</first><last>Abdou</last></author>
+      <author><first>Vladan</first><last>Glončák</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
       <pages>680–686</pages>
       <url>W17-4780</url>
       <doi>10.18653/v1/W17-4780</doi>
     </paper>
     <paper id="81">
       <title>The <fixed-case>AFRL</fixed-case> <fixed-case>WMT</fixed-case>17 Neural Machine Translation Training Task Submission</title>
-      <author><first>Jeremy</first> <last>Gwinnup</last></author>
-      <author><first>Grant</first> <last>Erdmann</last></author>
-      <author><first>Katherine</first> <last>Young</last></author>
+      <author><first>Jeremy</first><last>Gwinnup</last></author>
+      <author><first>Grant</first><last>Erdmann</last></author>
+      <author><first>Katherine</first><last>Young</last></author>
       <pages>687–691</pages>
       <url>W17-4781</url>
       <doi>10.18653/v1/W17-4781</doi>
@@ -8401,13 +8401,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Findings of the 2017 <fixed-case>D</fixed-case>isco<fixed-case>MT</fixed-case> Shared Task on Cross-lingual Pronoun Prediction</title>
-      <author><first>Sharid</first> <last>Loáiciga</last></author>
-      <author><first>Sara</first> <last>Stymne</last></author>
-      <author><first>Preslav</first> <last>Nakov</last></author>
-      <author><first>Christian</first> <last>Hardmeier</last></author>
-      <author><first>Jörg</first> <last>Tiedemann</last></author>
-      <author><first>Mauro</first> <last>Cettolo</last></author>
-      <author><first>Yannick</first> <last>Versley</last></author>
+      <author><first>Sharid</first><last>Loáiciga</last></author>
+      <author><first>Sara</first><last>Stymne</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <author><first>Christian</first><last>Hardmeier</last></author>
+      <author><first>Jörg</first><last>Tiedemann</last></author>
+      <author><first>Mauro</first><last>Cettolo</last></author>
+      <author><first>Yannick</first><last>Versley</last></author>
       <pages>1–16</pages>
       <url>W17-4801</url>
       <doi>10.18653/v1/W17-4801</doi>
@@ -8416,8 +8416,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Validation of an Automatic Metric for the Accuracy of Pronoun Translation (<fixed-case>APT</fixed-case>)</title>
-      <author><first>Lesly</first> <last>Miculicich Werlen</last></author>
-      <author><first>Andrei</first> <last>Popescu-Belis</last></author>
+      <author><first>Lesly</first><last>Miculicich Werlen</last></author>
+      <author><first>Andrei</first><last>Popescu-Belis</last></author>
       <pages>17–25</pages>
       <url>W17-4802</url>
       <doi>10.18653/v1/W17-4802</doi>
@@ -8425,9 +8425,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Using a Graph-based Coherence Model in Document-Level Machine Translation</title>
-      <author><first>Leo</first> <last>Born</last></author>
-      <author><first>Mohsen</first> <last>Mesgar</last></author>
-      <author><first>Michael</first> <last>Strube</last></author>
+      <author><first>Leo</first><last>Born</last></author>
+      <author><first>Mohsen</first><last>Mesgar</last></author>
+      <author><first>Michael</first><last>Strube</last></author>
       <pages>26–35</pages>
       <url>W17-4803</url>
       <doi>10.18653/v1/W17-4803</doi>
@@ -8435,7 +8435,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Treatment of Markup in Statistical Machine Translation</title>
-      <author><first>Mathias</first> <last>Müller</last></author>
+      <author><first>Mathias</first><last>Müller</last></author>
       <pages>36–46</pages>
       <url>W17-4804</url>
       <doi>10.18653/v1/W17-4804</doi>
@@ -8443,9 +8443,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>A <fixed-case>B</fixed-case>i<fixed-case>LSTM</fixed-case>-based System for Cross-lingual Pronoun Prediction</title>
-      <author><first>Sara</first> <last>Stymne</last></author>
-      <author><first>Sharid</first> <last>Loáiciga</last></author>
-      <author><first>Fabienne</first> <last>Cap</last></author>
+      <author><first>Sara</first><last>Stymne</last></author>
+      <author><first>Sharid</first><last>Loáiciga</last></author>
+      <author><first>Fabienne</first><last>Cap</last></author>
       <pages>47–53</pages>
       <url>W17-4805</url>
       <doi>10.18653/v1/W17-4805</doi>
@@ -8453,10 +8453,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Neural Machine Translation for Cross-Lingual Pronoun Prediction</title>
-      <author><first>Sebastien</first> <last>Jean</last></author>
-      <author><first>Stanislas</first> <last>Lauly</last></author>
-      <author><first>Orhan</first> <last>Firat</last></author>
-      <author><first>Kyunghyun</first> <last>Cho</last></author>
+      <author><first>Sebastien</first><last>Jean</last></author>
+      <author><first>Stanislas</first><last>Lauly</last></author>
+      <author><first>Orhan</first><last>Firat</last></author>
+      <author><first>Kyunghyun</first><last>Cho</last></author>
       <pages>54–57</pages>
       <url>W17-4806</url>
       <doi>10.18653/v1/W17-4806</doi>
@@ -8464,7 +8464,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Predicting Pronouns with a Convolutional Network and an N-gram Model</title>
-      <author><first>Christian</first> <last>Hardmeier</last></author>
+      <author><first>Christian</first><last>Hardmeier</last></author>
       <pages>58–62</pages>
       <url>W17-4807</url>
       <doi>10.18653/v1/W17-4807</doi>
@@ -8472,9 +8472,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Cross-Lingual Pronoun Prediction with Deep Recurrent Neural Networks v2.0</title>
-      <author><first>Juhani</first> <last>Luotolahti</last></author>
-      <author><first>Jenna</first> <last>Kanerva</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
+      <author><first>Juhani</first><last>Luotolahti</last></author>
+      <author><first>Jenna</first><last>Kanerva</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
       <pages>63–66</pages>
       <url>W17-4808</url>
       <doi>10.18653/v1/W17-4808</doi>
@@ -8482,7 +8482,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Combining the output of two coreference resolution systems for two source languages to improve annotation projection</title>
-      <author><first>Yulia</first> <last>Grishina</last></author>
+      <author><first>Yulia</first><last>Grishina</last></author>
       <pages>67–72</pages>
       <url>W17-4809</url>
       <doi>10.18653/v1/W17-4809</doi>
@@ -8490,8 +8490,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Discovery of Discourse-Related Language Contrasts through Alignment Discrepancies in <fixed-case>E</fixed-case>nglish-<fixed-case>G</fixed-case>erman Translation</title>
-      <author><first>Ekaterina</first> <last>Lapshinova-Koltunski</last></author>
-      <author><first>Christian</first> <last>Hardmeier</last></author>
+      <author><first>Ekaterina</first><last>Lapshinova-Koltunski</last></author>
+      <author><first>Christian</first><last>Hardmeier</last></author>
       <pages>73–81</pages>
       <url>W17-4810</url>
       <doi>10.18653/v1/W17-4810</doi>
@@ -8499,8 +8499,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Neural Machine Translation with Extended Context</title>
-      <author><first>Jörg</first> <last>Tiedemann</last></author>
-      <author><first>Yves</first> <last>Scherrer</last></author>
+      <author><first>Jörg</first><last>Tiedemann</last></author>
+      <author><first>Yves</first><last>Scherrer</last></author>
       <pages>82–92</pages>
       <url>W17-4811</url>
       <doi>10.18653/v1/W17-4811</doi>
@@ -8508,9 +8508,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Translating Implicit Discourse Connectives Based on Cross-lingual Annotation and Alignment</title>
-      <author><first>Hongzheng</first> <last>Li</last></author>
-      <author><first>Philippe</first> <last>Langlais</last></author>
-      <author><first>Yaohong</first> <last>Jin</last></author>
+      <author><first>Hongzheng</first><last>Li</last></author>
+      <author><first>Philippe</first><last>Langlais</last></author>
+      <author><first>Yaohong</first><last>Jin</last></author>
       <pages>93–98</pages>
       <url>W17-4812</url>
       <doi>10.18653/v1/W17-4812</doi>
@@ -8518,7 +8518,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Lexical Chains meet Word Embeddings in Document-level Statistical Machine Translation</title>
-      <author><first>Laura</first> <last>Mascarell</last></author>
+      <author><first>Laura</first><last>Mascarell</last></author>
       <pages>99–109</pages>
       <url>W17-4813</url>
       <doi>10.18653/v1/W17-4813</doi>
@@ -8526,7 +8526,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>On Integrating Discourse in Machine Translation</title>
-      <author><first>Karin</first> <last>Sim Smith</last></author>
+      <author><first>Karin</first><last>Sim Smith</last></author>
       <pages>110–121</pages>
       <url>W17-4814</url>
       <doi>10.18653/v1/W17-4814</doi>
@@ -8551,7 +8551,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>From Shakespeare to Twitter: What are Language Styles all about?</title>
-      <author><first>Wei</first> <last>Xu</last></author>
+      <author><first>Wei</first><last>Xu</last></author>
       <pages>1–9</pages>
       <url>W17-4901</url>
       <doi>10.18653/v1/W17-4901</doi>
@@ -8559,10 +8559,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Shakespearizing Modern Language Using Copy-Enriched Sequence to Sequence Models</title>
-      <author><first>Harsh</first> <last>Jhamtani</last></author>
-      <author><first>Varun</first> <last>Gangal</last></author>
-      <author><first>Eduard</first> <last>Hovy</last></author>
-      <author><first>Eric</first> <last>Nyberg</last></author>
+      <author><first>Harsh</first><last>Jhamtani</last></author>
+      <author><first>Varun</first><last>Gangal</last></author>
+      <author><first>Eduard</first><last>Hovy</last></author>
+      <author><first>Eric</first><last>Nyberg</last></author>
       <pages>10–19</pages>
       <url>W17-4902</url>
       <doi>10.18653/v1/W17-4902</doi>
@@ -8571,8 +8571,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Discovering Stylistic Variations in Distributional Vector Space Models via Lexical Paraphrases</title>
-      <author><first>Xing</first> <last>Niu</last></author>
-      <author><first>Marine</first> <last>Carpuat</last></author>
+      <author><first>Xing</first><last>Niu</last></author>
+      <author><first>Marine</first><last>Carpuat</last></author>
       <pages>20–27</pages>
       <url>W17-4903</url>
       <doi>10.18653/v1/W17-4903</doi>
@@ -8580,9 +8580,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Harvesting Creative Templates for Generating Stylistically Varied Restaurant Reviews</title>
-      <author><first>Shereen</first> <last>Oraby</last></author>
-      <author><first>Sheideh</first> <last>Homayon</last></author>
-      <author><first>Marilyn</first> <last>Walker</last></author>
+      <author><first>Shereen</first><last>Oraby</last></author>
+      <author><first>Sheideh</first><last>Homayon</last></author>
+      <author><first>Marilyn</first><last>Walker</last></author>
       <pages>28–36</pages>
       <url>W17-4904</url>
       <doi>10.18653/v1/W17-4904</doi>
@@ -8590,8 +8590,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Is writing style predictive of scientific fraud?</title>
-      <author><first>Chloé</first> <last>Braud</last></author>
-      <author><first>Anders</first> <last>Søgaard</last></author>
+      <author><first>Chloé</first><last>Braud</last></author>
+      <author><first>Anders</first><last>Søgaard</last></author>
       <pages>37–42</pages>
       <url>W17-4905</url>
       <doi>10.18653/v1/W17-4905</doi>
@@ -8599,9 +8599,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>“Deep” Learning : Detecting Metaphoricity in Adjective-Noun Pairs</title>
-      <author><first>Yuri</first> <last>Bizzoni</last></author>
-      <author><first>Stergios</first> <last>Chatzikyriakidis</last></author>
-      <author><first>Mehdi</first> <last>Ghanimifard</last></author>
+      <author><first>Yuri</first><last>Bizzoni</last></author>
+      <author><first>Stergios</first><last>Chatzikyriakidis</last></author>
+      <author><first>Mehdi</first><last>Ghanimifard</last></author>
       <pages>43–52</pages>
       <url>W17-4906</url>
       <doi>10.18653/v1/W17-4906</doi>
@@ -8609,9 +8609,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Authorship Attribution with Convolutional Neural Networks and <fixed-case>POS</fixed-case>-Eliding</title>
-      <author><first>Julian</first> <last>Hitschler</last></author>
-      <author><first>Esther</first> <last>van den Berg</last></author>
-      <author><first>Ines</first> <last>Rehbein</last></author>
+      <author><first>Julian</first><last>Hitschler</last></author>
+      <author><first>Esther</first><last>van den Berg</last></author>
+      <author><first>Ines</first><last>Rehbein</last></author>
       <pages>53–58</pages>
       <url>W17-4907</url>
       <doi>10.18653/v1/W17-4907</doi>
@@ -8619,9 +8619,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Topic and audience effects on distinctively <fixed-case>S</fixed-case>cottish vocabulary usage in Twitter data</title>
-      <author><first>Philippa</first> <last>Shoemark</last></author>
-      <author><first>James</first> <last>Kirby</last></author>
-      <author><first>Sharon</first> <last>Goldwater</last></author>
+      <author><first>Philippa</first><last>Shoemark</last></author>
+      <author><first>James</first><last>Kirby</last></author>
+      <author><first>Sharon</first><last>Goldwater</last></author>
       <pages>59–68</pages>
       <url>W17-4908</url>
       <doi>10.18653/v1/W17-4908</doi>
@@ -8629,10 +8629,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Differences in type-token ratio and part-of-speech frequencies in male and female <fixed-case>R</fixed-case>ussian written texts</title>
-      <author><first>Tatiana</first> <last>Litvinova</last></author>
-      <author><first>Pavel</first> <last>Seredin</last></author>
-      <author><first>Olga</first> <last>Litvinova</last></author>
-      <author><first>Olga</first> <last>Zagorovskaya</last></author>
+      <author><first>Tatiana</first><last>Litvinova</last></author>
+      <author><first>Pavel</first><last>Seredin</last></author>
+      <author><first>Olga</first><last>Litvinova</last></author>
+      <author><first>Olga</first><last>Zagorovskaya</last></author>
       <pages>69–73</pages>
       <url>W17-4909</url>
       <doi>10.18653/v1/W17-4909</doi>
@@ -8640,8 +8640,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Modeling Communicative Purpose with Functional Style: Corpus and Features for <fixed-case>G</fixed-case>erman Genre and Register Analysis</title>
-      <author><first>Thomas</first> <last>Haider</last></author>
-      <author><first>Alexis</first> <last>Palmer</last></author>
+      <author><first>Thomas</first><last>Haider</last></author>
+      <author><first>Alexis</first><last>Palmer</last></author>
       <pages>74–84</pages>
       <url>W17-4910</url>
       <doi>10.18653/v1/W17-4910</doi>
@@ -8649,8 +8649,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Stylistic Variation in Television Dialogue for Natural Language Generation</title>
-      <author><first>Grace</first> <last>Lin</last></author>
-      <author><first>Marilyn</first> <last>Walker</last></author>
+      <author><first>Grace</first><last>Lin</last></author>
+      <author><first>Marilyn</first><last>Walker</last></author>
       <pages>85–93</pages>
       <url>W17-4911</url>
       <doi>10.18653/v1/W17-4911</doi>
@@ -8658,8 +8658,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>Controlling Linguistic Style Aspects in Neural Language Generation</title>
-      <author><first>Jessica</first> <last>Ficler</last></author>
-      <author><first>Yoav</first> <last>Goldberg</last></author>
+      <author><first>Jessica</first><last>Ficler</last></author>
+      <author><first>Yoav</first><last>Goldberg</last></author>
       <pages>94–104</pages>
       <url>W17-4912</url>
       <doi>10.18653/v1/W17-4912</doi>
@@ -8667,8 +8667,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Approximating Style by N-gram-based Annotation</title>
-      <author><first>Melanie</first> <last>Andresen</last></author>
-      <author><first>Heike</first> <last>Zinsmeister</last></author>
+      <author><first>Melanie</first><last>Andresen</last></author>
+      <author><first>Heike</first><last>Zinsmeister</last></author>
       <pages>105–115</pages>
       <url>W17-4913</url>
       <doi>10.18653/v1/W17-4913</doi>
@@ -8676,10 +8676,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Assessing the Stylistic Properties of Neurally Generated Text in Authorship Attribution</title>
-      <author><first>Enrique</first> <last>Manjavacas</last></author>
-      <author><first>Jeroen</first> <last>De Gussem</last></author>
-      <author><first>Walter</first> <last>Daelemans</last></author>
-      <author><first>Mike</first> <last>Kestemont</last></author>
+      <author><first>Enrique</first><last>Manjavacas</last></author>
+      <author><first>Jeroen</first><last>De Gussem</last></author>
+      <author><first>Walter</first><last>Daelemans</last></author>
+      <author><first>Mike</first><last>Kestemont</last></author>
       <pages>116–125</pages>
       <url>W17-4914</url>
       <doi>10.18653/v1/W17-4914</doi>
@@ -8705,7 +8705,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Question Difficulty – How to Estimate Without Norming, How to Use for Automated Grading</title>
-      <author><first>Ulrike</first> <last>Padó</last></author>
+      <author><first>Ulrike</first><last>Padó</last></author>
       <pages>1–10</pages>
       <url>W17-5001</url>
       <doi>10.18653/v1/W17-5001</doi>
@@ -8713,11 +8713,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Combining <fixed-case>CNN</fixed-case>s and Pattern Matching for Question Interpretation in a Virtual Patient Dialogue System</title>
-      <author><first>Lifeng</first> <last>Jin</last></author>
-      <author><first>Michael</first> <last>White</last></author>
-      <author><first>Evan</first> <last>Jaffe</last></author>
-      <author><first>Laura</first> <last>Zimmerman</last></author>
-      <author><first>Douglas</first> <last>Danforth</last></author>
+      <author><first>Lifeng</first><last>Jin</last></author>
+      <author><first>Michael</first><last>White</last></author>
+      <author><first>Evan</first><last>Jaffe</last></author>
+      <author><first>Laura</first><last>Zimmerman</last></author>
+      <author><first>Douglas</first><last>Danforth</last></author>
       <pages>11–21</pages>
       <url>W17-5002</url>
       <doi>10.18653/v1/W17-5002</doi>
@@ -8725,10 +8725,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Continuous fluency tracking and the challenges of varying text complexity</title>
-      <author><first>Beata</first> <last>Beigman Klebanov</last></author>
-      <author><first>Anastassia</first> <last>Loukina</last></author>
-      <author><first>John</first> <last>Sabatini</last></author>
-      <author><first>Tenaha</first> <last>O’Reilly</last></author>
+      <author><first>Beata</first><last>Beigman Klebanov</last></author>
+      <author><first>Anastassia</first><last>Loukina</last></author>
+      <author><first>John</first><last>Sabatini</last></author>
+      <author><first>Tenaha</first><last>O’Reilly</last></author>
       <pages>22–32</pages>
       <url>W17-5003</url>
       <doi>10.18653/v1/W17-5003</doi>
@@ -8736,8 +8736,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Auxiliary Objectives for Neural Error Detection Models</title>
-      <author><first>Marek</first> <last>Rei</last></author>
-      <author><first>Helen</first> <last>Yannakoudakis</last></author>
+      <author><first>Marek</first><last>Rei</last></author>
+      <author><first>Helen</first><last>Yannakoudakis</last></author>
       <pages>33–43</pages>
       <url>W17-5004</url>
       <doi>10.18653/v1/W17-5004</doi>
@@ -8745,10 +8745,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Linked Data for Language-Learning Applications</title>
-      <author><first>Robyn</first> <last>Loughnane</last></author>
-      <author><first>Kate</first> <last>McCurdy</last></author>
-      <author><first>Peter</first> <last>Kolb</last></author>
-      <author><first>Stefan</first> <last>Selent</last></author>
+      <author><first>Robyn</first><last>Loughnane</last></author>
+      <author><first>Kate</first><last>McCurdy</last></author>
+      <author><first>Peter</first><last>Kolb</last></author>
+      <author><first>Stefan</first><last>Selent</last></author>
       <pages>44–51</pages>
       <url>W17-5005</url>
       <doi>10.18653/v1/W17-5005</doi>
@@ -8756,8 +8756,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Predicting Specificity in Classroom Discussion</title>
-      <author><first>Luca</first> <last>Lugini</last></author>
-      <author><first>Diane</first> <last>Litman</last></author>
+      <author><first>Luca</first><last>Lugini</last></author>
+      <author><first>Diane</first><last>Litman</last></author>
       <pages>52–61</pages>
       <url>W17-5006</url>
       <doi>10.18653/v1/W17-5006</doi>
@@ -8765,14 +8765,14 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>A Report on the 2017 Native Language Identification Shared Task</title>
-      <author><first>Shervin</first> <last>Malmasi</last></author>
-      <author><first>Keelan</first> <last>Evanini</last></author>
-      <author><first>Aoife</first> <last>Cahill</last></author>
-      <author><first>Joel</first> <last>Tetreault</last></author>
-      <author><first>Robert</first> <last>Pugh</last></author>
-      <author><first>Christopher</first> <last>Hamill</last></author>
-      <author><first>Diane</first> <last>Napolitano</last></author>
-      <author><first>Yao</first> <last>Qian</last></author>
+      <author><first>Shervin</first><last>Malmasi</last></author>
+      <author><first>Keelan</first><last>Evanini</last></author>
+      <author><first>Aoife</first><last>Cahill</last></author>
+      <author><first>Joel</first><last>Tetreault</last></author>
+      <author><first>Robert</first><last>Pugh</last></author>
+      <author><first>Christopher</first><last>Hamill</last></author>
+      <author><first>Diane</first><last>Napolitano</last></author>
+      <author><first>Yao</first><last>Qian</last></author>
       <pages>62–75</pages>
       <url>W17-5007</url>
       <doi>10.18653/v1/W17-5007</doi>
@@ -8780,8 +8780,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Evaluation of Automatically Generated Pronoun Reference Questions</title>
-      <author><first>Arief Yudha</first> <last>Satria</last></author>
-      <author><first>Takenobu</first> <last>Tokunaga</last></author>
+      <author><first>Arief Yudha</first><last>Satria</last></author>
+      <author><first>Takenobu</first><last>Tokunaga</last></author>
       <pages>76–85</pages>
       <url>W17-5008</url>
       <doi>10.18653/v1/W17-5008</doi>
@@ -8789,8 +8789,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Predicting Audience’s Laughter During Presentations Using Convolutional Neural Network</title>
-      <author><first>Lei</first> <last>Chen</last></author>
-      <author><first>Chong Min</first> <last>Lee</last></author>
+      <author><first>Lei</first><last>Chen</last></author>
+      <author><first>Chong Min</first><last>Lee</last></author>
       <pages>86–90</pages>
       <url>W17-5009</url>
       <doi>10.18653/v1/W17-5009</doi>
@@ -8798,9 +8798,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Collecting fluency corrections for spoken learner <fixed-case>E</fixed-case>nglish</title>
-      <author><first>Andrew</first> <last>Caines</last></author>
-      <author><first>Emma</first> <last>Flint</last></author>
-      <author><first>Paula</first> <last>Buttery</last></author>
+      <author><first>Andrew</first><last>Caines</last></author>
+      <author><first>Emma</first><last>Flint</last></author>
+      <author><first>Paula</first><last>Buttery</last></author>
       <pages>91–100</pages>
       <url>W17-5010</url>
       <doi>10.18653/v1/W17-5010</doi>
@@ -8808,10 +8808,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Exploring Relationships Between Writing &amp; Broader Outcomes With Automated Writing Evaluation</title>
-      <author><first>Jill</first> <last>Burstein</last></author>
-      <author><first>Dan</first> <last>McCaffrey</last></author>
-      <author><first>Beata</first> <last>Beigman Klebanov</last></author>
-      <author><first>Guangming</first> <last>Ling</last></author>
+      <author><first>Jill</first><last>Burstein</last></author>
+      <author><first>Dan</first><last>McCaffrey</last></author>
+      <author><first>Beata</first><last>Beigman Klebanov</last></author>
+      <author><first>Guangming</first><last>Ling</last></author>
       <pages>101–108</pages>
       <url>W17-5011</url>
       <doi>10.18653/v1/W17-5011</doi>
@@ -8819,10 +8819,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>An Investigation into the Pedagogical Features of Documents</title>
-      <author><first>Emily</first> <last>Sheng</last></author>
-      <author><first>Prem</first> <last>Natarajan</last></author>
-      <author><first>Jonathan</first> <last>Gordon</last></author>
-      <author><first>Gully</first> <last>Burns</last></author>
+      <author><first>Emily</first><last>Sheng</last></author>
+      <author><first>Prem</first><last>Natarajan</last></author>
+      <author><first>Jonathan</first><last>Gordon</last></author>
+      <author><first>Gully</first><last>Burns</last></author>
       <pages>109–120</pages>
       <url>W17-5012</url>
       <doi>10.18653/v1/W17-5012</doi>
@@ -8830,10 +8830,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Combining Multiple Corpora for Readability Assessment for People with Cognitive Disabilities</title>
-      <author><first>Victoria</first> <last>Yaneva</last></author>
-      <author><first>Constantin</first> <last>Orăsan</last></author>
-      <author><first>Richard</first> <last>Evans</last></author>
-      <author><first>Omid</first> <last>Rohanian</last></author>
+      <author><first>Victoria</first><last>Yaneva</last></author>
+      <author><first>Constantin</first><last>Orăsan</last></author>
+      <author><first>Richard</first><last>Evans</last></author>
+      <author><first>Omid</first><last>Rohanian</last></author>
       <pages>121–132</pages>
       <url>W17-5013</url>
       <doi>10.18653/v1/W17-5013</doi>
@@ -8841,8 +8841,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Automatic Extraction of High-Quality Example Sentences for Word Learning Using a Determinantal Point Process</title>
-      <author><first>Arseny</first> <last>Tolmachev</last></author>
-      <author><first>Sadao</first> <last>Kurohashi</last></author>
+      <author><first>Arseny</first><last>Tolmachev</last></author>
+      <author><first>Sadao</first><last>Kurohashi</last></author>
       <pages>133–142</pages>
       <url>W17-5014</url>
       <doi>10.18653/v1/W17-5014</doi>
@@ -8851,8 +8851,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Distractor Generation for <fixed-case>C</fixed-case>hinese Fill-in-the-blank Items</title>
-      <author><first>Shu</first> <last>Jiang</last></author>
-      <author><first>John</first> <last>Lee</last></author>
+      <author><first>Shu</first><last>Jiang</last></author>
+      <author><first>John</first><last>Lee</last></author>
       <pages>143–148</pages>
       <url>W17-5015</url>
       <doi>10.18653/v1/W17-5015</doi>
@@ -8860,9 +8860,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="16">
       <title>An Error-Oriented Approach to Word Embedding Pre-Training</title>
-      <author><first>Youmna</first> <last>Farag</last></author>
-      <author><first>Marek</first> <last>Rei</last></author>
-      <author><first>Ted</first> <last>Briscoe</last></author>
+      <author><first>Youmna</first><last>Farag</last></author>
+      <author><first>Marek</first><last>Rei</last></author>
+      <author><first>Ted</first><last>Briscoe</last></author>
       <pages>149–158</pages>
       <url>W17-5016</url>
       <doi>10.18653/v1/W17-5016</doi>
@@ -8870,11 +8870,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="17">
       <title>Investigating neural architectures for short answer scoring</title>
-      <author><first>Brian</first> <last>Riordan</last></author>
-      <author><first>Andrea</first> <last>Horbach</last></author>
-      <author><first>Aoife</first> <last>Cahill</last></author>
-      <author><first>Torsten</first> <last>Zesch</last></author>
-      <author><first>Chong Min</first> <last>Lee</last></author>
+      <author><first>Brian</first><last>Riordan</last></author>
+      <author><first>Andrea</first><last>Horbach</last></author>
+      <author><first>Aoife</first><last>Cahill</last></author>
+      <author><first>Torsten</first><last>Zesch</last></author>
+      <author><first>Chong Min</first><last>Lee</last></author>
       <pages>159–168</pages>
       <url>W17-5017</url>
       <doi>10.18653/v1/W17-5017</doi>
@@ -8882,10 +8882,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="18">
       <title>Human and Automated <fixed-case>CEFR</fixed-case>-based Grading of Short Answers</title>
-      <author><first>Anaïs</first> <last>Tack</last></author>
-      <author><first>Thomas</first> <last>François</last></author>
-      <author><first>Sophie</first> <last>Roekhaut</last></author>
-      <author><first>Cédrick</first> <last>Fairon</last></author>
+      <author><first>Anaïs</first><last>Tack</last></author>
+      <author><first>Thomas</first><last>François</last></author>
+      <author><first>Sophie</first><last>Roekhaut</last></author>
+      <author><first>Cédrick</first><last>Fairon</last></author>
       <pages>169–179</pages>
       <url>W17-5018</url>
       <doi>10.18653/v1/W17-5018</doi>
@@ -8893,9 +8893,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="19">
       <title><fixed-case>GEC</fixed-case> into the future: Where are we going and how do we get there?</title>
-      <author><first>Keisuke</first> <last>Sakaguchi</last></author>
-      <author><first>Courtney</first> <last>Napoles</last></author>
-      <author><first>Joel</first> <last>Tetreault</last></author>
+      <author><first>Keisuke</first><last>Sakaguchi</last></author>
+      <author><first>Courtney</first><last>Napoles</last></author>
+      <author><first>Joel</first><last>Tetreault</last></author>
       <pages>180–187</pages>
       <url>W17-5019</url>
       <doi>10.18653/v1/W17-5019</doi>
@@ -8903,7 +8903,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="20">
       <title>Detecting Off-topic Responses to Visual Prompts</title>
-      <author><first>Marek</first> <last>Rei</last></author>
+      <author><first>Marek</first><last>Rei</last></author>
       <pages>188–197</pages>
       <url>W17-5020</url>
       <doi>10.18653/v1/W17-5020</doi>
@@ -8911,11 +8911,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="21">
       <title>Combining Textual and Speech Features in the <fixed-case>NLI</fixed-case> Task Using State-of-the-Art Machine Learning Techniques</title>
-      <author><first>Pavel</first> <last>Ircing</last></author>
-      <author><first>Jan</first> <last>Švec</last></author>
-      <author><first>Zbyněk</first> <last>Zajíc</last></author>
-      <author><first>Barbora</first> <last>Hladká</last></author>
-      <author><first>Martin</first> <last>Holub</last></author>
+      <author><first>Pavel</first><last>Ircing</last></author>
+      <author><first>Jan</first><last>Švec</last></author>
+      <author><first>Zbyněk</first><last>Zajíc</last></author>
+      <author><first>Barbora</first><last>Hladká</last></author>
+      <author><first>Martin</first><last>Holub</last></author>
       <pages>198–209</pages>
       <url>W17-5021</url>
       <doi>10.18653/v1/W17-5021</doi>
@@ -8923,9 +8923,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="22">
       <title>Native Language Identification Using a Mixture of Character and Word N-grams</title>
-      <author><first>Elham</first> <last>Mohammadi</last></author>
-      <author><first>Hadi</first> <last>Veisi</last></author>
-      <author><first>Hessam</first> <last>Amini</last></author>
+      <author><first>Elham</first><last>Mohammadi</last></author>
+      <author><first>Hadi</first><last>Veisi</last></author>
+      <author><first>Hessam</first><last>Amini</last></author>
       <pages>210–216</pages>
       <url>W17-5022</url>
       <doi>10.18653/v1/W17-5022</doi>
@@ -8934,11 +8934,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="23">
       <title>Ensemble Methods for Native Language Identification</title>
-      <author><first>Sophia</first> <last>Chan</last></author>
-      <author><first>Maryam</first> <last>Honari Jahromi</last></author>
-      <author><first>Benjamin</first> <last>Benetti</last></author>
-      <author><first>Aazim</first> <last>Lakhani</last></author>
-      <author><first>Alona</first> <last>Fyshe</last></author>
+      <author><first>Sophia</first><last>Chan</last></author>
+      <author><first>Maryam</first><last>Honari Jahromi</last></author>
+      <author><first>Benjamin</first><last>Benetti</last></author>
+      <author><first>Aazim</first><last>Lakhani</last></author>
+      <author><first>Alona</first><last>Fyshe</last></author>
       <pages>217–223</pages>
       <url>W17-5023</url>
       <doi>10.18653/v1/W17-5023</doi>
@@ -8946,8 +8946,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="24">
       <title>Can string kernels pass the test of time in Native Language Identification?</title>
-      <author><first>Radu Tudor</first> <last>Ionescu</last></author>
-      <author><first>Marius</first> <last>Popescu</last></author>
+      <author><first>Radu Tudor</first><last>Ionescu</last></author>
+      <author><first>Marius</first><last>Popescu</last></author>
       <pages>224–234</pages>
       <url>W17-5024</url>
       <doi>10.18653/v1/W17-5024</doi>
@@ -8955,10 +8955,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="25">
       <title>Neural Networks and Spelling Features for Native Language Identification</title>
-      <author><first>Johannes</first> <last>Bjerva</last></author>
-      <author><first>Gintarė</first> <last>Grigonytė</last></author>
-      <author><first>Robert</first> <last>Östling</last></author>
-      <author><first>Barbara</first> <last>Plank</last></author>
+      <author><first>Johannes</first><last>Bjerva</last></author>
+      <author><first>Gintarė</first><last>Grigonytė</last></author>
+      <author><first>Robert</first><last>Östling</last></author>
+      <author><first>Barbara</first><last>Plank</last></author>
       <pages>235–239</pages>
       <url>W17-5025</url>
       <doi>10.18653/v1/W17-5025</doi>
@@ -8966,8 +8966,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="26">
       <title>A study of N-gram and Embedding Representations for Native Language Identification</title>
-      <author><first>Sowmya</first> <last>Vajjala</last></author>
-      <author><first>Sagnik</first> <last>Banerjee</last></author>
+      <author><first>Sowmya</first><last>Vajjala</last></author>
+      <author><first>Sagnik</first><last>Banerjee</last></author>
       <pages>240–248</pages>
       <url>W17-5026</url>
       <doi>10.18653/v1/W17-5026</doi>
@@ -8975,9 +8975,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="27">
       <title>A Shallow Neural Network for Native Language Identification with Character N-grams</title>
-      <author><first>Yunita</first> <last>Sari</last></author>
-      <author><first>Muhammad</first> <last>Rifqi Fatchurrahman</last></author>
-      <author><first>Meisyarah</first> <last>Dwiastuti</last></author>
+      <author><first>Yunita</first><last>Sari</last></author>
+      <author><first>Muhammad</first><last>Rifqi Fatchurrahman</last></author>
+      <author><first>Meisyarah</first><last>Dwiastuti</last></author>
       <pages>249–254</pages>
       <url>W17-5027</url>
       <doi>10.18653/v1/W17-5027</doi>
@@ -8985,8 +8985,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="28">
       <title>Fewer features perform well at Native Language Identification task</title>
-      <author><first>Taraka</first> <last>Rama</last></author>
-      <author><first>Çağrı</first> <last>Çöltekin</last></author>
+      <author><first>Taraka</first><last>Rama</last></author>
+      <author><first>Çağrı</first><last>Çöltekin</last></author>
       <pages>255–260</pages>
       <url>W17-5028</url>
       <doi>10.18653/v1/W17-5028</doi>
@@ -8994,10 +8994,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="29">
       <title>Structured Generation of Technical Reading Lists</title>
-      <author><first>Jonathan</first> <last>Gordon</last></author>
-      <author><first>Stephen</first> <last>Aguilar</last></author>
-      <author><first>Emily</first> <last>Sheng</last></author>
-      <author><first>Gully</first> <last>Burns</last></author>
+      <author><first>Jonathan</first><last>Gordon</last></author>
+      <author><first>Stephen</first><last>Aguilar</last></author>
+      <author><first>Emily</first><last>Sheng</last></author>
+      <author><first>Gully</first><last>Burns</last></author>
       <pages>261–270</pages>
       <url>W17-5029</url>
       <doi>10.18653/v1/W17-5029</doi>
@@ -9005,10 +9005,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="30">
       <title>Effects of Lexical Properties on Viewing Time per Word in Autistic and Neurotypical Readers</title>
-      <author><first>Sanja</first> <last>Štajner</last></author>
-      <author><first>Victoria</first> <last>Yaneva</last></author>
-      <author><first>Ruslan</first> <last>Mitkov</last></author>
-      <author><first>Simone Paolo</first> <last>Ponzetto</last></author>
+      <author><first>Sanja</first><last>Štajner</last></author>
+      <author><first>Victoria</first><last>Yaneva</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <author><first>Simone Paolo</first><last>Ponzetto</last></author>
       <pages>271–281</pages>
       <url>W17-5030</url>
       <doi>10.18653/v1/W17-5030</doi>
@@ -9016,8 +9016,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="31">
       <title>Transparent text quality assessment with convolutional neural networks</title>
-      <author><first>Robert</first> <last>Östling</last></author>
-      <author><first>Gintare</first> <last>Grigonyte</last></author>
+      <author><first>Robert</first><last>Östling</last></author>
+      <author><first>Gintare</first><last>Grigonyte</last></author>
       <pages>282–286</pages>
       <url>W17-5031</url>
       <doi>10.18653/v1/W17-5031</doi>
@@ -9025,10 +9025,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="32">
       <title>Artificial Error Generation with Machine Translation and Syntactic Patterns</title>
-      <author><first>Marek</first> <last>Rei</last></author>
-      <author><first>Mariano</first> <last>Felice</last></author>
-      <author><first>Zheng</first> <last>Yuan</last></author>
-      <author><first>Ted</first> <last>Briscoe</last></author>
+      <author><first>Marek</first><last>Rei</last></author>
+      <author><first>Mariano</first><last>Felice</last></author>
+      <author><first>Zheng</first><last>Yuan</last></author>
+      <author><first>Ted</first><last>Briscoe</last></author>
       <pages>287–292</pages>
       <url>W17-5032</url>
       <doi>10.18653/v1/W17-5032</doi>
@@ -9036,8 +9036,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="33">
       <title>Modelling semantic acquisition in second language learning</title>
-      <author><first>Ekaterina</first> <last>Kochmar</last></author>
-      <author><first>Ekaterina</first> <last>Shutova</last></author>
+      <author><first>Ekaterina</first><last>Kochmar</last></author>
+      <author><first>Ekaterina</first><last>Shutova</last></author>
       <pages>293–302</pages>
       <url>W17-5033</url>
       <doi>10.18653/v1/W17-5033</doi>
@@ -9045,8 +9045,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="34">
       <title>Multiple Choice Question Generation Utilizing An Ontology</title>
-      <author><first>Katherine</first> <last>Stasaski</last></author>
-      <author><first>Marti A.</first> <last>Hearst</last></author>
+      <author><first>Katherine</first><last>Stasaski</last></author>
+      <author><first>Marti A.</first><last>Hearst</last></author>
       <pages>303–312</pages>
       <url>W17-5034</url>
       <doi>10.18653/v1/W17-5034</doi>
@@ -9055,8 +9055,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="35">
       <title>Simplifying metaphorical language for young readers: A corpus study on news text</title>
-      <author><first>Magdalena</first> <last>Wolska</last></author>
-      <author><first>Yulia</first> <last>Clausen</last></author>
+      <author><first>Magdalena</first><last>Wolska</last></author>
+      <author><first>Yulia</first><last>Clausen</last></author>
       <pages>313–318</pages>
       <url>W17-5035</url>
       <doi>10.18653/v1/W17-5035</doi>
@@ -9064,8 +9064,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="36">
       <title>Language Based Mapping of Science Assessment Items to Skills</title>
-      <author><first>Farah</first> <last>Nadeem</last></author>
-      <author><first>Mari</first> <last>Ostendorf</last></author>
+      <author><first>Farah</first><last>Nadeem</last></author>
+      <author><first>Mari</first><last>Ostendorf</last></author>
       <pages>319–326</pages>
       <url>W17-5036</url>
       <doi>10.18653/v1/W17-5036</doi>
@@ -9074,8 +9074,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="37">
       <title>Connecting the Dots: Towards Human-Level Grammatical Error Correction</title>
-      <author><first>Shamil</first> <last>Chollampatt</last></author>
-      <author><first>Hwee Tou</first> <last>Ng</last></author>
+      <author><first>Shamil</first><last>Chollampatt</last></author>
+      <author><first>Hwee Tou</first><last>Ng</last></author>
       <pages>327–333</pages>
       <url>W17-5037</url>
       <doi>10.18653/v1/W17-5037</doi>
@@ -9083,8 +9083,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="38">
       <title>Question Generation for Language Learning: From ensuring texts are read to supporting learning</title>
-      <author><first>Maria</first> <last>Chinkina</last></author>
-      <author><first>Detmar</first> <last>Meurers</last></author>
+      <author><first>Maria</first><last>Chinkina</last></author>
+      <author><first>Detmar</first><last>Meurers</last></author>
       <pages>334–344</pages>
       <url>W17-5038</url>
       <doi>10.18653/v1/W17-5038</doi>
@@ -9092,8 +9092,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="39">
       <title>Systematically Adapting Machine Translation for Grammatical Error Correction</title>
-      <author><first>Courtney</first> <last>Napoles</last></author>
-      <author><first>Chris</first> <last>Callison-Burch</last></author>
+      <author><first>Courtney</first><last>Napoles</last></author>
+      <author><first>Chris</first><last>Callison-Burch</last></author>
       <pages>345–356</pages>
       <url>W17-5039</url>
       <doi>10.18653/v1/W17-5039</doi>
@@ -9101,10 +9101,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="40">
       <title>Fine-grained essay scoring of a complex writing task for native speakers</title>
-      <author><first>Andrea</first> <last>Horbach</last></author>
-      <author><first>Dirk</first> <last>Scholten-Akoun</last></author>
-      <author><first>Yuning</first> <last>Ding</last></author>
-      <author><first>Torsten</first> <last>Zesch</last></author>
+      <author><first>Andrea</first><last>Horbach</last></author>
+      <author><first>Dirk</first><last>Scholten-Akoun</last></author>
+      <author><first>Yuning</first><last>Ding</last></author>
+      <author><first>Torsten</first><last>Zesch</last></author>
       <pages>357–366</pages>
       <url>W17-5040</url>
       <doi>10.18653/v1/W17-5040</doi>
@@ -9112,8 +9112,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="41">
       <title>Exploring Optimal Voting in Native Language Identification</title>
-      <author><first>Cyril</first> <last>Goutte</last></author>
-      <author><first>Serge</first> <last>Léger</last></author>
+      <author><first>Cyril</first><last>Goutte</last></author>
+      <author><first>Serge</first><last>Léger</last></author>
       <pages>367–373</pages>
       <url>W17-5041</url>
       <doi>10.18653/v1/W17-5041</doi>
@@ -9121,10 +9121,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="42">
       <title><fixed-case>CIC</fixed-case>-<fixed-case>FBK</fixed-case> Approach to Native Language Identification</title>
-      <author><first>Ilia</first> <last>Markov</last></author>
-      <author><first>Lingzhen</first> <last>Chen</last></author>
-      <author><first>Carlo</first> <last>Strapparava</last></author>
-      <author><first>Grigori</first> <last>Sidorov</last></author>
+      <author><first>Ilia</first><last>Markov</last></author>
+      <author><first>Lingzhen</first><last>Chen</last></author>
+      <author><first>Carlo</first><last>Strapparava</last></author>
+      <author><first>Grigori</first><last>Sidorov</last></author>
       <pages>374–381</pages>
       <url>W17-5042</url>
       <doi>10.18653/v1/W17-5042</doi>
@@ -9132,13 +9132,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="43">
       <title>The Power of Character N-grams in Native Language Identification</title>
-      <author><first>Artur</first> <last>Kulmizev</last></author>
-      <author><first>Bo</first> <last>Blankers</last></author>
-      <author><first>Johannes</first> <last>Bjerva</last></author>
-      <author><first>Malvina</first> <last>Nissim</last></author>
-      <author><first>Gertjan</first> <last>van Noord</last></author>
-      <author><first>Barbara</first> <last>Plank</last></author>
-      <author><first>Martijn</first> <last>Wieling</last></author>
+      <author><first>Artur</first><last>Kulmizev</last></author>
+      <author><first>Bo</first><last>Blankers</last></author>
+      <author><first>Johannes</first><last>Bjerva</last></author>
+      <author><first>Malvina</first><last>Nissim</last></author>
+      <author><first>Gertjan</first><last>van Noord</last></author>
+      <author><first>Barbara</first><last>Plank</last></author>
+      <author><first>Martijn</first><last>Wieling</last></author>
       <pages>382–389</pages>
       <url>W17-5043</url>
       <doi>10.18653/v1/W17-5043</doi>
@@ -9146,8 +9146,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="44">
       <title>Classifier Stacking for Native Language Identification</title>
-      <author><first>Wen</first> <last>Li</last></author>
-      <author><first>Liang</first> <last>Zou</last></author>
+      <author><first>Wen</first><last>Li</last></author>
+      <author><first>Liang</first><last>Zou</last></author>
       <pages>390–397</pages>
       <url>W17-5044</url>
       <doi>10.18653/v1/W17-5044</doi>
@@ -9155,9 +9155,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="45">
       <title>Native Language Identification on Text and Speech</title>
-      <author><first>Marcos</first> <last>Zampieri</last></author>
-      <author><first>Alina Maria</first> <last>Ciobanu</last></author>
-      <author><first>Liviu P.</first> <last>Dinu</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <author><first>Alina Maria</first><last>Ciobanu</last></author>
+      <author><first>Liviu P.</first><last>Dinu</last></author>
       <pages>398–404</pages>
       <url>W17-5045</url>
       <doi>10.18653/v1/W17-5045</doi>
@@ -9165,8 +9165,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="46">
       <title>Native Language Identification using Phonetic Algorithms</title>
-      <author><first>Charese</first> <last>Smiley</last></author>
-      <author><first>Sandra</first> <last>Kübler</last></author>
+      <author><first>Charese</first><last>Smiley</last></author>
+      <author><first>Sandra</first><last>Kübler</last></author>
       <pages>405–412</pages>
       <url>W17-5046</url>
       <doi>10.18653/v1/W17-5046</doi>
@@ -9174,12 +9174,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="47">
       <title>A deep-learning based native-language classification by using a latent semantic analysis for the <fixed-case>NLI</fixed-case> Shared Task 2017</title>
-      <author><first>Yoo Rhee</first> <last>Oh</last></author>
-      <author><first>Hyung-Bae</first> <last>Jeon</last></author>
-      <author><first>Hwa Jeon</first> <last>Song</last></author>
-      <author><first>Yun-Kyung</first> <last>Lee</last></author>
-      <author><first>Jeon-Gue</first> <last>Park</last></author>
-      <author><first>Yun-Keun</first> <last>Lee</last></author>
+      <author><first>Yoo Rhee</first><last>Oh</last></author>
+      <author><first>Hyung-Bae</first><last>Jeon</last></author>
+      <author><first>Hwa Jeon</first><last>Song</last></author>
+      <author><first>Yun-Kyung</first><last>Lee</last></author>
+      <author><first>Jeon-Gue</first><last>Park</last></author>
+      <author><first>Yun-Keun</first><last>Lee</last></author>
       <pages>413–422</pages>
       <url>W17-5047</url>
       <doi>10.18653/v1/W17-5047</doi>
@@ -9187,9 +9187,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="48">
       <title>Fusion of Simple Models for Native Language Identification</title>
-      <author><first>Fabio</first> <last>Kepler</last></author>
-      <author><first>Ramon</first> <last>Astudillo</last></author>
-      <author><first>Alberto</first> <last>Abad</last></author>
+      <author><first>Fabio</first><last>Kepler</last></author>
+      <author><first>Ramon</first><last>Astudillo</last></author>
+      <author><first>Alberto</first><last>Abad</last></author>
       <pages>423–429</pages>
       <url>W17-5048</url>
       <doi>10.18653/v1/W17-5048</doi>
@@ -9197,8 +9197,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="49">
       <title>Stacked Sentence-Document Classifier Approach for Improving Native Language Identification</title>
-      <author><first>Andrea</first> <last>Cimino</last></author>
-      <author><first>Felice</first> <last>Dell’Orletta</last></author>
+      <author><first>Andrea</first><last>Cimino</last></author>
+      <author><first>Felice</first><last>Dell’Orletta</last></author>
       <pages>430–437</pages>
       <url>W17-5049</url>
       <doi>10.18653/v1/W17-5049</doi>
@@ -9206,8 +9206,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="50">
       <title>Using Gaze to Predict Text Readability</title>
-      <author><first>Ana Valeria</first> <last>González-Garduño</last></author>
-      <author><first>Anders</first> <last>Søgaard</last></author>
+      <author><first>Ana Valeria</first><last>González-Garduño</last></author>
+      <author><first>Anders</first><last>Søgaard</last></author>
       <pages>438–443</pages>
       <url>W17-5050</url>
       <doi>10.18653/v1/W17-5050</doi>
@@ -9215,11 +9215,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="51">
       <title>Annotating Orthographic Target Hypotheses in a <fixed-case>G</fixed-case>erman <fixed-case>L</fixed-case>1 Learner Corpus</title>
-      <author><first>Ronja</first> <last>Laarmann-Quante</last></author>
-      <author><first>Katrin</first> <last>Ortmann</last></author>
-      <author><first>Anna</first> <last>Ehlert</last></author>
-      <author><first>Maurice</first> <last>Vogel</last></author>
-      <author><first>Stefanie</first> <last>Dipper</last></author>
+      <author><first>Ronja</first><last>Laarmann-Quante</last></author>
+      <author><first>Katrin</first><last>Ortmann</last></author>
+      <author><first>Anna</first><last>Ehlert</last></author>
+      <author><first>Maurice</first><last>Vogel</last></author>
+      <author><first>Stefanie</first><last>Dipper</last></author>
       <pages>444–456</pages>
       <url>W17-5051</url>
       <doi>10.18653/v1/W17-5051</doi>
@@ -9227,9 +9227,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="52">
       <title>A Large Scale Quantitative Exploration of Modeling Strategies for Content Scoring</title>
-      <author><first>Nitin</first> <last>Madnani</last></author>
-      <author><first>Anastassia</first> <last>Loukina</last></author>
-      <author><first>Aoife</first> <last>Cahill</last></author>
+      <author><first>Nitin</first><last>Madnani</last></author>
+      <author><first>Anastassia</first><last>Loukina</last></author>
+      <author><first>Aoife</first><last>Cahill</last></author>
       <pages>457–467</pages>
       <url>W17-5052</url>
       <doi>10.18653/v1/W17-5052</doi>
@@ -9261,10 +9261,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>200<fixed-case>K</fixed-case>+ Crowdsourced Political Arguments for a New <fixed-case>C</fixed-case>hilean Constitution</title>
-      <author><first>Constanza</first> <last>Fierro</last></author>
-      <author><first>Claudio</first> <last>Fuentes</last></author>
-      <author><first>Jorge</first> <last>Pérez</last></author>
-      <author><first>Mauricio</first> <last>Quezada</last></author>
+      <author><first>Constanza</first><last>Fierro</last></author>
+      <author><first>Claudio</first><last>Fuentes</last></author>
+      <author><first>Jorge</first><last>Pérez</last></author>
+      <author><first>Mauricio</first><last>Quezada</last></author>
       <pages>1–10</pages>
       <url>W17-5101</url>
       <doi>10.18653/v1/W17-5101</doi>
@@ -9272,11 +9272,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Analyzing the Semantic Types of Claims and Premises in an Online Persuasive Forum</title>
-      <author><first>Christopher</first> <last>Hidey</last></author>
-      <author><first>Elena</first> <last>Musi</last></author>
-      <author><first>Alyssa</first> <last>Hwang</last></author>
-      <author><first>Smaranda</first> <last>Muresan</last></author>
-      <author><first>Kathy</first> <last>McKeown</last></author>
+      <author><first>Christopher</first><last>Hidey</last></author>
+      <author><first>Elena</first><last>Musi</last></author>
+      <author><first>Alyssa</first><last>Hwang</last></author>
+      <author><first>Smaranda</first><last>Muresan</last></author>
+      <author><first>Kathy</first><last>McKeown</last></author>
       <pages>11–21</pages>
       <url>W17-5102</url>
       <doi>10.18653/v1/W17-5102</doi>
@@ -9284,9 +9284,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Annotation of argument structure in <fixed-case>J</fixed-case>apanese legal documents</title>
-      <author><first>Hiroaki</first> <last>Yamada</last></author>
-      <author><first>Simone</first> <last>Teufel</last></author>
-      <author><first>Takenobu</first> <last>Tokunaga</last></author>
+      <author><first>Hiroaki</first><last>Yamada</last></author>
+      <author><first>Simone</first><last>Teufel</last></author>
+      <author><first>Takenobu</first><last>Tokunaga</last></author>
       <pages>22–31</pages>
       <url>W17-5103</url>
       <doi>10.18653/v1/W17-5103</doi>
@@ -9294,10 +9294,10 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="4">
       <title>Improving Claim Stance Classification with Lexical Knowledge Expansion and Context Utilization</title>
-      <author><first>Roy</first> <last>Bar-Haim</last></author>
-      <author><first>Lilach</first> <last>Edelstein</last></author>
-      <author><first>Charles</first> <last>Jochim</last></author>
-      <author><first>Noam</first> <last>Slonim</last></author>
+      <author><first>Roy</first><last>Bar-Haim</last></author>
+      <author><first>Lilach</first><last>Edelstein</last></author>
+      <author><first>Charles</first><last>Jochim</last></author>
+      <author><first>Noam</first><last>Slonim</last></author>
       <pages>32–38</pages>
       <url>W17-5104</url>
       <doi>10.18653/v1/W17-5104</doi>
@@ -9305,8 +9305,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="5">
       <title>Mining Argumentative Structure from Natural Language text using Automatically Generated Premise-Conclusion Topic Models</title>
-      <author><first>John</first> <last>Lawrence</last></author>
-      <author><first>Chris</first> <last>Reed</last></author>
+      <author><first>John</first><last>Lawrence</last></author>
+      <author><first>Chris</first><last>Reed</last></author>
       <pages>39–48</pages>
       <url>W17-5105</url>
       <doi>10.18653/v1/W17-5105</doi>
@@ -9314,16 +9314,16 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="6">
       <title>Building an Argument Search Engine for the Web</title>
-      <author><first>Henning</first> <last>Wachsmuth</last></author>
-      <author><first>Martin</first> <last>Potthast</last></author>
-      <author><first>Khalid</first> <last>Al-Khatib</last></author>
-      <author><first>Yamen</first> <last>Ajjour</last></author>
-      <author><first>Jana</first> <last>Puschmann</last></author>
-      <author><first>Jiani</first> <last>Qu</last></author>
-      <author><first>Jonas</first> <last>Dorsch</last></author>
-      <author><first>Viorel</first> <last>Morari</last></author>
-      <author><first>Janek</first> <last>Bevendorff</last></author>
-      <author><first>Benno</first> <last>Stein</last></author>
+      <author><first>Henning</first><last>Wachsmuth</last></author>
+      <author><first>Martin</first><last>Potthast</last></author>
+      <author><first>Khalid</first><last>Al-Khatib</last></author>
+      <author><first>Yamen</first><last>Ajjour</last></author>
+      <author><first>Jana</first><last>Puschmann</last></author>
+      <author><first>Jiani</first><last>Qu</last></author>
+      <author><first>Jonas</first><last>Dorsch</last></author>
+      <author><first>Viorel</first><last>Morari</last></author>
+      <author><first>Janek</first><last>Bevendorff</last></author>
+      <author><first>Benno</first><last>Stein</last></author>
       <pages>49–59</pages>
       <url>W17-5106</url>
       <doi>10.18653/v1/W17-5106</doi>
@@ -9331,8 +9331,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="7">
       <title>Argument Relation Classification Using a Joint Inference Model</title>
-      <author><first>Yufang</first> <last>Hou</last></author>
-      <author><first>Charles</first> <last>Jochim</last></author>
+      <author><first>Yufang</first><last>Hou</last></author>
+      <author><first>Charles</first><last>Jochim</last></author>
       <pages>60–66</pages>
       <url>W17-5107</url>
       <doi>10.18653/v1/W17-5107</doi>
@@ -9340,8 +9340,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="8">
       <title>Projection of Argumentative Corpora from Source to Target Languages</title>
-      <author><first>Ahmet</first> <last>Aker</last></author>
-      <author><first>Huangpan</first> <last>Zhang</last></author>
+      <author><first>Ahmet</first><last>Aker</last></author>
+      <author><first>Huangpan</first><last>Zhang</last></author>
       <pages>67–72</pages>
       <url>W17-5108</url>
       <doi>10.18653/v1/W17-5108</doi>
@@ -9350,7 +9350,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="9">
       <title>Manual Identification of Arguments with Implicit Conclusions Using Semantic Rules for Argument Mining</title>
-      <author><first>Nancy</first> <last>Green</last></author>
+      <author><first>Nancy</first><last>Green</last></author>
       <pages>73–78</pages>
       <url>W17-5109</url>
       <doi>10.18653/v1/W17-5109</doi>
@@ -9358,12 +9358,12 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="10">
       <title>Unsupervised corpus–wide claim detection</title>
-      <author><first>Ran</first> <last>Levy</last></author>
-      <author><first>Shai</first> <last>Gretz</last></author>
-      <author><first>Benjamin</first> <last>Sznajder</last></author>
-      <author><first>Shay</first> <last>Hummel</last></author>
-      <author><first>Ranit</first> <last>Aharonov</last></author>
-      <author><first>Noam</first> <last>Slonim</last></author>
+      <author><first>Ran</first><last>Levy</last></author>
+      <author><first>Shai</first><last>Gretz</last></author>
+      <author><first>Benjamin</first><last>Sznajder</last></author>
+      <author><first>Shay</first><last>Hummel</last></author>
+      <author><first>Ranit</first><last>Aharonov</last></author>
+      <author><first>Noam</first><last>Slonim</last></author>
       <pages>79–84</pages>
       <url>W17-5110</url>
       <doi>10.18653/v1/W17-5110</doi>
@@ -9372,7 +9372,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="11">
       <title>Using Question-Answering Techniques to Implement a Knowledge-Driven Argument Mining Approach</title>
-      <author><first>Patrick</first> <last>Saint-Dizier</last></author>
+      <author><first>Patrick</first><last>Saint-Dizier</last></author>
       <pages>85–90</pages>
       <url>W17-5111</url>
       <doi>10.18653/v1/W17-5111</doi>
@@ -9380,13 +9380,13 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="12">
       <title>What works and what does not: Classifier and feature analysis for argument mining</title>
-      <author><first>Ahmet</first> <last>Aker</last></author>
-      <author><first>Alfred</first> <last>Sliwa</last></author>
-      <author><first>Yuan</first> <last>Ma</last></author>
-      <author><first>Ruishen</first> <last>Lui</last></author>
-      <author><first>Niravkumar</first> <last>Borad</last></author>
-      <author><first>Seyedeh</first> <last>Ziyaei</last></author>
-      <author><first>Mina</first> <last>Ghobadi</last></author>
+      <author><first>Ahmet</first><last>Aker</last></author>
+      <author><first>Alfred</first><last>Sliwa</last></author>
+      <author><first>Yuan</first><last>Ma</last></author>
+      <author><first>Ruishen</first><last>Lui</last></author>
+      <author><first>Niravkumar</first><last>Borad</last></author>
+      <author><first>Seyedeh</first><last>Ziyaei</last></author>
+      <author><first>Mina</first><last>Ghobadi</last></author>
       <pages>91–96</pages>
       <url>W17-5112</url>
       <doi>10.18653/v1/W17-5112</doi>
@@ -9394,9 +9394,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="13">
       <title>Unsupervised Detection of Argumentative Units though Topic Modeling Techniques</title>
-      <author><first>Alfio</first> <last>Ferrara</last></author>
-      <author><first>Stefano</first> <last>Montanelli</last></author>
-      <author><first>Georgios</first> <last>Petasis</last></author>
+      <author><first>Alfio</first><last>Ferrara</last></author>
+      <author><first>Stefano</first><last>Montanelli</last></author>
+      <author><first>Georgios</first><last>Petasis</last></author>
       <pages>97–107</pages>
       <url>W17-5113</url>
       <doi>10.18653/v1/W17-5113</doi>
@@ -9404,8 +9404,8 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="14">
       <title>Using Complex Argumentative Interactions to Reconstruct the Argumentative Structure of Large-Scale Debates</title>
-      <author><first>John</first> <last>Lawrence</last></author>
-      <author><first>Chris</first> <last>Reed</last></author>
+      <author><first>John</first><last>Lawrence</last></author>
+      <author><first>Chris</first><last>Reed</last></author>
       <pages>108–117</pages>
       <url>W17-5114</url>
       <doi>10.18653/v1/W17-5114</doi>
@@ -9413,11 +9413,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="15">
       <title>Unit Segmentation of Argumentative Texts</title>
-      <author><first>Yamen</first> <last>Ajjour</last></author>
-      <author><first>Wei-Fan</first> <last>Chen</last></author>
-      <author><first>Johannes</first> <last>Kiesel</last></author>
-      <author><first>Henning</first> <last>Wachsmuth</last></author>
-      <author><first>Benno</first> <last>Stein</last></author>
+      <author><first>Yamen</first><last>Ajjour</last></author>
+      <author><first>Wei-Fan</first><last>Chen</last></author>
+      <author><first>Johannes</first><last>Kiesel</last></author>
+      <author><first>Henning</first><last>Wachsmuth</last></author>
+      <author><first>Benno</first><last>Stein</last></author>
       <pages>118–128</pages>
       <url>W17-5115</url>
       <doi>10.18653/v1/W17-5115</doi>
@@ -9442,7 +9442,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </frontmatter>
     <paper id="1">
       <title>Detecting Sarcasm Using Different Forms Of Incongruity</title>
-      <author><first>Aditya</first> <last>Joshi</last></author>
+      <author><first>Aditya</first><last>Joshi</last></author>
       <pages>1</pages>
       <url>W17-5201</url>
       <doi>10.18653/v1/W17-5201</doi>
@@ -9450,9 +9450,9 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="2">
       <title>Assessing State-of-the-Art Sentiment Models on State-of-the-Art Sentiment Datasets</title>
-      <author><first>Jeremy</first> <last>Barnes</last></author>
-      <author><first>Roman</first> <last>Klinger</last></author>
-      <author><first>Sabine</first> <last>Schulte im Walde</last></author>
+      <author><first>Jeremy</first><last>Barnes</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
+      <author><first>Sabine</first><last>Schulte im Walde</last></author>
       <pages>2–12</pages>
       <url>W17-5202</url>
       <doi>10.18653/v1/W17-5202</doi>
@@ -9467,11 +9467,11 @@ is able to handle phenomena related to scope by means of an higher-order type th
     </paper>
     <paper id="3">
       <title>Annotation, Modelling and Analysis of Fine-Grained Emotions on a Stance and Sentiment Detection Corpus</title>
-      <author><first>Hendrik</first> <last>Schuff</last></author>
-      <author><first>Jeremy</first> <last>Barnes</last></author>
-      <author><first>Julian</first> <last>Mohme</last></author>
-      <author><first>Sebastian</first> <last>Padó</last></author>
-      <author><first>Roman</first> <last>Klinger</last></author>
+      <author><first>Hendrik</first><last>Schuff</last></author>
+      <author><first>Jeremy</first><last>Barnes</last></author>
+      <author><first>Julian</first><last>Mohme</last></author>
+      <author><first>Sebastian</first><last>Padó</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
       <pages>13–23</pages>
       <url>W17-5203</url>
       <doi>10.18653/v1/W17-5203</doi>
@@ -9482,10 +9482,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title>Ranking Right-Wing Extremist Social Media Profiles by Similarity to Democratic and Extremist Groups</title>
-      <author><first>Matthias</first> <last>Hartung</last></author>
-      <author><first>Roman</first> <last>Klinger</last></author>
-      <author><first>Franziska</first> <last>Schmidtke</last></author>
-      <author><first>Lars</first> <last>Vogel</last></author>
+      <author><first>Matthias</first><last>Hartung</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
+      <author><first>Franziska</first><last>Schmidtke</last></author>
+      <author><first>Lars</first><last>Vogel</last></author>
       <pages>24–33</pages>
       <url>W17-5204</url>
       <doi>10.18653/v1/W17-5204</doi>
@@ -9493,8 +9493,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title><fixed-case>WASSA</fixed-case>-2017 Shared Task on Emotion Intensity</title>
-      <author><first>Saif</first> <last>Mohammad</last></author>
-      <author><first>Felipe</first> <last>Bravo-Marquez</last></author>
+      <author><first>Saif</first><last>Mohammad</last></author>
+      <author><first>Felipe</first><last>Bravo-Marquez</last></author>
       <pages>34–49</pages>
       <url>W17-5205</url>
       <doi>10.18653/v1/W17-5205</doi>
@@ -9502,9 +9502,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="6">
       <title><fixed-case>IMS</fixed-case> at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Emotion Intensity Prediction with Affective Norms, Automatically Extended Resources and Deep Learning</title>
-      <author><first>Maximilian</first> <last>Köper</last></author>
-      <author><first>Evgeny</first> <last>Kim</last></author>
-      <author><first>Roman</first> <last>Klinger</last></author>
+      <author><first>Maximilian</first><last>Köper</last></author>
+      <author><first>Evgeny</first><last>Kim</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
       <pages>50–57</pages>
       <url>W17-5206</url>
       <doi>10.18653/v1/W17-5206</doi>
@@ -9513,10 +9513,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="7">
       <title>Prayas at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt 2017: An Ensemble of Deep Neural Architectures for Emotion Intensity Prediction in Tweets</title>
-      <author><first>Pranav</first> <last>Goel</last></author>
-      <author><first>Devang</first> <last>Kulshreshtha</last></author>
-      <author><first>Prayas</first> <last>Jain</last></author>
-      <author><first>Kaushal Kumar</first> <last>Shukla</last></author>
+      <author><first>Pranav</first><last>Goel</last></author>
+      <author><first>Devang</first><last>Kulshreshtha</last></author>
+      <author><first>Prayas</first><last>Jain</last></author>
+      <author><first>Kaushal Kumar</first><last>Shukla</last></author>
       <pages>58–65</pages>
       <url>W17-5207</url>
       <doi>10.18653/v1/W17-5207</doi>
@@ -9524,7 +9524,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="8">
       <title>Latest News in Computational Argumentation: Surfing on the Deep Learning Wave, Scuba Diving in the Abyss of Fundamental Questions</title>
-      <author><first>Iryna</first> <last>Gurevych</last></author>
+      <author><first>Iryna</first><last>Gurevych</last></author>
       <pages>66</pages>
       <url>W17-5208</url>
       <doi>10.18653/v1/W17-5208</doi>
@@ -9532,10 +9532,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="9">
       <title>Towards Syntactic <fixed-case>I</fixed-case>berian Polarity Classification</title>
-      <author><first>David</first> <last>Vilares</last></author>
-      <author><first>Marcos</first> <last>Garcia</last></author>
-      <author><first>Miguel A.</first> <last>Alonso</last></author>
-      <author><first>Carlos</first> <last>Gómez-Rodríguez</last></author>
+      <author><first>David</first><last>Vilares</last></author>
+      <author><first>Marcos</first><last>Garcia</last></author>
+      <author><first>Miguel A.</first><last>Alonso</last></author>
+      <author><first>Carlos</first><last>Gómez-Rodríguez</last></author>
       <pages>67–73</pages>
       <url>W17-5209</url>
       <doi>10.18653/v1/W17-5209</doi>
@@ -9543,8 +9543,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="10">
       <title>Toward Stance Classification Based on Claim Microstructures</title>
-      <author><first>Filip</first> <last>Boltužić</last></author>
-      <author><first>Jan</first> <last>Šnajder</last></author>
+      <author><first>Filip</first><last>Boltužić</last></author>
+      <author><first>Jan</first><last>Šnajder</last></author>
       <pages>74–80</pages>
       <url>W17-5210</url>
       <doi>10.18653/v1/W17-5210</doi>
@@ -9552,10 +9552,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="11">
       <title>Linguistic Reflexes of Well-Being and Happiness in Echo</title>
-      <author><first>Jiaqi</first> <last>Wu</last></author>
-      <author><first>Marilyn</first> <last>Walker</last></author>
-      <author><first>Pranav</first> <last>Anand</last></author>
-      <author><first>Steve</first> <last>Whittaker</last></author>
+      <author><first>Jiaqi</first><last>Wu</last></author>
+      <author><first>Marilyn</first><last>Walker</last></author>
+      <author><first>Pranav</first><last>Anand</last></author>
+      <author><first>Steve</first><last>Whittaker</last></author>
       <pages>81–91</pages>
       <url>W17-5211</url>
       <doi>10.18653/v1/W17-5211</doi>
@@ -9563,8 +9563,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="12">
       <title>Forecasting Consumer Spending from Purchase Intentions Expressed on Social Media</title>
-      <author><first>Viktor</first> <last>Pekar</last></author>
-      <author><first>Jane</first> <last>Binner</last></author>
+      <author><first>Viktor</first><last>Pekar</last></author>
+      <author><first>Jane</first><last>Binner</last></author>
       <pages>92–101</pages>
       <url>W17-5212</url>
       <doi>10.18653/v1/W17-5212</doi>
@@ -9572,9 +9572,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="13">
       <title>Mining fine-grained opinions on closed captions of <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube videos with an attention-<fixed-case>RNN</fixed-case></title>
-      <author><first>Edison</first> <last>Marrese-Taylor</last></author>
-      <author><first>Jorge</first> <last>Balazs</last></author>
-      <author><first>Yutaka</first> <last>Matsuo</last></author>
+      <author><first>Edison</first><last>Marrese-Taylor</last></author>
+      <author><first>Jorge</first><last>Balazs</last></author>
+      <author><first>Yutaka</first><last>Matsuo</last></author>
       <pages>102–111</pages>
       <url>W17-5213</url>
       <doi>10.18653/v1/W17-5213</doi>
@@ -9582,7 +9582,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="14">
       <title>Understanding human values and their emotional effect</title>
-      <author><first>Alexandra</first> <last>Balahur</last></author>
+      <author><first>Alexandra</first><last>Balahur</last></author>
       <pages>112</pages>
       <url>W17-5214</url>
       <doi>10.18653/v1/W17-5214</doi>
@@ -9590,8 +9590,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="15">
       <title>Did you ever read about Frogs drinking Coffee? Investigating the Compositionality of Multi-Emoji Expressions</title>
-      <author><first>Rebeca</first> <last>Padilla López</last></author>
-      <author><first>Fabienne</first> <last>Cap</last></author>
+      <author><first>Rebeca</first><last>Padilla López</last></author>
+      <author><first>Fabienne</first><last>Cap</last></author>
       <pages>113–117</pages>
       <url>W17-5215</url>
       <doi>10.18653/v1/W17-5215</doi>
@@ -9599,8 +9599,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="16">
       <title>Investigating Redundancy in Emoji Use: Study on a Twitter Based Corpus</title>
-      <author><first>Giulia</first> <last>Donato</last></author>
-      <author><first>Patrizia</first> <last>Paggio</last></author>
+      <author><first>Giulia</first><last>Donato</last></author>
+      <author><first>Patrizia</first><last>Paggio</last></author>
       <pages>118–126</pages>
       <url>W17-5216</url>
       <doi>10.18653/v1/W17-5216</doi>
@@ -9608,9 +9608,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="17">
       <title>Modeling Temporal Progression of Emotional Status in Mental Health Forum: A Recurrent Neural Net Approach</title>
-      <author><first>Kishaloy</first> <last>Halder</last></author>
-      <author><first>Lahari</first> <last>Poddar</last></author>
-      <author><first>Min-Yen</first> <last>Kan</last></author>
+      <author><first>Kishaloy</first><last>Halder</last></author>
+      <author><first>Lahari</first><last>Poddar</last></author>
+      <author><first>Min-Yen</first><last>Kan</last></author>
       <pages>127–135</pages>
       <url>W17-5217</url>
       <doi>10.18653/v1/W17-5217</doi>
@@ -9618,11 +9618,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="18">
       <title>Towards an integrated pipeline for aspect-based sentiment analysis in various domains</title>
-      <author><first>Orphée</first> <last>De Clercq</last></author>
-      <author><first>Els</first> <last>Lefever</last></author>
-      <author><first>Gilles</first> <last>Jacobs</last></author>
-      <author><first>Tijl</first> <last>Carpels</last></author>
-      <author><first>Véronique</first> <last>Hoste</last></author>
+      <author><first>Orphée</first><last>De Clercq</last></author>
+      <author><first>Els</first><last>Lefever</last></author>
+      <author><first>Gilles</first><last>Jacobs</last></author>
+      <author><first>Tijl</first><last>Carpels</last></author>
+      <author><first>Véronique</first><last>Hoste</last></author>
       <pages>136–142</pages>
       <url>W17-5218</url>
       <doi>10.18653/v1/W17-5218</doi>
@@ -9630,9 +9630,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="19">
       <title>Building a <fixed-case>S</fixed-case>enti<fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et for <fixed-case>O</fixed-case>dia</title>
-      <author><first>Gaurav</first> <last>Mohanty</last></author>
-      <author><first>Abishek</first> <last>Kannan</last></author>
-      <author><first>Radhika</first> <last>Mamidi</last></author>
+      <author><first>Gaurav</first><last>Mohanty</last></author>
+      <author><first>Abishek</first><last>Kannan</last></author>
+      <author><first>Radhika</first><last>Mamidi</last></author>
       <pages>143–148</pages>
       <url>W17-5219</url>
       <doi>10.18653/v1/W17-5219</doi>
@@ -9640,9 +9640,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="20">
       <title>Lexicon Integrated <fixed-case>CNN</fixed-case> Models with Attention for Sentiment Analysis</title>
-      <author><first>Bonggun</first> <last>Shin</last></author>
-      <author><first>Timothy</first> <last>Lee</last></author>
-      <author><first>Jinho D.</first> <last>Choi</last></author>
+      <author><first>Bonggun</first><last>Shin</last></author>
+      <author><first>Timothy</first><last>Lee</last></author>
+      <author><first>Jinho D.</first><last>Choi</last></author>
       <pages>149–158</pages>
       <url>W17-5220</url>
       <doi>10.18653/v1/W17-5220</doi>
@@ -9650,10 +9650,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="21">
       <title>Explaining Recurrent Neural Network Predictions in Sentiment Analysis</title>
-      <author><first>Leila</first> <last>Arras</last></author>
-      <author><first>Grégoire</first> <last>Montavon</last></author>
-      <author><first>Klaus-Robert</first> <last>Müller</last></author>
-      <author><first>Wojciech</first> <last>Samek</last></author>
+      <author><first>Leila</first><last>Arras</last></author>
+      <author><first>Grégoire</first><last>Montavon</last></author>
+      <author><first>Klaus-Robert</first><last>Müller</last></author>
+      <author><first>Wojciech</first><last>Samek</last></author>
       <pages>159–168</pages>
       <url>W17-5221</url>
       <doi>10.18653/v1/W17-5221</doi>
@@ -9661,9 +9661,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="22">
       <title><fixed-case>G</fixed-case>rad<fixed-case>A</fixed-case>scent at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Character and Word Level Recurrent Neural Network Models for Tweet Emotion Intensity Detection</title>
-      <author><first>Egor</first> <last>Lakomkin</last></author>
-      <author><first>Chandrakant</first> <last>Bothe</last></author>
-      <author><first>Stefan</first> <last>Wermter</last></author>
+      <author><first>Egor</first><last>Lakomkin</last></author>
+      <author><first>Chandrakant</first><last>Bothe</last></author>
+      <author><first>Stefan</first><last>Wermter</last></author>
       <pages>169–174</pages>
       <url>W17-5222</url>
       <doi>10.18653/v1/W17-5222</doi>
@@ -9671,9 +9671,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="23">
       <title><fixed-case>NUIG</fixed-case> at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: <fixed-case>B</fixed-case>i<fixed-case>LSTM</fixed-case> and <fixed-case>SVR</fixed-case> Ensemble to Detect Emotion Intensity</title>
-      <author><first>Vladimir</first> <last>Andryushechkin</last></author>
-      <author><first>Ian</first> <last>Wood</last></author>
-      <author><first>James</first> <last>O’ Neill</last></author>
+      <author><first>Vladimir</first><last>Andryushechkin</last></author>
+      <author><first>Ian</first><last>Wood</last></author>
+      <author><first>James</first><last>O’ Neill</last></author>
       <pages>175–179</pages>
       <url>W17-5223</url>
       <doi>10.18653/v1/W17-5223</doi>
@@ -9681,10 +9681,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="24">
       <title>Unsupervised Aspect Term Extraction with B-<fixed-case>LSTM</fixed-case> &amp; <fixed-case>CRF</fixed-case> using Automatically Labelled Datasets</title>
-      <author><first>Athanasios</first> <last>Giannakopoulos</last></author>
-      <author><first>Claudiu</first> <last>Musat</last></author>
-      <author><first>Andreea</first> <last>Hossmann</last></author>
-      <author><first>Michael</first> <last>Baeriswyl</last></author>
+      <author><first>Athanasios</first><last>Giannakopoulos</last></author>
+      <author><first>Claudiu</first><last>Musat</last></author>
+      <author><first>Andreea</first><last>Hossmann</last></author>
+      <author><first>Michael</first><last>Baeriswyl</last></author>
       <pages>180–188</pages>
       <url>W17-5224</url>
       <doi>10.18653/v1/W17-5224</doi>
@@ -9692,8 +9692,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="25">
       <title><fixed-case>PLN</fixed-case>-<fixed-case>PUCRS</fixed-case> at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Psycholinguistic features for emotion intensity prediction in tweets</title>
-      <author><first>Henrique</first> <last>Santos</last></author>
-      <author><first>Renata</first> <last>Vieira</last></author>
+      <author><first>Henrique</first><last>Santos</last></author>
+      <author><first>Renata</first><last>Vieira</last></author>
       <pages>189–192</pages>
       <url>W17-5225</url>
       <doi>10.18653/v1/W17-5225</doi>
@@ -9701,10 +9701,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="26">
       <title>Textmining at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: A Deep Learning Approach to Sentiment Intensity Scoring of <fixed-case>E</fixed-case>nglish Tweets</title>
-      <author><first>Hardik</first> <last>Meisheri</last></author>
-      <author><first>Rupsa</first> <last>Saha</last></author>
-      <author><first>Priyanka</first> <last>Sinha</last></author>
-      <author><first>Lipika</first> <last>Dey</last></author>
+      <author><first>Hardik</first><last>Meisheri</last></author>
+      <author><first>Rupsa</first><last>Saha</last></author>
+      <author><first>Priyanka</first><last>Sinha</last></author>
+      <author><first>Lipika</first><last>Dey</last></author>
       <pages>193–199</pages>
       <url>W17-5226</url>
       <doi>10.18653/v1/W17-5226</doi>
@@ -9712,10 +9712,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="27">
       <title><fixed-case>YNU</fixed-case>-<fixed-case>HPCC</fixed-case> at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Using a <fixed-case>CNN</fixed-case>-<fixed-case>LSTM</fixed-case> Model for Sentiment Intensity Prediction</title>
-      <author><first>You</first> <last>Zhang</last></author>
-      <author><first>Hang</first> <last>Yuan</last></author>
-      <author><first>Jin</first> <last>Wang</last></author>
-      <author><first>Xuejie</first> <last>Zhang</last></author>
+      <author><first>You</first><last>Zhang</last></author>
+      <author><first>Hang</first><last>Yuan</last></author>
+      <author><first>Jin</first><last>Wang</last></author>
+      <author><first>Xuejie</first><last>Zhang</last></author>
       <pages>200–204</pages>
       <url>W17-5227</url>
       <doi>10.18653/v1/W17-5227</doi>
@@ -9723,8 +9723,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="28">
       <title>Seernet at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Tweet Emotion Intensity Estimator</title>
-      <author><first>Venkatesh</first> <last>Duppada</last></author>
-      <author><first>Sushant</first> <last>Hiray</last></author>
+      <author><first>Venkatesh</first><last>Duppada</last></author>
+      <author><first>Sushant</first><last>Hiray</last></author>
       <pages>205–211</pages>
       <url>W17-5228</url>
       <doi>10.18653/v1/W17-5228</doi>
@@ -9733,11 +9733,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="29">
       <title><fixed-case>IITP</fixed-case> at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Measuring Intensity of Emotions using Sentence Embeddings and Optimized Features</title>
-      <author><first>Md Shad</first> <last>Akhtar</last></author>
-      <author><first>Palaash</first> <last>Sawant</last></author>
-      <author><first>Asif</first> <last>Ekbal</last></author>
-      <author><first>Jyoti</first> <last>Pawar</last></author>
-      <author><first>Pushpak</first> <last>Bhattacharyya</last></author>
+      <author><first>Md Shad</first><last>Akhtar</last></author>
+      <author><first>Palaash</first><last>Sawant</last></author>
+      <author><first>Asif</first><last>Ekbal</last></author>
+      <author><first>Jyoti</first><last>Pawar</last></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last></author>
       <pages>212–218</pages>
       <url>W17-5229</url>
       <doi>10.18653/v1/W17-5229</doi>
@@ -9745,8 +9745,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="30">
       <title><fixed-case>NSE</fixed-case>mo at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: An Ensemble to Predict Emotion Intensity in Tweets</title>
-      <author><first>Sreekanth</first> <last>Madisetty</last></author>
-      <author><first>Maunendra Sankar</first> <last>Desarkar</last></author>
+      <author><first>Sreekanth</first><last>Madisetty</last></author>
+      <author><first>Maunendra Sankar</first><last>Desarkar</last></author>
       <pages>219–224</pages>
       <url>W17-5230</url>
       <doi>10.18653/v1/W17-5230</doi>
@@ -9754,7 +9754,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="31">
       <title><fixed-case>T</fixed-case>ecnolengua <fixed-case>L</fixed-case>ingmotif at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: A lexicon-based approach</title>
-      <author><first>Antonio</first> <last>Moreno-Ortiz</last></author>
+      <author><first>Antonio</first><last>Moreno-Ortiz</last></author>
       <pages>225–232</pages>
       <url>W17-5231</url>
       <doi>10.18653/v1/W17-5231</doi>
@@ -9762,8 +9762,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="32">
       <title><fixed-case>E</fixed-case>mo<fixed-case>A</fixed-case>tt at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Inner attention sentence embedding for Emotion Intensity</title>
-      <author><first>Edison</first> <last>Marrese-Taylor</last></author>
-      <author><first>Yutaka</first> <last>Matsuo</last></author>
+      <author><first>Edison</first><last>Marrese-Taylor</last></author>
+      <author><first>Yutaka</first><last>Matsuo</last></author>
       <pages>233–237</pages>
       <url>W17-5232</url>
       <doi>10.18653/v1/W17-5232</doi>
@@ -9771,10 +9771,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="33">
       <title><fixed-case>YZU</fixed-case>-<fixed-case>NLP</fixed-case> at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Determining Emotion Intensity Using a Bi-directional <fixed-case>LSTM</fixed-case>-<fixed-case>CNN</fixed-case> Model</title>
-      <author><first>Yuanye</first> <last>He</last></author>
-      <author><first>Liang-Chih</first> <last>Yu</last></author>
-      <author><first>K. Robert</first> <last>Lai</last></author>
-      <author><first>Weiyi</first> <last>Liu</last></author>
+      <author><first>Yuanye</first><last>He</last></author>
+      <author><first>Liang-Chih</first><last>Yu</last></author>
+      <author><first>K. Robert</first><last>Lai</last></author>
+      <author><first>Weiyi</first><last>Liu</last></author>
       <pages>238–242</pages>
       <url>W17-5233</url>
       <doi>10.18653/v1/W17-5233</doi>
@@ -9782,8 +9782,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="34">
       <title><fixed-case>DMG</fixed-case>roup at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Emotion Intensity Using Ensemble Method</title>
-      <author><first>Song</first> <last>Jiang</last></author>
-      <author><first>Xiaotian</first> <last>Han</last></author>
+      <author><first>Song</first><last>Jiang</last></author>
+      <author><first>Xiaotian</first><last>Han</last></author>
       <pages>243–248</pages>
       <url>W17-5234</url>
       <doi>10.18653/v1/W17-5234</doi>
@@ -9791,8 +9791,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="35">
       <title><fixed-case>UW</fixed-case>at-Emote at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Emotion Intensity Detection using Affect Clues, Sentiment Polarity and Word Embeddings</title>
-      <author><first>Vineet</first> <last>John</last></author>
-      <author><first>Olga</first> <last>Vechtomova</last></author>
+      <author><first>Vineet</first><last>John</last></author>
+      <author><first>Olga</first><last>Vechtomova</last></author>
       <pages>249–254</pages>
       <url>W17-5235</url>
       <doi>10.18653/v1/W17-5235</doi>
@@ -9800,8 +9800,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="36">
       <title><fixed-case>LIPN</fixed-case>-<fixed-case>UAM</fixed-case> at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017:Combination of Lexicon-based features and Sentence-level Vector Representations for Emotion Intensity Determination</title>
-      <author><first>Davide</first> <last>Buscaldi</last></author>
-      <author><first>Belem</first> <last>Priego</last></author>
+      <author><first>Davide</first><last>Buscaldi</last></author>
+      <author><first>Belem</first><last>Priego</last></author>
       <pages>255–258</pages>
       <url>W17-5236</url>
       <doi>10.18653/v1/W17-5236</doi>
@@ -9809,11 +9809,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="37">
       <title>deep<fixed-case>C</fixed-case>yb<fixed-case>E</fixed-case>r<fixed-case>N</fixed-case>et at <fixed-case>E</fixed-case>mo<fixed-case>I</fixed-case>nt-2017: Deep Emotion Intensities in Tweets</title>
-      <author><first>Vinayakumar</first> <last>R</last></author>
-      <author><first>Premjith</first> <last>B</last></author>
-      <author><first>Sachin Kumar</first> <last>S</last></author>
-      <author><first>Soman</first> <last>KP</last></author>
-      <author><first>Prabaharan</first> <last>Poornachandran</last></author>
+      <author><first>Vinayakumar</first><last>R</last></author>
+      <author><first>Premjith</first><last>B</last></author>
+      <author><first>Sachin Kumar</first><last>S</last></author>
+      <author><first>Soman</first><last>KP</last></author>
+      <author><first>Prabaharan</first><last>Poornachandran</last></author>
       <pages>259–263</pages>
       <url>W17-5237</url>
       <doi>10.18653/v1/W17-5237</doi>
@@ -9842,10 +9842,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>The <fixed-case>R</fixed-case>ep<fixed-case>E</fixed-case>val 2017 Shared Task: Multi-Genre Natural Language Inference with Sentence Representations</title>
-      <author><first>Nikita</first> <last>Nangia</last></author>
-      <author><first>Adina</first> <last>Williams</last></author>
-      <author><first>Angeliki</first> <last>Lazaridou</last></author>
-      <author><first>Samuel</first> <last>Bowman</last></author>
+      <author><first>Nikita</first><last>Nangia</last></author>
+      <author><first>Adina</first><last>Williams</last></author>
+      <author><first>Angeliki</first><last>Lazaridou</last></author>
+      <author><first>Samuel</first><last>Bowman</last></author>
       <pages>1–10</pages>
       <url>W17-5301</url>
       <doi>10.18653/v1/W17-5301</doi>
@@ -9853,11 +9853,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>Traversal-Free Word Vector Evaluation in Analogy Space</title>
-      <author><first>Xiaoyin</first> <last>Che</last></author>
-      <author><first>Nico</first> <last>Ring</last></author>
-      <author><first>Willi</first> <last>Raschkowski</last></author>
-      <author><first>Haojin</first> <last>Yang</last></author>
-      <author><first>Christoph</first> <last>Meinel</last></author>
+      <author><first>Xiaoyin</first><last>Che</last></author>
+      <author><first>Nico</first><last>Ring</last></author>
+      <author><first>Willi</first><last>Raschkowski</last></author>
+      <author><first>Haojin</first><last>Yang</last></author>
+      <author><first>Christoph</first><last>Meinel</last></author>
       <pages>11–15</pages>
       <url>W17-5302</url>
       <doi>10.18653/v1/W17-5302</doi>
@@ -9865,7 +9865,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>Hypothesis Testing based Intrinsic Evaluation of Word Embeddings</title>
-      <author><first>Nishant</first> <last>Gurnani</last></author>
+      <author><first>Nishant</first><last>Gurnani</last></author>
       <pages>16–20</pages>
       <url>W17-5303</url>
       <doi>10.18653/v1/W17-5303</doi>
@@ -9873,9 +9873,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title>Evaluation of word embeddings against cognitive processes: primed reaction times in lexical decision and naming tasks</title>
-      <author><first>Jeremy</first> <last>Auguste</last></author>
-      <author><first>Arnaud</first> <last>Rey</last></author>
-      <author><first>Benoit</first> <last>Favre</last></author>
+      <author><first>Jeremy</first><last>Auguste</last></author>
+      <author><first>Arnaud</first><last>Rey</last></author>
+      <author><first>Benoit</first><last>Favre</last></author>
       <pages>21–26</pages>
       <url>W17-5304</url>
       <doi>10.18653/v1/W17-5304</doi>
@@ -9883,8 +9883,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title>Playing with Embeddings : Evaluating embeddings for Robot Language Learning through <fixed-case>MUD</fixed-case> Games</title>
-      <author><first>Anmol</first> <last>Gulati</last></author>
-      <author><first>Kumar Krishna</first> <last>Agrawal</last></author>
+      <author><first>Anmol</first><last>Gulati</last></author>
+      <author><first>Kumar Krishna</first><last>Agrawal</last></author>
       <pages>27–30</pages>
       <url>W17-5305</url>
       <doi>10.18653/v1/W17-5305</doi>
@@ -9892,7 +9892,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="6">
       <title>Recognizing Textual Entailment in Twitter Using Word Embeddings</title>
-      <author><first>Octavia-Maria</first> <last>Şulea</last></author>
+      <author><first>Octavia-Maria</first><last>Şulea</last></author>
       <pages>31–35</pages>
       <url>W17-5306</url>
       <doi>10.18653/v1/W17-5306</doi>
@@ -9900,12 +9900,12 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="7">
       <title>Recurrent Neural Network-Based Sentence Encoder with Gated Attention for Natural Language Inference</title>
-      <author><first>Qian</first> <last>Chen</last></author>
-      <author><first>Xiaodan</first> <last>Zhu</last></author>
-      <author><first>Zhen-Hua</first> <last>Ling</last></author>
-      <author><first>Si</first> <last>Wei</last></author>
-      <author><first>Hui</first> <last>Jiang</last></author>
-      <author><first>Diana</first> <last>Inkpen</last></author>
+      <author><first>Qian</first><last>Chen</last></author>
+      <author><first>Xiaodan</first><last>Zhu</last></author>
+      <author><first>Zhen-Hua</first><last>Ling</last></author>
+      <author><first>Si</first><last>Wei</last></author>
+      <author><first>Hui</first><last>Jiang</last></author>
+      <author><first>Diana</first><last>Inkpen</last></author>
       <pages>36–40</pages>
       <url>W17-5307</url>
       <doi>10.18653/v1/W17-5307</doi>
@@ -9913,8 +9913,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="8">
       <title>Shortcut-Stacked Sentence Encoders for Multi-Domain Inference</title>
-      <author><first>Yixin</first> <last>Nie</last></author>
-      <author><first>Mohit</first> <last>Bansal</last></author>
+      <author><first>Yixin</first><last>Nie</last></author>
+      <author><first>Mohit</first><last>Bansal</last></author>
       <pages>41–45</pages>
       <url>W17-5308</url>
       <doi>10.18653/v1/W17-5308</doi>
@@ -9922,9 +9922,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="9">
       <title>Character-level Intra Attention Network for Natural Language Inference</title>
-      <author><first>Han</first> <last>Yang</last></author>
-      <author><first>Marta R.</first> <last>Costa-jussà</last></author>
-      <author><first>José A. R.</first> <last>Fonollosa</last></author>
+      <author><first>Han</first><last>Yang</last></author>
+      <author><first>Marta R.</first><last>Costa-jussà</last></author>
+      <author><first>José A. R.</first><last>Fonollosa</last></author>
       <pages>46–50</pages>
       <url>W17-5309</url>
       <doi>10.18653/v1/W17-5309</doi>
@@ -9932,10 +9932,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="10">
       <title>Refining Raw Sentence Representations for Textual Entailment Recognition via Attention</title>
-      <author><first>Jorge</first> <last>Balazs</last></author>
-      <author><first>Edison</first> <last>Marrese-Taylor</last></author>
-      <author><first>Pablo</first> <last>Loyola</last></author>
-      <author><first>Yutaka</first> <last>Matsuo</last></author>
+      <author><first>Jorge</first><last>Balazs</last></author>
+      <author><first>Edison</first><last>Marrese-Taylor</last></author>
+      <author><first>Pablo</first><last>Loyola</last></author>
+      <author><first>Yutaka</first><last>Matsuo</last></author>
       <pages>51–55</pages>
       <url>W17-5310</url>
       <doi>10.18653/v1/W17-5310</doi>
@@ -9974,10 +9974,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Towards Linguistically Generalizable <fixed-case>NLP</fixed-case> Systems: A Workshop and Shared Task</title>
-      <author><first>Allyson</first> <last>Ettinger</last></author>
-      <author><first>Sudha</first> <last>Rao</last></author>
-      <author><first>Hal</first> <last>Daumé III</last></author>
-      <author><first>Emily M.</first> <last>Bender</last></author>
+      <author><first>Allyson</first><last>Ettinger</last></author>
+      <author><first>Sudha</first><last>Rao</last></author>
+      <author><first>Hal</first><last>Daumé III</last></author>
+      <author><first>Emily M.</first><last>Bender</last></author>
       <pages>1–10</pages>
       <url>W17-5401</url>
       <doi>10.18653/v1/W17-5401</doi>
@@ -9985,11 +9985,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>Analysing Errors of Open Information Extraction Systems</title>
-      <author><first>Rudolf</first> <last>Schneider</last></author>
-      <author><first>Tom</first> <last>Oberhauser</last></author>
-      <author><first>Tobias</first> <last>Klatt</last></author>
-      <author><first>Felix A.</first> <last>Gers</last></author>
-      <author><first>Alexander</first> <last>Löser</last></author>
+      <author><first>Rudolf</first><last>Schneider</last></author>
+      <author><first>Tom</first><last>Oberhauser</last></author>
+      <author><first>Tobias</first><last>Klatt</last></author>
+      <author><first>Felix A.</first><last>Gers</last></author>
+      <author><first>Alexander</first><last>Löser</last></author>
       <pages>11–18</pages>
       <url>W17-5402</url>
       <doi>10.18653/v1/W17-5402</doi>
@@ -9997,9 +9997,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>Massively Multilingual Neural Grapheme-to-Phoneme Conversion</title>
-      <author><first>Ben</first> <last>Peters</last></author>
-      <author><first>Jon</first> <last>Dehdari</last></author>
-      <author><first>Josef</first> <last>van Genabith</last></author>
+      <author><first>Ben</first><last>Peters</last></author>
+      <author><first>Jon</first><last>Dehdari</last></author>
+      <author><first>Josef</first><last>van Genabith</last></author>
       <pages>19–26</pages>
       <url>W17-5403</url>
       <doi>10.18653/v1/W17-5403</doi>
@@ -10007,9 +10007,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title><fixed-case>BIBI</fixed-case> System Description: Building with <fixed-case>CNN</fixed-case>s and Breaking with Deep Reinforcement Learning</title>
-      <author><first>Yitong</first> <last>Li</last></author>
-      <author><first>Trevor</first> <last>Cohn</last></author>
-      <author><first>Timothy</first> <last>Baldwin</last></author>
+      <author><first>Yitong</first><last>Li</last></author>
+      <author><first>Trevor</first><last>Cohn</last></author>
+      <author><first>Timothy</first><last>Baldwin</last></author>
       <pages>27–32</pages>
       <url>W17-5404</url>
       <doi>10.18653/v1/W17-5404</doi>
@@ -10017,14 +10017,14 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title>Breaking <fixed-case>NLP</fixed-case>: Using Morphosyntax, Semantics, Pragmatics and World Knowledge to Fool Sentiment Analysis Systems</title>
-      <author><first>Taylor</first> <last>Mahler</last></author>
-      <author><first>Willy</first> <last>Cheung</last></author>
-      <author><first>Micha</first> <last>Elsner</last></author>
-      <author><first>David</first> <last>King</last></author>
-      <author><first>Marie-Catherine</first> <last>de Marneffe</last></author>
-      <author><first>Cory</first> <last>Shain</last></author>
-      <author><first>Symon</first> <last>Stevens-Guille</last></author>
-      <author><first>Michael</first> <last>White</last></author>
+      <author><first>Taylor</first><last>Mahler</last></author>
+      <author><first>Willy</first><last>Cheung</last></author>
+      <author><first>Micha</first><last>Elsner</last></author>
+      <author><first>David</first><last>King</last></author>
+      <author><first>Marie-Catherine</first><last>de Marneffe</last></author>
+      <author><first>Cory</first><last>Shain</last></author>
+      <author><first>Symon</first><last>Stevens-Guille</last></author>
+      <author><first>Michael</first><last>White</last></author>
       <pages>33–39</pages>
       <url>W17-5405</url>
       <doi>10.18653/v1/W17-5405</doi>
@@ -10033,9 +10033,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="6">
       <title>An Adaptable Lexical Simplification Architecture for Major <fixed-case>I</fixed-case>bero-Romance Languages</title>
-      <author><first>Daniel</first> <last>Ferrés</last></author>
-      <author><first>Horacio</first> <last>Saggion</last></author>
-      <author><first>Xavier</first> <last>Gómez Guinovart</last></author>
+      <author><first>Daniel</first><last>Ferrés</last></author>
+      <author><first>Horacio</first><last>Saggion</last></author>
+      <author><first>Xavier</first><last>Gómez Guinovart</last></author>
       <pages>40–47</pages>
       <url>W17-5406</url>
       <doi>10.18653/v1/W17-5406</doi>
@@ -10043,8 +10043,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="7">
       <title>Cross-genre Document Retrieval: Matching between Conversational and Formal Writings</title>
-      <author><first>Tomasz</first> <last>Jurczyk</last></author>
-      <author><first>Jinho D.</first> <last>Choi</last></author>
+      <author><first>Tomasz</first><last>Jurczyk</last></author>
+      <author><first>Jinho D.</first><last>Choi</last></author>
       <pages>48–53</pages>
       <url>W17-5407</url>
       <doi>10.18653/v1/W17-5407</doi>
@@ -10052,8 +10052,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="8">
       <title><fixed-case>ACTSA</fixed-case>: Annotated Corpus for <fixed-case>T</fixed-case>elugu Sentiment Analysis</title>
-      <author><first>Sandeep Sricharan</first> <last>Mukku</last></author>
-      <author><first>Radhika</first> <last>Mamidi</last></author>
+      <author><first>Sandeep Sricharan</first><last>Mukku</last></author>
+      <author><first>Radhika</first><last>Mamidi</last></author>
       <pages>54–58</pages>
       <url>W17-5408</url>
       <doi>10.18653/v1/W17-5408</doi>
@@ -10061,7 +10061,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="9">
       <title><fixed-case>S</fixed-case>trawman: An Ensemble of Deep Bag-of-Ngrams for Sentiment Analysis</title>
-      <author><first>Kyunghyun</first> <last>Cho</last></author>
+      <author><first>Kyunghyun</first><last>Cho</last></author>
       <pages>59–60</pages>
       <url>W17-5409</url>
       <doi>10.18653/v1/W17-5409</doi>
@@ -10069,8 +10069,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="10">
       <title>Breaking Sentiment Analysis of Movie Reviews</title>
-      <author><first>Ieva</first> <last>Staliūnaitė</last></author>
-      <author><first>Ben</first> <last>Bonfil</last></author>
+      <author><first>Ieva</first><last>Staliūnaitė</last></author>
+      <author><first>Ben</first><last>Bonfil</last></author>
       <pages>61–64</pages>
       <url>W17-5410</url>
       <doi>10.18653/v1/W17-5410</doi>
@@ -10096,8 +10096,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Automatic Mapping of <fixed-case>F</fixed-case>rench Discourse Connectives to <fixed-case>PDTB</fixed-case> Discourse Relations</title>
-      <author><first>Majid</first> <last>Laali</last></author>
-      <author><first>Leila</first> <last>Kosseim</last></author>
+      <author><first>Majid</first><last>Laali</last></author>
+      <author><first>Leila</first><last>Kosseim</last></author>
       <pages>1–6</pages>
       <url>W17-5501</url>
       <doi>10.18653/v1/W17-5501</doi>
@@ -10105,9 +10105,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>Towards Full Text Shallow Discourse Relation Annotation: Experiments with Cross-Paragraph Implicit Relations in the <fixed-case>PDTB</fixed-case></title>
-      <author><first>Rashmi</first> <last>Prasad</last></author>
-      <author><first>Katherine</first> <last>Forbes Riley</last></author>
-      <author><first>Alan</first> <last>Lee</last></author>
+      <author><first>Rashmi</first><last>Prasad</last></author>
+      <author><first>Katherine</first><last>Forbes Riley</last></author>
+      <author><first>Alan</first><last>Lee</last></author>
       <pages>7–16</pages>
       <url>W17-5502</url>
       <doi>10.18653/v1/W17-5502</doi>
@@ -10115,7 +10115,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>User-initiated Sub-dialogues in State-of-the-art Dialogue Systems</title>
-      <author><first>Staffan</first> <last>Larsson</last></author>
+      <author><first>Staffan</first><last>Larsson</last></author>
       <pages>17–22</pages>
       <url>W17-5503</url>
       <doi>10.18653/v1/W17-5503</doi>
@@ -10123,11 +10123,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title>A Multimodal Dialogue System for Medical Decision Support inside Virtual Reality</title>
-      <author><first>Alexander</first> <last>Prange</last></author>
-      <author><first>Margarita</first> <last>Chikobava</last></author>
-      <author><first>Peter</first> <last>Poller</last></author>
-      <author><first>Michael</first> <last>Barz</last></author>
-      <author><first>Daniel</first> <last>Sonntag</last></author>
+      <author><first>Alexander</first><last>Prange</last></author>
+      <author><first>Margarita</first><last>Chikobava</last></author>
+      <author><first>Peter</first><last>Poller</last></author>
+      <author><first>Michael</first><last>Barz</last></author>
+      <author><first>Daniel</first><last>Sonntag</last></author>
       <pages>23–26</pages>
       <url>W17-5504</url>
       <doi>10.18653/v1/W17-5504</doi>
@@ -10135,10 +10135,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title>Generative Encoder-Decoder Models for Task-Oriented Spoken Dialog Systems with Chatting Capability</title>
-      <author><first>Tiancheng</first> <last>Zhao</last></author>
-      <author><first>Allen</first> <last>Lu</last></author>
-      <author><first>Kyusong</first> <last>Lee</last></author>
-      <author><first>Maxine</first> <last>Eskenazi</last></author>
+      <author><first>Tiancheng</first><last>Zhao</last></author>
+      <author><first>Allen</first><last>Lu</last></author>
+      <author><first>Kyusong</first><last>Lee</last></author>
+      <author><first>Maxine</first><last>Eskenazi</last></author>
       <pages>27–36</pages>
       <url>W17-5505</url>
       <doi>10.18653/v1/W17-5505</doi>
@@ -10146,10 +10146,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="6">
       <title>Key-Value Retrieval Networks for Task-Oriented Dialogue</title>
-      <author><first>Mihail</first> <last>Eric</last></author>
-      <author><first>Lakshmi</first> <last>Krishnan</last></author>
-      <author><first>Francois</first> <last>Charette</last></author>
-      <author><first>Christopher D.</first> <last>Manning</last></author>
+      <author><first>Mihail</first><last>Eric</last></author>
+      <author><first>Lakshmi</first><last>Krishnan</last></author>
+      <author><first>Francois</first><last>Charette</last></author>
+      <author><first>Christopher D.</first><last>Manning</last></author>
       <pages>37–49</pages>
       <url>W17-5506</url>
       <doi>10.18653/v1/W17-5506</doi>
@@ -10157,11 +10157,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="7">
       <title>Lexical Acquisition through Implicit Confirmations over Multiple Dialogues</title>
-      <author><first>Kohei</first> <last>Ono</last></author>
-      <author><first>Ryu</first> <last>Takeda</last></author>
-      <author><first>Eric</first> <last>Nichols</last></author>
-      <author><first>Mikio</first> <last>Nakano</last></author>
-      <author><first>Kazunori</first> <last>Komatani</last></author>
+      <author><first>Kohei</first><last>Ono</last></author>
+      <author><first>Ryu</first><last>Takeda</last></author>
+      <author><first>Eric</first><last>Nichols</last></author>
+      <author><first>Mikio</first><last>Nakano</last></author>
+      <author><first>Kazunori</first><last>Komatani</last></author>
       <pages>50–59</pages>
       <url>W17-5507</url>
       <doi>10.18653/v1/W17-5507</doi>
@@ -10169,12 +10169,12 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="8">
       <title>Utterance Intent Classification of a Spoken Dialogue System with Efficiently Untied Recursive Autoencoders</title>
-      <author><first>Tsuneo</first> <last>Kato</last></author>
-      <author><first>Atsushi</first> <last>Nagai</last></author>
-      <author><first>Naoki</first> <last>Noda</last></author>
-      <author><first>Ryosuke</first> <last>Sumitomo</last></author>
-      <author><first>Jianming</first> <last>Wu</last></author>
-      <author><first>Seiichi</first> <last>Yamamoto</last></author>
+      <author><first>Tsuneo</first><last>Kato</last></author>
+      <author><first>Atsushi</first><last>Nagai</last></author>
+      <author><first>Naoki</first><last>Noda</last></author>
+      <author><first>Ryosuke</first><last>Sumitomo</last></author>
+      <author><first>Jianming</first><last>Wu</last></author>
+      <author><first>Seiichi</first><last>Yamamoto</last></author>
       <pages>60–64</pages>
       <url>W17-5508</url>
       <doi>10.18653/v1/W17-5508</doi>
@@ -10182,15 +10182,15 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="9">
       <title>Reward-Balancing for Statistical Spoken Dialogue Systems using Multi-objective Reinforcement Learning</title>
-      <author><first>Stefan</first> <last>Ultes</last></author>
-      <author><first>Paweł</first> <last>Budzianowski</last></author>
-      <author><first>Iñigo</first> <last>Casanueva</last></author>
-      <author><first>Nikola</first> <last>Mrkšić</last></author>
-      <author><first>Lina M.</first> <last>Rojas-Barahona</last></author>
-      <author><first>Pei-Hao</first> <last>Su</last></author>
-      <author><first>Tsung-Hsien</first> <last>Wen</last></author>
-      <author><first>Milica</first> <last>Gašić</last></author>
-      <author><first>Steve</first> <last>Young</last></author>
+      <author><first>Stefan</first><last>Ultes</last></author>
+      <author><first>Paweł</first><last>Budzianowski</last></author>
+      <author><first>Iñigo</first><last>Casanueva</last></author>
+      <author><first>Nikola</first><last>Mrkšić</last></author>
+      <author><first>Lina M.</first><last>Rojas-Barahona</last></author>
+      <author><first>Pei-Hao</first><last>Su</last></author>
+      <author><first>Tsung-Hsien</first><last>Wen</last></author>
+      <author><first>Milica</first><last>Gašić</last></author>
+      <author><first>Steve</first><last>Young</last></author>
       <pages>65–70</pages>
       <url>W17-5509</url>
       <doi>10.18653/v1/W17-5509</doi>
@@ -10198,9 +10198,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="10">
       <title>Automatic Measures to Characterise Verbal Alignment in Human-Agent Interaction</title>
-      <author><first>Guillaume</first> <last>Dubuisson Duplessis</last></author>
-      <author><first>Chloé</first> <last>Clavel</last></author>
-      <author><first>Frédéric</first> <last>Landragin</last></author>
+      <author><first>Guillaume</first><last>Dubuisson Duplessis</last></author>
+      <author><first>Chloé</first><last>Clavel</last></author>
+      <author><first>Frédéric</first><last>Landragin</last></author>
       <pages>71–81</pages>
       <url>W17-5510</url>
       <doi>10.18653/v1/W17-5510</doi>
@@ -10208,8 +10208,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="11">
       <title>Demonstration of interactive teaching for end-to-end dialog control with hybrid code networks</title>
-      <author><first>Jason D.</first> <last>Williams</last></author>
-      <author><first>Lars</first> <last>Liden</last></author>
+      <author><first>Jason D.</first><last>Williams</last></author>
+      <author><first>Lars</first><last>Liden</last></author>
       <pages>82–85</pages>
       <url>W17-5511</url>
       <doi>10.18653/v1/W17-5511</doi>
@@ -10217,14 +10217,14 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="12">
       <title>Sub-domain Modelling for Dialogue Management with Hierarchical Reinforcement Learning</title>
-      <author><first>Paweł</first> <last>Budzianowski</last></author>
-      <author><first>Stefan</first> <last>Ultes</last></author>
-      <author><first>Pei-Hao</first> <last>Su</last></author>
-      <author><first>Nikola</first> <last>Mrkšić</last></author>
-      <author><first>Tsung-Hsien</first> <last>Wen</last></author>
-      <author><first>Iñigo</first> <last>Casanueva</last></author>
-      <author><first>Lina M.</first> <last>Rojas-Barahona</last></author>
-      <author><first>Milica</first> <last>Gašić</last></author>
+      <author><first>Paweł</first><last>Budzianowski</last></author>
+      <author><first>Stefan</first><last>Ultes</last></author>
+      <author><first>Pei-Hao</first><last>Su</last></author>
+      <author><first>Nikola</first><last>Mrkšić</last></author>
+      <author><first>Tsung-Hsien</first><last>Wen</last></author>
+      <author><first>Iñigo</first><last>Casanueva</last></author>
+      <author><first>Lina M.</first><last>Rojas-Barahona</last></author>
+      <author><first>Milica</first><last>Gašić</last></author>
       <pages>86–92</pages>
       <url>W17-5512</url>
       <doi>10.18653/v1/W17-5512</doi>
@@ -10232,9 +10232,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="13">
       <title><fixed-case>MACA</fixed-case>: A Modular Architecture for Conversational Agents</title>
-      <author><first>Hoai Phuoc</first> <last>Truong</last></author>
-      <author><first>Prasanna</first> <last>Parthasarathi</last></author>
-      <author><first>Joelle</first> <last>Pineau</last></author>
+      <author><first>Hoai Phuoc</first><last>Truong</last></author>
+      <author><first>Prasanna</first><last>Parthasarathi</last></author>
+      <author><first>Joelle</first><last>Pineau</last></author>
       <pages>93–102</pages>
       <url>W17-5513</url>
       <doi>10.18653/v1/W17-5513</doi>
@@ -10242,10 +10242,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="14">
       <title>Sequential Dialogue Context Modeling for Spoken Language Understanding</title>
-      <author><first>Ankur</first> <last>Bapna</last></author>
-      <author><first>Gokhan</first> <last>Tür</last></author>
-      <author><first>Dilek</first> <last>Hakkani-Tür</last></author>
-      <author><first>Larry</first> <last>Heck</last></author>
+      <author><first>Ankur</first><last>Bapna</last></author>
+      <author><first>Gokhan</first><last>Tür</last></author>
+      <author><first>Dilek</first><last>Hakkani-Tür</last></author>
+      <author><first>Larry</first><last>Heck</last></author>
       <pages>103–114</pages>
       <url>W17-5514</url>
       <doi>10.18653/v1/W17-5514</doi>
@@ -10253,10 +10253,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="15">
       <title>Redundancy Localization for the Conversationalization of Unstructured Responses</title>
-      <author><first>Sebastian</first> <last>Krause</last></author>
-      <author><first>Mikhail</first> <last>Kozhevnikov</last></author>
-      <author><first>Eric</first> <last>Malmi</last></author>
-      <author><first>Daniele</first> <last>Pighin</last></author>
+      <author><first>Sebastian</first><last>Krause</last></author>
+      <author><first>Mikhail</first><last>Kozhevnikov</last></author>
+      <author><first>Eric</first><last>Malmi</last></author>
+      <author><first>Daniele</first><last>Pighin</last></author>
       <pages>115–126</pages>
       <url>W17-5515</url>
       <doi>10.18653/v1/W17-5515</doi>
@@ -10264,12 +10264,12 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="16">
       <title>Attentive listening system with backchanneling, response generation and flexible turn-taking</title>
-      <author><first>Divesh</first> <last>Lala</last></author>
-      <author><first>Pierrick</first> <last>Milhorat</last></author>
-      <author><first>Koji</first> <last>Inoue</last></author>
-      <author><first>Masanari</first> <last>Ishida</last></author>
-      <author><first>Katsuya</first> <last>Takanashi</last></author>
-      <author><first>Tatsuya</first> <last>Kawahara</last></author>
+      <author><first>Divesh</first><last>Lala</last></author>
+      <author><first>Pierrick</first><last>Milhorat</last></author>
+      <author><first>Koji</first><last>Inoue</last></author>
+      <author><first>Masanari</first><last>Ishida</last></author>
+      <author><first>Katsuya</first><last>Takanashi</last></author>
+      <author><first>Tatsuya</first><last>Kawahara</last></author>
       <pages>127–136</pages>
       <url>W17-5516</url>
       <doi>10.18653/v1/W17-5516</doi>
@@ -10277,8 +10277,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="17">
       <title>Natural Language Input for In-Car Spoken Dialog Systems: How Natural is Natural?</title>
-      <author><first>Patricia</first> <last>Braunger</last></author>
-      <author><first>Wolfgang</first> <last>Maier</last></author>
+      <author><first>Patricia</first><last>Braunger</last></author>
+      <author><first>Wolfgang</first><last>Maier</last></author>
       <pages>137–146</pages>
       <url>W17-5517</url>
       <doi>10.18653/v1/W17-5517</doi>
@@ -10286,11 +10286,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="18">
       <title>Sample-efficient Actor-Critic Reinforcement Learning with Supervised Data for Dialogue Management</title>
-      <author><first>Pei-Hao</first> <last>Su</last></author>
-      <author><first>Paweł</first> <last>Budzianowski</last></author>
-      <author><first>Stefan</first> <last>Ultes</last></author>
-      <author><first>Milica</first> <last>Gašić</last></author>
-      <author><first>Steve</first> <last>Young</last></author>
+      <author><first>Pei-Hao</first><last>Su</last></author>
+      <author><first>Paweł</first><last>Budzianowski</last></author>
+      <author><first>Stefan</first><last>Ultes</last></author>
+      <author><first>Milica</first><last>Gašić</last></author>
+      <author><first>Steve</first><last>Young</last></author>
       <pages>147–157</pages>
       <url>W17-5518</url>
       <doi>10.18653/v1/W17-5518</doi>
@@ -10298,8 +10298,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="19">
       <title>A surprisingly effective out-of-the-box char2char model on the <fixed-case>E</fixed-case>2<fixed-case>E</fixed-case> <fixed-case>NLG</fixed-case> Challenge dataset</title>
-      <author><first>Shubham</first> <last>Agarwal</last></author>
-      <author><first>Marc</first> <last>Dymetman</last></author>
+      <author><first>Shubham</first><last>Agarwal</last></author>
+      <author><first>Marc</first><last>Dymetman</last></author>
       <pages>158–163</pages>
       <url>W17-5519</url>
       <doi>10.18653/v1/W17-5519</doi>
@@ -10307,9 +10307,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="20">
       <title>Interaction Quality Estimation Using Long Short-Term Memories</title>
-      <author><first>Niklas</first> <last>Rach</last></author>
-      <author><first>Wolfgang</first> <last>Minker</last></author>
-      <author><first>Stefan</first> <last>Ultes</last></author>
+      <author><first>Niklas</first><last>Rach</last></author>
+      <author><first>Wolfgang</first><last>Minker</last></author>
+      <author><first>Stefan</first><last>Ultes</last></author>
       <pages>164–169</pages>
       <url>W17-5520</url>
       <doi>10.18653/v1/W17-5520</doi>
@@ -10317,18 +10317,18 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="21">
       <title><fixed-case>D</fixed-case>ial<fixed-case>P</fixed-case>ort, Gone Live: An Update After A Year of Development</title>
-      <author><first>Kyusong</first> <last>Lee</last></author>
-      <author><first>Tiancheng</first> <last>Zhao</last></author>
-      <author><first>Yulun</first> <last>Du</last></author>
-      <author><first>Edward</first> <last>Cai</last></author>
-      <author><first>Allen</first> <last>Lu</last></author>
-      <author><first>Eli</first> <last>Pincus</last></author>
-      <author><first>David</first> <last>Traum</last></author>
-      <author><first>Stefan</first> <last>Ultes</last></author>
-      <author><first>Lina M.</first> <last>Rojas-Barahona</last></author>
-      <author><first>Milica</first> <last>Gasic</last></author>
-      <author><first>Steve</first> <last>Young</last></author>
-      <author><first>Maxine</first> <last>Eskenazi</last></author>
+      <author><first>Kyusong</first><last>Lee</last></author>
+      <author><first>Tiancheng</first><last>Zhao</last></author>
+      <author><first>Yulun</first><last>Du</last></author>
+      <author><first>Edward</first><last>Cai</last></author>
+      <author><first>Allen</first><last>Lu</last></author>
+      <author><first>Eli</first><last>Pincus</last></author>
+      <author><first>David</first><last>Traum</last></author>
+      <author><first>Stefan</first><last>Ultes</last></author>
+      <author><first>Lina M.</first><last>Rojas-Barahona</last></author>
+      <author><first>Milica</first><last>Gasic</last></author>
+      <author><first>Steve</first><last>Young</last></author>
+      <author><first>Maxine</first><last>Eskenazi</last></author>
       <pages>170–173</pages>
       <url>W17-5521</url>
       <doi>10.18653/v1/W17-5521</doi>
@@ -10336,10 +10336,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="22">
       <title>Evaluating Natural Language Understanding Services for Conversational Question Answering Systems</title>
-      <author><first>Daniel</first> <last>Braun</last></author>
-      <author><first>Adrian</first> <last>Hernandez Mendez</last></author>
-      <author><first>Florian</first> <last>Matthes</last></author>
-      <author><first>Manfred</first> <last>Langen</last></author>
+      <author><first>Daniel</first><last>Braun</last></author>
+      <author><first>Adrian</first><last>Hernandez Mendez</last></author>
+      <author><first>Florian</first><last>Matthes</last></author>
+      <author><first>Manfred</first><last>Langen</last></author>
       <pages>174–185</pages>
       <url>W17-5522</url>
       <doi>10.18653/v1/W17-5522</doi>
@@ -10347,9 +10347,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="23">
       <title>The Role of Conversation Context for Sarcasm Detection in Online Interactions</title>
-      <author><first>Debanjan</first> <last>Ghosh</last></author>
-      <author><first>Alexander</first> <last>Richard Fabbri</last></author>
-      <author><first>Smaranda</first> <last>Muresan</last></author>
+      <author><first>Debanjan</first><last>Ghosh</last></author>
+      <author><first>Alexander</first><last>Richard Fabbri</last></author>
+      <author><first>Smaranda</first><last>Muresan</last></author>
       <pages>186–196</pages>
       <url>W17-5523</url>
       <doi>10.18653/v1/W17-5523</doi>
@@ -10357,9 +10357,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="24">
       <title><fixed-case>VOILA</fixed-case>: An Optimised Dialogue System for Interactively Learning Visually-Grounded Word Meanings (Demonstration System)</title>
-      <author><first>Yanchao</first> <last>Yu</last></author>
-      <author><first>Arash</first> <last>Eshghi</last></author>
-      <author><first>Oliver</first> <last>Lemon</last></author>
+      <author><first>Yanchao</first><last>Yu</last></author>
+      <author><first>Arash</first><last>Eshghi</last></author>
+      <author><first>Oliver</first><last>Lemon</last></author>
       <pages>197–200</pages>
       <url>W17-5524</url>
       <doi>10.18653/v1/W17-5524</doi>
@@ -10367,9 +10367,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="25">
       <title>The <fixed-case>E</fixed-case>2<fixed-case>E</fixed-case> Dataset: New Challenges For End-to-End Generation</title>
-      <author><first>Jekaterina</first> <last>Novikova</last></author>
-      <author><first>Ondřej</first> <last>Dušek</last></author>
-      <author><first>Verena</first> <last>Rieser</last></author>
+      <author><first>Jekaterina</first><last>Novikova</last></author>
+      <author><first>Ondřej</first><last>Dušek</last></author>
+      <author><first>Verena</first><last>Rieser</last></author>
       <pages>201–206</pages>
       <url>W17-5525</url>
       <doi>10.18653/v1/W17-5525</doi>
@@ -10377,14 +10377,14 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="26">
       <title><fixed-case>F</fixed-case>rames: a corpus for adding memory to goal-oriented dialogue systems</title>
-      <author><first>Layla</first> <last>El Asri</last></author>
-      <author><first>Hannes</first> <last>Schulz</last></author>
-      <author><first>Shikhar</first> <last>Sharma</last></author>
-      <author><first>Jeremie</first> <last>Zumer</last></author>
-      <author><first>Justin</first> <last>Harris</last></author>
-      <author><first>Emery</first> <last>Fine</last></author>
-      <author><first>Rahul</first> <last>Mehrotra</last></author>
-      <author><first>Kaheer</first> <last>Suleman</last></author>
+      <author><first>Layla</first><last>El Asri</last></author>
+      <author><first>Hannes</first><last>Schulz</last></author>
+      <author><first>Shikhar</first><last>Sharma</last></author>
+      <author><first>Jeremie</first><last>Zumer</last></author>
+      <author><first>Justin</first><last>Harris</last></author>
+      <author><first>Emery</first><last>Fine</last></author>
+      <author><first>Rahul</first><last>Mehrotra</last></author>
+      <author><first>Kaheer</first><last>Suleman</last></author>
       <pages>207–219</pages>
       <url>W17-5526</url>
       <doi>10.18653/v1/W17-5526</doi>
@@ -10393,7 +10393,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="27">
       <title>Towards a General, Continuous Model of Turn-taking in Spoken Dialogue using <fixed-case>LSTM</fixed-case> Recurrent Neural Networks</title>
-      <author><first>Gabriel</first> <last>Skantze</last></author>
+      <author><first>Gabriel</first><last>Skantze</last></author>
       <pages>220–230</pages>
       <url>W17-5527</url>
       <doi>10.18653/v1/W17-5527</doi>
@@ -10401,9 +10401,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="28">
       <title>Neural-based Natural Language Generation in Dialogue using <fixed-case>RNN</fixed-case> Encoder-Decoder with Semantic Aggregation</title>
-      <author><first>Van-Khanh</first> <last>Tran</last></author>
-      <author><first>Le-Minh</first> <last>Nguyen</last></author>
-      <author><first>Satoshi</first> <last>Tojo</last></author>
+      <author><first>Van-Khanh</first><last>Tran</last></author>
+      <author><first>Le-Minh</first><last>Nguyen</last></author>
+      <author><first>Satoshi</first><last>Tojo</last></author>
       <pages>231–240</pages>
       <url>W17-5528</url>
       <doi>10.18653/v1/W17-5528</doi>
@@ -10411,9 +10411,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="29">
       <title>Beyond On-hold Messages: Conversational Time-buying in Task-oriented Dialogue</title>
-      <author><first>Soledad</first> <last>López Gambino</last></author>
-      <author><first>Sina</first> <last>Zarrieß</last></author>
-      <author><first>David</first> <last>Schlangen</last></author>
+      <author><first>Soledad</first><last>López Gambino</last></author>
+      <author><first>Sina</first><last>Zarrieß</last></author>
+      <author><first>David</first><last>Schlangen</last></author>
       <pages>241–246</pages>
       <url>W17-5529</url>
       <doi>10.18653/v1/W17-5529</doi>
@@ -10421,8 +10421,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="30">
       <title>Neural-based Context Representation Learning for Dialog Act Classification</title>
-      <author><first>Daniel</first> <last>Ortega</last></author>
-      <author><first>Ngoc Thang</first> <last>Vu</last></author>
+      <author><first>Daniel</first><last>Ortega</last></author>
+      <author><first>Ngoc Thang</first><last>Vu</last></author>
       <pages>247–252</pages>
       <url>W17-5530</url>
       <doi>10.18653/v1/W17-5530</doi>
@@ -10430,9 +10430,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="31">
       <title>Predicting Success in Goal-Driven Human-Human Dialogues</title>
-      <author><first>Michael</first> <last>Noseworthy</last></author>
-      <author><first>Jackie Chi Kit</first> <last>Cheung</last></author>
-      <author><first>Joelle</first> <last>Pineau</last></author>
+      <author><first>Michael</first><last>Noseworthy</last></author>
+      <author><first>Jackie Chi Kit</first><last>Cheung</last></author>
+      <author><first>Joelle</first><last>Pineau</last></author>
       <pages>253–262</pages>
       <url>W17-5531</url>
       <doi>10.18653/v1/W17-5531</doi>
@@ -10440,10 +10440,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="32">
       <title>Generating and Evaluating Summaries for Partial Email Threads: Conversational <fixed-case>B</fixed-case>ayesian Surprise and Silver Standards</title>
-      <author><first>Jordon</first> <last>Johnson</last></author>
-      <author><first>Vaden</first> <last>Masrani</last></author>
-      <author><first>Giuseppe</first> <last>Carenini</last></author>
-      <author><first>Raymond</first> <last>Ng</last></author>
+      <author><first>Jordon</first><last>Johnson</last></author>
+      <author><first>Vaden</first><last>Masrani</last></author>
+      <author><first>Giuseppe</first><last>Carenini</last></author>
+      <author><first>Raymond</first><last>Ng</last></author>
       <pages>263–272</pages>
       <url>W17-5532</url>
       <doi>10.18653/v1/W17-5532</doi>
@@ -10451,8 +10451,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="33">
       <title>Enabling robust and fluid spoken dialogue with cognitively impaired users</title>
-      <author><first>Ramin</first> <last>Yaghoubzadeh</last></author>
-      <author><first>Stefan</first> <last>Kopp</last></author>
+      <author><first>Ramin</first><last>Yaghoubzadeh</last></author>
+      <author><first>Stefan</first><last>Kopp</last></author>
       <pages>273–283</pages>
       <url>W17-5533</url>
       <doi>10.18653/v1/W17-5533</doi>
@@ -10460,8 +10460,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="34">
       <title>Adversarial evaluation for open-domain dialogue generation</title>
-      <author><first>Elia</first> <last>Bruni</last></author>
-      <author><first>Raquel</first> <last>Fernández</last></author>
+      <author><first>Elia</first><last>Bruni</last></author>
+      <author><first>Raquel</first><last>Fernández</last></author>
       <pages>284–288</pages>
       <url>W17-5534</url>
       <doi>10.18653/v1/W17-5534</doi>
@@ -10469,9 +10469,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="35">
       <title>Exploring Joint Neural Model for Sentence Level Discourse Parsing and Sentiment Analysis</title>
-      <author><first>Bita</first> <last>Nejat</last></author>
-      <author><first>Giuseppe</first> <last>Carenini</last></author>
-      <author><first>Raymond</first> <last>Ng</last></author>
+      <author><first>Bita</first><last>Nejat</last></author>
+      <author><first>Giuseppe</first><last>Carenini</last></author>
+      <author><first>Raymond</first><last>Ng</last></author>
       <pages>289–298</pages>
       <url>W17-5535</url>
       <doi>10.18653/v1/W17-5535</doi>
@@ -10479,9 +10479,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="36">
       <title>Predicting Causes of Reformulation in Intelligent Assistants</title>
-      <author><first>Shumpei</first> <last>Sano</last></author>
-      <author><first>Nobuhiro</first> <last>Kaji</last></author>
-      <author><first>Manabu</first> <last>Sassano</last></author>
+      <author><first>Shumpei</first><last>Sano</last></author>
+      <author><first>Nobuhiro</first><last>Kaji</last></author>
+      <author><first>Manabu</first><last>Sassano</last></author>
       <pages>299–309</pages>
       <url>W17-5536</url>
       <doi>10.18653/v1/W17-5536</doi>
@@ -10489,11 +10489,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="37">
       <title>Are you serious?: Rhetorical Questions and Sarcasm in Social Media Dialog</title>
-      <author><first>Shereen</first> <last>Oraby</last></author>
-      <author><first>Vrindavan</first> <last>Harrison</last></author>
-      <author><first>Amita</first> <last>Misra</last></author>
-      <author><first>Ellen</first> <last>Riloff</last></author>
-      <author><first>Marilyn</first> <last>Walker</last></author>
+      <author><first>Shereen</first><last>Oraby</last></author>
+      <author><first>Vrindavan</first><last>Harrison</last></author>
+      <author><first>Amita</first><last>Misra</last></author>
+      <author><first>Ellen</first><last>Riloff</last></author>
+      <author><first>Marilyn</first><last>Walker</last></author>
       <pages>310–319</pages>
       <url>W17-5537</url>
       <doi>10.18653/v1/W17-5537</doi>
@@ -10502,10 +10502,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="38">
       <title>Finding Structure in Figurative Language: Metaphor Detection with Topic-based Frames</title>
-      <author><first>Hyeju</first> <last>Jang</last></author>
-      <author><first>Keith</first> <last>Maki</last></author>
-      <author><first>Eduard</first> <last>Hovy</last></author>
-      <author><first>Carolyn</first> <last>Rosé</last></author>
+      <author><first>Hyeju</first><last>Jang</last></author>
+      <author><first>Keith</first><last>Maki</last></author>
+      <author><first>Eduard</first><last>Hovy</last></author>
+      <author><first>Carolyn</first><last>Rosé</last></author>
       <pages>320–330</pages>
       <url>W17-5538</url>
       <doi>10.18653/v1/W17-5538</doi>
@@ -10513,9 +10513,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="39">
       <title>Using Reinforcement Learning to Model Incrementality in a Fast-Paced Dialogue Game</title>
-      <author><first>Ramesh</first> <last>Manuvinakurike</last></author>
-      <author><first>David</first> <last>DeVault</last></author>
-      <author><first>Kallirroi</first> <last>Georgila</last></author>
+      <author><first>Ramesh</first><last>Manuvinakurike</last></author>
+      <author><first>David</first><last>DeVault</last></author>
+      <author><first>Kallirroi</first><last>Georgila</last></author>
       <pages>331–341</pages>
       <url>W17-5539</url>
       <doi>10.18653/v1/W17-5539</doi>
@@ -10523,8 +10523,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="40">
       <title>Inferring Narrative Causality between Event Pairs in Films</title>
-      <author><first>Zhichao</first> <last>Hu</last></author>
-      <author><first>Marilyn</first> <last>Walker</last></author>
+      <author><first>Zhichao</first><last>Hu</last></author>
+      <author><first>Marilyn</first><last>Walker</last></author>
       <pages>342–351</pages>
       <url>W17-5540</url>
       <doi>10.18653/v1/W17-5540</doi>
@@ -10533,8 +10533,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="41">
       <title>Lessons in Dialogue System Deployment</title>
-      <author><first>Anton</first> <last>Leuski</last></author>
-      <author><first>Ron</first> <last>Artstein</last></author>
+      <author><first>Anton</first><last>Leuski</last></author>
+      <author><first>Ron</first><last>Artstein</last></author>
       <pages>352–355</pages>
       <url>W17-5541</url>
       <doi>10.18653/v1/W17-5541</doi>
@@ -10542,9 +10542,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="42">
       <title>Information Navigation System with Discovering User Interests</title>
-      <author><first>Koichiro</first> <last>Yoshino</last></author>
-      <author><first>Yu</first> <last>Suzuki</last></author>
-      <author><first>Satoshi</first> <last>Nakamura</last></author>
+      <author><first>Koichiro</first><last>Yoshino</last></author>
+      <author><first>Yu</first><last>Suzuki</last></author>
+      <author><first>Satoshi</first><last>Nakamura</last></author>
       <pages>356–359</pages>
       <url>W17-5542</url>
       <doi>10.18653/v1/W17-5542</doi>
@@ -10552,11 +10552,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="43">
       <title>Modelling Protagonist Goals and Desires in First-Person Narrative</title>
-      <author><first>Elahe</first> <last>Rahimtoroghi</last></author>
-      <author><first>Jiaqi</first> <last>Wu</last></author>
-      <author><first>Ruimin</first> <last>Wang</last></author>
-      <author><first>Pranav</first> <last>Anand</last></author>
-      <author><first>Marilyn</first> <last>Walker</last></author>
+      <author><first>Elahe</first><last>Rahimtoroghi</last></author>
+      <author><first>Jiaqi</first><last>Wu</last></author>
+      <author><first>Ruimin</first><last>Wang</last></author>
+      <author><first>Pranav</first><last>Anand</last></author>
+      <author><first>Marilyn</first><last>Walker</last></author>
       <pages>360–369</pages>
       <url>W17-5543</url>
       <doi>10.18653/v1/W17-5543</doi>
@@ -10565,14 +10565,14 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="44">
       <title><fixed-case>SHIH</fixed-case>bot: A <fixed-case>F</fixed-case>acebook chatbot for Sexual Health Information on <fixed-case>HIV</fixed-case>/<fixed-case>AIDS</fixed-case></title>
-      <author><first>Jacqueline</first> <last>Brixey</last></author>
-      <author><first>Rens</first> <last>Hoegen</last></author>
-      <author><first>Wei</first> <last>Lan</last></author>
-      <author><first>Joshua</first> <last>Rusow</last></author>
-      <author><first>Karan</first> <last>Singla</last></author>
-      <author><first>Xusen</first> <last>Yin</last></author>
-      <author><first>Ron</first> <last>Artstein</last></author>
-      <author><first>Anton</first> <last>Leuski</last></author>
+      <author><first>Jacqueline</first><last>Brixey</last></author>
+      <author><first>Rens</first><last>Hoegen</last></author>
+      <author><first>Wei</first><last>Lan</last></author>
+      <author><first>Joshua</first><last>Rusow</last></author>
+      <author><first>Karan</first><last>Singla</last></author>
+      <author><first>Xusen</first><last>Yin</last></author>
+      <author><first>Ron</first><last>Artstein</last></author>
+      <author><first>Anton</first><last>Leuski</last></author>
       <pages>370–373</pages>
       <url>W17-5544</url>
       <doi>10.18653/v1/W17-5544</doi>
@@ -10580,12 +10580,12 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="45">
       <title>How Would You Say It? Eliciting Lexically Diverse Dialogue for Supervised Semantic Parsing</title>
-      <author><first>Abhilasha</first> <last>Ravichander</last></author>
-      <author><first>Thomas</first> <last>Manzini</last></author>
-      <author><first>Matthias</first> <last>Grabmair</last></author>
-      <author><first>Graham</first> <last>Neubig</last></author>
-      <author><first>Jonathan</first> <last>Francis</last></author>
-      <author><first>Eric</first> <last>Nyberg</last></author>
+      <author><first>Abhilasha</first><last>Ravichander</last></author>
+      <author><first>Thomas</first><last>Manzini</last></author>
+      <author><first>Matthias</first><last>Grabmair</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
+      <author><first>Jonathan</first><last>Francis</last></author>
+      <author><first>Eric</first><last>Nyberg</last></author>
       <pages>374–383</pages>
       <url>W17-5545</url>
       <doi>10.18653/v1/W17-5545</doi>
@@ -10593,8 +10593,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="46">
       <title>Not All Dialogues are Created Equal: Instance Weighting for Neural Conversational Models</title>
-      <author><first>Pierre</first> <last>Lison</last></author>
-      <author><first>Serge</first> <last>Bibauw</last></author>
+      <author><first>Pierre</first><last>Lison</last></author>
+      <author><first>Serge</first><last>Bibauw</last></author>
       <pages>384–394</pages>
       <url>W17-5546</url>
       <doi>10.18653/v1/W17-5546</doi>
@@ -10602,7 +10602,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="47">
       <title>A data-driven model of explanations for a chatbot that helps to practice conversation in a foreign language</title>
-      <author><first>Sviatlana</first> <last>Höhn</last></author>
+      <author><first>Sviatlana</first><last>Höhn</last></author>
       <pages>395–405</pages>
       <url>W17-5547</url>
       <doi>10.18653/v1/W17-5547</doi>
@@ -10625,29 +10625,29 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Building a Better Bitext for Structurally Different Languages through Self-training</title>
-      <author><first>Jungyeul</first> <last>Park</last></author>
-      <author><first>Loïc</first> <last>Dugast</last></author>
-      <author><first>Jeen-Pyo</first> <last>Hong</last></author>
-      <author><first>Chang-Uk</first> <last>Shin</last></author>
-      <author><first>Jeong-Won</first> <last>Cha</last></author>
+      <author><first>Jungyeul</first><last>Park</last></author>
+      <author><first>Loïc</first><last>Dugast</last></author>
+      <author><first>Jeen-Pyo</first><last>Hong</last></author>
+      <author><first>Chang-Uk</first><last>Shin</last></author>
+      <author><first>Jeong-Won</first><last>Cha</last></author>
       <pages>1–10</pages>
       <url>W17-5601</url>
       <abstract>We propose a novel method to bootstrap the construction of parallel corpora for new pairs of structurally different languages. We do so by combining the use of a pivot language and self-training. A pivot language enables the use of existing translation models to bootstrap the alignment and a self-training procedure enables to achieve better alignment, both at the document and sentence level. We also propose several evaluation methods for the resulting alignment.</abstract>
     </paper>
     <paper id="2">
       <title><fixed-case>M</fixed-case>ulti<fixed-case>N</fixed-case>ews: A Web collection of an Aligned Multimodal and Multilingual Corpus</title>
-      <author><first>Haithem</first> <last>Afli</last></author>
-      <author><first>Pintu</first> <last>Lohar</last></author>
-      <author><first>Andy</first> <last>Way</last></author>
+      <author><first>Haithem</first><last>Afli</last></author>
+      <author><first>Pintu</first><last>Lohar</last></author>
+      <author><first>Andy</first><last>Way</last></author>
       <pages>11–15</pages>
       <url>W17-5602</url>
       <abstract>Integrating Natural Language Processing (NLP) and computer vision is a promising effort. However, the applicability of these methods directly depends on the availability of a specific multimodal data that includes images and texts. In this paper, we present a collection of a Multimodal corpus of comparable texts and their images in 9 languages from the web news articles of Euronews website. This corpus has found widespread use in the NLP community in Multilingual and multimodal tasks. Here, we focus on its acquisition of the images and text data and their multilingual alignment.</abstract>
     </paper>
     <paper id="3">
       <title>Learning Phrase Embeddings from Paraphrases with <fixed-case>GRU</fixed-case>s</title>
-      <author><first>Zhihao</first> <last>Zhou</last></author>
-      <author><first>Lifu</first> <last>Huang</last></author>
-      <author><first>Heng</first> <last>Ji</last></author>
+      <author><first>Zhihao</first><last>Zhou</last></author>
+      <author><first>Lifu</first><last>Huang</last></author>
+      <author><first>Heng</first><last>Ji</last></author>
       <pages>16–23</pages>
       <url>W17-5603</url>
       <abstract>Learning phrase representations has been widely explored in many Natural Language Processing tasks (e.g., Sentiment Analysis, Machine Translation) and has shown promising improvements. Previous studies either learn non-compositional phrase representations with general word embedding learning techniques or learn compositional phrase representations based on syntactic structures, which either require huge amounts of human annotations or cannot be easily generalized to all phrases. In this work, we propose to take advantage of large-scaled paraphrase database and present a pairwise-GRU framework to generate compositional phrase representations. Our framework can be re-used to generate representations for any phrases. Experimental results show that our framework achieves state-of-the-art results on several phrase similarity tasks.</abstract>
@@ -10669,24 +10669,24 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Overview of the 4th Workshop on <fixed-case>A</fixed-case>sian Translation</title>
-      <author><first>Toshiaki</first> <last>Nakazawa</last></author>
-      <author><first>Shohei</first> <last>Higashiyama</last></author>
-      <author><first>Chenchen</first> <last>Ding</last></author>
-      <author><first>Hideya</first> <last>Mino</last></author>
-      <author><first>Isao</first> <last>Goto</last></author>
-      <author><first>Hideto</first> <last>Kazawa</last></author>
-      <author><first>Yusuke</first> <last>Oda</last></author>
-      <author><first>Graham</first> <last>Neubig</last></author>
-      <author><first>Sadao</first> <last>Kurohashi</last></author>
+      <author><first>Toshiaki</first><last>Nakazawa</last></author>
+      <author><first>Shohei</first><last>Higashiyama</last></author>
+      <author><first>Chenchen</first><last>Ding</last></author>
+      <author><first>Hideya</first><last>Mino</last></author>
+      <author><first>Isao</first><last>Goto</last></author>
+      <author><first>Hideto</first><last>Kazawa</last></author>
+      <author><first>Yusuke</first><last>Oda</last></author>
+      <author><first>Graham</first><last>Neubig</last></author>
+      <author><first>Sadao</first><last>Kurohashi</last></author>
       <pages>1–54</pages>
       <url>W17-5701</url>
       <abstract>This paper presents the results of the shared tasks from the 4th workshop on Asian translation (WAT2017) including J↔E, J↔C scientific paper translation subtasks, C↔J, K↔J, E↔J patent translation subtasks, H↔E mixed domain subtasks, J↔E newswire subtasks and J↔E recipe subtasks. For the WAT2017, 12 institutions participated in the shared tasks. About 300 translation results have been submitted to the automatic evaluation server, and selected submissions were manually evaluated.</abstract>
     </paper>
     <paper id="2">
       <title>Controlling Target Features in Neural Machine Translation via Prefix Constraints</title>
-      <author><first>Shunsuke</first> <last>Takeno</last></author>
-      <author><first>Masaaki</first> <last>Nagata</last></author>
-      <author><first>Kazuhide</first> <last>Yamamoto</last></author>
+      <author><first>Shunsuke</first><last>Takeno</last></author>
+      <author><first>Masaaki</first><last>Nagata</last></author>
+      <author><first>Kazuhide</first><last>Yamamoto</last></author>
       <pages>55–63</pages>
       <url>W17-5702</url>
       <abstract>We propose <i>prefix constraints</i>, a novel method to enforce constraints on
@@ -10705,141 +10705,141 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>Improving <fixed-case>J</fixed-case>apanese-to-<fixed-case>E</fixed-case>nglish Neural Machine Translation by Paraphrasing the Target Language</title>
-      <author><first>Yuuki</first> <last>Sekizawa</last></author>
-      <author><first>Tomoyuki</first> <last>Kajiwara</last></author>
-      <author><first>Mamoru</first> <last>Komachi</last></author>
+      <author><first>Yuuki</first><last>Sekizawa</last></author>
+      <author><first>Tomoyuki</first><last>Kajiwara</last></author>
+      <author><first>Mamoru</first><last>Komachi</last></author>
       <pages>64–69</pages>
       <url>W17-5703</url>
       <abstract>Neural machine translation (NMT) produces sentences that are more fluent than those produced by statistical machine translation (SMT). However, NMT has a very high computational cost because of the high dimensionality of the output layer. Generally, NMT restricts the size of vocabulary, which results in infrequent words being treated as out-of-vocabulary (OOV) and degrades the performance of the translation. In evaluation, we achieved a statistically significant BLEU score improvement of 0.55-0.77 over the baselines including the state-of-the-art method.</abstract>
     </paper>
     <paper id="4">
       <title>Improving Low-Resource Neural Machine Translation with Filtered Pseudo-Parallel Corpus</title>
-      <author><first>Aizhan</first> <last>Imankulova</last></author>
-      <author><first>Takayuki</first> <last>Sato</last></author>
-      <author><first>Mamoru</first> <last>Komachi</last></author>
+      <author><first>Aizhan</first><last>Imankulova</last></author>
+      <author><first>Takayuki</first><last>Sato</last></author>
+      <author><first>Mamoru</first><last>Komachi</last></author>
       <pages>70–78</pages>
       <url>W17-5704</url>
       <abstract>Large-scale parallel corpora are indispensable to train highly accurate machine translators. However, manually constructed large-scale parallel corpora are not freely available in many language pairs. In previous studies, training data have been expanded using a pseudo-parallel corpus obtained using machine translation of the monolingual corpus in the target language. However, in low-resource language pairs in which only low-accuracy machine translation systems can be used, translation quality is reduces when a pseudo-parallel corpus is used naively. To improve machine translation performance with low-resource language pairs, we propose a method to expand the training data effectively via filtering the pseudo-parallel corpus using a quality estimation based on back-translation. As a result of experiments with three language pairs using small, medium, and large parallel corpora, language pairs with fewer training data filtered out more sentence pairs and improved BLEU scores more significantly.</abstract>
     </paper>
     <paper id="5">
       <title><fixed-case>J</fixed-case>apanese to <fixed-case>E</fixed-case>nglish/<fixed-case>C</fixed-case>hinese/<fixed-case>K</fixed-case>orean Datasets for Translation Quality Estimation and Automatic Post-Editing</title>
-      <author><first>Atsushi</first> <last>Fujita</last></author>
-      <author><first>Eiichiro</first> <last>Sumita</last></author>
+      <author><first>Atsushi</first><last>Fujita</last></author>
+      <author><first>Eiichiro</first><last>Sumita</last></author>
       <pages>79–88</pages>
       <url>W17-5705</url>
       <abstract>Aiming at facilitating the research on quality estimation (QE) and automatic post-editing (APE) of machine translation (MT) outputs, especially for those among Asian languages, we have created new datasets for Japanese to English, Chinese, and Korean translations. As the source text, actual utterances in Japanese were extracted from the log data of our speech translation service. MT outputs were then given by phrase-based statistical MT systems. Finally, human evaluators were employed to grade the quality of MT outputs and to post-edit them. This paper describes the characteristics of the created datasets and reports on our benchmarking experiments on word-level QE, sentence-level QE, and APE conducted using the created datasets.</abstract>
     </paper>
     <paper id="6">
       <title><fixed-case>NTT</fixed-case> Neural Machine Translation Systems at <fixed-case>WAT</fixed-case> 2017</title>
-      <author><first>Makoto</first> <last>Morishita</last></author>
-      <author><first>Jun</first> <last>Suzuki</last></author>
-      <author><first>Masaaki</first> <last>Nagata</last></author>
+      <author><first>Makoto</first><last>Morishita</last></author>
+      <author><first>Jun</first><last>Suzuki</last></author>
+      <author><first>Masaaki</first><last>Nagata</last></author>
       <pages>89–94</pages>
       <url>W17-5706</url>
       <abstract>In this year, we participated in four translation subtasks at WAT 2017. Our model structure is quite simple but we used it with well-tuned hyper-parameters, leading to a significant improvement compared to the previous state-of-the-art system. We also tried to make use of the unreliable part of the provided parallel corpus by back-translating and making a synthetic corpus. Our submitted system achieved the new state-of-the-art performance in terms of the BLEU score, as well as human evaluation.</abstract>
     </paper>
     <paper id="7">
       <title><fixed-case>XMU</fixed-case> Neural Machine Translation Systems for <fixed-case>WAT</fixed-case> 2017</title>
-      <author><first>Boli</first> <last>Wang</last></author>
-      <author><first>Zhixing</first> <last>Tan</last></author>
-      <author><first>Jinming</first> <last>Hu</last></author>
-      <author><first>Yidong</first> <last>Chen</last></author>
-      <author><first>Xiaodong</first> <last>Shi</last></author>
+      <author><first>Boli</first><last>Wang</last></author>
+      <author><first>Zhixing</first><last>Tan</last></author>
+      <author><first>Jinming</first><last>Hu</last></author>
+      <author><first>Yidong</first><last>Chen</last></author>
+      <author><first>Xiaodong</first><last>Shi</last></author>
       <pages>95–98</pages>
       <url>W17-5707</url>
       <abstract>This paper describes the Neural Machine Translation systems of Xiamen University for the shared translation tasks of WAT 2017. Our systems are based on the Encoder-Decoder framework with attention. We participated in three subtasks. We experimented subword segmentation, synthetic training data and model ensembling. Experiments show that all these methods can give substantial improvements.</abstract>
     </paper>
     <paper id="8">
       <title>A Bag of Useful Tricks for Practical Neural Machine Translation: Embedding Layer Initialization and Large Batch Size</title>
-      <author><first>Masato</first> <last>Neishi</last></author>
-      <author><first>Jin</first> <last>Sakuma</last></author>
-      <author><first>Satoshi</first> <last>Tohda</last></author>
-      <author><first>Shonosuke</first> <last>Ishiwatari</last></author>
-      <author><first>Naoki</first> <last>Yoshinaga</last></author>
-      <author><first>Masashi</first> <last>Toyoda</last></author>
+      <author><first>Masato</first><last>Neishi</last></author>
+      <author><first>Jin</first><last>Sakuma</last></author>
+      <author><first>Satoshi</first><last>Tohda</last></author>
+      <author><first>Shonosuke</first><last>Ishiwatari</last></author>
+      <author><first>Naoki</first><last>Yoshinaga</last></author>
+      <author><first>Masashi</first><last>Toyoda</last></author>
       <pages>99–109</pages>
       <url>W17-5708</url>
       <abstract>In this paper, we describe the team UT-IIS’s system and results for the WAT 2017 translation tasks. We further investigated several tricks including a novel technique for initializing embedding layers using only the parallel corpus, which increased the BLEU score by 1.28, found a practical large batch size of 256, and gained insights regarding hyperparameter settings. Ultimately, our system obtained a better result than the state-of-the-art system of WAT 2016. Our code is available on <url>https://github.com/nem6ishi/wat17</url>. </abstract>
     </paper>
     <paper id="9">
       <title>Patent <fixed-case>NMT</fixed-case> integrated with Large Vocabulary Phrase Translation by <fixed-case>SMT</fixed-case> at <fixed-case>WAT</fixed-case> 2017</title>
-      <author><first>Zi</first> <last>Long</last></author>
-      <author><first>Ryuichiro</first> <last>Kimura</last></author>
-      <author><first>Takehito</first> <last>Utsuro</last></author>
-      <author><first>Tomoharu</first> <last>Mitsuhashi</last></author>
-      <author><first>Mikio</first> <last>Yamamoto</last></author>
+      <author><first>Zi</first><last>Long</last></author>
+      <author><first>Ryuichiro</first><last>Kimura</last></author>
+      <author><first>Takehito</first><last>Utsuro</last></author>
+      <author><first>Tomoharu</first><last>Mitsuhashi</last></author>
+      <author><first>Mikio</first><last>Yamamoto</last></author>
       <pages>110–118</pages>
       <url>W17-5709</url>
       <abstract>Neural machine translation (NMT) cannot handle a larger vocabulary because the training complexity and decoding complexity proportionally increase with the number of target words. This problem becomes even more serious when translating patent documents, which contain many technical terms that are observed infrequently. Long et al.(2017) proposed to select phrases that contain out-of-vocabulary words using the statistical approach of branching entropy. The selected phrases are then replaced with tokens during training and post-translated by the phrase translation table of SMT. In this paper, we apply the method proposed by Long et al. (2017) to the WAT 2017 Japanese-Chinese and Japanese-English patent datasets. Evaluation on Japanese-to-Chinese, Chinese-to-Japanese, Japanese-to-English and English-to-Japanese patent sentence translation proved the effectiveness of phrases selected with branching entropy, where the NMT model of Long et al.(2017) achieves a substantial improvement over a baseline NMT model without the technique proposed by Long et al.(2017).</abstract>
     </paper>
     <paper id="10">
       <title>SMT reranked NMT</title>
-      <author><first>Terumasa</first> <last>Ehara</last></author>
+      <author><first>Terumasa</first><last>Ehara</last></author>
       <pages>119–126</pages>
       <url>W17-5710</url>
       <abstract>System architecture, experimental settings and experimental results of the EHR team for the WAT2017 tasks are described. We participate in three tasks: JPCen-ja, JPCzh-ja and JPCko-ja. Although the basic architecture of our system is NMT, reranking technique is conducted using SMT results. One of the major drawback of NMT is under-translation and over-translation. On the other hand, SMT infrequently makes such translations. So, using reranking of n-best NMT outputs by the SMT output, discarding such translations can be expected. We can improve BLEU score from 46.03 to 47.08 by this technique in JPCzh-ja task.</abstract>
     </paper>
     <paper id="11">
       <title>Ensemble and Reranking: Using Multiple Models in the <fixed-case>NICT</fixed-case>-2 Neural Machine Translation System at <fixed-case>WAT</fixed-case>2017</title>
-      <author><first>Kenji</first> <last>Imamura</last></author>
-      <author><first>Eiichiro</first> <last>Sumita</last></author>
+      <author><first>Kenji</first><last>Imamura</last></author>
+      <author><first>Eiichiro</first><last>Sumita</last></author>
       <pages>127–134</pages>
       <url>W17-5711</url>
       <abstract>In this paper, we describe the NICT-2 neural machine translation system evaluated at WAT2017. This system uses multiple models as an ensemble and combines models with opposite decoding directions by reranking (called bi-directional reranking). In our experimental results on small data sets, the translation quality improved when the number of models was increased to 32 in total and did not saturate. In the experiments on large data sets, improvements of 1.59-3.32 BLEU points were achieved when six-model ensembles were combined by the bi-directional reranking.</abstract>
     </paper>
     <paper id="12">
       <title>A Simple and Strong Baseline: <fixed-case>NAIST</fixed-case>-<fixed-case>NICT</fixed-case> Neural Machine Translation System for <fixed-case>WAT</fixed-case>2017 <fixed-case>E</fixed-case>nglish-<fixed-case>J</fixed-case>apanese Translation Task</title>
-      <author><first>Yusuke</first> <last>Oda</last></author>
-      <author><first>Katsuhito</first> <last>Sudoh</last></author>
-      <author><first>Satoshi</first> <last>Nakamura</last></author>
-      <author><first>Masao</first> <last>Utiyama</last></author>
-      <author><first>Eiichiro</first> <last>Sumita</last></author>
+      <author><first>Yusuke</first><last>Oda</last></author>
+      <author><first>Katsuhito</first><last>Sudoh</last></author>
+      <author><first>Satoshi</first><last>Nakamura</last></author>
+      <author><first>Masao</first><last>Utiyama</last></author>
+      <author><first>Eiichiro</first><last>Sumita</last></author>
       <pages>135–139</pages>
       <url>W17-5712</url>
       <abstract>This paper describes the details about the NAIST-NICT machine translation system for WAT2017 English-Japanese Scientific Paper Translation Task. The system consists of a language-independent tokenizer and an attentional encoder-decoder style neural machine translation model. According to the official results, our system achieves higher translation accuracy than any systems submitted previous campaigns despite simple model architecture.</abstract>
     </paper>
     <paper id="13">
       <title>Comparison of <fixed-case>SMT</fixed-case> and <fixed-case>NMT</fixed-case> trained with large Patent Corpora: <fixed-case>J</fixed-case>apio at <fixed-case>WAT</fixed-case>2017</title>
-      <author><first>Satoshi</first> <last>Kinoshita</last></author>
-      <author><first>Tadaaki</first> <last>Oshio</last></author>
-      <author><first>Tomoharu</first> <last>Mitsuhashi</last></author>
+      <author><first>Satoshi</first><last>Kinoshita</last></author>
+      <author><first>Tadaaki</first><last>Oshio</last></author>
+      <author><first>Tomoharu</first><last>Mitsuhashi</last></author>
       <pages>140–145</pages>
       <url>W17-5713</url>
       <abstract>Japio participates in patent subtasks (JPC-EJ/JE/CJ/KJ) with phrase-based statistical machine translation (SMT) and neural machine translation (NMT) systems which are trained with its own patent corpora in addition to the subtask corpora provided by organizers of WAT2017. In EJ and CJ subtasks, SMT and NMT systems whose sizes of training corpora are about 50 million and 10 million sentence pairs respectively achieved comparable scores for automatic evaluations, but NMT systems were superior to SMT systems for both official and in-house human evaluations.</abstract>
     </paper>
     <paper id="14">
       <title><fixed-case>K</fixed-case>yoto University Participation to <fixed-case>WAT</fixed-case> 2017</title>
-      <author><first>Fabien</first> <last>Cromieres</last></author>
-      <author><first>Raj</first> <last>Dabre</last></author>
-      <author><first>Toshiaki</first> <last>Nakazawa</last></author>
-      <author><first>Sadao</first> <last>Kurohashi</last></author>
+      <author><first>Fabien</first><last>Cromieres</last></author>
+      <author><first>Raj</first><last>Dabre</last></author>
+      <author><first>Toshiaki</first><last>Nakazawa</last></author>
+      <author><first>Sadao</first><last>Kurohashi</last></author>
       <pages>146–153</pages>
       <url>W17-5714</url>
       <abstract>We describe here our approaches and results on the WAT 2017 shared translation tasks. Following our good results with Neural Machine Translation in the previous shared task, we continue this approach this year, with incremental improvements in models and training methods. We focused on the ASPEC dataset and could improve the state-of-the-art results for Chinese-to-Japanese and Japanese-to-Chinese translations.</abstract>
     </paper>
     <paper id="15">
       <title><fixed-case>CUNI</fixed-case> <fixed-case>NMT</fixed-case> System for <fixed-case>WAT</fixed-case> 2017 Translation Tasks</title>
-      <author><first>Tom</first> <last>Kocmi</last></author>
-      <author><first>Dušan</first> <last>Variš</last></author>
-      <author><first>Ondřej</first> <last>Bojar</last></author>
+      <author><first>Tom</first><last>Kocmi</last></author>
+      <author><first>Dušan</first><last>Variš</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
       <pages>154–159</pages>
       <url>W17-5715</url>
       <abstract>The paper presents this year’s CUNI submissions to the WAT 2017 Translation Task focusing on the Japanese-English translation, namely Scientific papers subtask, Patents subtask and Newswire subtask. We compare two neural network architectures, the standard sequence-to-sequence with attention (Seq2Seq) and an architecture using convolutional sentence encoder (FBConv2Seq), both implemented in the NMT framework Neural Monkey that we currently participate in developing. We also compare various types of preprocessing of the source Japanese sentences and their impact on the overall results. Furthermore, we include the results of our experiments with out-of-domain data obtained by combining the corpora provided for each subtask.</abstract>
     </paper>
     <paper id="16">
       <title><fixed-case>T</fixed-case>okyo Metropolitan University Neural Machine Translation System for <fixed-case>WAT</fixed-case> 2017</title>
-      <author><first>Yukio</first> <last>Matsumura</last></author>
-      <author><first>Mamoru</first> <last>Komachi</last></author>
+      <author><first>Yukio</first><last>Matsumura</last></author>
+      <author><first>Mamoru</first><last>Komachi</last></author>
       <pages>160–166</pages>
       <url>W17-5716</url>
       <abstract>In this paper, we describe our neural machine translation (NMT) system, which is based on the attention-based NMT and uses long short-term memories (LSTM) as RNN. We implemented beam search and ensemble decoding in the NMT system. The system was tested on the 4th Workshop on Asian Translation (WAT 2017) shared tasks. In our experiments, we participated in the scientific paper subtasks and attempted Japanese-English, English-Japanese, and Japanese-Chinese translation tasks. The experimental results showed that implementation of beam search and ensemble decoding can effectively improve the translation quality.</abstract>
     </paper>
     <paper id="17">
       <title>Comparing Recurrent and Convolutional Architectures for <fixed-case>E</fixed-case>nglish-<fixed-case>H</fixed-case>indi Neural Machine Translation</title>
-      <author><first>Sandhya</first> <last>Singh</last></author>
-      <author><first>Ritesh</first> <last>Panjwani</last></author>
-      <author><first>Anoop</first> <last>Kunchukuttan</last></author>
-      <author><first>Pushpak</first> <last>Bhattacharyya</last></author>
+      <author><first>Sandhya</first><last>Singh</last></author>
+      <author><first>Ritesh</first><last>Panjwani</last></author>
+      <author><first>Anoop</first><last>Kunchukuttan</last></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last></author>
       <pages>167–170</pages>
       <url>W17-5717</url>
       <abstract>In this paper, we empirically compare the two encoder-decoder neural machine translation architectures: convolutional sequence to sequence model (ConvS2S) and recurrent sequence to sequence model (RNNS2S) for English-Hindi language pair as part of IIT Bombay’s submission to WAT2017 shared task. We report the results for both English-Hindi and Hindi-English direction of language pair.</abstract>
@@ -10862,100 +10862,100 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Automatic detection of stance towards vaccination in online discussion forums</title>
-      <author><first>Maria</first> <last>Skeppstedt</last></author>
-      <author><first>Andreas</first> <last>Kerren</last></author>
-      <author><first>Manfred</first> <last>Stede</last></author>
+      <author><first>Maria</first><last>Skeppstedt</last></author>
+      <author><first>Andreas</first><last>Kerren</last></author>
+      <author><first>Manfred</first><last>Stede</last></author>
       <pages>1–8</pages>
       <url>W17-5801</url>
       <abstract>A classifier for automatic detection of stance towards vaccination in online forums was trained and evaluated. Debate posts from six discussion threads on the British parental website Mumsnet were manually annotated for stance ‘against’ or ‘for’ vaccination, or as ‘undecided’. A support vector machine, trained to detect the three classes, achieved a macro F-score of 0.44, while a macro F-score of 0.62 was obtained by the same type of classifier on the binary classification task of distinguishing stance ‘against’ vaccination from stance ‘for’ vaccination. These results show that vaccine stance detection in online forums is a difficult task, at least for the type of model investigated and for the relatively small training corpus that was used. Future work will therefore include an expansion of the training data and an evaluation of other types of classifiers and features.</abstract>
     </paper>
     <paper id="2">
       <title>Analysing the Causes of Depressed Mood from Depression Vulnerable Individuals</title>
-      <author><first>Noor Fazilla</first> <last>Abd Yusof</last></author>
-      <author><first>Chenghua</first> <last>Lin</last></author>
-      <author><first>Frank</first> <last>Guerin</last></author>
+      <author><first>Noor Fazilla</first><last>Abd Yusof</last></author>
+      <author><first>Chenghua</first><last>Lin</last></author>
+      <author><first>Frank</first><last>Guerin</last></author>
       <pages>9–17</pages>
       <url>W17-5802</url>
       <abstract>We develop a computational model to discover the potential causes of depression by analysing the topics in a usergenerated text. We show the most prominent causes, and how these causes evolve over time. Also, we highlight the differences in causes between students with low and high neuroticism. Our studies demonstrate that the topics reveal valuable clues about the causes contributing to depressed mood. Identifying causes can have a significant impact on improving the quality of depression care; thereby providing greater insights into a patient’s state for pertinent treatment recommendations. Hence, this study significantly expands the ability to discover the potential factors that trigger depression, making it possible to increase the efficiency of depression treatment.</abstract>
     </paper>
     <paper id="3">
       <title>Multivariate Linear Regression of Symptoms-related Tweets for Infectious Gastroenteritis Scale Estimation</title>
-      <author><first>Ryo</first> <last>Takeuchi</last></author>
-      <author><first>Hayate</first> <last>Iso</last></author>
-      <author><first>Kaoru</first> <last>Ito</last></author>
-      <author><first>Shoko</first> <last>Wakamiya</last></author>
-      <author><first>Eiji</first> <last>Aramaki</last></author>
+      <author><first>Ryo</first><last>Takeuchi</last></author>
+      <author><first>Hayate</first><last>Iso</last></author>
+      <author><first>Kaoru</first><last>Ito</last></author>
+      <author><first>Shoko</first><last>Wakamiya</last></author>
+      <author><first>Eiji</first><last>Aramaki</last></author>
       <pages>18–25</pages>
       <url>W17-5803</url>
       <abstract>To date, various Twitter-based event detection systems have been proposed. Most of their targets, however, share common characteristics. They are seasonal or global events such as earthquakes and flu pandemics. In contrast, this study targets unseasonal and local disease events. Our system investigates the frequencies of disease-related words such as “nausea”,“chill”,and “diarrhea” and estimates the number of patients using regression of these word frequencies. Experiments conducted using Japanese 47 areas from January 2017 to April 2017 revealed that the detection of small and unseasonal event is extremely difficult (overall performance: 0.13). However, we found that the event scale and the detection performance show high correlation in the specified cases (in the phase of patient increasing or decreasing). The results also suggest that when 150 and more patients appear in a high population area, we can expect that our social sensors detect this outbreak. Based on these results, we can infer that social sensors can reliably detect unseasonal and local disease events under certain conditions, just as they can for seasonal or global events.</abstract>
     </paper>
     <paper id="4">
       <title>Incorporating Dependency Trees Improve Identification of Pregnant Women on Social Media Platforms</title>
-      <author><first>Yi-Jie</first> <last>Huang</last></author>
-      <author><first>Chu Hsien</first> <last>Su</last></author>
-      <author><first>Yi-Chun</first> <last>Chang</last></author>
-      <author><first>Tseng-Hsin</first> <last>Ting</last></author>
-      <author><first>Tzu-Yuan</first> <last>Fu</last></author>
-      <author><first>Rou-Min</first> <last>Wang</last></author>
-      <author><first>Hong-Jie</first> <last>Dai</last></author>
-      <author><first>Yung-Chun</first> <last>Chang</last></author>
-      <author><first>Jitendra</first> <last>Jonnagaddala</last></author>
-      <author><first>Wen-Lian</first> <last>Hsu</last></author>
+      <author><first>Yi-Jie</first><last>Huang</last></author>
+      <author><first>Chu Hsien</first><last>Su</last></author>
+      <author><first>Yi-Chun</first><last>Chang</last></author>
+      <author><first>Tseng-Hsin</first><last>Ting</last></author>
+      <author><first>Tzu-Yuan</first><last>Fu</last></author>
+      <author><first>Rou-Min</first><last>Wang</last></author>
+      <author><first>Hong-Jie</first><last>Dai</last></author>
+      <author><first>Yung-Chun</first><last>Chang</last></author>
+      <author><first>Jitendra</first><last>Jonnagaddala</last></author>
+      <author><first>Wen-Lian</first><last>Hsu</last></author>
       <pages>26–32</pages>
       <url>W17-5804</url>
       <abstract>The increasing popularity of social media lead users to share enormous information on the internet. This information has various application like, it can be used to develop models to understand or predict user behavior on social media platforms. For example, few online retailers have studied the shopping patterns to predict shopper’s pregnancy stage. Another interesting application is to use the social media platforms to analyze users’ health-related information. In this study, we developed a tree kernel-based model to classify tweets conveying pregnancy related information using this corpus. The developed pregnancy classification model achieved an accuracy of 0.847 and an F-score of 0.565. A new corpus from popular social media platform Twitter was developed for the purpose of this study. In future, we would like to improve this corpus by reducing noise such as retweets.</abstract>
     </paper>
     <paper id="5">
       <title>Using a Recurrent Neural Network Model for Classification of Tweets Conveyed Influenza-related Information</title>
-      <author><first>Chen-Kai</first> <last>Wang</last></author>
-      <author><first>Onkar</first> <last>Singh</last></author>
-      <author><first>Zhao-Li</first> <last>Tang</last></author>
-      <author><first>Hong-Jie</first> <last>Dai</last></author>
+      <author><first>Chen-Kai</first><last>Wang</last></author>
+      <author><first>Onkar</first><last>Singh</last></author>
+      <author><first>Zhao-Li</first><last>Tang</last></author>
+      <author><first>Hong-Jie</first><last>Dai</last></author>
       <pages>33–38</pages>
       <url>W17-5805</url>
       <abstract>Traditional disease surveillance systems depend on outpatient reporting and virological test results released by hospitals. These data have valid and accurate information about emerging outbreaks but it’s often not timely. In recent years the exponential growth of users getting connected to social media provides immense knowledge about epidemics by sharing related information. Social media can now flag more immediate concerns related to out-breaks in real time. In this paper we apply the long short-term memory recurrent neural net-work (RNN) architecture to classify tweets conveyed influenza-related information and compare its performance with baseline algorithms including support vector machine (SVM), decision tree, naive Bayes, simple logistics, and naive Bayes multinomial. The developed RNN model achieved an F-score of 0.845 on the MedWeb task test set, which outperforms the F-score of SVM without applying the synthetic minority oversampling technique by 0.08. The F-score of the RNN model is within 1% of the highest score achieved by SVM with oversampling technique.</abstract>
     </paper>
     <paper id="6">
       <title><fixed-case>Z</fixed-case>ika<fixed-case>H</fixed-case>ack 2016: A digital disease detection competition</title>
-      <author><first>Dillon C</first> <last>Adam</last></author>
-      <author><first>Jitendra</first> <last>Jonnagaddala</last></author>
-      <author><first>Daniel</first> <last>Han-Chen</last></author>
-      <author><first>Sean</first> <last>Batongbacal</last></author>
-      <author><first>Luan</first> <last>Almeida</last></author>
-      <author><first>Jing Z</first> <last>Zhu</last></author>
-      <author><first>Jenny J</first> <last>Yang</last></author>
-      <author><first>Jumail M</first> <last>Mundekkat</last></author>
-      <author><first>Steven</first> <last>Badman</last></author>
-      <author><first>Abrar</first> <last>Chughtai</last></author>
-      <author><first>C Raina</first> <last>MacIntyre</last></author>
+      <author><first>Dillon C</first><last>Adam</last></author>
+      <author><first>Jitendra</first><last>Jonnagaddala</last></author>
+      <author><first>Daniel</first><last>Han-Chen</last></author>
+      <author><first>Sean</first><last>Batongbacal</last></author>
+      <author><first>Luan</first><last>Almeida</last></author>
+      <author><first>Jing Z</first><last>Zhu</last></author>
+      <author><first>Jenny J</first><last>Yang</last></author>
+      <author><first>Jumail M</first><last>Mundekkat</last></author>
+      <author><first>Steven</first><last>Badman</last></author>
+      <author><first>Abrar</first><last>Chughtai</last></author>
+      <author><first>C Raina</first><last>MacIntyre</last></author>
       <pages>39–46</pages>
       <url>W17-5806</url>
       <abstract>Effective response to infectious diseases outbreaks relies on the rapid and early detection of those outbreaks. Invalidated, yet timely and openly available digital information can be used for the early detection of outbreaks. Public health surveillance authorities can exploit these early warnings to plan and co-ordinate rapid surveillance and emergency response programs. In 2016, a digital disease detection competition named ZikaHack was launched. The objective of the competition was for multidisciplinary teams to design, develop and demonstrate innovative digital disease detection solutions to retrospectively detect the 2015-16 Brazilian Zika virus outbreak earlier than traditional surveillance methods. In this paper, an overview of the ZikaHack competition is provided. The challenges and lessons learned in organizing this competition are also discussed for use by other researchers interested in organizing similar competitions.</abstract>
     </paper>
     <paper id="7">
       <title>A Method to Generate a Machine-Labeled Data for Biomedical Named Entity Recognition with Various Sub-Domains</title>
-      <author><first>Juae</first> <last>Kim</last></author>
-      <author><first>Sunjae</first> <last>Kwon</last></author>
-      <author><first>Youngjoong</first> <last>Ko</last></author>
-      <author><first>Jungyun</first> <last>Seo</last></author>
+      <author><first>Juae</first><last>Kim</last></author>
+      <author><first>Sunjae</first><last>Kwon</last></author>
+      <author><first>Youngjoong</first><last>Ko</last></author>
+      <author><first>Jungyun</first><last>Seo</last></author>
       <pages>47–51</pages>
       <url>W17-5807</url>
       <abstract>Biomedical Named Entity (NE) recognition is a core technique for various works in the biomedical domain. In previous studies, using machine learning algorithm shows better performance than dictionary-based and rule-based approaches because there are too many terminological variations of biomedical NEs and new biomedical NEs are constantly generated. To achieve the high performance with a machine-learning algorithm, good-quality corpora are required. However, it is difficult to obtain the good-quality corpora because an-notating a biomedical corpus for ma-chine-learning is extremely time-consuming and costly. In addition, most previous corpora are insufficient for high-level tasks because they cannot cover various domains. Therefore, we propose a method for generating a large amount of machine-labeled data that covers various domains. To generate a large amount of machine-labeled data, firstly we generate an initial machine-labeled data by using a chunker and MetaMap. The chunker is developed to extract only biomedical NEs with manually annotated data. MetaMap is used to annotate the category of bio-medical NE. Then we apply the self-training approach to bootstrap the performance of initial machine-labeled data. In our experiments, the biomedical NE recognition system that is trained with our proposed machine-labeled data achieves much high performance. As a result, our system outperforms biomedical NE recognition system that using MetaMap only with 26.03%p improvements on F1-score.</abstract>
     </paper>
     <paper id="8">
       <title>Enhancing Drug-Drug Interaction Classification with Corpus-level Feature and Classifier Ensemble</title>
-      <author><first>Jing Cyun</first> <last>Tu</last></author>
-      <author><first>Po-Ting</first> <last>Lai</last></author>
-      <author><first>Richard Tzong-Han</first> <last>Tsai</last></author>
+      <author><first>Jing Cyun</first><last>Tu</last></author>
+      <author><first>Po-Ting</first><last>Lai</last></author>
+      <author><first>Richard Tzong-Han</first><last>Tsai</last></author>
       <pages>52–56</pages>
       <url>W17-5808</url>
       <abstract>The study of drug-drug interaction (DDI) is important in the drug discovering. Both PubMed and DrugBank are rich resources to retrieve DDI information which is usually represented in plain text. Automatically extracting DDI pairs from text improves the quality of drug discov-ering. In this paper, we presented a study that focuses on the DDI classification. We normalized the drug names, and developed both sentence-level and corpus-level features for DDI classification. A classifier ensemble approach is used for the unbalance DDI labels problem. Our approach achieved an F-score of 65.4% on SemEval 2013 DDI test set. The experimental results also show the effects of proposed corpus-level features in the DDI task.</abstract>
     </paper>
     <paper id="9">
       <title>Chemical-Induced Disease Detection Using Invariance-based Pattern Learning Model</title>
-      <author><first>Neha</first> <last>Warikoo</last></author>
-      <author><first>Yung-Chun</first> <last>Chang</last></author>
-      <author><first>Wen-Lian</first> <last>Hsu</last></author>
+      <author><first>Neha</first><last>Warikoo</last></author>
+      <author><first>Yung-Chun</first><last>Chang</last></author>
+      <author><first>Wen-Lian</first><last>Hsu</last></author>
       <pages>57–64</pages>
       <url>W17-5809</url>
       <abstract>In this work, we introduce a novel feature engineering approach named “algebraic invariance” to identify discriminative patterns for learning relation pair features for the chemical-disease relation (CDR) task of BioCreative V. Our method exploits the existing structural similarity of the key concepts of relation descriptions from the CDR corpus to generate robust linguistic patterns for SVM tree kernel-based learning. Preprocessing of the training data classifies the entity pairs as either related or unrelated to build instance types for both inter-sentential and intra-sentential scenarios. An invariant function is proposed to process and optimally cluster similar patterns for both positive and negative instances. The learning model for CDR pairs is based on the SVM tree kernel approach, which generates feature trees and vectors and is modeled on suitable invariance based patterns, bringing brevity, precision and context to the identifier features. Results demonstrate that our method outperformed other compared approaches, achieved a high recall rate of 85.08%, and averaged an F1-score of 54.34% without the use of any additional knowledge bases.</abstract>
@@ -10979,123 +10979,123 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title><fixed-case>NTUCLE</fixed-case>: Developing a Corpus of Learner <fixed-case>E</fixed-case>nglish to Provide Writing Support for Engineering Students</title>
-      <author><first>Roger Vivek Placidus</first> <last>Winder</last></author>
-      <author><first>Joseph</first> <last>MacKinnon</last></author>
-      <author><first>Shu Yun</first> <last>Li</last></author>
-      <author><first>Benedict Christopher Tzer Liang</first> <last>Lin</last></author>
-      <author><first>Carmel Lee Hah</first> <last>Heah</last></author>
-      <author><first>Luís</first> <last>Morgado da Costa</last></author>
-      <author><first>Takayuki</first> <last>Kuribayashi</last></author>
-      <author><first>Francis</first> <last>Bond</last></author>
+      <author><first>Roger Vivek Placidus</first><last>Winder</last></author>
+      <author><first>Joseph</first><last>MacKinnon</last></author>
+      <author><first>Shu Yun</first><last>Li</last></author>
+      <author><first>Benedict Christopher Tzer Liang</first><last>Lin</last></author>
+      <author><first>Carmel Lee Hah</first><last>Heah</last></author>
+      <author><first>Luís</first><last>Morgado da Costa</last></author>
+      <author><first>Takayuki</first><last>Kuribayashi</last></author>
+      <author><first>Francis</first><last>Bond</last></author>
       <pages>1–11</pages>
       <url>W17-5901</url>
       <abstract>This paper describes the creation of a new annotated learner corpus. The aim is to use this corpus to develop an automated system for corrective feedback on students’ writing. With this system, students will be able to receive timely feedback on language errors before they submit their assignments for grading. A corpus of assignments submitted by first year engineering students was compiled, and a new error tag set for the NTU Corpus of Learner English (NTUCLE) was developed based on that of the NUS Corpus of Learner English (NUCLE), as well as marking rubrics used at NTU. After a description of the corpus, error tag set and annotation process, the paper presents the results of the annotation exercise as well as follow up actions. The final error tag set, which is significantly larger than that for the NUCLE error categories, is then presented before a brief conclusion summarising our experience and future plans.</abstract>
     </paper>
     <paper id="2">
       <title>Understanding Non-Native Writings: Can a Parser Help?</title>
-      <author><first>Jirka</first> <last>Hana</last></author>
-      <author><first>Barbora</first> <last>Hladká</last></author>
+      <author><first>Jirka</first><last>Hana</last></author>
+      <author><first>Barbora</first><last>Hladká</last></author>
       <pages>12–16</pages>
       <url>W17-5902</url>
       <abstract>We present a pilot study on parsing non-native texts written by learners of Czech. We performed experiments that have shown that at least high-level syntactic functions, like subject, predicate, and object, can be assigned based on a parser trained on standard native language.</abstract>
     </paper>
     <paper id="3">
       <title>Carrier Sentence Selection for Fill-in-the-blank Items</title>
-      <author><first>Shu</first> <last>Jiang</last></author>
-      <author><first>John</first> <last>Lee</last></author>
+      <author><first>Shu</first><last>Jiang</last></author>
+      <author><first>John</first><last>Lee</last></author>
       <pages>17–22</pages>
       <url>W17-5903</url>
       <abstract>Fill-in-the-blank items are a common form of exercise in computer-assisted language learning systems. To automatically generate an effective item, the system must be able to select a high-quality carrier sentence that illustrates the usage of the target word. Previous approaches for carrier sentence selection have considered sentence length, vocabulary difficulty, the position of the target word and the presence of finite verbs. This paper investigates the utility of word co-occurrence statistics and lexical similarity as selection criteria. In an evaluation on generating fill-in-the-blank items for learning Chinese as a foreign language, we show that these two criteria can improve carrier sentence quality.</abstract>
     </paper>
     <paper id="4">
       <title><fixed-case>H</fixed-case>indi Shabdamitra: A <fixed-case>W</fixed-case>ordnet based E-Learning Tool for Language Learning and Teaching</title>
-      <author><first>Hanumant</first> <last>Redkar</last></author>
-      <author><first>Sandhya</first> <last>Singh</last></author>
-      <author><first>Meenakshi</first> <last>Somasundaram</last></author>
-      <author><first>Dhara</first> <last>Gorasia</last></author>
-      <author><first>Malhar</first> <last>Kulkarni</last></author>
-      <author><first>Pushpak</first> <last>Bhattacharyya</last></author>
+      <author><first>Hanumant</first><last>Redkar</last></author>
+      <author><first>Sandhya</first><last>Singh</last></author>
+      <author><first>Meenakshi</first><last>Somasundaram</last></author>
+      <author><first>Dhara</first><last>Gorasia</last></author>
+      <author><first>Malhar</first><last>Kulkarni</last></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last></author>
       <pages>23–28</pages>
       <url>W17-5904</url>
       <abstract>In today’s technology driven digital era, education domain is undergoing a transformation from traditional approaches to more learner controlled and flexible methods of learning. This transformation has opened the new avenues for interdisciplinary research in the field of educational technology and natural language processing in developing quality digital aids for learning and teaching. The tool presented here - Hindi Shabhadamitra, developed using Hindi Wordnet for Hindi language learning, is one such e-learning tool. It has been developed as a teaching and learning aid suitable for formal school based curriculum and informal setup for self learning users. Besides vocabulary, it also provides word based grammar along with images and pronunciation for better learning and retention. This aid demonstrates that how a rich lexical resource like wordnet can be systematically remodeled for practical usage in the educational domain.</abstract>
     </paper>
     <paper id="5">
       <title><fixed-case>NLPTEA</fixed-case> 2017 Shared Task – <fixed-case>C</fixed-case>hinese Spelling Check</title>
-      <author><first>Gabriel</first> <last>Fung</last></author>
-      <author><first>Maxime</first> <last>Debosschere</last></author>
-      <author><first>Dingmin</first> <last>Wang</last></author>
-      <author id="bo-li"><first>Bo</first> <last>Li</last></author>
-      <author><first>Jia</first> <last>Zhu</last></author>
-      <author><first>Kam-Fai</first> <last>Wong</last></author>
+      <author><first>Gabriel</first><last>Fung</last></author>
+      <author><first>Maxime</first><last>Debosschere</last></author>
+      <author><first>Dingmin</first><last>Wang</last></author>
+      <author id="bo-li"><first>Bo</first><last>Li</last></author>
+      <author><first>Jia</first><last>Zhu</last></author>
+      <author><first>Kam-Fai</first><last>Wong</last></author>
       <pages>29–34</pages>
       <url>W17-5905</url>
       <abstract>This paper provides an overview along with our findings of the Chinese Spelling Check shared task at NLPTEA 2017. The goal of this task is to develop a computer-assisted system to automatically diagnose typing errors in traditional Chinese sentences written by students. We defined six types of errors which belong to two categories. Given a sentence, the system should detect where the errors are, and for each detected error determine its type and provide correction suggestions. We designed, constructed, and released a benchmark dataset for this task.</abstract>
     </paper>
     <paper id="6">
       <title><fixed-case>C</fixed-case>hinese Spelling Check based on N-gram and String Matching Algorithm</title>
-      <author><first>Jui-Feng</first> <last>Yeh</last></author>
-      <author><first>Li-Ting</first> <last>Chang</last></author>
-      <author><first>Chan-Yi</first> <last>Liu</last></author>
-      <author><first>Tsung-Wei</first> <last>Hsu</last></author>
+      <author><first>Jui-Feng</first><last>Yeh</last></author>
+      <author><first>Li-Ting</first><last>Chang</last></author>
+      <author><first>Chan-Yi</first><last>Liu</last></author>
+      <author><first>Tsung-Wei</first><last>Hsu</last></author>
       <pages>35–38</pages>
       <url>W17-5906</url>
       <abstract>This paper presents a Chinese spelling check approach based on language models combined with string match algorithm to treat the problems resulted from the influence caused by Cantonese mother tone. N-grams first used to detecting the probability of sentence constructed by the writers, a string matching algorithm called Knuth-Morris-Pratt (KMP) Algorithm is used to detect and correct the error. According to the experimental results, the proposed approach can detect the error and provide the corresponding correction.</abstract>
     </paper>
     <paper id="7">
       <title>N-gram Model for <fixed-case>C</fixed-case>hinese Grammatical Error Diagnosis</title>
-      <author><first>Jianbo</first> <last>Zhao</last></author>
-      <author><first>Hao</first> <last>Liu</last></author>
-      <author><first>Zuyi</first> <last>Bao</last></author>
-      <author><first>Xiaopeng</first> <last>Bai</last></author>
-      <author><first>Si</first> <last>Li</last></author>
-      <author><first>Zhiqing</first> <last>Lin</last></author>
+      <author><first>Jianbo</first><last>Zhao</last></author>
+      <author><first>Hao</first><last>Liu</last></author>
+      <author><first>Zuyi</first><last>Bao</last></author>
+      <author><first>Xiaopeng</first><last>Bai</last></author>
+      <author><first>Si</first><last>Li</last></author>
+      <author><first>Zhiqing</first><last>Lin</last></author>
       <pages>39–44</pages>
       <url>W17-5907</url>
       <abstract>Detection and correction of Chinese grammatical errors have been two of major challenges for Chinese automatic grammatical error diagnosis.This paper presents an N-gram model for automatic detection and correction of Chinese grammatical errors in NLPTEA 2017 task. The experiment results show that the proposed method is good at correction of Chinese grammatical errors.</abstract>
     </paper>
     <paper id="8">
       <title>The Influence of Spelling Errors on Content Scoring Performance</title>
-      <author><first>Andrea</first> <last>Horbach</last></author>
-      <author><first>Yuning</first> <last>Ding</last></author>
-      <author><first>Torsten</first> <last>Zesch</last></author>
+      <author><first>Andrea</first><last>Horbach</last></author>
+      <author><first>Yuning</first><last>Ding</last></author>
+      <author><first>Torsten</first><last>Zesch</last></author>
       <pages>45–53</pages>
       <url>W17-5908</url>
       <abstract>Spelling errors occur frequently in educational settings, but their influence on automatic scoring is largely unknown. We therefore investigate the influence of spelling errors on content scoring performance using the example of the ASAP corpus. We conduct an annotation study on the nature of spelling errors in the ASAP dataset and utilize these finding in machine learning experiments that measure the influence of spelling errors on automatic content scoring. Our main finding is that scoring methods using both token and character n-gram features are robust against spelling errors up to the error frequency in ASAP.</abstract>
     </paper>
     <paper id="9">
       <title>Analyzing the Impact of Spelling Errors on <fixed-case>POS</fixed-case>-Tagging and Chunking in Learner <fixed-case>E</fixed-case>nglish</title>
-      <author><first>Tomoya</first> <last>Mizumoto</last></author>
-      <author><first>Ryo</first> <last>Nagata</last></author>
+      <author><first>Tomoya</first><last>Mizumoto</last></author>
+      <author><first>Ryo</first><last>Nagata</last></author>
       <pages>54–58</pages>
       <url>W17-5909</url>
       <abstract>Part-of-speech (POS) tagging and chunking have been used in tasks targeting learner English; however, to the best our knowledge, few studies have evaluated their performance and no studies have revealed the causes of POS-tagging/chunking errors in detail. Therefore, we investigate performance and analyze the causes of failure. We focus on spelling errors that occur frequently in learner English. We demonstrate that spelling errors reduced POS-tagging performance by 0.23% owing to spelling errors, and that a spell checker is not necessary for POS-tagging/chunking of learner English.</abstract>
     </paper>
     <paper id="10">
       <title>Complex Word Identification: Challenges in Data Annotation and System Performance</title>
-      <author><first>Marcos</first> <last>Zampieri</last></author>
-      <author><first>Shervin</first> <last>Malmasi</last></author>
-      <author><first>Gustavo</first> <last>Paetzold</last></author>
-      <author><first>Lucia</first> <last>Specia</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <author><first>Shervin</first><last>Malmasi</last></author>
+      <author><first>Gustavo</first><last>Paetzold</last></author>
+      <author><first>Lucia</first><last>Specia</last></author>
       <pages>59–63</pages>
       <url>W17-5910</url>
       <abstract>This paper revisits the problem of complex word identification (CWI) following up the SemEval CWI shared task. We use ensemble classifiers to investigate how well computational methods can discriminate between complex and non-complex words. Furthermore, we analyze the classification performance to understand what makes lexical complexity challenging. Our findings show that most systems performed poorly on the SemEval CWI dataset, and one of the reasons for that is the way in which human annotation was performed.</abstract>
     </paper>
     <paper id="11">
       <title>Suggesting Sentences for <fixed-case>ESL</fixed-case> using Kernel Embeddings</title>
-      <author><first>Kent</first> <last>Shioda</last></author>
-      <author><first>Mamoru</first> <last>Komachi</last></author>
-      <author><first>Rue</first> <last>Ikeya</last></author>
-      <author><first>Daichi</first> <last>Mochihashi</last></author>
+      <author><first>Kent</first><last>Shioda</last></author>
+      <author><first>Mamoru</first><last>Komachi</last></author>
+      <author><first>Rue</first><last>Ikeya</last></author>
+      <author><first>Daichi</first><last>Mochihashi</last></author>
       <pages>64–68</pages>
       <url>W17-5911</url>
       <abstract>Sentence retrieval is an important NLP application for English as a Second Language (ESL) learners. ESL learners are familiar with web search engines, but generic web search results may not be adequate for composing documents in a specific domain. However, if we build our own search system specialized to a domain, it may be subject to the data sparseness problem. Recently proposed word2vec partially addresses the data sparseness problem, but fails to extract sentences relevant to queries owing to the modeling of the latent intent of the query. Thus, we propose a method of retrieving example sentences using kernel embeddings and N-gram windows. This method implicitly models latent intent of query and sentences, and alleviates the problem of noisy alignment. Our results show that our method achieved higher precision in sentence retrieval for ESL in the domain of a university press release corpus, as compared to a previous unsupervised method used for a semantic textual similarity task.</abstract>
     </paper>
     <paper id="12">
       <title>Event Timeline Generation from History Textbooks</title>
-      <author><first>Harsimran</first> <last>Bedi</last></author>
-      <author><first>Sangameshwar</first> <last>Patil</last></author>
-      <author><first>Swapnil</first> <last>Hingmire</last></author>
-      <author><first>Girish</first> <last>Palshikar</last></author>
+      <author><first>Harsimran</first><last>Bedi</last></author>
+      <author><first>Sangameshwar</first><last>Patil</last></author>
+      <author><first>Swapnil</first><last>Hingmire</last></author>
+      <author><first>Girish</first><last>Palshikar</last></author>
       <pages>69–77</pages>
       <url>W17-5912</url>
       <abstract>Event timeline serves as the basic structure of history, and it is used as a disposition of key phenomena in studying history as a subject in secondary school. In order to enable a student to understand a historical phenomenon as a series of connected events, we present a system for automatic event timeline generation from history textbooks. Additionally, we propose Message Sequence Chart (MSC) and time-map based visualization techniques to visualize an event timeline. We also identify key computational challenges in developing natural language processing based applications for history textbooks.</abstract>
@@ -11117,52 +11117,52 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Group Linguistic Bias Aware Neural Response Generation</title>
-      <author><first>Jianan</first> <last>Wang</last></author>
-      <author><first>Xin</first> <last>Wang</last></author>
-      <author><first>Fang</first> <last>Li</last></author>
-      <author><first>Zhen</first> <last>Xu</last></author>
-      <author><first>Zhuoran</first> <last>Wang</last></author>
-      <author><first>Baoxun</first> <last>Wang</last></author>
+      <author><first>Jianan</first><last>Wang</last></author>
+      <author><first>Xin</first><last>Wang</last></author>
+      <author><first>Fang</first><last>Li</last></author>
+      <author><first>Zhen</first><last>Xu</last></author>
+      <author><first>Zhuoran</first><last>Wang</last></author>
+      <author><first>Baoxun</first><last>Wang</last></author>
       <pages>1–10</pages>
       <url>W17-6001</url>
       <abstract>For practical chatbots, one of the essential factor for improving user experience is the capability of customizing the talking style of the agents, that is, to make chatbots provide responses meeting users’ preference on language styles, topics, etc. To address this issue, this paper proposes to incorporate linguistic biases, which implicitly involved in the conversation corpora generated by human groups in the Social Network Services (SNS), into the encoder-decoder based response generator. By attaching a specially designed neural component to dynamically control the impact of linguistic biases in response generation, a Group Linguistic Bias Aware Neural Response Generation (GLBA-NRG) model is eventually presented. The experimental results on the dataset from the Chinese SNS show that the proposed architecture outperforms the current response generating models by producing both meaningful and vivid responses with customized styles.</abstract>
     </paper>
     <paper id="2">
       <title>Neural Regularized Domain Adaptation for <fixed-case>C</fixed-case>hinese Word Segmentation</title>
-      <author><first>Zuyi</first> <last>Bao</last></author>
-      <author><first>Si</first> <last>Li</last></author>
-      <author><first>Weiran</first> <last>Xu</last></author>
-      <author><first>Sheng</first> <last>Gao</last></author>
+      <author><first>Zuyi</first><last>Bao</last></author>
+      <author><first>Si</first><last>Li</last></author>
+      <author><first>Weiran</first><last>Xu</last></author>
+      <author><first>Sheng</first><last>Gao</last></author>
       <pages>11–20</pages>
       <url>W17-6002</url>
       <abstract>For Chinese word segmentation, the large-scale annotated corpora mainly focus on newswire and only a handful of annotated data is available in other domains such as patents and literature. Considering the limited amount of annotated target domain data, it is a challenge for segmenters to learn domain-specific information while avoid getting over-fitted at the same time. In this paper, we propose a neural regularized domain adaptation method for Chinese word segmentation. The teacher networks trained in source domain are employed to regularize the training process of the student network by preserving the general knowledge. In the experiments, our neural regularized domain adaptation method achieves a better performance comparing to previous methods.</abstract>
     </paper>
     <paper id="3">
       <title>The Sentimental Value of <fixed-case>C</fixed-case>hinese Sub-Character Components</title>
-      <author><first>Yassine</first> <last>Benajiba</last></author>
-      <author><first>Or</first> <last>Biran</last></author>
-      <author><first>Zhiliang</first> <last>Weng</last></author>
-      <author><first>Yong</first> <last>Zhang</last></author>
-      <author><first>Jin</first> <last>Sun</last></author>
+      <author><first>Yassine</first><last>Benajiba</last></author>
+      <author><first>Or</first><last>Biran</last></author>
+      <author><first>Zhiliang</first><last>Weng</last></author>
+      <author><first>Yong</first><last>Zhang</last></author>
+      <author><first>Jin</first><last>Sun</last></author>
       <pages>21–29</pages>
       <url>W17-6003</url>
       <abstract>Sub-character components of Chinese characters carry important semantic information, and recent studies have shown that utilizing this information can improve performance on core semantic tasks. In this paper, we hypothesize that in addition to semantic information, sub-character components may also carry emotional information, and that utilizing it should improve performance on sentiment analysis tasks. We conduct a series of experiments on four Chinese sentiment data sets and show that we can significantly improve the performance in various tasks over that of a character-level embeddings baseline. We then focus on qualitatively assessing multiple examples and trying to explain how the sub-character components affect the results in each case.</abstract>
     </paper>
     <paper id="4">
       <title><fixed-case>C</fixed-case>hinese Answer Extraction Based on <fixed-case>POS</fixed-case> Tree and Genetic Algorithm</title>
-      <author><first>Shuihua</first> <last>Li</last></author>
-      <author><first>Xiaoming</first> <last>Zhang</last></author>
-      <author><first>Zhoujun</first> <last>Li</last></author>
+      <author><first>Shuihua</first><last>Li</last></author>
+      <author><first>Xiaoming</first><last>Zhang</last></author>
+      <author><first>Zhoujun</first><last>Li</last></author>
       <pages>30–36</pages>
       <url>W17-6004</url>
       <abstract>Answer extraction is the most important part of a chinese web-based question answering system. In order to enhance the robustness and adaptability of answer extraction to new domains and eliminate the influence of the incomplete and noisy search snippets, we propose two new answer exraction methods. We utilize text patterns to generate Part-of-Speech (POS) patterns. In addition, a method is proposed to construct a POS tree by using these POS patterns. The POS tree is useful to candidate answer extraction of web-based question answering. To retrieve a efficient POS tree, the similarities between questions are used to select the question-answer pairs whose questions are similar to the unanswered question. Then, the POS tree is improved based on these question-answer pairs. In order to rank these candidate answers, the weights of the leaf nodes of the POS tree are calculated using a heuristic method. Moreover, the Genetic Algorithm (GA) is used to train the weights. The experimental results of 10-fold crossvalidation show that the weighted POS tree trained by GA can improve the accuracy of answer extraction.</abstract>
     </paper>
     <paper id="5">
       <title>Learning from Parenthetical Sentences for Term Translation in Machine Translation</title>
-      <author><first>Guoping</first> <last>Huang</last></author>
-      <author><first>Jiajun</first> <last>Zhang</last></author>
-      <author><first>Yu</first> <last>Zhou</last></author>
-      <author><first>Chengqing</first> <last>Zong</last></author>
+      <author><first>Guoping</first><last>Huang</last></author>
+      <author><first>Jiajun</first><last>Zhang</last></author>
+      <author><first>Yu</first><last>Zhou</last></author>
+      <author><first>Chengqing</first><last>Zong</last></author>
       <pages>37–45</pages>
       <url>W17-6005</url>
       <abstract>Terms extensively exist in specific domains, and term translation plays a critical role in domain-specific machine translation (MT) tasks. However, it’s a challenging task to translate them correctly for the huge number of pre-existing terms and the endless new terms. To achieve better term translation quality, it is necessary to inject external term knowledge into the underlying MT system. Fortunately, there are plenty of term translation knowledge in parenthetical sentences on the Internet. In this paper, we propose a simple, straightforward and effective framework to improve term translation by learning from parenthetical sentences. This framework includes: (1) a focused web crawler; (2) a parenthetical sentence filter, acquiring parenthetical sentences including bilingual term pairs; (3) a term translation knowledge extractor, extracting bilingual term translation candidates; (4) a probability learner, generating the term translation table for MT decoders. The extensive experiments demonstrate that our proposed framework significantly improves the translation quality of terms and sentences.</abstract>
@@ -11184,108 +11184,108 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>A Feature Structure Algebra for <fixed-case>FTAG</fixed-case></title>
-      <author><first>Alexander</first> <last>Koller</last></author>
+      <author><first>Alexander</first><last>Koller</last></author>
       <pages>1–10</pages>
       <url>W17-6201</url>
     </paper>
     <paper id="2">
       <title>Parsing Minimalist Languages with Interpreted Regular Tree Grammars</title>
-      <author><first>Meaghan</first> <last>Fowlie</last></author>
-      <author><first>Alexander</first> <last>Koller</last></author>
+      <author><first>Meaghan</first><last>Fowlie</last></author>
+      <author><first>Alexander</first><last>Koller</last></author>
       <pages>11–20</pages>
       <url>W17-6202</url>
     </paper>
     <paper id="3">
       <title>Depictives in <fixed-case>E</fixed-case>nglish: An <fixed-case>LTAG</fixed-case> Approach</title>
-      <author><first>Benjamin</first> <last>Burkhardt</last></author>
-      <author><first>Timm</first> <last>Lichte</last></author>
-      <author><first>Laura</first> <last>Kallmeyer</last></author>
+      <author><first>Benjamin</first><last>Burkhardt</last></author>
+      <author><first>Timm</first><last>Lichte</last></author>
+      <author><first>Laura</first><last>Kallmeyer</last></author>
       <pages>21–30</pages>
       <url>W17-6203</url>
     </paper>
     <paper id="4">
       <title>Reflexives and Reciprocals in Synchronous Tree Adjoining Grammar</title>
-      <author><first>Cristina</first> <last>Aggazzotti</last></author>
-      <author><first>Stuart M.</first> <last>Shieber</last></author>
+      <author><first>Cristina</first><last>Aggazzotti</last></author>
+      <author><first>Stuart M.</first><last>Shieber</last></author>
       <pages>31–42</pages>
       <url>W17-6204</url>
     </paper>
     <paper id="5">
       <title>Coordination in <fixed-case>TAG</fixed-case> without the Conjoin Operation</title>
-      <author><first>Chung-hye</first> <last>Han</last></author>
-      <author><first>Anoop</first> <last>Sarkar</last></author>
+      <author><first>Chung-hye</first><last>Han</last></author>
+      <author><first>Anoop</first><last>Sarkar</last></author>
       <pages>43–52</pages>
       <url>W17-6205</url>
     </paper>
     <paper id="6">
       <title>Scope, Time, and Predicate Restriction in Blackfoot using <fixed-case>MC</fixed-case>-<fixed-case>STAG</fixed-case></title>
-      <author><first>Dennis Ryan</first> <last>Storoshenko</last></author>
+      <author><first>Dennis Ryan</first><last>Storoshenko</last></author>
       <pages>53–60</pages>
       <url>W17-6206</url>
     </paper>
     <paper id="7">
       <title>Combining Predicate-Argument Structure and Operator Projection: Clause Structure in Role and Reference Grammar</title>
-      <author><first>Laura</first> <last>Kallmeyer</last></author>
-      <author><first>Rainer</first> <last>Osswald</last></author>
+      <author><first>Laura</first><last>Kallmeyer</last></author>
+      <author><first>Rainer</first><last>Osswald</last></author>
       <pages>61–70</pages>
       <url>W17-6207</url>
     </paper>
     <paper id="8">
       <title>Parsing with Dynamic Continuized <fixed-case>CCG</fixed-case></title>
-      <author><first>Michael</first> <last>White</last></author>
-      <author><first>Simon</first> <last>Charlow</last></author>
-      <author><first>Jordan</first> <last>Needle</last></author>
-      <author><first>Dylan</first> <last>Bumford</last></author>
+      <author><first>Michael</first><last>White</last></author>
+      <author><first>Simon</first><last>Charlow</last></author>
+      <author><first>Jordan</first><last>Needle</last></author>
+      <author><first>Dylan</first><last>Bumford</last></author>
       <pages>71–83</pages>
       <url>W17-6208</url>
     </paper>
     <paper id="9">
       <title>Multiword Expression-Aware <fixed-case>A</fixed-case>* <fixed-case>TAG</fixed-case> Parsing Revisited</title>
-      <author><first>Jakub</first> <last>Waszczuk</last></author>
-      <author><first>Agata</first> <last>Savary</last></author>
-      <author><first>Yannick</first> <last>Parmentier</last></author>
+      <author><first>Jakub</first><last>Waszczuk</last></author>
+      <author><first>Agata</first><last>Savary</last></author>
+      <author><first>Yannick</first><last>Parmentier</last></author>
       <pages>84–93</pages>
       <url>W17-6209</url>
     </paper>
     <paper id="10">
       <title>Single-Rooted <fixed-case>DAG</fixed-case>s in Regular <fixed-case>DAG</fixed-case> Languages: <fixed-case>P</fixed-case>arikh Image and Path Languages</title>
-      <author><first>Martin</first> <last>Berglund</last></author>
-      <author><first>Henrik</first> <last>Björklund</last></author>
-      <author><first>Frank</first> <last>Drewes</last></author>
+      <author><first>Martin</first><last>Berglund</last></author>
+      <author><first>Henrik</first><last>Björklund</last></author>
+      <author><first>Frank</first><last>Drewes</last></author>
       <pages>94–101</pages>
       <url>W17-6210</url>
     </paper>
     <paper id="11">
       <title>Contextual Hyperedge Replacement Grammars for Abstract Meaning Representations</title>
-      <author><first>Frank</first> <last>Drewes</last></author>
-      <author><first>Anna</first> <last>Jonsson</last></author>
+      <author><first>Frank</first><last>Drewes</last></author>
+      <author><first>Anna</first><last>Jonsson</last></author>
       <pages>102–111</pages>
       <url>W17-6211</url>
     </paper>
     <paper id="12">
       <title>Transforming Dependency Structures to <fixed-case>LTAG</fixed-case> Derivation Trees</title>
-      <author><first>Caio</first> <last>Corro</last></author>
-      <author><first>Joseph</first> <last>Le Roux</last></author>
+      <author><first>Caio</first><last>Corro</last></author>
+      <author><first>Joseph</first><last>Le Roux</last></author>
       <pages>112–121</pages>
       <url>W17-6212</url>
     </paper>
     <paper id="13">
       <title>Linguistically Rich Vector Representations of Supertags for <fixed-case>TAG</fixed-case> Parsing</title>
-      <author><first>Dan</first> <last>Friedman</last></author>
-      <author><first>Jungo</first> <last>Kasai</last></author>
-      <author><first>R. Thomas</first> <last>McCoy</last></author>
-      <author><first>Robert</first> <last>Frank</last></author>
-      <author><first>Forrest</first> <last>Davis</last></author>
-      <author><first>Owen</first> <last>Rambow</last></author>
+      <author><first>Dan</first><last>Friedman</last></author>
+      <author><first>Jungo</first><last>Kasai</last></author>
+      <author><first>R. Thomas</first><last>McCoy</last></author>
+      <author><first>Robert</first><last>Frank</last></author>
+      <author><first>Forrest</first><last>Davis</last></author>
+      <author><first>Owen</first><last>Rambow</last></author>
       <pages>122–131</pages>
       <url>W17-6213</url>
     </paper>
     <paper id="14">
       <title><fixed-case>TAG</fixed-case> Parser Evaluation using Textual Entailments</title>
-      <author><first>Pauli</first> <last>Xu</last></author>
-      <author><first>Robert</first> <last>Frank</last></author>
-      <author><first>Jungo</first> <last>Kasai</last></author>
-      <author><first>Owen</first> <last>Rambow</last></author>
+      <author><first>Pauli</first><last>Xu</last></author>
+      <author><first>Robert</first><last>Frank</last></author>
+      <author><first>Jungo</first><last>Kasai</last></author>
+      <author><first>Owen</first><last>Rambow</last></author>
       <pages>132–141</pages>
       <url>W17-6214</url>
     </paper>
@@ -11306,121 +11306,121 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Automatically Acquired Lexical Knowledge Improves <fixed-case>J</fixed-case>apanese Joint Morphological and Dependency Analysis</title>
-      <author><first>Daisuke</first> <last>Kawahara</last></author>
-      <author><first>Yuta</first> <last>Hayashibe</last></author>
-      <author><first>Hajime</first> <last>Morita</last></author>
-      <author><first>Sadao</first> <last>Kurohashi</last></author>
+      <author><first>Daisuke</first><last>Kawahara</last></author>
+      <author><first>Yuta</first><last>Hayashibe</last></author>
+      <author><first>Hajime</first><last>Morita</last></author>
+      <author><first>Sadao</first><last>Kurohashi</last></author>
       <pages>1–10</pages>
       <url>W17-6301</url>
       <abstract>This paper presents a joint model for morphological and dependency analysis based on automatically acquired lexical knowledge. This model takes advantage of rich lexical knowledge to simultaneously resolve word segmentation, POS, and dependency ambiguities. In our experiments on Japanese, we show the effectiveness of our joint model over conventional pipeline models.</abstract>
     </paper>
     <paper id="2">
       <title>Dependency Language Models for Transition-based Dependency Parsing</title>
-      <author><first>Juntao</first> <last>Yu</last></author>
-      <author><first>Bernd</first> <last>Bohnet</last></author>
+      <author><first>Juntao</first><last>Yu</last></author>
+      <author><first>Bernd</first><last>Bohnet</last></author>
       <pages>11–17</pages>
       <url>W17-6302</url>
       <abstract>In this paper, we present an approach to improve the accuracy of a strong transition-based dependency parser by exploiting dependency language models that are extracted from a large parsed corpus. We integrated a small number of features based on the dependency language models into the parser. To demonstrate the effectiveness of the proposed approach, we evaluate our parser on standard English and Chinese data where the base parser could achieve competitive accuracy scores. Our enhanced parser achieved state-of-the-art accuracy on Chinese data and competitive results on English data. We gained a large absolute improvement of one point (UAS) on Chinese and 0.5 points for English.</abstract>
     </paper>
     <paper id="3">
       <title>Lexicalized vs. Delexicalized Parsing in Low-Resource Scenarios</title>
-      <author><first>Agnieszka</first> <last>Falenska</last></author>
-      <author><first>Özlem</first> <last>Çetinoğlu</last></author>
+      <author><first>Agnieszka</first><last>Falenska</last></author>
+      <author><first>Özlem</first><last>Çetinoğlu</last></author>
       <pages>18–24</pages>
       <url>W17-6303</url>
       <abstract>We present a systematic analysis of lexicalized vs. delexicalized parsing in low-resource scenarios, and propose a methodology to choose one method over another under certain conditions. We create a set of simulation experiments on 41 languages and apply our findings to 9 low-resource languages. Experimental results show that our methodology chooses the best approach in 8 out of 9 cases.</abstract>
     </paper>
     <paper id="4">
       <title>Improving neural tagging with lexical information</title>
-      <author><first>Benoît</first> <last>Sagot</last></author>
-      <author><first>Héctor</first> <last>Martínez Alonso</last></author>
+      <author><first>Benoît</first><last>Sagot</last></author>
+      <author><first>Héctor</first><last>Martínez Alonso</last></author>
       <pages>25–31</pages>
       <url>W17-6304</url>
       <abstract>Neural part-of-speech tagging has achieved competitive results with the incorporation of character-based and pre-trained word embeddings. In this paper, we show that a state-of-the-art bi-LSTM tagger can benefit from using information from morphosyntactic lexicons as additional input. The tagger, trained on several dozen languages, shows a consistent, average improvement when using lexical information, even when also using character-based embeddings, thus showing the complementarity of the different sources of lexical information. The improvements are particularly important for the smaller datasets.</abstract>
     </paper>
     <paper id="5">
       <title>Prepositional Phrase Attachment over Word Embedding Products</title>
-      <author><first>Pranava Swaroop</first> <last>Madhyastha</last></author>
-      <author><first>Xavier</first> <last>Carreras</last></author>
-      <author><first>Ariadna</first> <last>Quattoni</last></author>
+      <author><first>Pranava Swaroop</first><last>Madhyastha</last></author>
+      <author><first>Xavier</first><last>Carreras</last></author>
+      <author><first>Ariadna</first><last>Quattoni</last></author>
       <pages>32–43</pages>
       <url>W17-6305</url>
       <abstract>We present a low-rank multi-linear model for the task of solving prepositional phrase attachment ambiguity (PP task). Our model exploits tensor products of word embeddings, capturing all possible conjunctions of latent embeddings. Our results on a wide range of datasets and task settings show that tensor products are the best compositional operation and that a relatively simple multi-linear model that uses only word embeddings of lexical features can outperform more complex non-linear architectures that exploit the same information. Our proposed model gives the current best reported performance on an out-of-domain evaluation and performs competively on out-of-domain dependency parsing datasets.</abstract>
     </paper>
     <paper id="6">
       <title><fixed-case>L</fixed-case>1-<fixed-case>L</fixed-case>2 Parallel Dependency Treebank as Learner Corpus</title>
-      <author><first>John</first> <last>Lee</last></author>
-      <author><first>Keying</first> <last>Li</last></author>
-      <author><first>Herman</first> <last>Leung</last></author>
+      <author><first>John</first><last>Lee</last></author>
+      <author><first>Keying</first><last>Li</last></author>
+      <author><first>Herman</first><last>Leung</last></author>
       <pages>44–49</pages>
       <url>W17-6306</url>
       <abstract>This opinion paper proposes the use of parallel treebank as learner corpus. We show how an L1-L2 parallel treebank — i.e., parse trees of non-native sentences, aligned to the parse trees of their target hypotheses — can facilitate retrieval of sentences with specific learner errors. We argue for its benefits, in terms of corpus re-use and interoperability, over a conventional learner corpus annotated with error tags. As a proof of concept, we conduct a case study on word-order errors made by learners of Chinese as a foreign language. We report precision and recall in retrieving a range of word-order error categories from L1-L2 tree pairs annotated in the Universal Dependency framework.</abstract>
     </paper>
     <paper id="7">
       <title>Splitting Complex <fixed-case>E</fixed-case>nglish Sentences</title>
-      <author><first>John</first> <last>Lee</last></author>
-      <author><first>J. Buddhika K. Pathirage</first> <last>Don</last></author>
+      <author><first>John</first><last>Lee</last></author>
+      <author><first>J. Buddhika K. Pathirage</first><last>Don</last></author>
       <pages>50–55</pages>
       <url>W17-6307</url>
       <abstract>This paper applies parsing technology to the task of syntactic simplification of English sentences, focusing on the identification of text spans that can be removed from a complex sentence. We report the most comprehensive evaluation to-date on this task, using a dataset of sentences that exhibit simplification based on coordination, subordination, punctuation/parataxis, adjectival clauses, participial phrases, and appositive phrases. We train a decision tree with features derived from text span length, POS tags and dependency relations, and show that it significantly outperforms a parser-only baseline.</abstract>
     </paper>
     <paper id="8">
       <title>Hierarchical Word Structure-based Parsing: A Feasibility Study on <fixed-case>UD</fixed-case>-style Dependency Parsing in <fixed-case>J</fixed-case>apanese</title>
-      <author><first>Takaaki</first> <last>Tanaka</last></author>
-      <author><first>Katsuhiko</first> <last>Hayashi</last></author>
-      <author><first>Masaaki</first> <last>Nagata</last></author>
+      <author><first>Takaaki</first><last>Tanaka</last></author>
+      <author><first>Katsuhiko</first><last>Hayashi</last></author>
+      <author><first>Masaaki</first><last>Nagata</last></author>
       <pages>56–60</pages>
       <url>W17-6308</url>
       <abstract>In applying word-based dependency parsing such as Universal Dependencies (UD) to Japanese, the uncertainty of word segmentation emerges for defining a word unit of the dependencies. We introduce the following hierarchical word structures to dependency parsing in Japanese: morphological units (a short unit word, SUW) and syntactic units (a long unit word, LUW). An SUW can be used to segment a sentence consistently, while it is too short to represent syntactic construction. An LUW is a unit including functional multiwords and LUW-based analysis facilitates the capturing of syntactic structure and makes parsing results more precise than SUW-based analysis. This paper describes the results of a feasibility study on the ability and the effectiveness of parsing methods based on hierarchical word structure (LUW chunking+parsing) in comparison to single layer word structure (SUW parsing). We also show joint analysis of LUW-chunking and dependency parsing improves the performance of identifying predicate-argument structures, while there is not much difference between overall results of them. not much difference between overall results of them.</abstract>
     </paper>
     <paper id="9">
       <title>Leveraging Newswire Treebanks for Parsing Conversational Data with Argument Scrambling</title>
-      <author><first>Riyaz A.</first> <last>Bhat</last></author>
-      <author><first>Irshad</first> <last>Bhat</last></author>
-      <author><first>Dipti</first> <last>Sharma</last></author>
+      <author><first>Riyaz A.</first><last>Bhat</last></author>
+      <author><first>Irshad</first><last>Bhat</last></author>
+      <author><first>Dipti</first><last>Sharma</last></author>
       <pages>61–66</pages>
       <url>W17-6309</url>
       <abstract>We investigate the problem of parsing conversational data of morphologically-rich languages such as Hindi where argument scrambling occurs frequently. We evaluate a state-of-the-art non-linear transition-based parsing system on a new dataset containing 506 dependency trees for sentences from Bollywood (Hindi) movie scripts and Twitter posts of Hindi monolingual speakers. We show that a dependency parser trained on a newswire treebank is strongly biased towards the canonical structures and degrades when applied to conversational data. Inspired by Transformational Generative Grammar (Chomsky, 1965), we mitigate the sampling bias by generating all theoretically possible alternative word orders of a clause from the existing (kernel) structures in the treebank. Training our parser on canonical and transformed structures improves performance on conversational data by around 9% LAS over the baseline newswire parser.</abstract>
     </paper>
     <paper id="10">
       <title>Using hyperlinks to improve multilingual partial parsers</title>
-      <author><first>Anders</first> <last>Søgaard</last></author>
+      <author><first>Anders</first><last>Søgaard</last></author>
       <pages>67–71</pages>
       <url>W17-6310</url>
       <abstract>Syntactic annotation is costly and not available for the vast majority of the world’s languages. We show that sometimes we can do away with less labeled data by exploiting more readily available forms of mark-up. Specifically, we revisit an idea from Valentin Spitkovsky’s work (2010), namely that hyperlinks typically bracket syntactic constituents or chunks. We strengthen his results by showing that not only can hyperlinks help in low resource scenarios, exemplified here by Quechua, but learning from hyperlinks can also improve state-of-the-art NLP models for English newswire. We also present out-of-domain evaluation on English Ontonotes 4.0.</abstract>
     </paper>
     <paper id="11">
       <title>Correcting prepositional phrase attachments using multimodal corpora</title>
-      <author><first>Sebastien</first> <last>Delecraz</last></author>
-      <author><first>Alexis</first> <last>Nasr</last></author>
-      <author><first>Frederic</first> <last>Bechet</last></author>
-      <author><first>Benoit</first> <last>Favre</last></author>
+      <author><first>Sebastien</first><last>Delecraz</last></author>
+      <author><first>Alexis</first><last>Nasr</last></author>
+      <author><first>Frederic</first><last>Bechet</last></author>
+      <author><first>Benoit</first><last>Favre</last></author>
       <pages>72–77</pages>
       <url>W17-6311</url>
       <abstract>PP-attachments are an important source of errors in parsing natural language. We propose in this article to use data coming from a multimodal corpus, combining textual, visual and conceptual information, as well as a correction strategy, to propose alternative attachments in the output of a parser.</abstract>
     </paper>
     <paper id="12">
       <title>Exploiting Structure in Parsing to 1-Endpoint-Crossing Graphs</title>
-      <author><first>Robin</first> <last>Kurtz</last></author>
-      <author><first>Marco</first> <last>Kuhlmann</last></author>
+      <author><first>Robin</first><last>Kurtz</last></author>
+      <author><first>Marco</first><last>Kuhlmann</last></author>
       <pages>78–87</pages>
       <url>W17-6312</url>
       <abstract>Deep dependency parsing can be cast as the search for maximum acyclic subgraphs in weighted digraphs. Because this search problem is intractable in the general case, we consider its restriction to the class of 1-endpoint-crossing (1ec) graphs, which has high coverage on standard data sets. Our main contribution is a characterization of 1ec graphs as a subclass of the graphs with pagenumber at most 3. Building on this we show how to extend an existing parsing algorithm for 1-endpoint-crossing trees to the full class. While the runtime complexity of the extended algorithm is polynomial in the length of the input sentence, it features a large constant, which poses a challenge for practical implementations.</abstract>
     </paper>
     <paper id="13">
       <title>Effective Online Reordering with Arc-Eager Transitions</title>
-      <author><first>Ryosuke</first> <last>Kohita</last></author>
-      <author><first>Hiroshi</first> <last>Noji</last></author>
-      <author><first>Yuji</first> <last>Matsumoto</last></author>
+      <author><first>Ryosuke</first><last>Kohita</last></author>
+      <author><first>Hiroshi</first><last>Noji</last></author>
+      <author><first>Yuji</first><last>Matsumoto</last></author>
       <pages>88–98</pages>
       <url>W17-6313</url>
       <abstract>We present a new transition system with word reordering for unrestricted non-projective dependency parsing. Our system is based on decomposed arc-eager rather than arc-standard, which allows more flexible ambiguity resolution between a local projective and non-local crossing attachment. In our experiment on Universal Dependencies 2.0, we find our parser outperforms the ordinary swap-based parser particularly on languages with a large amount of non-projectivity.</abstract>
     </paper>
     <paper id="14">
       <title>Arc-Hybrid Non-Projective Dependency Parsing with a Static-Dynamic Oracle</title>
-      <author><first>Miryam</first> <last>de Lhoneux</last></author>
-      <author><first>Sara</first> <last>Stymne</last></author>
-      <author><first>Joakim</first> <last>Nivre</last></author>
+      <author><first>Miryam</first><last>de Lhoneux</last></author>
+      <author><first>Sara</first><last>Stymne</last></author>
+      <author><first>Joakim</first><last>Nivre</last></author>
       <pages>99–104</pages>
       <url>W17-6314</url>
       <abstract>In this paper, we extend the arc-hybrid system for transition-based parsing with a swap transition that enables reordering of the words and construction of non-projective trees. Although this extension breaks the arc-decomposability of the transition system, we show how the existing dynamic oracle for this system can be modified and combined with a static oracle only for the swap transition. Experiments on 5 languages show that the new system gives competitive accuracy and is significantly better than a system trained with a purely static oracle.</abstract>
@@ -11428,33 +11428,33 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="15">
       <title>Encoder-Decoder Shift-Reduce Syntactic Parsing</title>
-      <author><first>Jiangming</first> <last>Liu</last></author>
-      <author><first>Yue</first> <last>Zhang</last></author>
+      <author><first>Jiangming</first><last>Liu</last></author>
+      <author><first>Yue</first><last>Zhang</last></author>
       <pages>105–114</pages>
       <url>W17-6315</url>
       <abstract>Encoder-decoder neural networks have been used for many NLP tasks, such as neural machine translation. They have also been applied to constituent parsing by using bracketed tree structures as a target language, translating input sentences into syntactic trees. A more commonly used method to linearize syntactic trees is the shift-reduce system, which uses a sequence of transition-actions to build trees. We empirically investigate the effectiveness of applying the encoder-decoder network to transition-based parsing. On standard benchmarks, our system gives comparable results to the stack LSTM parser for dependency parsing, and significantly better results compared to the aforementioned parser for constituent parsing, which uses bracketed tree formats.</abstract>
     </paper>
     <paper id="16">
       <title>Arc-Standard Spinal Parsing with Stack-<fixed-case>LSTM</fixed-case>s</title>
-      <author><first>Miguel</first> <last>Ballesteros</last></author>
-      <author><first>Xavier</first> <last>Carreras</last></author>
+      <author><first>Miguel</first><last>Ballesteros</last></author>
+      <author><first>Xavier</first><last>Carreras</last></author>
       <pages>115–121</pages>
       <url>W17-6316</url>
       <abstract>We present a neural transition-based parser for spinal trees, a dependency representation of constituent trees. The parser uses Stack-LSTMs that compose constituent nodes with dependency-based derivations. In experiments, we show that this model adapts to different styles of dependency relations, but this choice has little effect for predicting constituent structure, suggesting that LSTMs induce useful states by themselves.</abstract>
     </paper>
     <paper id="17">
       <title>Coarse-To-Fine Parsing for Expressive Grammar Formalisms</title>
-      <author><first>Christoph</first> <last>Teichmann</last></author>
-      <author><first>Alexander</first> <last>Koller</last></author>
-      <author><first>Jonas</first> <last>Groschwitz</last></author>
+      <author><first>Christoph</first><last>Teichmann</last></author>
+      <author><first>Alexander</first><last>Koller</last></author>
+      <author><first>Jonas</first><last>Groschwitz</last></author>
       <pages>122–127</pages>
       <url>W17-6317</url>
       <abstract>We generalize coarse-to-fine parsing to grammar formalisms that are more expressive than PCFGs and/or describe languages of trees or graphs. We evaluate our algorithm on PCFG, PTAG, and graph parsing. While we achieve the expected performance gains on PCFGs, coarse-to-fine does not help for PTAG and can even slow down parsing for graphs. We discuss the implications of this finding.</abstract>
     </paper>
     <paper id="18">
       <title>Evaluating <fixed-case>LSTM</fixed-case> models for grammatical function labelling</title>
-      <author><first>Bich-Ngoc</first> <last>Do</last></author>
-      <author><first>Ines</first> <last>Rehbein</last></author>
+      <author><first>Bich-Ngoc</first><last>Do</last></author>
+      <author><first>Ines</first><last>Rehbein</last></author>
       <pages>128–133</pages>
       <url>W17-6318</url>
       <abstract>To improve grammatical function labelling for German, we augment the labelling component of a neural dependency parser with a decision history. We present different ways to encode the history, using different LSTM architectures, and show that our models yield significant improvements, resulting in a LAS for German that is close to the best result from the SPMRL 2014 shared task (without the reranker).</abstract>
@@ -11476,236 +11476,236 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Capturing Dependency Syntax with “Deep” Sequential Models</title>
-      <author><first>Yoav</first> <last>Goldberg</last></author>
+      <author><first>Yoav</first><last>Goldberg</last></author>
       <pages>1</pages>
       <url>W17-6501</url>
     </paper>
     <paper id="2">
       <title>Syntax-Semantics Interface: A Plea for a Deep Dependency Sentence Structure</title>
-      <author><first>Eva</first> <last>Hajičová</last></author>
+      <author><first>Eva</first><last>Hajičová</last></author>
       <pages>2–3</pages>
       <url>W17-6502</url>
     </paper>
     <paper id="3">
       <title>The Benefit of Syntactic vs. Linear N-grams for Linguistic Description</title>
-      <author><first>Melanie</first> <last>Andresen</last></author>
-      <author><first>Heike</first> <last>Zinsmeister</last></author>
+      <author><first>Melanie</first><last>Andresen</last></author>
+      <author><first>Heike</first><last>Zinsmeister</last></author>
       <pages>4–14</pages>
       <url>W17-6503</url>
     </paper>
     <paper id="4">
       <title>On the Predicate-Argument Structure: Internal and Absorbing Scope</title>
-      <author><first>Igor</first> <last>Boguslavsky</last></author>
+      <author><first>Igor</first><last>Boguslavsky</last></author>
       <pages>15–24</pages>
       <url>W17-6504</url>
     </paper>
     <paper id="5">
       <title>On the order of Words in <fixed-case>I</fixed-case>talian: a Study on Genre vs Complexity</title>
-      <author><first>Dominique</first> <last>Brunato</last></author>
-      <author><first>Felice</first> <last>Dell’Orletta</last></author>
+      <author><first>Dominique</first><last>Brunato</last></author>
+      <author><first>Felice</first><last>Dell’Orletta</last></author>
       <pages>25–31</pages>
       <url>W17-6505</url>
     </paper>
     <paper id="6">
       <title>Revising the <fixed-case>METU</fixed-case>-Sabancı <fixed-case>T</fixed-case>urkish Treebank: An Exercise in Surface-Syntactic Annotation of Agglutinative Languages</title>
-      <author><first>Alicia</first> <last>Burga</last></author>
-      <author><first>Alp</first> <last>Öktem</last></author>
-      <author><first>Leo</first> <last>Wanner</last></author>
+      <author><first>Alicia</first><last>Burga</last></author>
+      <author><first>Alp</first><last>Öktem</last></author>
+      <author><first>Leo</first><last>Wanner</last></author>
       <pages>32–41</pages>
       <url>W17-6506</url>
     </paper>
     <paper id="7">
       <title>Enhanced <fixed-case>UD</fixed-case> Dependencies with Neutralized Diathesis Alternation</title>
-      <author><first>Marie</first> <last>Candito</last></author>
-      <author><first>Bruno</first> <last>Guillaume</last></author>
-      <author><first>Guy</first> <last>Perrier</last></author>
-      <author><first>Djamé</first> <last>Seddah</last></author>
+      <author><first>Marie</first><last>Candito</last></author>
+      <author><first>Bruno</first><last>Guillaume</last></author>
+      <author><first>Guy</first><last>Perrier</last></author>
+      <author><first>Djamé</first><last>Seddah</last></author>
       <pages>42–53</pages>
       <url>W17-6507</url>
     </paper>
     <paper id="8">
       <title>Classifying Languages by Dependency Structure. Typologies of Delexicalized Universal Dependency Treebanks</title>
-      <author><first>Xinying</first> <last>Chen</last></author>
-      <author><first>Kim</first> <last>Gerdes</last></author>
+      <author><first>Xinying</first><last>Chen</last></author>
+      <author><first>Kim</first><last>Gerdes</last></author>
       <pages>54–63</pages>
       <url>W17-6508</url>
     </paper>
     <paper id="9">
       <title>A Dependency Treebank for <fixed-case>K</fixed-case>urmanji <fixed-case>K</fixed-case>urdish</title>
-      <author><first>Memduh</first> <last>Gökırmak</last></author>
-      <author><first>Francis M.</first> <last>Tyers</last></author>
+      <author><first>Memduh</first><last>Gökırmak</last></author>
+      <author><first>Francis M.</first><last>Tyers</last></author>
       <pages>64–72</pages>
       <url>W17-6509</url>
     </paper>
     <paper id="10">
       <title>What are the limitations on the flux of syntactic dependencies? Evidence from <fixed-case>UD</fixed-case> treebanks</title>
-      <author><first>Sylvain</first> <last>Kahane</last></author>
-      <author><first>Chunxiao</first> <last>Yan</last></author>
-      <author><first>Marie-Amélie</first> <last>Botalla</last></author>
+      <author><first>Sylvain</first><last>Kahane</last></author>
+      <author><first>Chunxiao</first><last>Yan</last></author>
+      <author><first>Marie-Amélie</first><last>Botalla</last></author>
       <pages>73–82</pages>
       <url>W17-6510</url>
     </paper>
     <paper id="11">
       <title>Fully Delexicalized Contexts for Syntax-Based Word Embeddings</title>
-      <author><first>Jenna</first> <last>Kanerva</last></author>
-      <author><first>Sampo</first> <last>Pyysalo</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
+      <author><first>Jenna</first><last>Kanerva</last></author>
+      <author><first>Sampo</first><last>Pyysalo</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
       <pages>83–91</pages>
       <url>W17-6511</url>
     </paper>
     <paper id="12">
       <title>Universal Dependencies for Dargwa Mehweb</title>
-      <author><first>Alexandra</first> <last>Kozhukhar</last></author>
+      <author><first>Alexandra</first><last>Kozhukhar</last></author>
       <pages>92–99</pages>
       <url>W17-6512</url>
     </paper>
     <paper id="13">
       <title>Menzerath-Altmann Law in Syntactic Dependency Structure</title>
-      <author><first>Ján</first> <last>Mačutek</last></author>
-      <author><first>Radek</first> <last>Čech</last></author>
-      <author><first>Jiří</first> <last>Milička</last></author>
+      <author><first>Ján</first><last>Mačutek</last></author>
+      <author><first>Radek</first><last>Čech</last></author>
+      <author><first>Jiří</first><last>Milička</last></author>
       <pages>100–107</pages>
       <url>W17-6513</url>
     </paper>
     <paper id="14">
       <title>Assessing the Annotation Consistency of the Universal Dependencies Corpora</title>
-      <author><first>Marie-Catherine</first> <last>de Marneffe</last></author>
-      <author><first>Matias</first> <last>Grioni</last></author>
-      <author><first>Jenna</first> <last>Kanerva</last></author>
-      <author><first>Filip</first> <last>Ginter</last></author>
+      <author><first>Marie-Catherine</first><last>de Marneffe</last></author>
+      <author><first>Matias</first><last>Grioni</last></author>
+      <author><first>Jenna</first><last>Kanerva</last></author>
+      <author><first>Filip</first><last>Ginter</last></author>
       <pages>108–115</pages>
       <url>W17-6514</url>
     </paper>
     <paper id="15">
       <title>To What Extent is Immediate Constituency Analysis Dependency-Based? A Survey of Foundational Texts</title>
-      <author><first>Nicolas</first> <last>Mazziotta</last></author>
-      <author><first>Sylvain</first> <last>Kahane</last></author>
+      <author><first>Nicolas</first><last>Mazziotta</last></author>
+      <author><first>Sylvain</first><last>Kahane</last></author>
       <pages>116–126</pages>
       <url>W17-6515</url>
     </paper>
     <paper id="16">
       <title>Dependency Structure of Binary Conjunctions(of the <fixed-case>IF</fixed-case>…, <fixed-case>THEN</fixed-case>… Type)</title>
-      <author><first>Igor</first> <last>Mel’čuk</last></author>
+      <author><first>Igor</first><last>Mel’čuk</last></author>
       <pages>127–134</pages>
       <url>W17-6516</url>
     </paper>
     <paper id="17">
       <title>Non-Projectivity in <fixed-case>S</fixed-case>erbian: Analysis of Formal and Linguistic Properties</title>
-      <author><first>Aleksandra</first> <last>Miletic</last></author>
-      <author><first>Assaf</first> <last>Urieli</last></author>
+      <author><first>Aleksandra</first><last>Miletic</last></author>
+      <author><first>Assaf</first><last>Urieli</last></author>
       <pages>135–144</pages>
       <url>W17-6517</url>
     </paper>
     <paper id="18">
       <title>Prices go Up, Surge, Jump, Spike, Skyrocket, Go through the Roof… Intensifier Collocations with Parametric Nouns of Type <fixed-case>PRICE</fixed-case></title>
-      <author><first>Jasmina</first> <last>Milićević</last></author>
+      <author><first>Jasmina</first><last>Milićević</last></author>
       <pages>145–153</pages>
       <url>W17-6518</url>
     </paper>
     <paper id="19">
       <title><fixed-case>C</fixed-case>hinese Descriptive and Resultative V-de Constructions. A Dependency-based Analysis</title>
-      <author><first>Ruochen</first> <last>Niu</last></author>
+      <author><first>Ruochen</first><last>Niu</last></author>
       <pages>154–164</pages>
       <url>W17-6519</url>
     </paper>
     <paper id="20">
       <title>The Component Unit. Introducing a Novel Unit of Syntactic Analysis</title>
-      <author><first>Timothy</first> <last>Osborne</last></author>
-      <author><first>Ruochen</first> <last>Niu</last></author>
+      <author><first>Timothy</first><last>Osborne</last></author>
+      <author><first>Ruochen</first><last>Niu</last></author>
       <pages>165–175</pages>
       <url>W17-6520</url>
     </paper>
     <paper id="21">
       <title>Control vs. Raising in <fixed-case>E</fixed-case>nglish. A Dependency Grammar Account</title>
-      <author><first>Timothy</first> <last>Osborne</last></author>
-      <author><first>Matthew</first> <last>Reeve</last></author>
+      <author><first>Timothy</first><last>Osborne</last></author>
+      <author><first>Matthew</first><last>Reeve</last></author>
       <pages>176–186</pages>
       <url>W17-6521</url>
     </paper>
     <paper id="22">
       <title>Segmentation Granularity in Dependency Representations for <fixed-case>K</fixed-case>orean</title>
-      <author><first>Jungyeul</first> <last>Park</last></author>
+      <author><first>Jungyeul</first><last>Park</last></author>
       <pages>187–196</pages>
       <url>W17-6522</url>
     </paper>
     <paper id="23">
       <title>Universal Dependencies for <fixed-case>P</fixed-case>ortuguese</title>
-      <author><first>Alexandre</first> <last>Rademaker</last></author>
-      <author><first>Fabricio</first> <last>Chalub</last></author>
-      <author><first>Livy</first> <last>Real</last></author>
-      <author><first>Cláudia</first> <last>Freitas</last></author>
-      <author><first>Eckhard</first> <last>Bick</last></author>
-      <author><first>Valeria</first> <last>de Paiva</last></author>
+      <author><first>Alexandre</first><last>Rademaker</last></author>
+      <author><first>Fabricio</first><last>Chalub</last></author>
+      <author><first>Livy</first><last>Real</last></author>
+      <author><first>Cláudia</first><last>Freitas</last></author>
+      <author><first>Eckhard</first><last>Bick</last></author>
+      <author><first>Valeria</first><last>de Paiva</last></author>
       <pages>197–206</pages>
       <url>W17-6523</url>
     </paper>
     <paper id="24">
       <title><fixed-case>UDL</fixed-case>ex: Towards Cross-language Subcategorization Lexicons</title>
-      <author><first>Giulia</first> <last>Rambelli</last></author>
-      <author><first>Alessandro</first> <last>Lenci</last></author>
-      <author><first>Thierry</first> <last>Poibeau</last></author>
+      <author><first>Giulia</first><last>Rambelli</last></author>
+      <author><first>Alessandro</first><last>Lenci</last></author>
+      <author><first>Thierry</first><last>Poibeau</last></author>
       <pages>207–2017</pages>
       <url>W17-6524</url>
     </paper>
     <paper id="25">
       <title>Universal Dependencies are Hard to Parse – or are They?</title>
-      <author><first>Ines</first> <last>Rehbein</last></author>
-      <author><first>Julius</first> <last>Steen</last></author>
-      <author><first>Bich-Ngoc</first> <last>Do</last></author>
-      <author><first>Anette</first> <last>Frank</last></author>
+      <author><first>Ines</first><last>Rehbein</last></author>
+      <author><first>Julius</first><last>Steen</last></author>
+      <author><first>Bich-Ngoc</first><last>Do</last></author>
+      <author><first>Anette</first><last>Frank</last></author>
       <pages>218–228</pages>
       <url>W17-6525</url>
     </paper>
     <paper id="26">
       <title>Annotating <fixed-case>I</fixed-case>talian Social Media Texts in Universal Dependencies</title>
-      <author><first>Manuela</first> <last>Sanguinetti</last></author>
-      <author><first>Cristina</first> <last>Bosco</last></author>
-      <author><first>Alessandro</first> <last>Mazzei</last></author>
-      <author><first>Alberto</first> <last>Lavelli</last></author>
-      <author><first>Fabio</first> <last>Tamburini</last></author>
+      <author><first>Manuela</first><last>Sanguinetti</last></author>
+      <author><first>Cristina</first><last>Bosco</last></author>
+      <author><first>Alessandro</first><last>Mazzei</last></author>
+      <author><first>Alberto</first><last>Lavelli</last></author>
+      <author><first>Fabio</first><last>Tamburini</last></author>
       <pages>229–239</pages>
       <url>W17-6526</url>
     </paper>
     <paper id="27">
       <title><fixed-case>H</fixed-case>ungarian Copula Constructions in Dependency Syntax and Parsing</title>
-      <author><first>Katalin Ilona</first> <last>Simkó</last></author>
-      <author><first>Veronika</first> <last>Vincze</last></author>
+      <author><first>Katalin Ilona</first><last>Simkó</last></author>
+      <author><first>Veronika</first><last>Vincze</last></author>
       <pages>240–247</pages>
       <url>W17-6527</url>
     </paper>
     <paper id="28">
       <title>Semgrex-Plus: a Tool for Automatic Dependency-Graph Rewriting</title>
-      <author><first>Fabio</first> <last>Tamburini</last></author>
+      <author><first>Fabio</first><last>Tamburini</last></author>
       <pages>248–254</pages>
       <url>W17-6528</url>
     </paper>
     <paper id="29">
       <title>Unity in Diversity: A Unified Parsing Strategy for Major <fixed-case>I</fixed-case>ndian Languages</title>
-      <author><first>Juhi</first> <last>Tandon</last></author>
-      <author><first>Dipti Misra</first> <last>Sharma</last></author>
+      <author><first>Juhi</first><last>Tandon</last></author>
+      <author><first>Dipti Misra</first><last>Sharma</last></author>
       <pages>255–265</pages>
       <url>W17-6529</url>
     </paper>
     <paper id="30">
       <title>Quantitative Comparative Syntax on the <fixed-case>C</fixed-case>antonese-<fixed-case>M</fixed-case>andarin Parallel Dependency Treebank</title>
-      <author><first>Tak-sum</first> <last>Wong</last></author>
-      <author><first>Kim</first> <last>Gerdes</last></author>
-      <author><first>Herman</first> <last>Leung</last></author>
-      <author><first>John</first> <last>Lee</last></author>
+      <author><first>Tak-sum</first><last>Wong</last></author>
+      <author><first>Kim</first><last>Gerdes</last></author>
+      <author><first>Herman</first><last>Leung</last></author>
+      <author><first>John</first><last>Lee</last></author>
       <pages>266–275</pages>
       <url>W17-6530</url>
     </paper>
     <paper id="31">
       <title>Understanding Constraints on Non-Projectivity Using Novel Measures</title>
-      <author><first>Himanshu</first> <last>Yadav</last></author>
-      <author><first>Ashwini</first> <last>Vaidya</last></author>
-      <author><first>Samar</first> <last>Husain</last></author>
+      <author><first>Himanshu</first><last>Yadav</last></author>
+      <author><first>Ashwini</first><last>Vaidya</last></author>
+      <author><first>Samar</first><last>Husain</last></author>
       <pages>276–286</pages>
       <url>W17-6531</url>
     </paper>
     <paper id="32">
       <title>Core Arguments in Universal Dependencies</title>
-      <author><first>Daniel</first> <last>Zeman</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
       <pages>287–296</pages>
       <url>W17-6532</url>
     </paper>
@@ -12417,11 +12417,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
   <volume id="70">
     <meta>
       <booktitle>Proceedings of Language, Ontology, Terminology and Knowledge Structures Workshop (<fixed-case>LOTKS</fixed-case> 2017)</booktitle>
-      <editor><first>Francesca</first> <last>Frontini</last></editor>
-      <editor><first>Larisa</first> <last>Grčić Simeunović</last></editor>
-      <editor><first>Špela</first> <last>Vintar</last></editor>
-      <editor><first>Anas Fahad</first> <last>Khan</last></editor>
-      <editor><first>Artemis</first> <last>Parvisi</last></editor>
+      <editor><first>Francesca</first><last>Frontini</last></editor>
+      <editor><first>Larisa</first><last>Grčić Simeunović</last></editor>
+      <editor><first>Špela</first><last>Vintar</last></editor>
+      <editor><first>Anas Fahad</first><last>Khan</last></editor>
+      <editor><first>Artemis</first><last>Parvisi</last></editor>
       <publisher>Association for Computational Linguistics</publisher>
       <address>Montpellier, France</address>
       <month>September</month>
@@ -12432,77 +12432,77 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Exploratory Analysis for Ontology Learning from Social Events on Social Media Streaming in <fixed-case>S</fixed-case>panish</title>
-      <author><first>Enrique</first> <last>Valeriano</last></author>
-      <author><first>Arturo</first> <last>Oncevay-Marcos</last></author>
+      <author><first>Enrique</first><last>Valeriano</last></author>
+      <author><first>Arturo</first><last>Oncevay-Marcos</last></author>
       <url>W17-7001</url>
     </paper>
     <paper id="2">
       <title>Creating a gold standard corpus for terminological annotation from online forum data</title>
-      <author><first>Anna</first> <last>Hätty</last></author>
-      <author><first>Simon</first> <last>Tannert</last></author>
-      <author><first>Ulrich</first> <last>Heid</last></author>
+      <author><first>Anna</first><last>Hätty</last></author>
+      <author><first>Simon</first><last>Tannert</last></author>
+      <author><first>Ulrich</first><last>Heid</last></author>
       <url>W17-7002</url>
     </paper>
     <paper id="3">
       <title>A conceptual ontology in the water domain of knowledge to bridge the lexical semantics of stratified discursive strata</title>
-      <author><first>Jean-Louis</first> <last>Janin</last></author>
-      <author><first>Henri</first> <last>Portine</last></author>
+      <author><first>Jean-Louis</first><last>Janin</last></author>
+      <author><first>Henri</first><last>Portine</last></author>
       <url>W17-7003</url>
     </paper>
     <paper id="4">
       <title><fixed-case>G</fixed-case>eo<fixed-case>D</fixed-case>ict: an integrated gazetteer</title>
-      <author><first>Jacques</first> <last>Fize</last></author>
-      <author><first>Gaurav</first> <last>Shrivastava</last></author>
-      <author><first>Pierre André</first> <last>Ménard</last></author>
+      <author><first>Jacques</first><last>Fize</last></author>
+      <author><first>Gaurav</first><last>Shrivastava</last></author>
+      <author><first>Pierre André</first><last>Ménard</last></author>
       <url>W17-7004</url>
     </paper>
     <paper id="5">
       <title>Fine-grained domain classification of text using <fixed-case>TERMIUM</fixed-case> Plus</title>
-      <author><first>Gabriel</first> <last>Bernier-Colborne</last></author>
-      <author><first>Caroline</first> <last>Barrière</last></author>
-      <author><first>Pierre André</first> <last>Ménard</last></author>
+      <author><first>Gabriel</first><last>Bernier-Colborne</last></author>
+      <author><first>Caroline</first><last>Barrière</last></author>
+      <author><first>Pierre André</first><last>Ménard</last></author>
       <url>W17-7005</url>
     </paper>
     <paper id="6">
       <title><fixed-case>TBX</fixed-case> in <fixed-case>ODD</fixed-case>: Schema-agnostic specification and documentation for <fixed-case>T</fixed-case>erm<fixed-case>B</fixed-case>ase e<fixed-case>X</fixed-case>change</title>
-      <author><first>Stefan</first> <last>Pernes</last></author>
-      <author><first>Laurent</first> <last>Romary</last></author>
+      <author><first>Stefan</first><last>Pernes</last></author>
+      <author><first>Laurent</first><last>Romary</last></author>
       <url>W17-7006</url>
     </paper>
     <paper id="7">
       <title>Enrichment of <fixed-case>F</fixed-case>rench Biomedical Ontologies with <fixed-case>UMLS</fixed-case> Concepts and Semantic Types for Biomedical Named Entity Recognition Though Ontological Semantic Annotation</title>
-      <author><first>Andon</first> <last>Tchechmedjiev</last></author>
-      <author><first>Clément</first> <last>Jonquet</last></author>
+      <author><first>Andon</first><last>Tchechmedjiev</last></author>
+      <author><first>Clément</first><last>Jonquet</last></author>
       <url>W17-7007</url>
     </paper>
     <paper id="8">
       <title>Experiments in taxonomy induction in <fixed-case>S</fixed-case>panish and <fixed-case>F</fixed-case>rench</title>
-      <author><first>Irene</first> <last>Renau</last></author>
-      <author><first>Rogelio</first> <last>Nazar</last></author>
-      <author><first>Rafael</first> <last>Marín</last></author>
+      <author><first>Irene</first><last>Renau</last></author>
+      <author><first>Rogelio</first><last>Nazar</last></author>
+      <author><first>Rafael</first><last>Marín</last></author>
       <url>W17-7008</url>
     </paper>
     <paper id="9">
       <title>A statistical model for morphology inspired by the Amis language</title>
-      <author><first>Isabelle</first> <last>Bril</last></author>
-      <author><first>Achraf</first> <last>Lassoued</last></author>
-      <author><first>Michel</first> <last>de Rougemont</last></author>
+      <author><first>Isabelle</first><last>Bril</last></author>
+      <author><first>Achraf</first><last>Lassoued</last></author>
+      <author><first>Michel</first><last>de Rougemont</last></author>
       <url>W17-7009</url>
     </paper>
     <paper id="10">
       <title>Developing <fixed-case>L</fixed-case>ex<fixed-case>O</fixed-case>: a Collaborative Editor of Multilingual Lexica and Termino-Ontological Resources in the Humanities</title>
-      <author><first>Andrea</first> <last>Bellandi</last></author>
-      <author><first>Emiliano</first> <last>Giovannetti</last></author>
-      <author><first>Silvia</first> <last>Piccini</last></author>
-      <author><first>Anja</first> <last>Weingart</last></author>
+      <author><first>Andrea</first><last>Bellandi</last></author>
+      <author><first>Emiliano</first><last>Giovannetti</last></author>
+      <author><first>Silvia</first><last>Piccini</last></author>
+      <author><first>Anja</first><last>Weingart</last></author>
       <url>W17-7010</url>
     </paper>
     <paper id="11">
       <title>Designing an Ontology for the Study of Ritual in Ancient <fixed-case>G</fixed-case>reek Tragedy</title>
-      <author><first>Gloria</first> <last>Mugelli</last></author>
-      <author><first>Andrea</first> <last>Bellandi</last></author>
-      <author><first>Federico</first> <last>Boschetti</last></author>
-      <author><first>Anas Fahad</first> <last>Khan</last></author>
+      <author><first>Gloria</first><last>Mugelli</last></author>
+      <author><first>Andrea</first><last>Bellandi</last></author>
+      <author><first>Federico</first><last>Boschetti</last></author>
+      <author><first>Anas Fahad</first><last>Khan</last></author>
       <url>W17-7011</url>
     </paper>
   </volume>
@@ -12608,56 +12608,56 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>The Effect of Negative Sampling Strategy on Capturing Semantic Similarity in Document Embeddings</title>
-      <author><first>Marzieh</first> <last>Saeidi</last></author>
-      <author><first>Ritwik</first> <last>Kulkarni</last></author>
-      <author><first>Theodosia</first> <last>Togia</last></author>
-      <author><first>Michele</first> <last>Sama</last></author>
+      <author><first>Marzieh</first><last>Saeidi</last></author>
+      <author><first>Ritwik</first><last>Kulkarni</last></author>
+      <author><first>Theodosia</first><last>Togia</last></author>
+      <author><first>Michele</first><last>Sama</last></author>
       <pages>1–8</pages>
       <url>W17-7301</url>
     </paper>
     <paper id="2">
       <title>Building Graph Representations of Deep Vector Embeddings</title>
-      <author><first>Dario</first> <last>Garcia-Gasulla</last></author>
-      <author><first>Armand</first> <last>Vilalta</last></author>
-      <author><first>Ferran</first> <last>Parés</last></author>
-      <author><first>Jonathan</first> <last>Moreno</last></author>
-      <author><first>Eduard</first> <last>Ayguadé</last></author>
-      <author><first>Jesús</first> <last>Labarta</last></author>
-      <author><first>Ulises</first> <last>Cortés</last></author>
-      <author><first>Toyotaro</first> <last>Suzumura</last></author>
+      <author><first>Dario</first><last>Garcia-Gasulla</last></author>
+      <author><first>Armand</first><last>Vilalta</last></author>
+      <author><first>Ferran</first><last>Parés</last></author>
+      <author><first>Jonathan</first><last>Moreno</last></author>
+      <author><first>Eduard</first><last>Ayguadé</last></author>
+      <author><first>Jesús</first><last>Labarta</last></author>
+      <author><first>Ulises</first><last>Cortés</last></author>
+      <author><first>Toyotaro</first><last>Suzumura</last></author>
       <pages>9–15</pages>
       <url>W17-7302</url>
     </paper>
     <paper id="3">
       <title>Class Disjointness Constraints as Specific Objective Functions in Neural Network Classifiers</title>
-      <author><first>François</first> <last>Scharffe</last></author>
+      <author><first>François</first><last>Scharffe</last></author>
       <pages>16–23</pages>
       <url>W17-7303</url>
     </paper>
     <paper id="4">
       <title>Full-Network Embedding in a Multimodal Embedding Pipeline</title>
-      <author><first>Armand</first> <last>Vilalta</last></author>
-      <author><first>Dario</first> <last>Garcia-Gasulla</last></author>
-      <author><first>Ferran</first> <last>Parés</last></author>
-      <author><first>Jonathan</first> <last>Moreno</last></author>
-      <author><first>Eduard</first> <last>Ayguadé</last></author>
-      <author><first>Jesus</first> <last>Labarta</last></author>
-      <author><first>Ulises</first> <last>Cortés</last></author>
-      <author><first>Toyotaro</first> <last>Suzumura</last></author>
+      <author><first>Armand</first><last>Vilalta</last></author>
+      <author><first>Dario</first><last>Garcia-Gasulla</last></author>
+      <author><first>Ferran</first><last>Parés</last></author>
+      <author><first>Jonathan</first><last>Moreno</last></author>
+      <author><first>Eduard</first><last>Ayguadé</last></author>
+      <author><first>Jesus</first><last>Labarta</last></author>
+      <author><first>Ulises</first><last>Cortés</last></author>
+      <author><first>Toyotaro</first><last>Suzumura</last></author>
       <pages>24–32</pages>
       <url>W17-7304</url>
     </paper>
     <paper id="5">
       <title>Extracting Tags from Large Raw Texts Using End-to-End Memory Networks</title>
-      <author><first>Feras</first> <last>Al Kassar</last></author>
-      <author><first>Frédéric</first> <last>Armetta</last></author>
+      <author><first>Feras</first><last>Al Kassar</last></author>
+      <author><first>Frédéric</first><last>Armetta</last></author>
       <pages>33–40</pages>
       <url>W17-7305</url>
     </paper>
     <paper id="6">
       <title>Dealing with Co-reference in Neural Semantic Parsing</title>
-      <author><first>Rik</first> <last>van Noord</last></author>
-      <author><first>Johan</first> <last>Bos</last></author>
+      <author><first>Rik</first><last>van Noord</last></author>
+      <author><first>Johan</first><last>Bos</last></author>
       <pages>41–49</pages>
       <url>W17-7306</url>
     </paper>
@@ -13275,7 +13275,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     <meta>
       <booktitle>Proceedings of the 16th International Workshop on Treebanks and Linguistic Theories</booktitle>
       <url>W17-76</url>
-      <editor><first>Jan</first> <last>Hajič</last></editor>
+      <editor><first>Jan</first><last>Hajič</last></editor>
       <address>Prague, Czech Republic</address>
       <year>2017</year>
     </meta>
@@ -13284,177 +13284,177 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Annotating and parsing to semantic frames: feedback from the <fixed-case>F</fixed-case>rench <fixed-case>F</fixed-case>rame<fixed-case>N</fixed-case>et project</title>
-      <author><first>Marie</first> <last>Candito</last></author>
+      <author><first>Marie</first><last>Candito</last></author>
       <pages>v</pages>
       <url>W17-7601</url>
     </paper>
     <paper id="2">
       <title>Downstream use of syntactic analysis: does representation matter?</title>
-      <author><first>Lilja</first> <last>Øvrelid</last></author>
+      <author><first>Lilja</first><last>Øvrelid</last></author>
       <pages>vi</pages>
       <url>W17-7602</url>
     </paper>
     <paper id="3">
       <title>Distributional regularities of verbs and verbal adjectives: Treebank evidence and broader implications</title>
-      <author><first>Daniël</first> <last>de Kok</last></author>
-      <author><first>Patricia</first> <last>Fischer</last></author>
-      <author><first>Corina</first> <last>Dima</last></author>
-      <author><first>Erhard</first> <last>Hinrichs</last></author>
+      <author><first>Daniël</first><last>de Kok</last></author>
+      <author><first>Patricia</first><last>Fischer</last></author>
+      <author><first>Corina</first><last>Dima</last></author>
+      <author><first>Erhard</first><last>Hinrichs</last></author>
       <pages>1–9</pages>
       <url>W17-7603</url>
     </paper>
     <paper id="4">
       <title><fixed-case>UD</fixed-case> Annotatrix: An annotation tool for Universal Dependencies</title>
-      <author><first>Francis M.</first> <last>Tyers</last></author>
-      <author><first>Mariya</first> <last>Sheyanova</last></author>
-      <author><first>Jonathan North</first> <last>Washington</last></author>
+      <author><first>Francis M.</first><last>Tyers</last></author>
+      <author><first>Mariya</first><last>Sheyanova</last></author>
+      <author><first>Jonathan North</first><last>Washington</last></author>
       <pages>10–17</pages>
       <url>W17-7604</url>
     </paper>
     <paper id="5">
       <title>The Treebanked Conspiracy. Actors and Actions in Bellum Catilinae</title>
-      <author><first>Marco</first> <last>Passarotti</last></author>
-      <author><first>Berta</first> <last>González Saavedra</last></author>
+      <author><first>Marco</first><last>Passarotti</last></author>
+      <author><first>Berta</first><last>González Saavedra</last></author>
       <pages>18–26</pages>
       <url>W17-7605</url>
     </paper>
     <paper id="6">
       <title>Universal Dependencies-based syntactic features in detecting human translation varieties</title>
-      <author><first>Maria</first> <last>Kunilovskaya</last></author>
-      <author><first>Andrey</first> <last>Kutuzov</last></author>
+      <author><first>Maria</first><last>Kunilovskaya</last></author>
+      <author><first>Andrey</first><last>Kutuzov</last></author>
       <pages>27–36</pages>
       <url>W17-7606</url>
     </paper>
     <paper id="7">
       <title>Graph Convolutional Networks for Named Entity Recognition</title>
-      <author><first>Alberto</first> <last>Cetoli</last></author>
-      <author><first>Stefano</first> <last>Bragaglia</last></author>
-      <author><first>Andrew</first> <last>O’Harney</last></author>
-      <author><first>Marc</first> <last>Sloan</last></author>
+      <author><first>Alberto</first><last>Cetoli</last></author>
+      <author><first>Stefano</first><last>Bragaglia</last></author>
+      <author><first>Andrew</first><last>O’Harney</last></author>
+      <author><first>Marc</first><last>Sloan</last></author>
       <pages>37–45</pages>
       <url>W17-7607</url>
     </paper>
     <paper id="8">
       <title>Extensions to the <fixed-case>G</fixed-case>r<fixed-case>ETEL</fixed-case> Treebank Query Application</title>
-      <author><first>Jan</first> <last>Odijk</last></author>
-      <author><first>Martijn</first> <last>van der Klis</last></author>
-      <author><first>Sheean</first> <last>Spoel</last></author>
+      <author><first>Jan</first><last>Odijk</last></author>
+      <author><first>Martijn</first><last>van der Klis</last></author>
+      <author><first>Sheean</first><last>Spoel</last></author>
       <pages>46–55</pages>
       <url>W17-7608</url>
     </paper>
     <paper id="9">
       <title>The Relation of Form and Function in Linguistic Theory and in a Multilayer Treebank</title>
-      <author><first>Eduard</first> <last>Bejček</last></author>
-      <author><first>Eva</first> <last>Hajičová</last></author>
-      <author><first>Marie</first> <last>Mikulová</last></author>
-      <author><first>Jarmila</first> <last>Panevová</last></author>
+      <author><first>Eduard</first><last>Bejček</last></author>
+      <author><first>Eva</first><last>Hajičová</last></author>
+      <author><first>Marie</first><last>Mikulová</last></author>
+      <author><first>Jarmila</first><last>Panevová</last></author>
       <pages>56–63</pages>
       <url>W17-7609</url>
     </paper>
     <paper id="10">
       <title>Literal readings of multiword expressions: as scarce as hen’s teeth</title>
-      <author><first>Agata</first> <last>Savary</last></author>
-      <author><first>Silvio Ricardo</first> <last>Cordeiro</last></author>
+      <author><first>Agata</first><last>Savary</last></author>
+      <author><first>Silvio Ricardo</first><last>Cordeiro</last></author>
       <pages>64–72</pages>
       <url>W17-7610</url>
     </paper>
     <paper id="11">
       <title>Querying Multi-word Expressions Annotation with <fixed-case>CQL</fixed-case></title>
-      <author><first>Natalia</first> <last>Klyueva</last></author>
-      <author><first>Anna</first> <last>Vernerová</last></author>
-      <author><first>Behrang</first> <last>Qasemizadeh</last></author>
+      <author><first>Natalia</first><last>Klyueva</last></author>
+      <author><first>Anna</first><last>Vernerová</last></author>
+      <author><first>Behrang</first><last>Qasemizadeh</last></author>
       <pages>73–79</pages>
       <url>W17-7611</url>
     </paper>
     <paper id="12">
       <title><fixed-case>REALEC</fixed-case> learner treebank: annotation principles and evaluation of automatic parsing</title>
-      <author><first>Olga</first> <last>Lyashevskaya</last></author>
-      <author><first>Irina</first> <last>Panteleeva</last></author>
+      <author><first>Olga</first><last>Lyashevskaya</last></author>
+      <author><first>Irina</first><last>Panteleeva</last></author>
       <pages>80–87</pages>
       <url>W17-7612</url>
     </paper>
     <paper id="13">
       <title>A semiautomatic lemmatisation procedure for treebanks. Old <fixed-case>E</fixed-case>nglish strong and weak verbs</title>
-      <author><first>Marta</first> <last>Tío Sáenz</last></author>
-      <author><first>Darío</first> <last>Metola Rodríguez</last></author>
+      <author><first>Marta</first><last>Tío Sáenz</last></author>
+      <author><first>Darío</first><last>Metola Rodríguez</last></author>
       <pages>88–94</pages>
       <url>W17-7613</url>
     </paper>
     <paper id="14">
       <title>Data point selection for genre-aware parsing</title>
-      <author><first>Ines</first> <last>Rehbein</last></author>
-      <author><first>Felix</first> <last>Bildhauer</last></author>
+      <author><first>Ines</first><last>Rehbein</last></author>
+      <author><first>Felix</first><last>Bildhauer</last></author>
       <pages>95–105</pages>
       <url>W17-7614</url>
     </paper>
     <paper id="15">
       <title>Error Analysis of Cross-lingual Tagging and Parsing</title>
-      <author><first>Rudolf</first> <last>Rosa</last></author>
-      <author><first>Zdeněk</first> <last>Žabokrtský</last></author>
+      <author><first>Rudolf</first><last>Rosa</last></author>
+      <author><first>Zdeněk</first><last>Žabokrtský</last></author>
       <pages>106–118</pages>
       <url>W17-7615</url>
     </paper>
     <paper id="16">
       <title>A <fixed-case>T</fixed-case>elugu treebank based on a grammar book</title>
-      <author><first>Taraka</first> <last>Rama</last></author>
-      <author><first>Sowmya</first> <last>Vajjala</last></author>
+      <author><first>Taraka</first><last>Rama</last></author>
+      <author><first>Sowmya</first><last>Vajjala</last></author>
       <pages>119–128</pages>
       <url>W17-7616</url>
     </paper>
     <paper id="17">
       <title>Recent Developments within <fixed-case>B</fixed-case>ul<fixed-case>T</fixed-case>ree<fixed-case>B</fixed-case>ank</title>
-      <author><first>Petya</first> <last>Osenova</last></author>
-      <author><first>Kiril</first> <last>Simov</last></author>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <author><first>Kiril</first><last>Simov</last></author>
       <pages>129–137</pages>
       <url>W17-7617</url>
     </paper>
     <paper id="18">
       <title>Towards a dependency-annotated treebank for <fixed-case>B</fixed-case>ambara</title>
-      <author><first>Ekaterina</first> <last>Aplonova</last></author>
-      <author><first>Francis M.</first> <last>Tyers</last></author>
+      <author><first>Ekaterina</first><last>Aplonova</last></author>
+      <author><first>Francis M.</first><last>Tyers</last></author>
       <pages>138–145</pages>
       <url>W17-7618</url>
     </paper>
     <paper id="19">
       <title>Merging the Trees - Building a Morphological Treebank for <fixed-case>G</fixed-case>erman from Two Resources</title>
-      <author><first>Petra</first> <last>Steiner</last></author>
+      <author><first>Petra</first><last>Steiner</last></author>
       <pages>146–160</pages>
       <url>W17-7619</url>
     </paper>
     <paper id="20">
       <title>What <fixed-case>I</fixed-case> think when <fixed-case>I</fixed-case> think about treebanks</title>
-      <author><first>Anders</first> <last>Søgaard</last></author>
+      <author><first>Anders</first><last>Søgaard</last></author>
       <pages>161–166</pages>
       <url>W17-7620</url>
     </paper>
     <paper id="21">
       <title>Syntactic Semantic Correspondence in Dependency Grammar</title>
-      <author><first>Cătălina</first> <last>Mărănduc</last></author>
-      <author><first>Cătălin</first> <last>Mititelu</last></author>
-      <author><first>Victoria</first> <last>Bobicev</last></author>
+      <author><first>Cătălina</first><last>Mărănduc</last></author>
+      <author><first>Cătălin</first><last>Mititelu</last></author>
+      <author><first>Victoria</first><last>Bobicev</last></author>
       <pages>167–180</pages>
       <url>W17-7621</url>
     </paper>
     <paper id="22">
       <title>Multi-word annotation in syntactic treebanks - Propositions for Universal Dependencies</title>
-      <author><first>Sylvain</first> <last>Kahane</last></author>
-      <author><first>Marine</first> <last>Courtin</last></author>
-      <author><first>Kim</first> <last>Gerdes</last></author>
+      <author><first>Sylvain</first><last>Kahane</last></author>
+      <author><first>Marine</first><last>Courtin</last></author>
+      <author><first>Kim</first><last>Gerdes</last></author>
       <pages>181–189</pages>
       <url>W17-7622</url>
     </paper>
     <paper id="23">
       <title>A Universal Dependencies Treebank for <fixed-case>M</fixed-case>arathi</title>
-      <author><first>Vinit</first> <last>Ravishankar</last></author>
+      <author><first>Vinit</first><last>Ravishankar</last></author>
       <pages>190–200</pages>
       <url>W17-7623</url>
     </paper>
     <paper id="24">
       <title>Dangerous Relations in Dependency Treebanks</title>
-      <author><first>Chiara</first> <last>Alzetta</last></author>
-      <author><first>Felice</first> <last>Dell’Orletta</last></author>
-      <author><first>Simonetta</first> <last>Montemagni</last></author>
-      <author><first>Giulia</first> <last>Venturi</last></author>
+      <author><first>Chiara</first><last>Alzetta</last></author>
+      <author><first>Felice</first><last>Dell’Orletta</last></author>
+      <author><first>Simonetta</first><last>Montemagni</last></author>
+      <author><first>Giulia</first><last>Venturi</last></author>
       <pages>201–210</pages>
       <url>W17-7624</url>
     </paper>
@@ -13472,8 +13472,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
   <volume id="77">
     <meta>
       <booktitle>Proceedings of the 1st Workshop on Natural Language Processing and Information Retrieval associated with <fixed-case>RANLP</fixed-case> 2017</booktitle>
-      <editor><first>Mireille</first> <last>Makary</last></editor>
-      <editor><first>Michael</first> <last>Oakes</last></editor>
+      <editor><first>Mireille</first><last>Makary</last></editor>
+      <editor><first>Michael</first><last>Oakes</last></editor>
       <publisher>INCOMA Inc.</publisher>
       <address>Varna, Bulgaria</address>
       <month>September</month>
@@ -13484,9 +13484,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Deception Detection for the <fixed-case>R</fixed-case>ussian Language: Lexical and Syntactic Parameters</title>
-      <author><first>Dina</first> <last>Pisarevskaya</last></author>
-      <author><first>Tatiana</first> <last>Litvinova</last></author>
-      <author><first>Olga</first> <last>Litvinova</last></author>
+      <author><first>Dina</first><last>Pisarevskaya</last></author>
+      <author><first>Tatiana</first><last>Litvinova</last></author>
+      <author><first>Olga</first><last>Litvinova</last></author>
       <pages>1–10</pages>
       <doi>10.26615/978-954-452-038-0_001</doi>
       <url>https://doi.org/10.26615/978-954-452-038-0_001</url>
@@ -13494,10 +13494,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>o<fixed-case>IQ</fixed-case>a: An Opinion Influence Oriented Question Answering Framework with Applications to Marketing Domain</title>
-      <author><first>Dumitru-Clementin</first> <last>Cercel</last></author>
-      <author><first>Cristian</first> <last>Onose</last></author>
-      <author><first>Stefan</first> <last>Trausan-Matu</last></author>
-      <author><first>Florin</first> <last>Pop</last></author>
+      <author><first>Dumitru-Clementin</first><last>Cercel</last></author>
+      <author><first>Cristian</first><last>Onose</last></author>
+      <author><first>Stefan</first><last>Trausan-Matu</last></author>
+      <author><first>Florin</first><last>Pop</last></author>
       <pages>11–18</pages>
       <doi>10.26615/978-954-452-038-0_002</doi>
       <url>https://doi.org/10.26615/978-954-452-038-0_002</url>
@@ -13505,9 +13505,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>Automatic Summarization of Online Debates</title>
-      <author><first>Nattapong</first> <last>Sanchan</last></author>
-      <author><first>Ahmet</first> <last>Aker</last></author>
-      <author><first>Kalina</first> <last>Bontcheva</last></author>
+      <author><first>Nattapong</first><last>Sanchan</last></author>
+      <author><first>Ahmet</first><last>Aker</last></author>
+      <author><first>Kalina</first><last>Bontcheva</last></author>
       <pages>19–27</pages>
       <doi>10.26615/978-954-452-038-0_003</doi>
       <url>https://doi.org/10.26615/978-954-452-038-0_003</url>
@@ -13515,10 +13515,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title>A Game with a Purpose for Automatic Detection of Children’s Speech Disabilities using Limited Speech Resources</title>
-      <author><first>Reem</first> <last>Salem</last></author>
-      <author><first>Mohamed</first> <last>Elmahdy</last></author>
-      <author><first>Slim</first> <last>Abdennadher</last></author>
-      <author><first>Injy</first> <last>Hamed</last></author>
+      <author><first>Reem</first><last>Salem</last></author>
+      <author><first>Mohamed</first><last>Elmahdy</last></author>
+      <author><first>Slim</first><last>Abdennadher</last></author>
+      <author><first>Injy</first><last>Hamed</last></author>
       <pages>28–34</pages>
       <doi>10.26615/978-954-452-038-0_004</doi>
       <url>https://doi.org/10.26615/978-954-452-038-0_004</url>
@@ -13530,8 +13530,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
       <booktitle>Proceedings of the Workshop Knowledge Resources for the Socio-Economic Sciences and Humanities associated with <fixed-case>RANLP</fixed-case> 2017</booktitle>
       <editor><first>Kalliopi</first><last>Zervanou</last></editor>
       <editor><first>Petya</first><last>Osenova</last></editor>
-      <editor><first>Eveline</first> <last>Wandl-Vogt</last></editor>
-      <editor><first>Dan</first> <last>Cristea</last></editor>
+      <editor><first>Eveline</first><last>Wandl-Vogt</last></editor>
+      <editor><first>Dan</first><last>Cristea</last></editor>
       <publisher>INCOMA Inc.</publisher>
       <address>Varna</address>
       <month>September</month>
@@ -13542,8 +13542,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Connecting people digitally - a semantic web based approach to linking heterogeneous data sets</title>
-      <author><first>Katalin</first> <last>Lejtovicz</last></author>
-      <author><first>Amelie</first> <last>Dorn</last></author>
+      <author><first>Katalin</first><last>Lejtovicz</last></author>
+      <author><first>Amelie</first><last>Dorn</last></author>
       <pages>1–8</pages>
       <doi>10.26615/978-954-452-040-3_001</doi>
       <url>https://doi.org/10.26615/978-954-452-040-3_001</url>
@@ -13551,9 +13551,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>A Multiform Balanced Dependency Treebank for <fixed-case>R</fixed-case>omanian</title>
-      <author><first>Mihaela</first> <last>Colhon</last></author>
-      <author><first>Cătălina</first> <last>Mărănduc</last></author>
-      <author><first>Cătălin</first> <last>Mititelu</last></author>
+      <author><first>Mihaela</first><last>Colhon</last></author>
+      <author><first>Cătălina</first><last>Mărănduc</last></author>
+      <author><first>Cătălin</first><last>Mititelu</last></author>
       <pages>9–18</pages>
       <doi>10.26615/978-954-452-040-3_002</doi>
       <url>https://doi.org/10.26615/978-954-452-040-3_002</url>
@@ -13561,11 +13561,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title><fixed-case>GR</fixed-case>a<fixed-case>SP</fixed-case>: Grounded Representation and Source Perspective</title>
-      <author><first>Antske</first> <last>Fokkens</last></author>
-      <author><first>Piek</first> <last>Vossen</last></author>
-      <author><first>Marco</first> <last>Rospocher</last></author>
-      <author><first>Rinke</first> <last>Hoekstra</last></author>
-      <author><first>Willem Robert</first> <last>van Hage</last></author>
+      <author><first>Antske</first><last>Fokkens</last></author>
+      <author><first>Piek</first><last>Vossen</last></author>
+      <author><first>Marco</first><last>Rospocher</last></author>
+      <author><first>Rinke</first><last>Hoekstra</last></author>
+      <author><first>Willem Robert</first><last>van Hage</last></author>
       <pages>19–25</pages>
       <doi>10.26615/978-954-452-040-3_003</doi>
       <url>https://doi.org/10.26615/978-954-452-040-3_003</url>
@@ -13573,7 +13573,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title>Educational Content Generation for Business and Administration <fixed-case>FL</fixed-case> Courses with the <fixed-case>NBU</fixed-case> <fixed-case>PLT</fixed-case> Platform</title>
-      <author><first>Maria</first> <last>Stambolieva</last></author>
+      <author><first>Maria</first><last>Stambolieva</last></author>
       <pages>26–30</pages>
       <doi>10.26615/978-954-452-040-3_004</doi>
       <url>https://doi.org/10.26615/978-954-452-040-3_004</url>
@@ -13581,15 +13581,15 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title>Machine Learning Models of Universal Grammar Parameter Dependencies</title>
-      <author><first>Dimitar</first> <last>Kazakov</last></author>
-      <author><first>Guido</first> <last>Cordoni</last></author>
-      <author><first>Andrea</first> <last>Ceolin</last></author>
-      <author><first>Monica-Alexandrina</first> <last>Irimia</last></author>
-      <author><first>Shin-Sook</first> <last>Kim</last></author>
-      <author><first>Dimitris</first> <last>Michelioudakis</last></author>
-      <author><first>Nina</first> <last>Radkevich</last></author>
-      <author><first>Cristina</first> <last>Guardiano</last></author>
-      <author><first>Giuseppe</first> <last>Longobardi</last></author>
+      <author><first>Dimitar</first><last>Kazakov</last></author>
+      <author><first>Guido</first><last>Cordoni</last></author>
+      <author><first>Andrea</first><last>Ceolin</last></author>
+      <author><first>Monica-Alexandrina</first><last>Irimia</last></author>
+      <author><first>Shin-Sook</first><last>Kim</last></author>
+      <author><first>Dimitris</first><last>Michelioudakis</last></author>
+      <author><first>Nina</first><last>Radkevich</last></author>
+      <author><first>Cristina</first><last>Guardiano</last></author>
+      <author><first>Giuseppe</first><last>Longobardi</last></author>
       <pages>31–37</pages>
       <doi>10.26615/978-954-452-040-3_005</doi>
       <url>https://doi.org/10.26615/978-954-452-040-3_005</url>
@@ -13613,11 +13613,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Enhancing Machine Translation of Academic Course Catalogues with Terminological Resources</title>
-      <author><first>Randy</first> <last>Scansani</last></author>
-      <author><first>Silvia</first> <last>Bernardini</last></author>
-      <author><first>Adriano</first> <last>Ferraresi</last></author>
-      <author><first>Federico</first> <last>Gaspari</last></author>
-      <author><first>Marcello</first> <last>Soffritti</last></author>
+      <author><first>Randy</first><last>Scansani</last></author>
+      <author><first>Silvia</first><last>Bernardini</last></author>
+      <author><first>Adriano</first><last>Ferraresi</last></author>
+      <author><first>Federico</first><last>Gaspari</last></author>
+      <author><first>Marcello</first><last>Soffritti</last></author>
       <pages>1–10</pages>
       <doi>10.26615/978-954-452-042-7_001</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_001</url>
@@ -13625,9 +13625,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>Experiments in Non-Coherent Post-editing</title>
-      <author><first>Cristina</first> <last>Toledo Báez</last></author>
-      <author><first>Moritz</first> <last>Schaeffer</last></author>
-      <author><first>Michael</first> <last>Carl</last></author>
+      <author><first>Cristina</first><last>Toledo Báez</last></author>
+      <author><first>Moritz</first><last>Schaeffer</last></author>
+      <author><first>Michael</first><last>Carl</last></author>
       <pages>11–20</pages>
       <doi>10.26615/978-954-452-042-7_002</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_002</url>
@@ -13635,7 +13635,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>Comparing Machine Translation and Human Translation: A Case Study</title>
-      <author><first>Lars</first> <last>Ahrenberg</last></author>
+      <author><first>Lars</first><last>Ahrenberg</last></author>
       <pages>21–28</pages>
       <doi>10.26615/978-954-452-042-7_003</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_003</url>
@@ -13643,8 +13643,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title><fixed-case>T</fixed-case>rans<fixed-case>B</fixed-case>ank: Metadata as the Missing Link between <fixed-case>NLP</fixed-case> and Traditional Translation Studies</title>
-      <author><first>Michael</first> <last>Ustaszewski</last></author>
-      <author><first>Andy</first> <last>Stauder</last></author>
+      <author><first>Michael</first><last>Ustaszewski</last></author>
+      <author><first>Andy</first><last>Stauder</last></author>
       <pages>29–35</pages>
       <doi>10.26615/978-954-452-042-7_004</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_004</url>
@@ -13652,11 +13652,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title>Interpreting Strategies Annotation in the <fixed-case>WAW</fixed-case> Corpus</title>
-      <author><first>Irina</first> <last>Temnikova</last></author>
-      <author><first>Ahmed</first> <last>Abdelali</last></author>
-      <author><first>Samy</first> <last>Hedaya</last></author>
-      <author><first>Stephan</first> <last>Vogel</last></author>
-      <author><first>Aishah</first> <last>Al Daher</last></author>
+      <author><first>Irina</first><last>Temnikova</last></author>
+      <author><first>Ahmed</first><last>Abdelali</last></author>
+      <author><first>Samy</first><last>Hedaya</last></author>
+      <author><first>Stephan</first><last>Vogel</last></author>
+      <author><first>Aishah</first><last>Al Daher</last></author>
       <pages>36–43</pages>
       <doi>10.26615/978-954-452-042-7_005</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_005</url>
@@ -13664,8 +13664,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="6">
       <title>Translation Memory Systems Have a Long Way to Go</title>
-      <author><first>Andrea</first> <last>Silvestre Baquero</last></author>
-      <author><first>Ruslan</first> <last>Mitkov</last></author>
+      <author><first>Andrea</first><last>Silvestre Baquero</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
       <pages>44–51</pages>
       <doi>10.26615/978-954-452-042-7_006</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_006</url>
@@ -13673,8 +13673,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="7">
       <title>Building Dialectal <fixed-case>A</fixed-case>rabic Corpora</title>
-      <author><first>Hani</first> <last>Elgabou</last></author>
-      <author><first>Dimitar</first> <last>Kazakov</last></author>
+      <author><first>Hani</first><last>Elgabou</last></author>
+      <author><first>Dimitar</first><last>Kazakov</last></author>
       <pages>52–57</pages>
       <doi>10.26615/978-954-452-042-7_007</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_007</url>
@@ -13682,8 +13682,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="8">
       <title>Towards Producing Human-Validated Translation Resources for the <fixed-case>F</fixed-case>ula language through <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Linking</title>
-      <author><first>Khalil</first> <last>Mrini</last></author>
-      <author><first>Martin</first> <last>Benjamin</last></author>
+      <author><first>Khalil</first><last>Mrini</last></author>
+      <author><first>Martin</first><last>Benjamin</last></author>
       <pages>58–64</pages>
       <doi>10.26615/978-954-452-042-7_008</doi>
       <url>https://doi.org/10.26615/978-954-452-042-7_008</url>
@@ -13707,7 +13707,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>Document retrieval and question answering in medical documents. A large-scale corpus challenge.</title>
-      <author><first>Curea</first> <last>Eric</last></author>
+      <author><first>Curea</first><last>Eric</last></author>
       <pages>1–7</pages>
       <doi>10.26615/978-954-452-044-1_001</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_001</url>
@@ -13715,8 +13715,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>Adapting the <fixed-case>TTL</fixed-case> <fixed-case>R</fixed-case>omanian <fixed-case>POS</fixed-case> Tagger to the Biomedical Domain</title>
-      <author><first>Maria</first> <last>Mitrofan</last></author>
-      <author><first>Radu</first> <last>Ion</last></author>
+      <author><first>Maria</first><last>Mitrofan</last></author>
+      <author><first>Radu</first><last>Ion</last></author>
       <pages>8–14</pages>
       <doi>10.26615/978-954-452-044-1_002</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_002</url>
@@ -13724,9 +13724,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>Discourse-Wide Extraction of Assay Frames from the Biological Literature</title>
-      <author><first>Dayne</first> <last>Freitag</last></author>
-      <author><first>Paul</first> <last>Kalmar</last></author>
-      <author><first>Eric</first> <last>Yeh</last></author>
+      <author><first>Dayne</first><last>Freitag</last></author>
+      <author><first>Paul</first><last>Kalmar</last></author>
+      <author><first>Eric</first><last>Yeh</last></author>
       <pages>15–23</pages>
       <doi>10.26615/978-954-452-044-1_003</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_003</url>
@@ -13734,7 +13734,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title>Classification based extraction of numeric values from clinical narratives</title>
-      <author><first>Maximilian</first> <last>Zubke</last></author>
+      <author><first>Maximilian</first><last>Zubke</last></author>
       <pages>24–31</pages>
       <doi>10.26615/978-954-452-044-1_004</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_004</url>
@@ -13742,8 +13742,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title>Understanding of unknown medical words</title>
-      <author><first>Natalia</first> <last>Grabar</last></author>
-      <author><first>Thierry</first> <last>Hamon</last></author>
+      <author><first>Natalia</first><last>Grabar</last></author>
+      <author><first>Thierry</first><last>Hamon</last></author>
       <pages>32–41</pages>
       <doi>10.26615/978-954-452-044-1_005</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_005</url>
@@ -13751,11 +13751,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="6">
       <title>Entity-Centric Information Access with Human in the Loop for the Biomedical Domain</title>
-      <author><first>Seid Muhie</first> <last>Yimam</last></author>
-      <author><first>Steffen</first> <last>Remus</last></author>
-      <author><first>Alexander</first> <last>Panchenko</last></author>
-      <author><first>Andreas</first> <last>Holzinger</last></author>
-      <author><first>Chris</first> <last>Biemann</last></author>
+      <author><first>Seid Muhie</first><last>Yimam</last></author>
+      <author><first>Steffen</first><last>Remus</last></author>
+      <author><first>Alexander</first><last>Panchenko</last></author>
+      <author><first>Andreas</first><last>Holzinger</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
       <pages>42–48</pages>
       <doi>10.26615/978-954-452-044-1_006</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_006</url>
@@ -13763,8 +13763,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="7">
       <title>One model per entity: using hundreds of machine learning models to recognize and normalize biomedical names in text</title>
-      <author><first>Victor</first> <last>Bellon</last></author>
-      <author><first>Raul</first> <last>Rodriguez-Esteban</last></author>
+      <author><first>Victor</first><last>Bellon</last></author>
+      <author><first>Raul</first><last>Rodriguez-Esteban</last></author>
       <pages>49–54</pages>
       <doi>10.26615/978-954-452-044-1_007</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_007</url>
@@ -13772,8 +13772,8 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="8">
       <title>Towards Confidence Estimation for Typed Protein-Protein Relation Extraction</title>
-      <author><first>Camilo</first> <last>Thorne</last></author>
-      <author><first>Roman</first> <last>Klinger</last></author>
+      <author><first>Camilo</first><last>Thorne</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
       <pages>55–63</pages>
       <doi>10.26615/978-954-452-044-1_008</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_008</url>
@@ -13781,10 +13781,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="9">
       <title>Identification of Risk Factors in Clinical Texts through Association Rules</title>
-      <author><first>Svetla</first> <last>Boytcheva</last></author>
-      <author><first>Ivelina</first> <last>Nikolova</last></author>
-      <author><first>Galia</first> <last>Angelova</last></author>
-      <author><first>Zhivko</first> <last>Angelov</last></author>
+      <author><first>Svetla</first><last>Boytcheva</last></author>
+      <author><first>Ivelina</first><last>Nikolova</last></author>
+      <author><first>Galia</first><last>Angelova</last></author>
+      <author><first>Zhivko</first><last>Angelov</last></author>
       <pages>64–72</pages>
       <doi>10.26615/978-954-452-044-1_009</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_009</url>
@@ -13792,11 +13792,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="10">
       <title><fixed-case>POMELO</fixed-case>: <fixed-case>M</fixed-case>edline corpus with manually annotated food-drug interactions</title>
-      <author><first>Thierry</first> <last>Hamon</last></author>
-      <author><first>Vincent</first> <last>Tabanou</last></author>
-      <author><first>Fleur</first> <last>Mougin</last></author>
-      <author><first>Natalia</first> <last>Grabar</last></author>
-      <author><first>Frantz</first> <last>Thiessard</last></author>
+      <author><first>Thierry</first><last>Hamon</last></author>
+      <author><first>Vincent</first><last>Tabanou</last></author>
+      <author><first>Fleur</first><last>Mougin</last></author>
+      <author><first>Natalia</first><last>Grabar</last></author>
+      <author><first>Frantz</first><last>Thiessard</last></author>
       <pages>73–80</pages>
       <doi>10.26615/978-954-452-044-1_010</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_010</url>
@@ -13804,10 +13804,10 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="11">
       <title>Annotation of Clinical Narratives in <fixed-case>B</fixed-case>ulgarian language</title>
-      <author><first>Ivajlo</first> <last>Radev</last></author>
-      <author><first>Kiril</first> <last>Simov</last></author>
-      <author><first>Galia</first> <last>Angelova</last></author>
-      <author><first>Svetla</first> <last>Boytcheva</last></author>
+      <author><first>Ivajlo</first><last>Radev</last></author>
+      <author><first>Kiril</first><last>Simov</last></author>
+      <author><first>Galia</first><last>Angelova</last></author>
+      <author><first>Svetla</first><last>Boytcheva</last></author>
       <pages>81–87</pages>
       <doi>10.26615/978-954-452-044-1_011</doi>
       <url>https://doi.org/10.26615/978-954-452-044-1_011</url>
@@ -13830,9 +13830,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </frontmatter>
     <paper id="1">
       <title>A Diachronic Corpus for <fixed-case>R</fixed-case>omanian (<fixed-case>R</fixed-case>o<fixed-case>D</fixed-case>ia)</title>
-      <author><first>Ludmila</first> <last>Malahov</last></author>
-      <author><first>Cătălina</first> <last>Mărănduc</last></author>
-      <author><first>Alexandru</first> <last>Colesnicov</last></author>
+      <author><first>Ludmila</first><last>Malahov</last></author>
+      <author><first>Cătălina</first><last>Mărănduc</last></author>
+      <author><first>Alexandru</first><last>Colesnicov</last></author>
       <pages>1–9</pages>
       <doi>0.26615/978-954-452-046-5_001</doi>
       <url>http://doi.org/10.26615/978-954-452-046-5_001</url>
@@ -13840,9 +13840,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="2">
       <title>Tools for Building a Corpus to Study the Historical and Geographical Variation of the <fixed-case>R</fixed-case>omanian Language</title>
-      <author><first>Victoria</first> <last>Bobicev</last></author>
-      <author><first>Cătălina</first> <last>Mărănduc</last></author>
-      <author><first>Cenel Augusto</first> <last>Perez</last></author>
+      <author><first>Victoria</first><last>Bobicev</last></author>
+      <author><first>Cătălina</first><last>Mărănduc</last></author>
+      <author><first>Cenel Augusto</first><last>Perez</last></author>
       <pages>10–19</pages>
       <doi>0.26615/978-954-452-046-5_002</doi>
       <url>http://doi.org/10.26615/978-954-452-046-5_002</url>
@@ -13850,12 +13850,12 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="3">
       <title>Multilingual Ontologies for the Representation and Processing of Folktales</title>
-      <author><first>Thierry</first> <last>Declerck</last></author>
-      <author><first>Anastasija</first> <last>Aman</last></author>
-      <author><first>Martin</first> <last>Banzer</last></author>
-      <author><first>Dominik</first> <last>Macháček</last></author>
-      <author><first>Lisa</first> <last>Schäfer</last></author>
-      <author><first>Natalia</first> <last>Skachkova</last></author>
+      <author><first>Thierry</first><last>Declerck</last></author>
+      <author><first>Anastasija</first><last>Aman</last></author>
+      <author><first>Martin</first><last>Banzer</last></author>
+      <author><first>Dominik</first><last>Macháček</last></author>
+      <author><first>Lisa</first><last>Schäfer</last></author>
+      <author><first>Natalia</first><last>Skachkova</last></author>
       <pages>20–23</pages>
       <doi>0.26615/978-954-452-046-5_003</doi>
       <url>http://doi.org/10.26615/978-954-452-046-5_003</url>
@@ -13863,9 +13863,9 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="4">
       <title>On the annotation of vague expressions: a case study on <fixed-case>R</fixed-case>omanian historical texts</title>
-      <author><first>Anca</first> <last>Dinu</last></author>
-      <author><first>Walther</first> <last>von Hahn</last></author>
-      <author><first>Cristina</first> <last>Vertan</last></author>
+      <author><first>Anca</first><last>Dinu</last></author>
+      <author><first>Walther</first><last>von Hahn</last></author>
+      <author><first>Cristina</first><last>Vertan</last></author>
       <pages>24–31</pages>
       <doi>0.26615/978-954-452-046-5_004</doi>
       <url>http://doi.org/10.26615/978-954-452-046-5_004</url>
@@ -13873,11 +13873,11 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="5">
       <title>Language Technologies in Teaching Bugarian at Primary and Secondary School Level: the <fixed-case>NBU</fixed-case> Platform of Language Teaching (<fixed-case>PLT</fixed-case>)</title>
-      <author><first>Maria</first> <last>Stambolieva</last></author>
-      <author><first>Valentina</first> <last>Ivanova</last></author>
-      <author><first>Mariana</first> <last>Raykova</last></author>
-      <author><first>Milka</first> <last>Hadjikoteva</last></author>
-      <author><first>Mariya</first> <last>Neykova</last></author>
+      <author><first>Maria</first><last>Stambolieva</last></author>
+      <author><first>Valentina</first><last>Ivanova</last></author>
+      <author><first>Mariana</first><last>Raykova</last></author>
+      <author><first>Milka</first><last>Hadjikoteva</last></author>
+      <author><first>Mariya</first><last>Neykova</last></author>
       <pages>32–38</pages>
       <doi>0.26615/978-954-452-046-5_005</doi>
       <url>http://doi.org/10.26615/978-954-452-046-5_005</url>
@@ -13885,7 +13885,7 @@ with emotion annotation. We (a) analyse annotation reliability and annotation me
     </paper>
     <paper id="6">
       <title>Natural Language Processing in Political Campaigns</title>
-      <author><first>Cristina</first> <last>Moise</last></author>
+      <author><first>Cristina</first><last>Moise</last></author>
       <pages>39–43</pages>
       <doi>0.26615/978-954-452-046-5_006</doi>
       <url>http://doi.org/10.26615/978-954-452-046-5_006</url>
diff --git a/data/xml/W19.xml b/data/xml/W19.xml
index 02495714ef..85c327331c 100644
--- a/data/xml/W19.xml
+++ b/data/xml/W19.xml
@@ -16525,4 +16525,369 @@ In this tutorial on MT and post-editing we would like to continue sharing the la
       <doi>10.18653/v1/W19-8673</doi>
     </paper>
   </volume>
+  <volume id="87" ingest-date="2020-01-15">
+    <meta>
+      <booktitle>Proceedings of the Human-Informed Translation and Interpreting Technology Workshop (HiT-IT 2019)</booktitle>
+      <url>W19-87</url>
+      <publisher>Incoma Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2019</year>
+    </meta>
+    <frontmatter>
+      <url>W19-8700</url>
+    </frontmatter>
+    <paper id="1">
+      <title>Comparison between Automatic and Human Subtitling: A Case Study with Game of Thrones</title>
+      <author><first>Sabrina</first><last>Baldo de Brébisson</last></author>
+      <pages>1–10</pages>
+      <abstract>In this submission, I would like to share my experiences with the software DeepL and the comparison analysis I have made with human subtitling offered by the DVD version of the corpus I have chosen as the topic of my study – the eight Seasons of Game of Thrones. The idea is to study if the version proposed by an automatic translation program could be used as a first draft for the professional subtitler. It is expected that the latter would work on the form of the subtitles, that is to say mainly on their length, in a second step.</abstract>
+      <url>W19-8701</url>
+      <doi>10.26615/issn.2683-0078.2019_001</doi>
+    </paper>
+    <paper id="2">
+      <title>Parallel Corpus of <fixed-case>C</fixed-case>roatian-<fixed-case>I</fixed-case>talian Administrative Texts</title>
+      <author><first>Marija</first><last>Brkic Bakaric</last></author>
+      <author><first>Ivana</first><last>Lalli Pacelat</last></author>
+      <pages>11–18</pages>
+      <abstract>Parallel corpora constitute a unique re-source for providing assistance to human translators. The selection and preparation of the parallel corpora also conditions the quality of the resulting MT engine. Since Croatian is a national language and Italian is officially recognized as a minority lan-guage in seven cities and twelve munici-palities of Istria County, a large amount of parallel texts is produced on a daily basis. However, there have been no attempts in using these texts for compiling a parallel corpus. A domain-specific sentence-aligned parallel Croatian-Italian corpus of administrative texts would be of high value in creating different language tools and resources. The aim of this paper is, therefore, to explore the value of parallel documents which are publicly available mostly in pdf format and to investigate the use of automatically-built dictionaries in corpus compilation. The effects that a document format and, consequently sentence splitting, and the dictionary input have on the sentence alignment process are manually evaluated.</abstract>
+      <url>W19-8702</url>
+      <doi>10.26615/issn.2683-0078.2019_002</doi>
+    </paper>
+    <paper id="3">
+      <title>What Influences the Features of Post-editese? A Preliminary Study</title>
+      <author><first>Sheila</first><last>Castilho</last></author>
+      <author><first>Natália</first><last>Resende</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>19–27</pages>
+      <abstract>While a number of studies have shown evidence of translationese phenomena, that is, statistical differences between original texts and translated texts (Gellerstam, 1986), results of studies searching for translationese features in postedited texts (what has been called ”posteditese” (Daems et al., 2017)) have presented mixed results. This paper reports a preliminary study aimed at identifying the presence of post-editese features in machine-translated post-edited texts and at understanding how they differ from translationese features. We test the influence of factors such as post-editing (PE) levels (full vs. light), translation proficiency (professionals vs. students) and text domain (news vs. literary). Results show evidence of post-editese features, especially in light PE texts and in certain domains.</abstract>
+      <url>W19-8703</url>
+      <doi>10.26615/issn.2683-0078.2019_003</doi>
+    </paper>
+    <paper id="4">
+      <title>Designing a Frame-Semantic Machine Translation Evaluation Metric</title>
+      <author><first>Oliver</first><last>Czulo</last></author>
+      <author><first>Tiago</first><last>Torrent</last></author>
+      <author><first>Ely</first><last>Matos</last></author>
+      <author><first>Alexandre Diniz da</first><last>Costa</last></author>
+      <author><first>Debanjana</first><last>Kar</last></author>
+      <pages>28–35</pages>
+      <abstract>We propose a metric for machine translation evaluation based on frame semantics which does not require the use of reference translations or human corrections, but is aimed at comparing original and translated output directly. The metrics is described on the basis of an existing manual frame-semantic annotation of a parallel corpus with an English original and a Brazilian Portuguese and a German translation. We discuss implications of our metrics design, including the potential of scaling it for multiple languages.</abstract>
+      <url>W19-8704</url>
+      <doi>10.26615/issn.2683-0078.2019_004</doi>
+    </paper>
+    <paper id="5">
+      <title>Human Evaluation of Neural Machine Translation: The Case of Deep Learning</title>
+      <author><first>Marie</first><last>Escribe</last></author>
+      <pages>36–46</pages>
+      <abstract>Recent advances in artificial neural networks now have a great impact on translation technology. A considerable achievement was reached in this field with the publication of L’Apprentissage Profond. This book, originally written in English (Deep Learning), was entirely machine-translated into French and post-edited by several experts. In this context, it appears essential to have a clear vision of the performance of MT tools. Providing an evaluation of NMT is precisely the aim of the present research paper. To accomplish this objective, a framework for error categorisation was built and a comparative analysis of the raw translation output and the post-edited version was performed with the purpose of identifying recurring patterns of errors. The findings showed that even though some grammatical errors were spotted, the output was generally correct from a linguistic point of view. The most recurring errors are linked to the specialised terminology employed in this book. Further errors include parts of text that were not translated as well as edits based on stylistic preferences. The major part of the output was not acceptable as such and required several edits per segment, but some sentences were of publishable quality and were therefore left untouched in the final version.</abstract>
+      <url>W19-8705</url>
+      <doi>10.26615/issn.2683-0078.2019_005</doi>
+    </paper>
+    <paper id="6">
+      <title>Translationese Features as Indicators of Quality in <fixed-case>E</fixed-case>nglish-<fixed-case>R</fixed-case>ussian Human Translation</title>
+      <author><first>Maria</first><last>Kunilovskaya</last></author>
+      <author><first>Ekaterina</first><last>Lapshinova-Koltunski</last></author>
+      <pages>47–56</pages>
+      <abstract>We use a range of morpho-syntactic features inspired by research in register studies (e.g. Biber, 1995; Neumann, 2013) and translation studies (e.g. Ilisei et al., 2010; Zanettin, 2013; Kunilovskaya and Kutuzov, 2018) to reveal the association between translationese and human translation quality. Translationese is understood as any statistical deviations of translations from non-translations (Baker, 1993) and is assumed to affect the fluency of translations, rendering them foreign-sounding and clumsy of wording and structure. This connection is often posited or implied in the studies of translationese or translational varieties (De Sutter et al., 2017), but is rarely directly tested. Our 45 features include frequencies of selected morphological forms and categories, some types of syntactic structures and relations, as well as several overall text measures extracted from Universal Dependencies annotation. The research corpora include English-to-Russian professional and student translations of informational or argumentative newspaper texts and a comparable corpus of non-translated Russian. Our results indicate lack of direct association between translationese and quality in our data: while our features distinguish translations and non-translations with the near perfect accuracy, the performance of the same algorithm on the quality classes barely exceeds the chance level.</abstract>
+      <url>W19-8706</url>
+      <doi>10.26615/issn.2683-0078.2019_006</doi>
+    </paper>
+    <paper id="7">
+      <title>The Punster’s Amanuensis: The Proper Place of Humans and Machines in the Translation of Wordplay</title>
+      <author><first>Tristan</first><last>Miller</last></author>
+      <pages>57–65</pages>
+      <abstract>The translation of wordplay is one of the most extensively researched problems in translation studies, but it has attracted little attention in the fields of natural language processing and machine translation. This is because today’s language technologies treat anomalies and ambiguities in the input as things that must be resolved in favour of a single “correct” interpretation, rather than preserved and interpreted in their own right. But if computers cannot yet process such creative language on their own, can they at least provide specialized support to translation professionals? In this paper, I survey the state of the art relevant to computational processing of humorous wordplay and put forth a vision of how existing theories, resources, and technologies could be adapted and extended to support interactive, computer-assisted translation.</abstract>
+      <url>W19-8707</url>
+      <doi>10.26615/issn.2683-0078.2019_007</doi>
+    </paper>
+    <paper id="8">
+      <title>Comparing a Hand-crafted to an Automatically Generated Feature Set for Deep Learning: Pairwise Translation Evaluation</title>
+      <author><first>Despoina</first><last>Mouratidis</last></author>
+      <author><first>Katia Lida</first><last>Kermanidis</last></author>
+      <pages>66–74</pages>
+      <abstract>The automatic evaluation of machine translation (MT) has proven to be a very significant research topic. Most automatic evaluation methods focus on the evaluation of the output of MT as they compute similarity scores that represent translation quality. This work targets on the performance of MT evaluation. We present a general scheme for learning to classify parallel translations, using linguistic information, of two MT model outputs and one human (reference) translation. We present three experiments to this scheme using neural networks (NN). One using string based hand-crafted features (Exp1), the second using automatically trained embeddings from the reference and the two MT outputs (one from a statistical machine translation (SMT) model and the other from a neural ma-chine translation (NMT) model), which are learned using NN (Exp2), and the third experiment (Exp3) that combines information from the other two experiments. The languages involved are English (EN), Greek (GR) and Italian (IT) segments are educational in domain. The proposed language-independent learning scheme which combines information from the two experiments (experiment 3) achieves higher classification accuracy compared with models using BLEU score information as well as other classification approaches, such as Random Forest (RF) and Support Vector Machine (SVM).</abstract>
+      <url>W19-8708</url>
+      <doi>10.26615/issn.2683-0078.2019_008</doi>
+    </paper>
+    <paper id="9">
+      <title>Differences between <fixed-case>SMT</fixed-case> and <fixed-case>NMT</fixed-case> Output - a Translators’ Point of View</title>
+      <author><first>Jonathan</first><last>Mutal</last></author>
+      <author><first>Lise</first><last>Volkart</last></author>
+      <author><first>Pierrette</first><last>Bouillon</last></author>
+      <author><first>Sabrina</first><last>Girletti</last></author>
+      <author><first>Paula</first><last>Estrella</last></author>
+      <pages>75–81</pages>
+      <abstract>In this study, we compare the output quality of two MT systems, a statistical (SMT) and a neural (NMT) engine, customised for Swiss Post’s Language Service using the same training data. We focus on the point of view of professional translators and investigate how they perceive the differences between the MT output and a human reference (namely deletions, substitutions, insertions and word order). Our findings show that translators more frequently consider these differences to be errors in SMT than NMT, and that deletions are the most serious errors in both architectures. We also observe lower agreement on differences to be corrected in NMT than in SMT, suggesting that errors are easier to identify in SMT. These findings confirm the ability of NMT to produce correct paraphrases, which could also explain why BLEU is often considered as an inadequate metric to evaluate the performance of NMT systems.</abstract>
+      <url>W19-8709</url>
+      <doi>10.26615/issn.2683-0078.2019_009</doi>
+    </paper>
+    <paper id="10">
+      <title>The <fixed-case>C</fixed-case>hinese/<fixed-case>E</fixed-case>nglish Political Interpreting Corpus (<fixed-case>CEPIC</fixed-case>): A New Electronic Resource for Translators and Interpreters</title>
+      <author><first>Jun</first><last>Pan</last></author>
+      <pages>82–88</pages>
+      <abstract>The Chinese/English Political Interpreting Corpus (CEPIC) is a new electronic and open access resource developed for translators and interpreters, especially those working with political text types. Over 6 million word tokens in size, the online corpus consists of transcripts of Chinese (Cantonese &amp; Putonghua) / English political speeches and their translated and interpreted texts. It includes rich meta-data and is POS-tagged and annotated with prosodic and paralinguistic features that are of concern to spoken language and interpreting. The online platform of the CEPIC features main functions including Keyword Search, Word Collocation and Expanded Keyword in Context, which are illustrated in the paper. The CEPIC can shed light on online translation and interpreting corpora development in the future.</abstract>
+      <url>W19-8710</url>
+      <doi>10.26615/issn.2683-0078.2019_010</doi>
+    </paper>
+    <paper id="11">
+      <title>Translation Quality Assessment Tools and Processes in Relation to <fixed-case>CAT</fixed-case> Tools</title>
+      <author><first>Viktoriya</first><last>Petrova</last></author>
+      <pages>89–97</pages>
+      <abstract>Modern translation QA tools are the latest attempt to overcome the inevitable subjective component of human revisers. This paper analyzes the current situation in the translation industry in respect to those tools and their relationship with CAT tools. The adoption of international standards has set the basic frame that defines “quality”. Because of the clear impossibility to develop a universal QA tool, all of the existing ones have in common a wide variety of settings for the user to choose from. A brief comparison is made between most popular standalone QA tools. In order to verify their results in practice, QA outputs from two of those tools have been compared. Polls that cover a period of 12 years have been collected. Their participants explained what practices they adopted in order to guarantee quality.</abstract>
+      <url>W19-8711</url>
+      <doi>10.26615/issn.2683-0078.2019_011</doi>
+    </paper>
+    <paper id="12">
+      <title>Corpus Linguistics, Translation and Error Analysis</title>
+      <author><first>Maria</first><last>Stambolieva</last></author>
+      <pages>98–104</pages>
+      <abstract>The paper presents a study of the French Imparfait and its functional equivalents in Bulgarian and English in view of applications in machine translation and error analysis. The aims of the study are: 1/ based on the analysis of a corpus of text, to validate/revise earlier research on the values of the French Imparfait, 2/ to define the contextual factors pointing to the realisation of one or another value of the forms, 3/ based on the analysis of aligned translations, to identify the translation equivalents of these values, 4/ to formulate translation rules, 5/ based on the analysis of the translation rules, to refine the annotation modules of the environment used – the NBU e-Platform for language teaching and research.</abstract>
+      <url>W19-8712</url>
+      <doi>10.26615/issn.2683-0078.2019_012</doi>
+    </paper>
+    <paper id="13">
+      <title>Human-Informed Speakers and Interpreters Analysis in the <fixed-case>WAW</fixed-case> Corpus and an Automatic Method for Calculating Interpreters’ Décalage</title>
+      <author><first>Irina</first><last>Temnikova</last></author>
+      <author><first>Ahmed</first><last>Abdelali</last></author>
+      <author><first>Souhila</first><last>Djabri</last></author>
+      <author><first>Samy</first><last>Hedaya</last></author>
+      <pages>105–115</pages>
+      <abstract>This article presents a multi-faceted analysis of a subset of interpreted conference speeches from the WAW corpus for the English-Arabic language pair. We analyze several speakers and interpreters variables via manual annotation and automatic methods. We propose a new automatic method for calculating interpreters’ décalage based on Automatic Speech Recognition (ASR) and automatic alignment of named entities and content words between speaker and interpreter. The method is evaluated by two human annotators who have expertise in interpreting and Interpreting Studies and shows highly satisfactory results, accompanied with a high inter-annotator agreement. We provide insights about the relations of speakers’ variables, interpreters’ variables and décalage and discuss them from Interpreting Studies and interpreting practice point of view. We had interesting findings about interpreters behavior which need to be extended to a large number of conference sessions in our future research.</abstract>
+      <url>W19-8713</url>
+      <doi>10.26615/issn.2683-0078.2019_013</doi>
+    </paper>
+    <paper id="14">
+      <title>Towards a Proactive <fixed-case>MWE</fixed-case> Terminological Platform for Cross-Lingual Mediation in the Age of Big Data</title>
+      <author><first>Benjamin K.</first><last>Tsou</last></author>
+      <author><first>Kapo</first><last>Chow</last></author>
+      <author><first>JUNRU</first><last>Nie</last></author>
+      <author><first>Yuan</first><last>Yuan</last></author>
+      <pages>116–121</pages>
+      <abstract>The emergence of China as a global economic power in the 21st Century has brought about surging needs for cross-lingual and cross-cultural mediation, typically performed by translators. Advances in Artificial Intelligence and Language Engineering have been bolstered by Machine learning and suitable Big Data cultivation. They have helped to meet some of the translator’s needs, though the technical specialists have not kept pace with the practical and expanding requirements in language mediation. One major technical and linguistic hurdle involves words outside the vocabulary of the translator or the lexical database he/she consults, especially Multi- Word Expressions (Compound Words) in technical subjects. A further problem is in the multiplicity of renditions of a term in the target language. This paper discusses a proactive approach following the successful extraction and application of sizable bilingual Multi-Word Expressions (Compound Words) for language mediation in technical subjects, which do not fall within the expertise of typical translators, who have inadequate appreciation of the range of new technical tools available to help him/her. Our approach draws on the personal reflections of translators and teachers of translation and is based on the prior R&amp;D efforts relating to 300,000 comparable Chinese-English patents. The subsequent protocol we have developed aims to be proactive in meeting four identified practical challenges in technical translation (e.g. patents). It has broader economic implication in the Age of Big Data (Tsou et al, 2015) and Trade War, as the workload, if not, the challenges, increasingly cannot be met by currently available front-line translators. We shall demonstrate how new tools can be harnessed to spearhead the application of language technology not only in language mediation but also in the “teaching” and “learning” of translation. It shows how a better appreciation of their needs may enhance the contributions of the technical specialists, and thus enhance the resultant synergetic benefits.</abstract>
+      <url>W19-8714</url>
+      <doi>10.26615/issn.2683-0078.2019_014</doi>
+    </paper>
+    <paper id="15">
+      <title>Exploring Adequacy Errors in Neural Machine Translation with the Help of Cross-Language Aligned Word Embeddings</title>
+      <author><first>Michael</first><last>Ustaszewski</last></author>
+      <pages>122–128</pages>
+      <abstract>Neural machine translation (NMT) was shown to produce more fluent output than phrase-based statistical (PBMT) and rule-based machine translation (RBMT). However, improved fluency makes it more difficult for post editors to identify and correct adequacy errors, because unlike RBMT and SMT, in NMT adequacy errors are frequently not anticipated by fluency errors. Omissions and additions of content in otherwise flawlessly fluent NMT output are the most prominent types of such adequacy errors, which can only be detected with reference to source texts. This contribution explores the degree of semantic similarity between source texts, NMT output and post edited output. In this way, computational semantic similarity scores (cosine similarity) are related to human quality judgments. The analyses are based on publicly available NMT post editing data annotated for errors in three language pairs (EN-DE, EN-LV, EN-HR) with the Multidimensional Quality Metrics (MQM). Methodologically, this contribution tests whether cross-language aligned word embeddings as the sole source of semantic information mirror human error annotation.</abstract>
+      <url>W19-8715</url>
+      <doi>10.26615/issn.2683-0078.2019_015</doi>
+    </paper>
+    <paper id="16">
+      <title>The Success Story of Mitra Translations</title>
+      <author><first>Mina</first><last>Ilieva</last></author>
+      <author><first>Mariya</first><last>Kancheva</last></author>
+      <pages>129–133</pages>
+      <abstract>Technologies and their constant updates and innovative nature drastically and irreversibly transformed this small business into a leading brand on the translation market, along with just few other LSPs integrating translation software solutions. Now, we are constantly following the new developments in software updates and online platforms and we are successfully keeping up with any new trend in the field of translation, localization, transcreation, revision, post-editing, etc. Ultimately, we are positive that proper implementation of technology (with focus on quality, cost and time) and hard work are the stepping stones in the way to become a trusted translation services provider.</abstract>
+      <url>W19-8716</url>
+      <doi>10.26615/issn.2683-0078.2019_016</doi>
+    </paper>
+    <paper id="17">
+      <title>The Four Stages of Machine Translation Acceptance in a Freelancer’s Life</title>
+      <author><first>Maria</first><last>Sgourou</last></author>
+      <pages>134–135</pages>
+      <abstract>Technology is a big challenge and raises many questions and issues when it comes to its application in the translation process, but translation’s biggest problem is not technology; it is rather how technology is perceived by translators. MT developers and researchers should take into account this perception and move towards a more democratized approach to include the base of the translation industry and perhaps its more valuable asset, the translators.</abstract>
+      <url>W19-8717</url>
+      <doi>10.26615/issn.2683-0078.2019_017</doi>
+    </paper>
+    <paper id="18">
+      <title>Optimising the Machine Translation Post-editing Workflow</title>
+      <author><first>Anna</first><last>Zaretskaya</last></author>
+      <pages>136–139</pages>
+      <abstract>In this article, we describe how machine translation is used for post-editing at TransPerfect and the ways in which we optimise the workflow. This includes MT evaluation, MT engine customisation, leveraging MT suggestions compared to TM matches, and the lessons learnt from implementing MT at a large scale.</abstract>
+      <url>W19-8718</url>
+      <doi>10.26615/issn.2683-0078.2019_018</doi>
+    </paper>
+  </volume>
+  <volume id="89" ingest-date="2020-01-15">
+    <meta>
+      <booktitle>Proceedings of the Workshop MultiLing 2019: Summarization Across Languages, Genres and Sources</booktitle>
+      <url>W19-89</url>
+      <editor><first>George</first><last>Giannakopoulos</last></editor>
+      <publisher>INCOMA Ltd.</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2019</year>
+    </meta>
+    <frontmatter>
+      <url>W19-8900</url>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>RANLP</fixed-case> 2019 Multilingual Headline Generation Task Overview</title>
+      <author><first>Marina</first><last>Litvak</last></author>
+      <author><first>John M.</first><last>Conroy</last></author>
+      <author><first>Peter A.</first><last>Rankel</last></author>
+      <pages>1–5</pages>
+      <abstract>The objective of the 2019 RANLP Multilingual Headline Generation (HG) Task is to explore some of the challenges highlighted by current state of the art approaches on creating informative headlines to news articles: non-descriptive headlines, out-of-domain training data, generating headlines from long documents which are not well represented by the head heuristic, and dealing with multilingual domain. This tasks makes available a large set of training data for headline generation and provides an evaluation methods for the task. Our data sets are drawn from Wikinews as well as Wikipedia. Participants were required to generate headlines for at least 3 languages, which were evaluated via automatic methods. A key aspect of the task is multilinguality. The task measures the performance of multilingual headline generation systems using the Wikipedia and Wikinews articles in multiple languages. The objective is to assess the performance of automatic head- line generation techniques on text documents covering a diverse range of languages and topics outside the news domain.</abstract>
+      <url>W19-8901</url>
+      <doi>10.26615/978-954-452-058-8_001</doi>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>M</fixed-case>ulti<fixed-case>L</fixed-case>ing 2019: Financial Narrative Summarisation</title>
+      <author><first>Mahmoud</first><last>El-Haj</last></author>
+      <pages>6–10</pages>
+      <abstract>The Financial Narrative Summarisation task at MultiLing 2019 aims to demonstrate the value and challenges of applying automatic text summarisation to financial text written in English, usually referred to as financial narrative disclosures. The task dataset has been extracted from UK annual reports published in PDF file format. The participants were asked to provide structured summaries, based on real-world, publicly available financial an- nual reports of UK firms by extracting information from different key sections. Partici- pants were asked to generate summaries that reflects the analysis and assessment of the financial trend of the business over the past year, as provided by annual reports. The evaluation of the summaries was performed using AutoSummENG and Rouge automatic metrics. This paper focuses mainly on the data creation process.</abstract>
+      <url>W19-8902</url>
+      <doi>10.26615/978-954-452-058-8_002</doi>
+    </paper>
+    <paper id="3">
+      <title>The Summary Evaluation Task in the <fixed-case>M</fixed-case>ulti<fixed-case>L</fixed-case>ing - <fixed-case>RANLP</fixed-case> 2019 Workshop</title>
+      <author><first>George</first><last>Giannakopoulos</last></author>
+      <author><first>Nikiforos</first><last>Pittaras</last></author>
+      <pages>11–16</pages>
+      <abstract>This report covers the summarization evaluation task, proposed to the summarization community via the MultiLing 2019 Workshop of the RANLP 2019 conference. The task aims to encourage the development of automatic summarization evaluation methods closely aligned with manual, human-authored summary grades and judgements. A multilingual setting is adopted, building upon a corpus of Wikinews articles across 6 languages (English, Arabic, Romanian, Greek, Spanish and Czech). The evaluation utilizes human (golden) and machine-generated (peer) summaries, which have been assigned human evaluation scores from previous MultiLing tasks. Using these resources, the original corpus is augmented with synthetic data, combining summary texts under three different strategies (reorder, merge and replace), each engineered to introduce noise in the summary in a controlled and quantifiable way. We estimate that the utilization of such data can extract and highlight useful attributes of summary quality estimation, aiding the creation of data-driven automatic methods with an increased correlation to human summary evaluations across domains and languages. This paper provides a brief description of the summary evaluation task, the data generation protocol and the resources made available by the MultiLing community, towards improving automatic summarization evaluation.</abstract>
+      <url>W19-8903</url>
+      <doi>10.26615/978-954-452-058-8_003</doi>
+    </paper>
+    <paper id="4">
+      <title>Multi-lingual <fixed-case>W</fixed-case>ikipedia Summarization and Title Generation On Low Resource Corpus</title>
+      <author><first>Wei</first><last>Liu</last></author>
+      <author><first>Lei</first><last>Li</last></author>
+      <author><first>Zuying</first><last>Huang</last></author>
+      <author><first>Yinan</first><last>Liu</last></author>
+      <pages>17–25</pages>
+      <abstract>MultiLing 2019 Headline Generation Task on Wikipedia Corpus raised a critical and practical problem: multilingual task on low resource corpus. In this paper we proposed QDAS extractive summarization model enhanced by sentence2vec and try to apply transfer learning based on large multilingual pre-trained language model for Wikipedia Headline Generation task. We treat it as sequence labeling task and develop two schemes to handle with it. Experimental results have shown that large pre-trained model can effectively utilize learned knowledge to extract certain phrase using low resource supervised data.</abstract>
+      <url>W19-8904</url>
+      <doi>10.26615/978-954-452-058-8_004</doi>
+    </paper>
+    <paper id="5">
+      <title>A topic-based sentence representation for extractive text summarization</title>
+      <author><first>Nikolaos</first><last>Gialitsis</last></author>
+      <author><first>Nikiforos</first><last>Pittaras</last></author>
+      <author><first>Panagiotis</first><last>Stamatopoulos</last></author>
+      <pages>26–34</pages>
+      <abstract>In this study, we examine the effect of probabilistic topic model-based word representations, on sentence-based extractive summarization. We formulate the task of summary extraction as a binary classification problem, and we test a variety of machine learning algorithms, exploring a range of different settings. An wide experimental evaluation on the MultiLing 2015 MSS dataset illustrates that topic-based representations can prove beneficial to the extractive summarization process in terms of F1, ROUGE-L and ROUGE-W scores, compared to a TF-IDF baseline, with QDA-based analysis providing the best results.</abstract>
+      <url>W19-8905</url>
+      <doi>10.26615/978-954-452-058-8_005</doi>
+    </paper>
+    <paper id="6">
+      <title>A Study on Game Review Summarization</title>
+      <author><first>George</first><last>Panagiotopoulos</last></author>
+      <author><first>George</first><last>Giannakopoulos</last></author>
+      <author><first>Antonios</first><last>Liapis</last></author>
+      <pages>35–43</pages>
+      <abstract>Game reviews have constituted a unique means of interaction between players and companies for many years. The dynamics appearing through online publishing have significantly grown the number of comments per game, giving rise to very interesting communities. The growth has, in turn, led to a difficulty in dealing with the volume and varying quality of the comments as a source of information. This work studies whether and how game reviews can be summarized, based on the notions pre-existing in aspect-based summarization and sentiment analysis. The work provides suggested pipeline of analysis, also offering preliminary findings on whether aspects detected in a set of comments can be consistently evaluated by human users.</abstract>
+      <url>W19-8906</url>
+      <doi>10.26615/978-954-452-058-8_006</doi>
+    </paper>
+    <paper id="7">
+      <title>Social Web Observatory: An entity-driven, holistic information summarization platform across sources</title>
+      <author><first>Leonidas</first><last>Tsekouras</last></author>
+      <author><first>Georgios</first><last>Petasis</last></author>
+      <author><first>Aris</first><last>Kosmopoulos</last></author>
+      <pages>44–52</pages>
+      <abstract>The Social Web Observatory is an entity-driven, sentiment-aware, event summarization web platform, combining various methods and tools to overview trends across social media and news sources in Greek. SWO crawls, clusters and summarizes information following an entity-centric view of text streams, allowing to monitor the public sentiment towards a specific person, organization or other entity. In this paper, we overview the platform, outline the analysis pipeline and describe a user study aimed to quantify the usefulness of the system and especially the meaningfulness and coherence of discovered events.</abstract>
+      <url>W19-8907</url>
+      <doi>10.26615/978-954-452-058-8_007</doi>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>EASY</fixed-case>-M: Evaluation System for Multilingual Summarizers</title>
+      <pages>53–62</pages>
+      <abstract>Automatic text summarization aims at producing a shorter version of a document (or a document set). Evaluation of summarization quality is a challenging task. Because human evaluations are expensive and evaluators often disagree between themselves, many researchers prefer to evaluate their systems automatically, with help of software tools. Such a tool usually requires a point of reference in the form of one or more human-written summaries for each text in the corpus. Then, a system-generated summary is compared to one or more human-written summaries, according to selected metrics. However, a single metric cannot reflect all quality-related aspects of a summary. In this paper we present the EvAluation SYstem for Multilingual Summarization (EASY-M), which enables the evaluation of system-generated summaries in 17 different languages with several quality measures, based on comparison with their human-generated counterparts. The system also provides comparative results with two built-in baselines. The source code and both online and offline versions of EASY-M is freely available for the NLP community.</abstract>
+      <url>W19-8908</url>
+      <doi>10.26615/978-954-452-058-8_008</doi>
+    </paper>
+    <paper id="9">
+      <title>A study of semantic augmentation of word embeddings for extractive summarization</title>
+      <author><first>Nikiforos</first><last>Pittaras</last></author>
+      <author><first>Vangelis</first><last>Karkaletsis</last></author>
+      <pages>63–72</pages>
+      <abstract>In this study we examine the effect of semantic augmentation approaches on extractive text summarization. Wordnet hypernym relations are used to extract term-frequency concept information, subsequently concatenated to sentence-level representations produced by aggregated deep neural word embeddings. Multiple dimensionality reduction techniques and combination strategies are examined via feature transformation and clustering methods. An experimental evaluation on the MultiLing 2015 MSS dataset illustrates that semantic information can introduce benefits to the extractive summarization process in terms of F1, ROUGE-1 and ROUGE-2 scores, with LSA-based post-processing introducing the largest improvements.</abstract>
+      <url>W19-8909</url>
+      <doi>10.26615/978-954-452-058-8_009</doi>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>HE</fixed-case>v<fixed-case>AS</fixed-case>: Headline Evaluation and Analysis System</title>
+      <author><first>Marina</first><last>Litvak</last></author>
+      <author><first>Natalia</first><last>Vanetik</last></author>
+      <author><first>Itzhak</first><last>Eretz Kdosha</last></author>
+      <pages>73–80</pages>
+      <abstract>Automatic headline generation is a subtask of one-line summarization with many reported applications. Evaluation of systems generating headlines is a very challenging and undeveloped area. We introduce the Headline Evaluation and Analysis System (HEvAS) that performs automatic evaluation of systems in terms of a quality of the generated headlines. HEvAS provides two types of metrics– one which measures the informativeness of a headline, and another that measures its readability. The results of evaluation can be compared to the results of baseline methods which are implemented in HEvAS. The system also performs the statistical analysis of the evaluation results and provides different visualization charts. This paper describes all evaluation metrics, baselines, analysis, and architecture, utilized by our system.</abstract>
+      <url>W19-8910</url>
+      <doi>10.26615/978-954-452-058-8_010</doi>
+    </paper>
+  </volume>
+  <volume id="90" ingest-date="2020-01-15">
+    <meta>
+      <booktitle>Proceedings of the Workshop on Language Technology for Digital Historical Archives</booktitle>
+      <url>W19-90</url>
+      <editor><first>University of Hamburg</first><last>Cristina Vertan</last></editor>
+      <editor><first>Bulgarian cdemy of Sciences</first><last>Petya Osenova</last></editor>
+      <editor><first>St. Kliment Ohridski University</first><last>of Sofia</last></editor>
+      <editor><first>St. Kliment Ohridski University of Sofia</first><last>Dimitar Iliev</last></editor>
+      <publisher>INCOMA Ltd.</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2019</year>
+    </meta>
+    <frontmatter>
+      <url>W19-9000</url>
+    </frontmatter>
+    <paper id="1">
+      <title>Graphemic ambiguous queries on <fixed-case>A</fixed-case>rabic-scripted historical corpora</title>
+      <author><first>Alicia</first><last>González Martínez</last></author>
+      <pages>1–2</pages>
+      <abstract></abstract>
+      <url>W19-9001</url>
+      <doi>10.26615/978-954-452-059-5_001</doi>
+    </paper>
+    <paper id="2">
+      <title>Word Clustering for Historical Newspapers Analysis</title>
+      <author><first>Lidia</first><last>Pivovarova</last></author>
+      <author><first>Elaine</first><last>Zosa</last></author>
+      <author><first>Jani</first><last>Marjanen</last></author>
+      <pages>3–10</pages>
+      <abstract>This paper is a part of a collaboration between computer scientists and historians aimed at development of novel tools and methods to improve analysis of historical newspapers. We present a case study of ideological terms ending with -ism suffix in nineteenth century Finnish newspapers. We propose a two-step procedure to trace differences in word usages over time: training of diachronic embeddings on sev- eral time slices and when clustering embeddings of selected words together with their neighbours to obtain historical context. The obtained clusters turn out to be useful for historical studies. The paper also discuss specific difficulties related to development historian-oriented tools.</abstract>
+      <url>W19-9002</url>
+      <doi>10.26615/978-954-452-059-5_002</doi>
+    </paper>
+    <paper id="3">
+      <title>Geotagging a Diachronic Corpus of Alpine Texts: Comparing Distinct Approaches to Toponym Recognition</title>
+      <author><first>Tannon</first><last>Kew</last></author>
+      <author><first>Anastassia</first><last>Shaitarova</last></author>
+      <author><first>Isabel</first><last>Meraner</last></author>
+      <author><first>Janis</first><last>Goldzycher</last></author>
+      <author><first>Simon</first><last>Clematide</last></author>
+      <author><first>Martin</first><last>Volk</last></author>
+      <pages>11–18</pages>
+      <abstract>Geotagging historic and cultural texts provides valuable access to heritage data, enabling location-based searching and new geographically related discoveries. In this paper, we describe two distinct approaches to geotagging a variety of fine-grained toponyms in a diachronic corpus of alpine texts. By applying a traditional gazetteer-based approach, aided by a few simple heuristics, we attain strong high-precision annotations. Using the output of this earlier system, we adopt a state-of-the-art neural approach in order to facilitate the detection of new toponyms on the basis of context. Additionally, we present the results of preliminary experiments on integrating a small amount of crowdsourced annotations to improve overall performance of toponym recognition in our heritage corpus.</abstract>
+      <url>W19-9003</url>
+      <doi>10.26615/978-954-452-059-5_003</doi>
+    </paper>
+    <paper id="4">
+      <title>Controlled Semi-automatic Annotation of Classical Ethiopic</title>
+      <author><first>Cristina</first><last>Vertan</last></author>
+      <pages>19–23</pages>
+      <abstract>Preservation of the cultural heritage by means of digital methods became ex-tremely popular during last years. After intensive digitization campaigns the fo-cus moves slowly from the genuine preservation (i.e digital archiving togeth-er with standard search mechanisms) to research-oriented usage of materials available electronically. This usage is in-tended to go far beyond simple reading of digitized materials; researchers should be able to gain new insigts in materials, discover new facts by means of tools rely-ing on innovative algorithms.In this arti-cle we will describe the workflow neces-sary for the annotation of a dichronic corpus of classical Ethiopic, language of essential importance for the study of Ear-ly Christianity</abstract>
+      <url>W19-9004</url>
+      <doi>10.26615/978-954-452-059-5_004</doi>
+    </paper>
+    <paper id="5">
+      <title>Implementing an archival, multilingual and Semantic Web-compliant taxonomy by means of <fixed-case>SKOS</fixed-case> (Simple Knowledge Organization System)</title>
+      <author><first>Francesco</first><last>Gelati</last></author>
+      <pages>24–27</pages>
+      <abstract>The paper shows how a multilingual hierarchical thesaurus, or taxonomy, can be created and implemented in compliance with Semantic Web requirements by means of the data model SKOS (Simple Knowledge Organization System). It takes the EHRI (European Holocaust Research Infrastructure) portal as an example, and shows how open-source software like SKOS Play! can facilitate the task.</abstract>
+      <url>W19-9005</url>
+      <doi>10.26615/978-954-452-059-5_005</doi>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>EU</fixed-case> 4 U: An educational platform for the cultural heritage of the <fixed-case>EU</fixed-case></title>
+      <author><first>Maria</first><last>Stambolieva</last></author>
+      <pages>28–33</pages>
+      <abstract>The paper presents an ongoing project of the NBU Laboratory for Language Technology aiming to create a multilingual, CEFR-graded electronic didactic resource for online learning, centered on the history and cultural heritage of the EU (e-EULearn). The resource is developed within the e-Platform of the NBU Laboratory for Language Technology and re-uses the rich corpus of educational material created at the Laboratory for the needs of NBU program modules, distance and blended learning language courses and other projects. Focus being not just on foreign language tuition, but above all on people, places and events in the history and culture of the EU member states, the annotation modules of the e-Platform have been accordingly extended. Current and upcoming activities are directed at: 1/ enriching the English corpus of didactic materials on EU history and culture, 2/ translating the texts into (the) other official EU languages and aligning the translations with the English texts; 3/ developing new test modules. In the process of developing this resource, a database on important people, places, objects and events in the cultural history of the EU will be created.</abstract>
+      <url>W19-9006</url>
+      <doi>10.26615/978-954-452-059-5_006</doi>
+    </paper>
+    <paper id="7">
+      <title>Modelling linguistic vagueness and uncertainty in historical texts</title>
+      <author><first>Cristina</first><last>Vertan</last></author>
+      <pages>34–38</pages>
+      <abstract>Many applications in Digital Humanities (DH) rely on annotations of the raw mate-rial. These annotations (inferred automat-ically or done manually) assume that la-belled facts are either true or false, thus all inferences started on such annotations us boolean logic. This contradicts her-meneutic principles used by humanites in which most part of the knowledge has a degree of truth which varies depending on the experience and the world knowledge of the interpreter. In this pa-per we will show how uncertainty and vagueness, two main features of any his-torical text can be encoded in annota-tions and thus be considered by DH ap-plications.</abstract>
+      <url>W19-9007</url>
+      <doi>10.26615/978-954-452-059-5_007</doi>
+    </paper>
+  </volume>
 </collection>
diff --git a/data/yaml/venues_joint_map.yaml b/data/yaml/venues_joint_map.yaml
index 372d457120..bcea414e19 100644
--- a/data/yaml/venues_joint_map.yaml
+++ b/data/yaml/venues_joint_map.yaml
@@ -727,3 +727,6 @@ D19-65: [WS,DiscoMT]
 D19-66: WS
 D19-67: WS
 D19-68: WS
+W19-87: RANLP
+W19-89: RANLP
+W19-90: RANLP