acl-org · mjpost · Jun 21, 2019 · May 9, 2019 · May 9, 2019 · May 9, 2019
diff --git a/bin/anthology/anthology.py b/bin/anthology/anthology.py
@@ -54,38 +54,35 @@ def import_directory(self, importdir):
         for xmlfile in glob(importdir + "/xml/*.xml"):
             self.import_file(xmlfile)
         self.pindex.verify()
-        
+
     def import_file(self, filename):
         tree = etree.parse(filename)
-        volume = tree.getroot()
-        top_level_id = volume.get("id")
-        if top_level_id in self.volumes:
-            log.critical(
-                "Attempted to import top-level ID '{}' twice".format(top_level_id)
-            )
-            log.critical("Triggered by file: {}".format(filename))
-        current_volume = None
-        for paper in volume:
-            parsed_paper = Paper.from_xml(paper, top_level_id, self.formatter)
-            self.pindex.register(parsed_paper)
-            full_id = parsed_paper.full_id
-            if full_id in self.papers:
+        collection = tree.getroot()
+        collection_id = collection.get("id")
+        for volume_xml in collection:
+            volume = Volume.from_xml(volume_xml, collection_id, self.venues, self.sigs, self.formatter)
+
+            if volume.full_id in self.volumes:
                 log.critical(
-                    "Attempted to import paper '{}' twice -- skipping".format(full_id)
+                    "Attempted to import volume ID '{}' twice".format(volume.full_id)
                 )
-                continue
-            if parsed_paper.is_volume:
-                if current_volume is not None:
-                    self.volumes[current_volume.full_id] = current_volume
-                current_volume = Volume(parsed_paper, self.venues, self.sigs)
-            else:
-                if current_volume is None:
+                log.critical("Triggered by file: {}".format(filename))
+
+            # front matter
+            if len(volume.content):
+                front_matter = volume.content[0]
+                self.pindex.register(front_matter)
+                self.papers[front_matter.full_id] = front_matter
+
+            self.volumes[volume.full_id] = volume
+            for paper in volume_xml.findall('paper'):
+                parsed_paper = Paper.from_xml(paper, volume, self.formatter)
+                self.pindex.register(parsed_paper)
+                full_id = parsed_paper.full_id
+                if full_id in self.papers:
                     log.critical(
-                        "First paper of XML should be volume entry, but '{}' is not interpreted as one".format(
-                            full_id
-                        )
+                        "Attempted to import paper '{}' twice -- skipping".format(full_id)
                     )
-                current_volume.append(parsed_paper)
-            self.papers[full_id] = parsed_paper
-        if current_volume is not None:
-            self.volumes[current_volume.full_id] = current_volume
+                    continue
+                volume.append(parsed_paper)
+                self.papers[full_id] = parsed_paper
diff --git a/bin/anthology/data.py b/bin/anthology/data.py
@@ -24,8 +24,12 @@
 ANTHOLOGY_URL = "https://www.aclweb.org/anthology/{}"
 ATTACHMENT_URL = "https://www.aclweb.org/anthology/attachments/{}"
 
+# Names of XML elements that may appear multiple times
+LIST_ELEMENTS = ("attachment", "author", "editor", "video", "revision", "erratum")
 
 def get_journal_title(top_level_id, volume_title):
+    # TODO: consider moving this from code to data (perhaps
+    # under <booktitle> in the volume metadata
     if top_level_id[0] == "J":
         year = int(top_level_id[1:3])
         if year >= 65 and year <= 83:

diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 
 import logging as log
-from .people import PersonName
 from .utils import (
+    parse_element,
     infer_attachment_url,
     remove_extra_whitespace,
     is_journal,
@@ -28,25 +28,73 @@
 # For BibTeX export
 from .formatter import bibtex_encode, bibtex_make_entry
 
-# Names of XML elements that may appear multiple times
-_LIST_ELEMENTS = ("attachment", "author", "editor", "video", "revision", "erratum")
-
-
 class Paper:
-    def __init__(self, paper_id, top_level_id, formatter):
+    def __init__(self, paper_id, volume, formatter):
+        self.parent_volume = volume
         self.formatter = formatter
-        self.parent_volume = None
-        self.paper_id = paper_id
-        self.top_level_id = top_level_id
-        self.attrib = {}
+        self._id = paper_id
         self._bibkey = False
+        self.is_volume = paper_id == '0'
+
+        # initialize metadata with keys inherited from volume
+        self.attrib = {}
+        for key, value in volume.attrib.items():
+            # Only inherit 'editor' for frontmatter
+            if (key == 'editor' and not self.is_volume) or key in ('collection_id', 'booktitle', 'id', 'meta_data', 'meta_journal_title', 'meta_volume', 'meta_issue', 'sigs', 'venues', 'meta_date', 'url'):
+                continue
+
+            self.attrib[key] = value
 
     def from_xml(xml_element, *args):
-        paper = Paper(xml_element.get("id"), *args)
-        paper._parse_element(xml_element)
+        # Default to paper ID "0" (for front matter)
+        paper = Paper(xml_element.get("id", '0'), *args)
+
+        # Set values from parsing the XML element (overwriting
+        # and changing some initialized from the volume metadata)
+        for key, value in parse_element(xml_element).items():
+            if key == 'author' and 'editor' in paper.attrib:
+                del paper.attrib['editor']
+            paper.attrib[key] = value
+
+        # Frontmatter title is the volume 'booktitle'
+        if paper.is_volume:
+            paper.attrib['xml_title'] = paper.attrib['xml_booktitle']
+            paper.attrib['xml_title'].tag = 'title'
+
+        # Remove booktitle for frontmatter and journals
+        if paper.is_volume or is_journal(paper.full_id):
+            del paper.attrib['xml_booktitle']
+
+        # Expand URLs with paper ID
+        for tag in ('revision', 'erratum'):
+            if tag in paper.attrib:
+                for item in paper.attrib[tag]:
+                    if not item['url'].startswith(paper.full_id):
+                        log.error(
+                            "{} must begin with paper ID '{}', but is '{}'".format(
+                                tag, paper.full_id, item['url']
+                            )
+                        )
+                    item['url'] = data.ANTHOLOGY_URL.format(item['url'])
+
+        if 'attachment' in paper.attrib:
+            for item in paper.attrib['attachment']:
+                item['url'] = infer_attachment_url(item['url'], paper.full_id)
+
+        # Explicitly construct URL of original version of the paper
+        # -- this is a bit hacky, but it's not given in the XML
+        # explicitly
+        if 'revision' in paper.attrib:
+            paper.attrib['revision'].insert(0, {
+                "value": "{}v1".format(paper.full_id),
+                "id": "1",
+                "url": data.ANTHOLOGY_URL.format( "{}v1".format(paper.full_id)) } )
+
+
         paper.attrib["title"] = paper.get_title("plain")
         if "booktitle" in paper.attrib:
             paper.attrib["booktitle"] = paper.get_booktitle("plain")
+
         if "editor" in paper.attrib:
             if paper.is_volume:
                 if "author" in paper.attrib:
@@ -64,116 +112,13 @@ def from_xml(xml_element, *args):
                         paper.full_id
                     )
                 )
-        if "year" not in paper.attrib:
-            paper._infer_year()
         if "pages" in paper.attrib:
             if paper.attrib["pages"] is not None:
                 paper._interpret_pages()
             else:
                 del paper.attrib["pages"]
         return paper
 
-    def _parse_element(self, paper_element):
-        # read & store values
-        if "href" in paper_element.attrib:
-            self.attrib["attrib_href"] = paper_element.get("href")
-            self.attrib["url"] = paper_element.get("href")
-        elif not (self.is_volume and is_journal(self.full_id)):
-            # Generate a URL, except for top-level journal entries
-            self.attrib["url"] = data.ANTHOLOGY_URL.format(self.full_id)
-        for element in paper_element:
-            # parse value
-            tag = element.tag.lower()
-            if tag in ("abstract", "title", "booktitle"):
-                tag = "xml_{}".format(tag)
-                value = element
-            elif tag == "attachment":
-                value = {
-                    "filename": element.text,
-                    "type": element.get("type", "attachment"),
-                    "url": infer_attachment_url(element.text, self.full_id),
-                }
-            elif tag in ("author", "editor"):
-                id_ = element.attrib.get("id", None)
-                value = (PersonName.from_element(element), id_)
-            elif tag in ("erratum", "revision"):
-                if tag == "revision" and "revision" not in self.attrib:
-                    # Explicitly construct URL of original version of the paper
-                    # -- this is a bit hacky, but it's not given in the XML
-                    # explicitly
-                    self.attrib["revision"] = [
-                        {
-                            "value": "{}v1".format(self.full_id),
-                            "id": "1",
-                            "url": data.ANTHOLOGY_URL.format(
-                                "{}v1".format(self.full_id)
-                            ),
-                        }
-                    ]
-                if not element.text.startswith(self.full_id):
-                    log.error(
-                        "{} must begin with paper ID '{}', but is '{}'".format(
-                            tag, self.full_id, element.text
-                        )
-                    )
-                value = {
-                    "value": element.text,
-                    "id": element.get("id"),
-                    "url": data.ANTHOLOGY_URL.format(element.text),
-                }
-            elif tag == "mrf":
-                value = {"filename": element.text, "src": element.get("src")}
-            elif tag == "video":
-                # Treat videos the same way as other attachments
-                tag = "attachment"
-                value = {
-                    "filename": element.get("href"),
-                    "type": element.get("tag", "video"),
-                    "url": infer_attachment_url(element.get("href"), self.full_id),
-                }
-            elif tag in ("dataset", "software"):
-                value = {
-                    "filename": element.text,
-                    "type": tag,
-                    "url": infer_attachment_url(element.text, self.full_id),
-                }
-                tag = "attachment"
-            else:
-                value = element.text
-            # store value
-            if tag == "url":
-                continue  # We basically have to ignore this for now
-            if tag in _LIST_ELEMENTS:
-                try:
-                    self.attrib[tag].append(value)
-                except KeyError:
-                    self.attrib[tag] = [value]
-            else:
-                if tag in self.attrib:
-                    log.warning(
-                        "{}: Unexpected multiple occurrence of '{}' element".format(
-                            self.full_id, tag
-                        )
-                    )
-                self.attrib[tag] = value
-
-    def _infer_year(self):
-        """Infer the year from the volume ID.
-
-        Many paper entries do not explicitly contain their year.  This function assumes
-        that the paper's volume identifier follows the format 'xyy', where x is
-        some letter and yy are the last two digits of the year of publication.
-        """
-        assert (
-            len(self.top_level_id) == 3
-        ), "Couldn't infer year: unknown volume ID format"
-        digits = self.top_level_id[1:]
-        if int(digits) >= 60:
-            year = "19{}".format(digits)
-        else:
-            year = "20{}".format(digits)
-        self.attrib["year"] = year
-
     def _interpret_pages(self):
         """Splits up 'pages' field into first and last page, if possible.
 
@@ -187,18 +132,28 @@ def _interpret_pages(self):
                 return
 
     @property
-    def is_volume(self):
-        """Determines if this paper is a regular paper or a proceedings volume.
+    def collection_id(self):
+        return self.parent_volume.collection_id
 
-        By default, each paper ID of format 'x000' will be treated as (the front
-        matter of) a proceedings volume, unless the XML is of type workshop,
-        where each paper ID of format 'xx00' is treated as one volume.
-        """
-        return is_volume_id(self.full_id)
+    @property
+    def volume_id(self):
+        return self.parent_volume.volume_id
+
+    @property
+    def paper_id(self):
+        if self.collection_id[0] == "W" or self.collection_id == "C69":
+            # If volume is a workshop, use the last two digits of ID
+            _id = "{}{:02d}".format(self.volume_id, int(self._id))
+        else:
+            # If not, only the last three
+            _id = "{}{:03d}".format(self.volume_id, int(self._id))
+        # Just to be sure
+        assert len(_id) == 4
+        return _id
 
     @property
     def full_id(self):
-        return "{}-{}".format(self.top_level_id, self.paper_id)
+        return "{}-{}".format(self.collection_id, self.paper_id)
 
     @property
     def bibkey(self):
@@ -290,7 +245,7 @@ def as_bibtex(self):
                     ("booktitle", self.parent_volume.get_title(form="latex"))
                 )
         for entry in ("month", "year", "address", "publisher", "note"):
-            if entry in self.attrib:
+            if self.get(entry) is not None:
                 entries.append((entry, bibtex_encode(self.get(entry))))
         for entry in ("url", "doi"):
             if entry in self.attrib:

diff --git a/bin/anthology/people.py b/bin/anthology/people.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 import logging as log
-from .formatter import bibtex_encode
+import anthology.formatter as my_formatter
 
 
 class PersonName:
@@ -60,8 +60,8 @@ def id_(self):
 
     def as_bibtex(self):
         if not self.first:
-            return "{{{}}}".format(bibtex_encode(self.last))
-        return bibtex_encode("{}, {}".format(self.last, self.first))
+            return "{{{}}}".format(my_formatter.bibtex_encode(self.last))
+        return my_formatter.bibtex_encode("{}, {}".format(self.last, self.first))
 
     def as_dict(self):
         return {"first": self.first, "last": self.last, "full": self.full}