acl-org · xinru1414 · Mar 15, 2022 · Mar 15, 2022 · Mar 15, 2022 · Mar 15, 2022
diff --git a/bin/anthology/anthology.py b/bin/anthology/anthology.py
@@ -28,7 +28,8 @@
 from .venues import VenueIndex
 from .volumes import Volume
 from .sigs import SIGIndex
-from .utils import is_newstyle_id, infer_year
+from .data import ResourceType
+from .utils import is_newstyle_id, infer_year, get_proceedings_id_from_filename
 from .events import EventIndex
 
 
@@ -137,3 +138,25 @@ def import_file(self, filename):
                     continue
                 volume.append(parsed_paper)
                 self.papers[full_id] = parsed_paper
+
+    def get_hash_for_resource(self, resource_type: ResourceType, filename: str) -> str:
+        proceedings_id = get_proceedings_id_from_filename(resource_type, filename)
+        if proceedings_id not in self.papers and proceedings_id not in self.volumes:
+            raise Exception(f"Paper/Volume for PDF {proceedings_id!r} does not exist.")
+
+        resource_hash = None
+        if resource_type == ResourceType.PDF:
+            resource_hash = self.papers.get(
+                proceedings_id, self.volumes.get(proceedings_id)
+            ).pdf_hash
+        elif resource_type == ResourceType.ATTACHMENT:
+            attachments = self.papers[proceedings_id].attachments
+            filename_to_hash = {a['filename']: a['hash'] for a in attachments}
+            resource_hash = filename_to_hash.get(filename)
+
+        if resource_hash is None:
+            raise Exception(
+                "Hash for resource is None. Please update with value before running this script."
+            )
+
+        return resource_hash
diff --git a/bin/anthology/data.py b/bin/anthology/data.py
@@ -22,6 +22,7 @@
 ################################################################################
 
 import os
+from enum import Enum
 
 # this is the canonical URL.  In contrast to all other
 # URL templates, it always links to the official anthology.
@@ -53,6 +54,25 @@
     "ANTHOLOGY_FILES", os.path.join(os.environ["HOME"], "anthology-files")
 )
 
+# Anthology pdf location
+# Defaults to {ANTHOLOGY_FILE_DIR}/pdf
+ANTHOLOGY_PDF_DIR = os.environ.get(
+    "ANTHOLOGY_PDFS", os.path.join(ANTHOLOGY_FILE_DIR, "pdf")
+)
+
+# Anthology attachments location
+# Defaults to {ANTHOLOGY_FILE_DIR}/attachments
+ANTHOLOGY_ATTACHMENTS_DIR = os.environ.get(
+    "ANTHOLOGY_ATTACHMENTS", os.path.join(ANTHOLOGY_FILE_DIR, "attachments")
+)
+
+# Anthology data location
+# Defaults to {git_repo_root}/data
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+ANTHOLOGY_DATA_DIR = os.environ.get(
+    "ANTHOLOGY_DATA", os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "..", "data"))
+)
+
 # Names of XML elements that may appear multiple times
 LIST_ELEMENTS = (
     "attachment",
@@ -100,3 +120,8 @@ def get_journal_title(top_level_id, volume_title):
         return "Transactions of the Association for Computational Linguistics"
     else:
         return volume_title
+
+
+class ResourceType(Enum):
+    PDF = 'pdf'
+    ATTACHMENT = 'attachments'
diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py
@@ -96,6 +96,10 @@ def videos(self):
             ]
         return []
 
+    @cached_property
+    def pdf_hash(self):
+        return self.attrib.get("pdf_hash", None)
+
     def _parse_revision_or_errata(self, tag):
         for item in self.attrib.get(tag, []):
             # Expand URLs with paper ID

diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py
@@ -20,6 +20,7 @@
 import re
 import requests
 import shutil
+import subprocess
 
 from lxml import etree
 from urllib.parse import urlparse
@@ -444,11 +445,13 @@ def parse_element(xml_element):
         elif tag == "url":
             tag = "xml_url"
             value = element.text
+            attrib['pdf_hash'] = element.get("hash")
         elif tag == "attachment":
             value = {
                 "filename": element.text,
                 "type": element.get("type", "attachment"),
                 "url": element.text,
+                "hash": element.get("hash"),
             }
         elif tag in ("author", "editor"):
             id_ = element.attrib.get("id", None)
@@ -520,6 +523,60 @@ def compute_hash_from_file(path: str) -> str:
         return compute_hash(f.read())
 
 
+# For auto upload files to server
+# The root directory for files
+ANTHOLOGY_FILE_ROOT = "anthology-files"
+
+# The ssh shortcut (in ~/.ssh/config) or full hostname
+ANTHOLOGY_HOST = "anth"
+
+
+def upload_file_to_queue(
+    local_path: str,
+    resource_type: data.ResourceType,
+    venue_name: str,
+    filename: str,
+    file_hash: str,
+    commit: bool = False,
+):
+    actual_hash = compute_hash_from_file(local_path)
+    if file_hash != actual_hash:
+        raise Exception(
+            f"Got unexpected hash, file contains incorrect data. (actual hash: {actual_hash}, expected: {file_hash})"
+        )
+
+    mdkir_cmd = [
+        'ssh',
+        ANTHOLOGY_HOST,
+        f'mkdir -p {ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}',
+    ]
+    if commit:
+        subprocess.check_call(mdkir_cmd)
+    else:
+        logging.info(f"Would run: {mdkir_cmd}")
+
+    upload_cmd = [
+        "rsync",
+        "-lptgoDve",
+        "ssh",
+        local_path,
+        f"{ANTHOLOGY_HOST}:{ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}/{filename}.{file_hash}",
+    ]
+    if commit:
+        subprocess.check_call(upload_cmd)
+    else:
+        logging.info(f"Would run: {upload_cmd}")
+
+
+def get_proceedings_id_from_filename(
+    resource_type: data.ResourceType, filename: str
+) -> str:
+    trailing_dots = {data.ResourceType.PDF: 1, data.ResourceType.ATTACHMENT: 2}[
+        resource_type
+    ]
+    return filename.rsplit('.', trailing_dots)[0]
+
+
 def read_leaves(data) -> List[str]:
     """Reads the leaves of a possibly superfluously-hierarchical data structure.
     For example:

diff --git a/bin/anthology/volumes.py b/bin/anthology/volumes.py
@@ -113,6 +113,10 @@ def pdf(self):
             return infer_url(url, template=data.PDF_LOCATION_TEMPLATE)
         return None
 
+    @cached_property
+    def pdf_hash(self):
+        return self.attrib.get("pdf_hash", None)
+
     def _set_meta_info(self, meta_data):
         """Derive journal title, volume, and issue no. used in metadata.