Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

automated file uploads #1849

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
25 changes: 24 additions & 1 deletion bin/anthology/anthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from .venues import VenueIndex
from .volumes import Volume
from .sigs import SIGIndex
from .utils import is_newstyle_id, infer_year
from .data import ResourceType
from .utils import is_newstyle_id, infer_year, get_proceedings_id_from_filename
from .events import EventIndex


Expand Down Expand Up @@ -137,3 +138,25 @@ def import_file(self, filename):
continue
volume.append(parsed_paper)
self.papers[full_id] = parsed_paper

def get_hash_for_resource(self, resource_type: ResourceType, filename: str) -> str:
proceedings_id = get_proceedings_id_from_filename(resource_type, filename)
if proceedings_id not in self.papers and proceedings_id not in self.volumes:
raise Exception(f"Paper/Volume for PDF {proceedings_id!r} does not exist.")

resource_hash = None
if resource_type == ResourceType.PDF:
resource_hash = self.papers.get(
proceedings_id, self.volumes.get(proceedings_id)
).pdf_hash
elif resource_type == ResourceType.ATTACHMENT:
attachments = self.papers[proceedings_id].attachments
filename_to_hash = {a['filename']: a['hash'] for a in attachments}
resource_hash = filename_to_hash.get(filename)

if resource_hash is None:
raise Exception(
"Hash for resource is None. Please update with value before running this script."
)

return resource_hash
25 changes: 25 additions & 0 deletions bin/anthology/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
################################################################################

import os
from enum import Enum

# this is the canonical URL. In contrast to all other
# URL templates, it always links to the official anthology.
Expand Down Expand Up @@ -53,6 +54,25 @@
"ANTHOLOGY_FILES", os.path.join(os.environ["HOME"], "anthology-files")
)

# Anthology pdf location
# Defaults to {ANTHOLOGY_FILE_DIR}/pdf
ANTHOLOGY_PDF_DIR = os.environ.get(
"ANTHOLOGY_PDFS", os.path.join(ANTHOLOGY_FILE_DIR, "pdf")
)

# Anthology attachments location
# Defaults to {ANTHOLOGY_FILE_DIR}/attachments
ANTHOLOGY_ATTACHMENTS_DIR = os.environ.get(
"ANTHOLOGY_ATTACHMENTS", os.path.join(ANTHOLOGY_FILE_DIR, "attachments")
)

# Anthology data location
# Defaults to {git_repo_root}/data
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
ANTHOLOGY_DATA_DIR = os.environ.get(
"ANTHOLOGY_DATA", os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "..", "data"))
)

# Names of XML elements that may appear multiple times
LIST_ELEMENTS = (
"attachment",
Expand Down Expand Up @@ -100,3 +120,8 @@ def get_journal_title(top_level_id, volume_title):
return "Transactions of the Association for Computational Linguistics"
else:
return volume_title


class ResourceType(Enum):
PDF = 'pdf'
ATTACHMENT = 'attachments'
4 changes: 4 additions & 0 deletions bin/anthology/papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def videos(self):
]
return []

@cached_property
def pdf_hash(self):
return self.attrib.get("pdf_hash", None)

def _parse_revision_or_errata(self, tag):
for item in self.attrib.get(tag, []):
# Expand URLs with paper ID
Expand Down
57 changes: 57 additions & 0 deletions bin/anthology/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import re
import requests
import shutil
import subprocess

from lxml import etree
from urllib.parse import urlparse
Expand Down Expand Up @@ -444,11 +445,13 @@ def parse_element(xml_element):
elif tag == "url":
tag = "xml_url"
value = element.text
attrib['pdf_hash'] = element.get("hash")
elif tag == "attachment":
value = {
"filename": element.text,
"type": element.get("type", "attachment"),
"url": element.text,
"hash": element.get("hash"),
}
elif tag in ("author", "editor"):
id_ = element.attrib.get("id", None)
Expand Down Expand Up @@ -520,6 +523,60 @@ def compute_hash_from_file(path: str) -> str:
return compute_hash(f.read())


# For auto upload files to server
# The root directory for files
ANTHOLOGY_FILE_ROOT = "anthology-files"

# The ssh shortcut (in ~/.ssh/config) or full hostname
ANTHOLOGY_HOST = "anth"


def upload_file_to_queue(
local_path: str,
resource_type: data.ResourceType,
venue_name: str,
filename: str,
file_hash: str,
commit: bool = False,
):
actual_hash = compute_hash_from_file(local_path)
if file_hash != actual_hash:
raise Exception(
f"Got unexpected hash, file contains incorrect data. (actual hash: {actual_hash}, expected: {file_hash})"
)

mdkir_cmd = [
'ssh',
ANTHOLOGY_HOST,
f'mkdir -p {ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}',
]
if commit:
subprocess.check_call(mdkir_cmd)
else:
logging.info(f"Would run: {mdkir_cmd}")

upload_cmd = [
"rsync",
"-lptgoDve",
"ssh",
local_path,
f"{ANTHOLOGY_HOST}:{ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}/{filename}.{file_hash}",
]
if commit:
subprocess.check_call(upload_cmd)
else:
logging.info(f"Would run: {upload_cmd}")


def get_proceedings_id_from_filename(
resource_type: data.ResourceType, filename: str
) -> str:
trailing_dots = {data.ResourceType.PDF: 1, data.ResourceType.ATTACHMENT: 2}[
resource_type
]
return filename.rsplit('.', trailing_dots)[0]


def read_leaves(data) -> List[str]:
"""Reads the leaves of a possibly superfluously-hierarchical data structure.
For example:
Expand Down
4 changes: 4 additions & 0 deletions bin/anthology/volumes.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def pdf(self):
return infer_url(url, template=data.PDF_LOCATION_TEMPLATE)
return None

@cached_property
def pdf_hash(self):
return self.attrib.get("pdf_hash", None)

def _set_meta_info(self, meta_data):
"""Derive journal title, volume, and issue no. used in metadata.

Expand Down
Loading