diff --git a/bin/anthology/anthology.py b/bin/anthology/anthology.py index a5b38d67c7..f2fe27b0b7 100644 --- a/bin/anthology/anthology.py +++ b/bin/anthology/anthology.py @@ -28,7 +28,8 @@ from .venues import VenueIndex from .volumes import Volume from .sigs import SIGIndex -from .utils import is_newstyle_id, infer_year +from .data import ResourceType +from .utils import is_newstyle_id, infer_year, get_proceedings_id_from_filename from .events import EventIndex @@ -137,3 +138,25 @@ def import_file(self, filename): continue volume.append(parsed_paper) self.papers[full_id] = parsed_paper + + def get_hash_for_resource(self, resource_type: ResourceType, filename: str) -> str: + proceedings_id = get_proceedings_id_from_filename(resource_type, filename) + if proceedings_id not in self.papers and proceedings_id not in self.volumes: + raise Exception(f"Paper/Volume for PDF {proceedings_id!r} does not exist.") + + resource_hash = None + if resource_type == ResourceType.PDF: + resource_hash = self.papers.get( + proceedings_id, self.volumes.get(proceedings_id) + ).pdf_hash + elif resource_type == ResourceType.ATTACHMENT: + attachments = self.papers[proceedings_id].attachments + filename_to_hash = {a['filename']: a['hash'] for a in attachments} + resource_hash = filename_to_hash.get(filename) + + if resource_hash is None: + raise Exception( + "Hash for resource is None. Please update with value before running this script." + ) + + return resource_hash diff --git a/bin/anthology/data.py b/bin/anthology/data.py index 6da958a8ab..753e86dab9 100644 --- a/bin/anthology/data.py +++ b/bin/anthology/data.py @@ -22,6 +22,7 @@ ################################################################################ import os +from enum import Enum # this is the canonical URL. In contrast to all other # URL templates, it always links to the official anthology. @@ -53,6 +54,25 @@ "ANTHOLOGY_FILES", os.path.join(os.environ["HOME"], "anthology-files") ) +# Anthology pdf location +# Defaults to {ANTHOLOGY_FILE_DIR}/pdf +ANTHOLOGY_PDF_DIR = os.environ.get( + "ANTHOLOGY_PDFS", os.path.join(ANTHOLOGY_FILE_DIR, "pdf") +) + +# Anthology attachments location +# Defaults to {ANTHOLOGY_FILE_DIR}/attachments +ANTHOLOGY_ATTACHMENTS_DIR = os.environ.get( + "ANTHOLOGY_ATTACHMENTS", os.path.join(ANTHOLOGY_FILE_DIR, "attachments") +) + +# Anthology data location +# Defaults to {git_repo_root}/data +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +ANTHOLOGY_DATA_DIR = os.environ.get( + "ANTHOLOGY_DATA", os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "..", "data")) +) + # Names of XML elements that may appear multiple times LIST_ELEMENTS = ( "attachment", @@ -100,3 +120,8 @@ def get_journal_title(top_level_id, volume_title): return "Transactions of the Association for Computational Linguistics" else: return volume_title + + +class ResourceType(Enum): + PDF = 'pdf' + ATTACHMENT = 'attachments' diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py index e36e525ac5..6e50874af9 100644 --- a/bin/anthology/papers.py +++ b/bin/anthology/papers.py @@ -96,6 +96,10 @@ def videos(self): ] return [] + @cached_property + def pdf_hash(self): + return self.attrib.get("pdf_hash", None) + def _parse_revision_or_errata(self, tag): for item in self.attrib.get(tag, []): # Expand URLs with paper ID diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py index da524afaaa..91b9972768 100644 --- a/bin/anthology/utils.py +++ b/bin/anthology/utils.py @@ -20,6 +20,7 @@ import re import requests import shutil +import subprocess from lxml import etree from urllib.parse import urlparse @@ -444,11 +445,13 @@ def parse_element(xml_element): elif tag == "url": tag = "xml_url" value = element.text + attrib['pdf_hash'] = element.get("hash") elif tag == "attachment": value = { "filename": element.text, "type": element.get("type", "attachment"), "url": element.text, + "hash": element.get("hash"), } elif tag in ("author", "editor"): id_ = element.attrib.get("id", None) @@ -520,6 +523,60 @@ def compute_hash_from_file(path: str) -> str: return compute_hash(f.read()) +# For auto upload files to server +# The root directory for files +ANTHOLOGY_FILE_ROOT = "anthology-files" + +# The ssh shortcut (in ~/.ssh/config) or full hostname +ANTHOLOGY_HOST = "anth" + + +def upload_file_to_queue( + local_path: str, + resource_type: data.ResourceType, + venue_name: str, + filename: str, + file_hash: str, + commit: bool = False, +): + actual_hash = compute_hash_from_file(local_path) + if file_hash != actual_hash: + raise Exception( + f"Got unexpected hash, file contains incorrect data. (actual hash: {actual_hash}, expected: {file_hash})" + ) + + mdkir_cmd = [ + 'ssh', + ANTHOLOGY_HOST, + f'mkdir -p {ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}', + ] + if commit: + subprocess.check_call(mdkir_cmd) + else: + logging.info(f"Would run: {mdkir_cmd}") + + upload_cmd = [ + "rsync", + "-lptgoDve", + "ssh", + local_path, + f"{ANTHOLOGY_HOST}:{ANTHOLOGY_FILE_ROOT}/queue/{resource_type.value}/{venue_name}/{filename}.{file_hash}", + ] + if commit: + subprocess.check_call(upload_cmd) + else: + logging.info(f"Would run: {upload_cmd}") + + +def get_proceedings_id_from_filename( + resource_type: data.ResourceType, filename: str +) -> str: + trailing_dots = {data.ResourceType.PDF: 1, data.ResourceType.ATTACHMENT: 2}[ + resource_type + ] + return filename.rsplit('.', trailing_dots)[0] + + def read_leaves(data) -> List[str]: """Reads the leaves of a possibly superfluously-hierarchical data structure. For example: diff --git a/bin/anthology/volumes.py b/bin/anthology/volumes.py index 1004b49543..537e2d920e 100644 --- a/bin/anthology/volumes.py +++ b/bin/anthology/volumes.py @@ -113,6 +113,10 @@ def pdf(self): return infer_url(url, template=data.PDF_LOCATION_TEMPLATE) return None + @cached_property + def pdf_hash(self): + return self.attrib.get("pdf_hash", None) + def _set_meta_info(self, meta_data): """Derive journal title, volume, and issue no. used in metadata. diff --git a/bin/commit_queue.py b/bin/commit_queue.py new file mode 100644 index 0000000000..35cc9f3ae7 --- /dev/null +++ b/bin/commit_queue.py @@ -0,0 +1,290 @@ +# Options +# 1. Iterate through queue, if any have the checksum that is in the xml, copy to "real" location and delete from queue +# pros: should be the fastest since queue should be kept clean. +# cons: Will not detect any issues in "real" location. +# 2. Iterate through all Papers, check if checksummed file is in queue, copy if there is and remove from queue. +# pros: will detect if any resources are missing. +# cons: need to iterate through all papers everytime, will not catch case where extra files were copied. +# 3. Iterate through "real" location and checksum all files, if mismatched with paper, look at queue to copy in new file (e.g. a pdf or an attachment). +# pros: will detect if there are files in the "real" location that aren't referenced, and any resources that are referenced but have a different checksum (different version). +# cons: need to iterate through all files in "real" location and do checksum +# 4. Combine both #2 and #3 +# pros: will detect missing resources, extra resources and out dated resources +# cons: have to iterate all Papers, etc... in xml and checksum all files in the "real" location +# Picking option #1 as default and #4 as a complete check (will implement in future) + + +from typing import List, Optional +import os +import click +import logging as log +from functools import partial +import subprocess + +from anthology import Anthology +from anthology.data import ANTHOLOGY_DATA_DIR, ResourceType +from anthology.utils import SeverityTracker, compute_hash_from_file + +# Enable show default by default +click.option = partial(click.option, show_default=True) + +# The root directory for files, currently containing pdf/ and attachments/ +ANTHOLOGY_FILE_ROOT = "/home/anthologizer/anthology-files" + +# The ssh shortcut (in ~/.ssh/config) or full hostname +ANTHOLOGY_HOST = "anth" + +# The remote url of the acl anthology git repo +REMOTE_URL = "https://github.com/acl-org/acl-anthology.git" + +# The main branch of the acl anthology git repo +REMOTE_MAIN_BRANCH_NAME = "master" + + +def is_clean_checkout_of_remote_branch( + repo_dir: str, remote_url: str, remote_main_branch_name: str +) -> bool: + # Check if repo is clean + status = ( + subprocess.check_output(["git", "status", "-uall", "--short"]) + .decode('utf-8') + .strip() + ) + if status: + log.debug( + f"Repo @ {repo_dir!r} is not clean. It has the following changes:\n{status}" + ) + return False + + # Check tracking url and branch + current_ref = ( + subprocess.check_output(["git", "symbolic-ref", "-q", "HEAD"]) + .decode('utf-8') + .strip() + ) + remote_tracking_branch_ref = subprocess.check_output( + ["git", "for-each-ref", "--format='%(upstream:short)'", current_ref] + ) + + if "/" not in remote_tracking_branch_ref: + msg = f"Invalid remote tracking branch ref {remote_tracking_branch_ref}" + log.error(msg) + raise Exception(msg) + + tracking_remote_name, remote_tracking_branch = remote_tracking_branch_ref.split( + '/', 1 + ) + + if remote_tracking_branch != remote_main_branch_name: + log.debug( + f"Remote tracking branch {remote_tracking_branch!r} is not main remote branch {remote_main_branch_name!r}" + ) + return False + + tracking_remote_url = ( + subprocess.check_output(["git", "remote", "get-url", tracking_remote_name]) + .decode('utf-8') + .strip() + ) + + if tracking_remote_url != remote_url: + log.debug( + f"Remote tracking url {tracking_remote_url!r} is not the remote url {remote_url!r}" + ) + return False + return True + + +def run_remote_command(cmd): + return subprocess.check_output(['ssh', 'anth', cmd]).decode('utf-8').strip() + + +class FileSystemOps: + def __init__(self, is_on_server: bool, host: Optional[str], commit: bool): + self.is_on_server = is_on_server + self.host = host + self.commit = commit + if not is_on_server and not host: + raise Exception( + f"If is_on_server is false, host is required but got host: {host!r}" + ) + + self.root_dir = ANTHOLOGY_DATA_DIR if is_on_server else ANTHOLOGY_FILE_ROOT + + def listdir(self, relative_path: str) -> List[str]: + abs_dir = f'{self.root_dir}/{relative_path}' + if self.is_on_server: + return os.listdir(abs_dir) + return ( + subprocess.check_output(['ssh', self.host, f'ls {abs_dir}']) + .decode('utf-8') + .strip() + .split('\n') + ) + + def movefile(self, relative_src_path: str, relative_dest_path: str): + abs_src = f'{self.root_dir}/{relative_src_path}' + abs_dest = f'{self.root_dir}/{relative_dest_path}' + abs_dest_dir = os.path.dirname(abs_dest) + + if self.is_on_server: + if self.commit: + os.makedirs(abs_dest_dir, exist_ok=True) + else: + log.info(f"Would super-mkdir {abs_dest_dir!r}") + if self.commit: + os.rename(abs_src, abs_dest) + else: + log.info(f"Would move file {abs_src!r} to {abs_dest!r}") + return + mdkir_cmd = [ + 'ssh', + ANTHOLOGY_HOST, + f'mkdir -p {abs_dest_dir}', + ] + if self.commit: + subprocess.check_call(mdkir_cmd) + else: + log.info(f"Would run: {mdkir_cmd}") + + cmd = ['ssh', self.host, f'mv {abs_src} {abs_dest}'] + if self.commit: + subprocess.check_call(cmd) + else: + log.info(f"Would run: {cmd}") + + def hashfile(self, relative_path: str) -> str: + abs_dir = f'{self.root_dir}/{relative_path}' + if self.is_on_server: + return compute_hash_from_file(abs_dir) + return ( + subprocess.check_output(['ssh', self.host, f'crc32 {abs_dir}']) + .decode('utf-8') + .strip() + ) + + def exists(self, relative_path: str) -> bool: + abs_dir = f'{self.root_dir}/{relative_path}' + if self.is_on_server: + return os.path.exists(abs_dir) + try: + subprocess.check_output(['ssh', self.host, f'stat {abs_dir}']) + return True + except subprocess.CalledProcessError: + return False + + def remove(self, relative_path: str): + abs_dir = f'{self.root_dir}/{relative_path}' + if self.is_on_server: + if self.commit: + os.remove(abs_dir) + else: + log.info(f"Would remove file {abs_dir!r}") + return + cmd = ['ssh', self.host, f'rm {abs_dir}'] + if self.commit: + subprocess.check_call(cmd) + else: + log.info(f"Would run: {cmd}") + + +def process_queue(anth: Anthology, resource_type: ResourceType, fs: FileSystemOps): + log.debug(f'Processing queue for {resource_type}') + queue_base_path = f'queue/{resource_type.value}' + if not fs.exists(queue_base_path): + log.error(f'Missing queue directory: {queue_base_path}.') + return + for venue_name in fs.listdir(queue_base_path): + for filename in fs.listdir(os.path.join(queue_base_path, venue_name)): + log.debug(f'\tProcessing file {filename!r}') + base_filename, file_hash = filename.rsplit('.', 1) + + # Get main branch resource hash + try: + current_version_hash = anth.get_hash_for_resource( + resource_type, base_filename + ) + except Exception as e: + log.error(f"{e} (filename: {filename!r})", exc_info=True) + continue + + if file_hash == current_version_hash: + log.info( + f"Found queued file matching hash: {os.path.join(queue_base_path, venue_name, filename)}" + ) + fs.movefile( + os.path.join(queue_base_path, venue_name, filename), + os.path.join(resource_type.value, venue_name, base_filename), + ) + + +def do_complete_check(anth: Anthology, resource_type: ResourceType, fs: FileSystemOps): + log.error("Complete check isn't implemented yet") + + +@click.command() +@click.option( + '-i', + '--importdir', + type=click.Path(exists=True), + default=ANTHOLOGY_DATA_DIR, + help="Directory to import the Anthology XML files data files from.", +) +@click.option( + '--is-on-server', + is_flag=True, + help="If this flag is set file system changes will be applied to the local file system, else changes will be made by sshing into the anth server.", +) +@click.option( + '-c', + '--commit', + is_flag=True, + help="Commit (=write) the changes to the anthology server; will only do a dry run otherwise.", +) +@click.option( + '--complete-check', is_flag=True, help="Do a complete check of resources on server." +) +@click.option('--debug', is_flag=True, help="Output debug-level log messages.") +def main( + importdir: str, + is_on_server: bool, + commit: str, + complete_check: bool, + debug: bool, +): + log_level = log.DEBUG if debug else log.INFO + log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level) + tracker = SeverityTracker() + log.getLogger().addHandler(tracker) + + log.info( + 'Running as if on server.' + if is_on_server + else 'Will ssh to server for file system operations.' + ) + + if not is_clean_checkout_of_remote_branch( + importdir, REMOTE_URL, REMOTE_MAIN_BRANCH_NAME + ): + log.error( + f"Repo @ {importdir} isn't clean or isn't tracking the master remote branch." + ) + + log.info("Instantiating the Anthology...") + anth = Anthology(importdir=importdir) + + fs = FileSystemOps(is_on_server=is_on_server, host=ANTHOLOGY_HOST, commit=commit) + + if complete_check: + do_complete_check(anth, resource_type=ResourceType.PDF, fs=fs) + do_complete_check(anth, resource_type=ResourceType.ATTACHMENT, fs=fs) + else: + process_queue(anth, resource_type=ResourceType.PDF, fs=fs) + process_queue(anth, resource_type=ResourceType.ATTACHMENT, fs=fs) + + if tracker.highest >= log.ERROR: + exit(1) + + +if __name__ == "__main__": + main() diff --git a/bin/enqueue_files.py b/bin/enqueue_files.py new file mode 100644 index 0000000000..8238f45963 --- /dev/null +++ b/bin/enqueue_files.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Copyright 2022 Xinru Yan +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import click +import logging as log +from enum import Enum +from functools import partial + +from anthology import Anthology +from anthology.utils import upload_file_to_queue +from anthology.utils import SeverityTracker +from anthology.data import ( + ANTHOLOGY_ATTACHMENTS_DIR, + ANTHOLOGY_DATA_DIR, + ANTHOLOGY_PDF_DIR, + ResourceType, +) + + +# Enable show default by default +click.option = partial(click.option, show_default=True) + + +def get_proceedings_id_from_filename(resource_type: ResourceType, filename: str) -> str: + trailing_dots = {ResourceType.PDF: 1, ResourceType.ATTACHMENT: 2}[resource_type] + return filename.rsplit('.', trailing_dots)[0] + + +# Iterate over files in resource directory, find the hash in the Anthology and upload the file (if commit) +def enqueue_dir( + anth: Anthology, + resource_directory: str, + resource_type: ResourceType, + commit: bool = False, +): + for venue_name in os.listdir(resource_directory): + for filename in os.listdir(os.path.join(resource_directory, venue_name)): + local_path = os.path.join(resource_directory, venue_name, filename) + + # Get resource hash + try: + resource_hash = anth.get_hash_for_resource(anth, resource_type, filename) + except Exception as e: + log.error(f"{e} (filename: {local_path!r})", exc_info=True) + continue + + upload_file_to_queue( + local_path, + resource_type=resource_type, + venue_name=venue_name, + filename=filename, + file_hash=resource_hash, + commit=commit, + ) + + +@click.command() +@click.option( + '-i', + '--importdir', + type=click.Path(exists=True), + default=ANTHOLOGY_DATA_DIR, + help="Directory to import the Anthology XML files data files from.", +) +@click.option( + '-p', + '--pdfs-dir', + type=click.Path(exists=True), + default=ANTHOLOGY_PDF_DIR, + help="Root path for placement of PDF files", +) +@click.option( + '-a', + '--attachments-dir', + type=click.Path(exists=True), + default=ANTHOLOGY_ATTACHMENTS_DIR, + help="Root path for placement of PDF files", +) +@click.option( + '-c', + '--commit', + is_flag=True, + help="Commit (=write) the changes to the anthology server; will only do a dry run otherwise.", +) +@click.option('--debug', is_flag=True, help="Output debug-level log messages.") +def main(importdir, pdfs_dir, attachments_dir, commit, debug): + log_level = log.DEBUG if debug else log.INFO + log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level) + tracker = SeverityTracker() + log.getLogger().addHandler(tracker) + + log.info("Instantiating the Anthology...") + anth = Anthology(importdir=importdir) + + log.info("Enqueuing PDFs...") + enqueue_dir(anth, pdfs_dir, ResourceType.PDF, commit) + + log.info("Enqueuing Attachments...") + enqueue_dir(anth, attachments_dir, ResourceType.ATTACHMENT, commit) + + if not commit: + if tracker.highest >= log.ERROR: + log.warning( + "There were errors! Please check them carefully before re-running this script with -c/--commit." + ) + else: + log.warning( + "Re-run this script with -c/--commit to upload these files to the server." + ) + + if tracker.highest >= log.ERROR: + exit(1) + + +if __name__ == "__main__": + main()