diff --git a/scripts/Crawlers/BaseCrawlerTest.py b/scripts/Crawlers/BaseCrawlerTest.py new file mode 100644 index 000000000..f05ab9b92 --- /dev/null +++ b/scripts/Crawlers/BaseCrawlerTest.py @@ -0,0 +1,627 @@ +import abc +import json +import os +import re +import shutil + +import git +import requests +from datalad import api + +from scripts.Crawlers.constants import DATS_FIELDS +from scripts.Crawlers.constants import LICENSE_CODES +from scripts.Crawlers.constants import MODALITIES +from scripts.Crawlers.constants import NO_ANNEX_FILE_PATTERNS +from scripts.Crawlers.constants import REQUIRED_DATS_FIELDS + + +class BaseCrawler: + """ + Interface to extend conp-dataset crawlers. + + ================== + Overview + ================== + + Any crawler created from this interface will have to crawl + datasets from a specific remote platforms. This base class + implements the functions common to all crawled backends, in particular: + (1) verify that correct fork of conp-dataset is used, + (2) create and switch to an new branch for each new dataset, + (3) ignore README and DATS files for the annex, + (4) create new datalad datasets, + (5) publish to GitHub repository, + (6) create pull requests. + + Method run(), implemented in the base class, is the entry point to any crawler. + It does the following things: + (1) It calls abstract method get_all_dataset_description(), that must be implemented + by the crawler in the child class. get_all_dataset_description() + retrieves from the remote platform + all the necessary information about each dataset that is supposed to be added or + updated to conp-dataset. + (2) It iterates through each dataset description, and switch to a dedicated git branch + for each dataset. + (2.a) If the dataset is new, the base class will create a new branch, + an empty datalad repository, unannex DATS.json and README.md and create an + empty GitHub repository. It will then call abstract method add_new_dataset() + which will add/download all dataset files under given directory. + The crawler will then add a custom DATS.json and README.md if those weren't added. + Creating the README.md requires get_readme_content() to be implemented, which will + return the content of the README.md in markdown format. The crawler will then save and + publish all changes to the newly create repository. It will also handle adding a new submodule + to .gitmodules and creating a pull request to CONP-PCNO/conp-dataset. + (2.b) If the dataset already exists, verified by the existence of its corresponding branch, + the base class will call abstract method update_if_necessary() which will verify + if the dataset requires updating and update if so. If the dataset got updated, This method + will return True which will trigger saving, publishing new content to the dataset's respective + repository, creating a new DATS.json if it doesn't exist and creating a pull + request to CONP-PCNO/conp-dataset. + + ================== + How to implement a new crawler + ================== + + (1) Create a class deriving from BaseCrawler + (2) Implement the four abstract methods: + * get_all_dataset_description, + * add_new_dataset + * update_if_necessary + * get_readme_content. + See docstrings of each method for specifications. + (3) In crawl.py, locate the comment where it says to instantiate new crawlers, + instantiate this new Crawler and call run() on it + """ + + def __init__(self, github_token, config_path, verbose, force, no_pr, basedir): + self.basedir = basedir + self.repo = git.Repo(self.basedir) + self.username = self._check_requirements() + self.github_token = github_token + self.config_path = config_path + self.verbose = verbose + self.force = force + self.git = git + self.datalad = api + self.no_pr = no_pr + if self.verbose: + print(f"Using base directory {self.basedir}") + + @abc.abstractmethod + def get_all_dataset_description(self): + """ + Get relevant datasets' description from platform. + + Retrieves datasets' description that needs to be in CONP-datasets + from platform specific to each crawler like Zenodo, OSF, etc. It is up + to the crawler to identify which datasets on the platform should be crawled. + The Zenodo crawler uses keywords for this purpose, but other mechanisms + could work too. + + Each description is required to have the necessary information in order + to build a valid DATS file from it. The following keys are necessary in + each description: + description["title"]: The name of the dataset, usually one sentence or short description of the dataset + description["identifier"]: The identifier of the dataset + description["creators"]: The person(s) or organization(s) which contributed to the creation of the dataset + description["description"]: A textual narrative comprised of one or more statements describing the dataset + description["version"]: A release point for the dataset when applicable + description["licenses"]: The terms of use of the dataset + description["keywords"]: Tags associated with the dataset, which will help in its discovery + description["types"]: A term, ideally from a controlled terminology, identifying the dataset type or nature + of the data, placing it in a typology + More fields can be added as long as they comply with the DATS schema available at + https://github.com/CONP-PCNO/schema/blob/master/dataset_schema.json + + Any fields/keys not in the schema will be ignored when creating the dataset's DATS. + It is fine to add more helpful information for other methods which will use them. + + Here are some examples of valid DATS.json files: + https://github.com/conp-bot/conp-dataset-Learning_Naturalistic_Structure__Processed_fMRI_dataset/blob/476a1ee3c4df59aca471499b2e492a65bd389a88/DATS.json + https://github.com/conp-bot/conp-dataset-MRI_and_unbiased_averages_of_wild_muskrats__Ondatra_zibethicus__and_red_squirrels__Tami/blob/c9e9683fbfec71f44a5fc3576515011f6cd024fe/DATS.json + https://github.com/conp-bot/conp-dataset-PERFORM_Dataset__one_control_subject/blob/0b1e271fb4dcc03f9d15f694cc3dfae5c7c2d358/DATS.json + + Returns: + List of description of relevant datasets. Each description is a + dictionary. For example: + + [{ + "title": "PERFORM Dataset Example", + "description": "PERFORM dataset description", + "version": "0.0.1", + ... + }, + { + "title": "SIMON dataset example", + "description: "SIMON dataset description", + "version": "1.4.2", + ... + }, + ... + ] + """ + return [] + + @abc.abstractmethod + def add_new_dataset(self, dataset_description, dataset_dir): + """ + Configure and add newly created dataset to the local CONP git repository. + + The BaseCrawler will take care of a few overhead tasks before + add_new_dataset() is called, namely: + (1) Creating and checkout a dedicated branch for this dataset + (2) Initialising a Github repo for this dataset + (3) Creating an empty datalad dataset for files to be added + (4) Annex ignoring README.md and DATS.json + + After add_new_dataset() is called, BaseCrawler will: + (1) Create a custom DATS.json if it isn't added in add_new_dataset() + (2) Create a custom README.md with get_readme_content() if that file is non-existent + (3) Save and publish all changes + (4) Adding this dataset as a submodule + (5) Creating a pull request to CONP-PCNO/conp-dataset + (6) Switch back to the master branch + + This means that add_new_dataset() will only have to implement a few tasks in the given + previously initialized datalad dataset directory: + (1) Adding any one time configuration such as annex ignoring files or adding dataset version tracker + (2) Downloading and unarchiving relevant archives using datalad.download_url(link, archive=True) + (3) Adding file links as symlinks using annex("addurl", link, "--fast", "--file", filename) + + There is no need to save/push/publish/create pull request + as those will be done after this function has finished + + Parameter: + dataset_description (dict): Dictionary containing information on + retrieved dataset from platform. Element of + the list returned by get_all_dataset_description. + dataset_dir (str): Local directory path where the newly + created datalad dataset is located. + """ + return + + @abc.abstractmethod + def update_if_necessary(self, dataset_description, dataset_dir): + """ + Update dataset if it has been modified on the remote platform. + + Determines if local dataset identified by 'identifier' + needs to be updated. If so, update dataset. + + Similarily to add_new_dataset(), update_if_necessary() will need to + take care of updating the dataset if required: + (1) Downloading and unarchiving relevant archives using datalad.download_url(link, archive=True) + (2) Adding new file links as symlinks using annex("addurl", link, "--fast", "--file", filename) + (3) Updating any tracker files used to determine if the dataset needs to be updated + + There is no need to save/push/publish/create pull request + as those will be done after this function has finished + + Parameter: + dataset_description (dict): Dictionary containing information on + retrieved dataset from platform. + Element of the list returned by + get_all_dataset_description. + dataset_dir (str): Directory path of the + previously created datalad dataset + + Returns: + bool: True if dataset got modified, False otherwise + """ + return False + + @abc.abstractmethod + def get_readme_content(self, dataset_description): + """ + Returns the content of the README.md in markdown. + + Given the dataset description provided by + get_all_dataset_description(), return the content of the + README.md in markdown. + + Parameter: + dataset_description (dict): Dictionary containing information on + retrieved dataset from platform. + Element of the list returned by + get_all_dataset_description. + + Returns: + string: Content of the README.md + """ + return "" + + def run(self): + """ + DO NOT OVERRIDE THIS METHOD + This method is the entry point for all Crawler classes. + It will loop through dataset descriptions collected from get_all_dataset_description(), + verify if each of those dataset present locally, create a new dataset with add_new_dataset() + if not. If dataset is already existing locally, verify if dataset needs updating + with update_if_necessary() and update if so + """ + dataset_description_list = self.get_all_dataset_description() + for dataset_description in dataset_description_list: + clean_title = self._clean_dataset_title(dataset_description["title"]) + branch_name = "conp-bot/" + clean_title + dataset_dir = os.path.join(self.basedir, "projects", clean_title) + d = self.datalad.Dataset(dataset_dir) + if branch_name not in self.repo.remotes.origin.refs: # New dataset + self.repo.git.checkout("-b", branch_name) + repo_title = ("conp-dataset-" + dataset_description["title"])[0:100] + try: + d.create() + r = d.create_sibling_github( + repo_title, + name="origin", + github_login=self.github_token, + github_passwd=self.github_token, + ) + except Exception as error: + # handle the exception + print("An exception occurred:", error) + + # Add github token to dataset origin remote url + try: + origin = self.repo.remote("origin") + origin_url = next(origin.urls) + if "@" not in origin_url: + origin.set_url( + origin_url.replace( + "https://", + "https://" + self.github_token + "@", + ), + ) + except git.exc.NoSuchPathError: + pass + + self._add_github_repo_description(repo_title, dataset_description) + for pattern in NO_ANNEX_FILE_PATTERNS: + d.no_annex(pattern) + self.add_new_dataset(dataset_description, dataset_dir) + + # Create DATS.json if it exists in directory and 1 level deep subdir + dats_path: str = os.path.join(dataset_dir, "DATS.json") + if existing_dats_path := self._check_file_present( + dataset_dir, "dats.json" + ): + if self.verbose: + print(f"Found existing DATS.json at {existing_dats_path}") + if existing_dats_path != dats_path: + shutil.copy(existing_dats_path, dats_path) + self._add_source_data_submodule_if_derived_from_conp_dataset( + dats_path, dataset_dir + ) + else: + self._create_new_dats( + dataset_dir, + dats_path, + dataset_description, + d, + ) + # Move the logo into the root directory if found in 1 level deep subdir + logo_path = os.path.join(dataset_dir, "logo.png") + if existing_logo_path := self._check_file_present( + dataset_dir, "logo.png" + ): + if self.verbose: + print(f"Found logo at {existing_logo_path}") + if existing_logo_path != logo_path: + os.rename(existing_logo_path, logo_path) + + # Create README.md if it doesn't exist + if not os.path.isfile(os.path.join(dataset_dir, "README.md")): + readme = self.get_readme_content(dataset_description) + self._create_readme(readme, os.path.join(dataset_dir, "README.md")) + d.save() + try: + d.publish(to="origin") + self.repo.git.submodule( + "add", + r[0][1].replace(self.github_token + "@", ""), + dataset_dir, + ) + except Exception as e: + print(f"Skipping publication due to an error: {e}") + modified = True + commit_msg = "Created " + dataset_description["title"] + else: # Dataset already existing locally + self.repo.git.checkout("-f", branch_name) + try: + self.repo.git.merge("-n", "--no-verify", "master") + except Exception as e: + print(f"Error while merging master into {branch_name}: {e}") + print("Skipping this dataset") + self.repo.git.merge("--abort") + self.repo.git.checkout("-f", "master") + continue + + modified = self.update_if_necessary(dataset_description, dataset_dir) + if modified: + # Create DATS.json if it exists in directory and 1 level deep subdir + dats_path: str = os.path.join(dataset_dir, "DATS.json") + if existing_dats_path := self._check_file_present( + dataset_dir, "dats.json" + ): + if self.verbose: + print(f"Found existing DATS.json at {existing_dats_path}") + if existing_dats_path != dats_path: + os.rename(existing_dats_path, dats_path) + self._add_source_data_submodule_if_derived_from_conp_dataset( + dats_path, dataset_dir + ) + else: + self._create_new_dats( + dataset_dir, + dats_path, + dataset_description, + d, + ) + # Move the logo into the root directory if found in 1 level deep subdir + logo_path = os.path.join(dataset_dir, "logo.png") + if existing_logo_path := self._check_file_present( + dataset_dir, "logo.png" + ): + if self.verbose: + print(f"Found logo at {existing_logo_path}") + if existing_logo_path != logo_path: + os.rename(existing_logo_path, logo_path) + # Create README.md if it doesn't exist + if not os.path.isfile(os.path.join(dataset_dir, "README.md")): + readme = self.get_readme_content(dataset_description) + self._create_readme( + readme, + os.path.join(dataset_dir, "README.md"), + ) + d.save() + try: + d.publish(to="origin") + except Exception as e: + print(f"Skipping publication due to an error: {e}") + commit_msg = "Updated " + dataset_description["title"] + + # If modification detected in dataset, push to branch and create PR + if modified: + self._push_and_pull_request( + commit_msg, + dataset_dir, + dataset_description["title"], + ) + + # Go back to master + self.repo.git.checkout("master") + + def _add_github_repo_description(self, repo_title, dataset_description): + url = "https://api.github.com/repos/{}/{}".format( + self.username, + repo_title, + ) + head = {"Authorization": "token {}".format(self.github_token)} + description = "Please don't submit any PR to this repository. " + if "creators" in dataset_description.keys(): + description += ( + "If you want to request modifications, please contact " + f"{dataset_description['creators'][0]['name']}" + ) + payload = {"description": description} + r = requests.patch(url, data=json.dumps(payload), headers=head) + if not r.ok: + print( + "Problem adding description to repository {}:".format(repo_title), + ) + print(r.content) + + def _check_requirements(self): + # GitHub user must have a fork of https://github.com/CONP-PCNO/conp-dataset + # Script must be run in the directory of a local clone of this fork + # Git remote 'origin' of local Git clone must point to that fork + # Local Git clone must be set to branch 'master' + if "origin" not in self.repo.remotes: + raise Exception("Remote 'origin' does not exist in current reposition") + origin_url = next(self.repo.remote("origin").urls) + full_name = re.search("github.com[/,:](.*).git", origin_url).group(1) + r = requests.get("http://api.github.com/repos/" + full_name).json() + if not r["fork"] or r["parent"]["full_name"] != "CONP-PCNO/conp-dataset": + raise Exception("Current repository not a fork of CONP-PCNO/conp-dataset") + branch = self.repo.active_branch.name + if branch != "master": + raise Exception("Local git clone active branch not set to 'master'") + + # Return username + return full_name.split("/")[0] + + def _push_and_pull_request(self, msg, dataset_dir, title): + self.repo.git.add(dataset_dir) + self.repo.git.add(".gitmodules") + self.repo.git.commit("-m", "[conp-bot] " + msg) + clean_title = self._clean_dataset_title(title) + origin = self.repo.remote("origin") + origin_url = next(origin.urls) + if "@" not in origin_url: + origin.set_url( + origin_url.replace("https://", "https://" + self.github_token + "@"), + ) + self.repo.git.push("--set-upstream", "origin", "conp-bot/" + clean_title) + + # Create PR + print("Creating PR for " + title) + if not self.no_pr: + r = requests.post( + "https://api.github.com/repos/CONP-PCNO/conp-dataset/pulls", + json={ + "title": "Crawler result ({})".format(title), + "body": """## Description +{} + +## Checklist + +Mandatory files and elements: +- [x] A `README.md` file, at the root of the dataset +- [x] A `DATS.json` file, at the root of the dataset +- [ ] If configuration is required (for instance to enable a special remote), + a `config.sh` script at the root of the dataset +- [x] A DOI (see instructions in [contribution guide] +(https://github.com/CONP-PCNO/conp-dataset/blob/master/.github/CONTRIBUTING.md), and corresponding badge in `README.md` + +Functional checks: +- [x] Dataset can be installed using DataLad, recursively if it has sub-datasets +- [x] Every data file has a URL +- [x] Every data file can be retrieved or requires authentication +- [ ] `DATS.json` is a valid DATs model +- [ ] If dataset is derived data, raw data is a sub-dataset +""".format( + msg + "\n", + ), + "head": self.username + ":conp-bot/" + clean_title, + "base": "master", + }, + headers={"Authorization": "token {}".format(self.github_token)}, + ) + if r.status_code != 201: + raise Exception("Error while creating pull request: " + r.text) + + def _clean_dataset_title(self, title): + return re.sub(r"\W|^(?=\d)", "_", title) + + def _create_new_dats(self, dataset_dir, dats_path, dataset, d): + # Helper recursive function + def retrieve_license_path_in_dir(dir, paths): + for f_name in os.listdir(dir): + f_path = os.path.join(dir, f_name) + if os.path.isdir(f_path): + retrieve_license_path_in_dir(f_path, paths) + continue + elif "license" not in f_name.lower(): + continue + elif os.path.islink(f_path): + d.get(f_path) + paths.append(f_path) + + # Check required properties + for field in REQUIRED_DATS_FIELDS: + if field not in dataset.keys(): + print( + "Warning: required property {} not found in dataset description".format( + field, + ), + ) + + # Add all dats properties from dataset description + data = {key: value for key, value in dataset.items() if key in DATS_FIELDS} + + # Check for license code in dataset if a license was not specified from the platform + if "licenses" not in data or ( + len(data["licenses"]) == 1 and data["licenses"][0]["name"].lower() == "none" + ): + # Collect all license file paths + license_f_paths = [] + retrieve_license_path_in_dir(dataset_dir, license_f_paths) + + # If found some license files, for each, check for first valid license code and add to DATS + if license_f_paths: + licenses = set() + for f_path in license_f_paths: + with open(f_path) as f: + text = f.read().lower() + for code in LICENSE_CODES: + if code.lower() in text: + licenses.add(code) + break + data["licenses"] = [{"name": code} for code in licenses] + + # Add file count + num = 0 + for file in os.listdir(dataset_dir): + file_path = os.path.join(dataset_dir, file) + if ( + file[0] == "." + or file == "DATS.json" + or file == "README.md" + or file == "logo.png" + ): + continue + elif os.path.isdir(file_path): + num += sum([len(files) for r, d, files in os.walk(file_path)]) + else: + num += 1 + if "extraProperties" not in data.keys(): + data["extraProperties"] = [ + {"category": "files", "values": [{"value": str(num)}]}, + ] + else: + data["extraProperties"].append( + {"category": "files", "values": [{"value": str(num)}]}, + ) + + # Retrieve modalities from files + file_paths = map( + lambda x: x.split(" ")[-1], + filter( + lambda x: " " in x, + git.Repo(dataset_dir).git.annex("list").split("\n"), + ), + ) # Get file paths + file_names = list( + map(lambda x: x.split("/")[-1] if "/" in x else x, file_paths), + ) # Get file names from path + modalities = {self._guess_modality(file_name) for file_name in file_names} + if len(modalities) == 0: + modalities.add("unknown") + elif len(modalities) > 1 and "unknown" in modalities: + modalities.remove("unknown") + if "types" not in data.keys(): + data["types"] = [{"value": modality} for modality in modalities] + else: + for modality in modalities: + data["types"].append({"value": modality}) + + # Create file + with open(dats_path, "w") as f: + json.dump(data, f, indent=4) + + def _guess_modality(self, file_name): + # Associate file types to substrings found in the file name + for m in MODALITIES: + for s in MODALITIES[m]: + if s in file_name: + return m + return "unknown" + + def _create_readme(self, content, path): + with open(path, "w") as f: + f.write(content) + + def _check_file_present(self, directory, filename): + for file_name in os.listdir(directory): + file_path: str = os.path.join(directory, file_name) + if os.path.isdir(file_path): + for subfile_name in os.listdir(file_path): + if subfile_name.lower() == filename.lower(): + return os.path.join(file_path, subfile_name) + elif file_name.lower() == filename.lower(): + return file_path + + def _add_source_data_submodule_if_derived_from_conp_dataset( + self, dats_json, dataset_dir + ): + with open(dats_json) as f: + metadata = json.loads(f.read()) + + source_dataset_link = None + source_dataset_id = None + if "extraProperties" not in metadata.keys(): + return + for property in metadata["extraProperties"]: + if property["category"] == "derivedFrom": + try: + source_dataset_link = property["values"][0]["value"] + except (KeyError, IndexError): + continue + if property["category"] == "parent_dataset_id": + try: + source_dataset_id = property["values"][0]["value"] + except (KeyError, IndexError): + continue + + if source_dataset_link is not None and "github.com" in source_dataset_link: + d = self.datalad.Dataset(os.path.join(dataset_dir, source_dataset_id)) + d.create() diff --git a/scripts/Crawlers/OSFCrawlerTest.py b/scripts/Crawlers/OSFCrawlerTest.py new file mode 100644 index 000000000..5bb16d935 --- /dev/null +++ b/scripts/Crawlers/OSFCrawlerTest.py @@ -0,0 +1,661 @@ +import datetime +import json +import os +import time +from typing import Any +from typing import Callable +from typing import Dict +from typing import List +from typing import Optional + +import humanize +import requests +from datalad.distribution.dataset import Dataset +from datalad.support.exceptions import IncompleteResultsError +from git import Repo +from requests.exceptions import HTTPError + +from scripts.Crawlers.BaseCrawlerTest import BaseCrawler + + +def _create_osf_tracker(path, dataset): + with open(path, "w") as f: + data = { + "version": dataset["version"], + "title": dataset["title"], + } + json.dump(data, f, indent=4) + + +class OSFCrawler(BaseCrawler): + def __init__(self, github_token, config_path, verbose, force, no_pr, basedir): + super().__init__(github_token, config_path, verbose, force, no_pr, basedir) + self.osf_token = self._get_token() + + def _get_token(self): + if os.path.isfile(self.config_path): + with open(self.config_path) as f: + data = json.load(f) + if "osf_token" in data.keys(): + return data["osf_token"] + + def _get_request_with_bearer_token(self, link, redirect=True, retries=5): + header = {"Authorization": f"Bearer {self.osf_token}"} + attempt = 0 + while attempt < retries: + try: + r = requests.get(link, headers=header, allow_redirects=redirect) + r.raise_for_status() # Cela va lever une exception pour les réponses 4xx et 5xx + return r # Retourne la réponse si tout va bien + except HTTPError as http_err: + print(f"HTTP error occurred: {http_err} - Response: {r.text}") + if r.status_code == 503: # Spécifiquement pour gérer les erreurs 503 + print( + f"Request to {r.url} failed with 503 Bad Gateway, retrying..." + ) + attempt += 1 + time.sleep(2**attempt) # Backoff exponentiel + continue + if r.status_code == 502: # Spécifiquement pour gérer les erreurs 502 + print( + f"Request to {r.url} failed with 502 Bad Gateway, skipping download." + ) + return None # Retourne None pour permettre au code de continuer + else: + raise Exception( + f"HTTP error occurred: {http_err} - {r.status_code}" + ) # Lève l'exception pour les autres erreurs HTTP + except Exception as err: + raise Exception(f"An error occurred: {err}") + + def _query_osf(self): + query = "https://api.osf.io/v2/nodes/?filter[tags]=canadian-open-neuroscience-platform" + r_json = self._get_request_with_bearer_token(query).json() + results = r_json["data"] + + # Retrieve results from other pages + if r_json["links"]["meta"]["total"] > r_json["links"]["meta"]["per_page"]: + next_page = r_json["links"]["next"] + while next_page is not None: + next_page_json = self._get_request_with_bearer_token(next_page).json() + results.extend(next_page_json["data"]) + next_page = next_page_json["links"]["next"] + + if self.verbose: + print("OSF query: {}".format(query)) + return results + + def _download_files( + self, + link, + current_dir, + inner_path, + d, + annex, + sizes, + is_private=False, + ): + response = self._get_request_with_bearer_token(link) + if response is None: + print(f"Skipping download for {link} due to a failed request.") + return + print("first download", response) + r_json = response.json() + files = r_json["data"] + + # Retrieve the files in the other pages if there are more than 1 page + if ( + "links" in r_json.keys() + and r_json["links"]["meta"]["total"] > r_json["links"]["meta"]["per_page"] + ): + print("dans le next page") + next_page = r_json["links"]["next"] + while next_page is not None: + response = self._get_request_with_bearer_token(next_page) + if response is None: + print(f"Skipping page {next_page} due to a failed request.") + break + + next_page_json = response.json() + files.extend(next_page_json["data"]) + next_page = next_page_json["links"]["next"] + + for file in files: + # Handle folders + if file["attributes"]["kind"] == "folder": + folder_path = os.path.join(current_dir, file["attributes"]["name"]) + # Conditions added by Alex + if not os.path.exists(folder_path): + os.mkdir(folder_path) + self._download_files( + file["relationships"]["files"]["links"]["related"]["href"], + folder_path, + os.path.join(inner_path, file["attributes"]["name"]), + d, + annex, + sizes, + is_private, + ) + else: + print(f"the folder {folder_path} already exist.") + + # Handle single files + elif file["attributes"]["kind"] == "file": + try: + # Private dataset/files + if is_private: + correct_download_link = self._get_request_with_bearer_token( + file["links"]["download"], + redirect=False, + ) + if correct_download_link is not None: + correct_download_link = correct_download_link.headers[ + "location" + ] + if ( + "https://accounts.osf.io/login" + not in correct_download_link + ): + zip_file = ( + True + if file["attributes"]["name"].split(".")[-1] + == "zip" + else False + ) + d.download_url( + correct_download_link, + path=os.path.join(inner_path, ""), + archive=zip_file, + ) + else: # Token did not work for downloading file, return + file = file["links"]["download"] + print( + f"Unable to download file {file} with current token, skipping file", + ) + return + + # Public file + else: + # Handle zip files + if file["attributes"]["name"].split(".")[-1] == "zip": + d.download_url( + file["links"]["download"], + path=os.path.join(inner_path, ""), + archive=True, + ) + else: + d.download_url( + file["links"]["download"], + path=os.path.join(inner_path, ""), + ) + + except IncompleteResultsError as e: + print( + f"Skipping file {file['links']['download']} due to error: {e}" + ) + continue # Skip ce fichier et passer au suivant + + # append the size of the downloaded file to the sizes array + file_size = file["attributes"]["size"] + if not file_size: + # if the file size cannot be found in the OSF API response, then get it from git annex info + inner_file_path = os.path.join( + inner_path, + file["attributes"]["name"], + ) + annex_info_dict = json.loads( + annex("info", "--bytes", "--json", inner_file_path), + ) + file_size = int(annex_info_dict.get("size", 0)) + sizes.append(file_size) + + def _download_components( + self, + components_list, + current_dir, + inner_path, + d, + annex, + dataset_size, + is_private, + ): + # Loop through each available components and download their files + for component in components_list: + component_title = self._clean_dataset_title( + component["attributes"]["title"], + ) + component_inner_path = os.path.join( + inner_path, + "components", + component_title, + ) + os.makedirs(os.path.join(current_dir, component_inner_path)) + self._download_files( + component["relationships"]["files"]["links"]["related"]["href"], + os.path.join(current_dir, component_inner_path), + component_inner_path, + d, + annex, + dataset_size, + is_private, + ) + + # check if the component contains (sub)components, in which case, download the (sub)components data + subcomponents_list = self._get_components( + component["relationships"]["children"]["links"]["related"]["href"], + ) + if subcomponents_list: + self._download_components( + subcomponents_list, + current_dir, + os.path.join(component_inner_path), + d, + annex, + dataset_size, + is_private, + ) + + # Once we have downloaded all the components files, check to see if there are any empty + # directories (in the case the 'OSF parent' dataset did not have any downloaded files + list_of_empty_dirs = [ + dirpath + for (dirpath, dirnames, filenames) in os.walk(current_dir) + if len(dirnames) == 0 and len(filenames) == 0 + ] + for empty_dir in list_of_empty_dirs: + os.rmdir(empty_dir) + + def _get_contributors(self, link): + r = self._get_request_with_bearer_token(link) + contributors = [ + contributor["embeds"]["users"]["data"]["attributes"]["full_name"] + for contributor in r.json()["data"] + ] + return contributors + + def _get_license(self, link): + r = self._get_request_with_bearer_token(link) + return r.json()["data"]["attributes"]["name"] + + def _get_components(self, link): + r = self._get_request_with_bearer_token(link) + return r.json()["data"] + + def _get_wiki(self, link) -> Optional[str]: + r = self._get_request_with_bearer_token(link) + data = r.json()["data"] + if len(data) > 0: + return self._get_request_with_bearer_token( + data[0]["links"]["download"] + ).content.decode() + + def _get_institutions(self, link): + r = self._get_request_with_bearer_token(link) + if r.json()["data"]: + institutions = [ + institution["attributes"]["name"] for institution in r.json()["data"] + ] + return institutions + + def _get_identifier(self, link): + r = self._get_request_with_bearer_token(link) + return r.json()["data"][0]["attributes"]["value"] if r.json()["data"] else False + + def get_all_dataset_description(self): + osf_dois = [] + datasets = self._query_osf() + for dataset in datasets: + # skip datasets that have a parent since the files' components will + # go into the parent dataset. + # print("parent" in dataset["relationships"].keys()) + if "parent" in dataset["relationships"].keys(): + print(dataset["relationships"]["parent"]) + # continue + + attributes = dataset["attributes"] + + # Retrieve keywords/tags + keywords = list(map(lambda x: {"value": x}, attributes["tags"])) + + # Retrieve contributors/creators + contributors = self._get_contributors( + dataset["relationships"]["contributors"]["links"]["related"]["href"], + ) + + # Retrieve license + license_ = "None" + if "license" in dataset["relationships"].keys(): + license_ = self._get_license( + dataset["relationships"]["license"]["links"]["related"]["href"], + ) + + # Retrieve institution information + institutions = self._get_institutions( + dataset["relationships"]["affiliated_institutions"]["links"]["related"][ + "href" + ], + ) + + # Retrieve identifier information + identifier = self._get_identifier( + dataset["relationships"]["identifiers"]["links"]["related"]["href"], + ) + + # Get link for the dataset files + files_link = dataset["relationships"]["files"]["links"]["related"]["href"] + + # Get components list + components_list = self._get_components( + dataset["relationships"]["children"]["links"]["related"]["href"], + ) + + # Get wiki to put in README + wiki: Optional[str] = None + try: + wiki = self._get_wiki( + dataset["relationships"]["wikis"]["links"]["related"]["href"] + ) + except Exception as e: + print(f'Error getting wiki for {attributes["title"]} because of {e}') + + # Gather extra properties + extra_properties = [ + { + "category": "logo", + "values": [ + { + "value": "https://osf.io/static/img/institutions/shields/cos-shield.png", + }, + ], + }, + ] + if institutions: + extra_properties.append( + { + "category": "origin_institution", + "values": list( + map(lambda x: {"value": x}, institutions), + ), + }, + ) + + # Retrieve dates + date_created = datetime.datetime.strptime( + attributes["date_created"], + "%Y-%m-%dT%H:%M:%S.%f", + ) + date_modified = datetime.datetime.strptime( + attributes["date_modified"], + "%Y-%m-%dT%H:%M:%S.%f", + ) + + dataset_dats_content = { + "title": attributes["title"], + "files": files_link, + "components_list": components_list, + "homepage": dataset["links"]["html"], + "creators": list( + map(lambda x: {"name": x}, contributors), + ), + "description": attributes["description"], + "wiki": wiki, + "version": attributes["date_modified"], + "licenses": [ + { + "name": license_, + }, + ], + "dates": [ + { + "date": date_created.strftime("%Y-%m-%d %H:%M:%S"), + "type": { + "value": "date created", + }, + }, + { + "date": date_modified.strftime("%Y-%m-%d %H:%M:%S"), + "type": { + "value": "date modified", + }, + }, + ], + "keywords": keywords, + "distributions": [ + { + "size": 0, + "unit": {"value": "B"}, + "access": { + "landingPage": dataset["links"]["html"], + "authorizations": [ + { + "value": "public" + if attributes["public"] + else "private", + }, + ], + }, + }, + ], + "extraProperties": extra_properties, + } + + if identifier: + source = "OSF DOI" if "OSF.IO" in identifier else "DOI" + dataset_dats_content["identifier"] = { + "identifier": identifier, + "identifierSource": source, + } + + osf_dois.append(dataset_dats_content) + + if self.verbose: + print("Retrieved OSF DOIs: ") + for osf_doi in osf_dois: + print( + "- Title: {}, Last modified: {}".format( + osf_doi["title"], + osf_doi["version"], + ), + ) + + return osf_dois + + def add_new_dataset(self, dataset: Dict[str, Any], dataset_dir: str): + d: Dataset = self.datalad.Dataset(dataset_dir) + d.no_annex(".conp-osf-crawler.json") + d.save() + annex: Callable = Repo(dataset_dir).git.annex + dataset_size: List[int] = [] + + # Setup private OSF dataset if the dataset is private + is_private: bool = self._setup_private_dataset( + dataset["files"], + dataset_dir, + annex, + d, + ) + self._download_files( + dataset["files"], + dataset_dir, + "", + d, + annex, + dataset_size, + is_private, + ) + if dataset["components_list"]: + self._download_components( + dataset["components_list"], + dataset_dir, + "", + d, + annex, + dataset_size, + is_private, + ) + dataset_size_num, dataset_unit = humanize.naturalsize(sum(dataset_size)).split( + " ", + ) + dataset["distributions"][0]["size"] = float(dataset_size_num) + dataset["distributions"][0]["unit"]["value"] = dataset_unit + + # Add .conp-osf-crawler.json tracker file + _create_osf_tracker( + os.path.join(dataset_dir, ".conp-osf-crawler.json"), + dataset, + ) + # Tenter de publier sur le remote 'origin' + try: + d.publish(to="origin") + except IncompleteResultsError as e: + print(f"Skipping publication due to error: {e}") + except Exception as e: + print(f"An unexpected error occurred during publication: {e}") + + def update_if_necessary(self, dataset_description, dataset_dir): + tracker_path = os.path.join(dataset_dir, ".conp-osf-crawler.json") + if not os.path.isfile(tracker_path): + print("{} does not exist in dataset, skipping".format(tracker_path)) + return False + with open(tracker_path) as f: + tracker = json.load(f) + if tracker["version"] == dataset_description["version"]: + # Same version, no need to update + if self.verbose: + print( + "{}, version {} same as OSF version DOI ({}), no need to update".format( + dataset_description["title"], + dataset_description["version"], + tracker["version"], + ), + ) + return False + + # Update dataset + if self.verbose: + print( + "{}, version {} different from OSF version DOI {}, updating".format( + dataset_description["title"], + tracker["version"], + dataset_description["version"], + ), + ) + + # Remove all data and DATS.json files + for file_name in os.listdir(dataset_dir): + if file_name[0] == ".": + continue + self.datalad.remove(os.path.join(dataset_dir, file_name), check=False) + + d = self.datalad.Dataset(dataset_dir) + annex = Repo(dataset_dir).git.annex + + dataset_size = [] + is_private: bool = self._is_private_dataset(dataset_description["files"]) + self._download_files( + dataset_description["files"], + dataset_dir, + "", + d, + annex, + dataset_size, + is_private, + ) + if dataset_description["components_list"]: + self._download_components( + dataset_description["components_list"], + dataset_dir, + "", + d, + annex, + dataset_size, + is_private, + ) + dataset_size, dataset_unit = humanize.naturalsize(sum(dataset_size)).split( + " ", + ) + dataset_description["distributions"][0]["size"] = float(dataset_size) + dataset_description["distributions"][0]["unit"]["value"] = dataset_unit + + # Add .conp-osf-crawler.json tracker file + _create_osf_tracker( + os.path.join(dataset_dir, ".conp-osf-crawler.json"), + dataset_description, + ) + + return True + + def get_readme_content(self, dataset): + readme_content = ( + f'# {dataset["title"]}\n\nCrawled from [OSF]({dataset["homepage"]})' + ) + + if "description" in dataset and dataset["description"]: + readme_content += f'\n\n## Description\n\n{dataset["description"]}' + + if "identifier" in dataset and dataset["identifier"]: + readme_content += f'\n\n## DOI: {dataset["identifier"]["identifier"]}' + + if "wiki" in dataset and dataset["wiki"]: + readme_content += f'\n\n## WIKI\n\n{dataset["wiki"]}' + + return readme_content + + def _setup_private_dataset( + self, + files_url: str, + dataset_dir: str, + annex: Callable, + dataset: Dataset, + ) -> bool: + # Check if the dataset is indeed private + if self._is_private_dataset(files_url): + if self.verbose: + print( + "Dataset is private, creating OSF provider and make git annex autoenable datalad remote", + ) + + # Create OSF provider file and needed directories and don't annex the file + datalad_dir: str = os.path.join(dataset_dir, ".datalad") + if not os.path.exists(datalad_dir): + os.mkdir(datalad_dir) + providers_dir: str = os.path.join(datalad_dir, "providers") + if not os.path.exists(providers_dir): + os.mkdir(providers_dir) + osf_config_path: str = os.path.join(providers_dir, "OSF.cfg") + with open(osf_config_path, "w") as f: + f.write( + """[provider:OSF] +url_re = .*osf\\.io.* +authentication_type = bearer_token +credential = OSF + +[credential:OSF] +# If known, specify URL or email to how/where to request credentials +# url = ??? +type = token""" + ) + dataset.no_annex(os.path.join("**", "OSF.cfg")) + + # Make git annex autoenable datalad remote + annex( + "initremote", + "datalad", + "externaltype=datalad", + "type=external", + "encryption=none", + "autoenable=true", + ) + + # Set OSF token as a environment variable for authentication + os.environ["DATALAD_OSF_token"] = self.osf_token + + # Save changes + dataset.save() + + return True + + return False + + def _is_private_dataset(self, files_url) -> bool: + return True if requests.get(files_url).status_code == 401 else False diff --git a/scripts/Crawlers/ZenodoCrawler.py b/scripts/Crawlers/ZenodoCrawler.py index 8878b8e82..4a8c8f44e 100644 --- a/scripts/Crawlers/ZenodoCrawler.py +++ b/scripts/Crawlers/ZenodoCrawler.py @@ -17,10 +17,10 @@ def _create_zenodo_tracker(path, dataset): with open(path, "w") as f: data = { "zenodo": { - "concept_doi": dataset["concept_doi"], - "version": dataset["latest_version"], + "concept_doi": dataset.get("concept_doi"), + "version": dataset.get("latest_version"), }, - "title": dataset["title"], + "title": dataset.get("title"), } json.dump(data, f, indent=4) @@ -39,7 +39,7 @@ def _get_tokens(self): with open(self.config_path) as f: data = json.load(f) if "zenodo_tokens" in data.keys(): - return data["zenodo_tokens"] + return data.get("zenodo_tokens") else: return {} @@ -50,9 +50,9 @@ def _query_zenodo(self): 'q=keywords:"canadian-open-neuroscience-platform"' ) r_json = requests.get(query).json() - results = r_json["hits"]["hits"] + results = r_json.get("hits", {}).get("hits") - if r_json["links"]["next"]: + if r_json and r_json.get("links", {}).get("next"): next_page = r_json["links"]["next"] while next_page is not None: next_page_json = requests.get(next_page).json() @@ -77,7 +77,7 @@ def _download_file(self, bucket, d, is_private): file_size: int = bucket.get("size", 0) if self.verbose: print(f"Downloading {link} as {file_name} of size {file_size}") - d.download_url(link, archive=True if bucket["type"] == "zip" else False) + d.download_url(link, archive=True if bucket.get("type") == "zip" else False) def get_all_dataset_description(self): zenodo_dois = [] @@ -124,21 +124,25 @@ def get_all_dataset_description(self): for bucket in dataset["files"]: files.append(bucket) - latest_version_doi = metadata["relations"]["version"][0]["last_child"][ - "pid_value" - ] + latest_version_doi = None + version = metadata.get("relations", {}).get("version", []) + if len(version): + latest_version_doi = version[0].get("last_child", {}).get("pid_value") # Retrieve and clean file formats/extensions file_formats = ( - list(set(map(lambda x: x["type"], files))) if len(files) > 0 else None + list(set(map(lambda x: os.path.splitext(x.get("key"))[1][1:], files))) + if len(files) > 0 + else [] ) + if "" in file_formats: file_formats.remove("") # Retrieve and clean file keywords keywords = [] if "keywords" in metadata.keys(): - keywords = list(map(lambda x: {"value": x}, metadata["keywords"])) + keywords = list(map(lambda x: {"value": x}, metadata.get("keywords"))) # Retrieve subject annotations from Zenodo and clean the annotated # subjects to insert in isAbout of DATS file @@ -231,12 +235,12 @@ def get_all_dataset_description(self): file_format.upper() for file_format in file_formats # Do not modify specific file formats. - if file_formats not in ["NIfTI", "BigWig"] + if file_format not in ["NIfTI", "BigWig"] ], "size": dataset_size, "unit": {"value": dataset_unit}, "access": { - "landingPage": dataset["links"]["html"], + "landingPage": dataset.get("links", {}).get("html"), "authorizations": [ { "value": "public"