diff --git a/scripts/Crawlers/BaseCrawlerTest.py b/scripts/Crawlers/BaseCrawlerTest.py
new file mode 100644
index 000000000..f05ab9b92
--- /dev/null
+++ b/scripts/Crawlers/BaseCrawlerTest.py
@@ -0,0 +1,627 @@
+import abc
+import json
+import os
+import re
+import shutil
+
+import git
+import requests
+from datalad import api
+
+from scripts.Crawlers.constants import DATS_FIELDS
+from scripts.Crawlers.constants import LICENSE_CODES
+from scripts.Crawlers.constants import MODALITIES
+from scripts.Crawlers.constants import NO_ANNEX_FILE_PATTERNS
+from scripts.Crawlers.constants import REQUIRED_DATS_FIELDS
+
+
+class BaseCrawler:
+    """
+    Interface to extend conp-dataset crawlers.
+
+    ==================
+    Overview
+    ==================
+
+    Any crawler created from this interface will have to crawl
+    datasets from a specific remote platforms. This base class
+    implements the functions common to all crawled backends, in particular:
+    (1) verify that correct fork of conp-dataset is used,
+    (2) create and switch to an new branch for each new dataset,
+    (3) ignore README and DATS files for the annex,
+    (4) create new datalad datasets,
+    (5) publish to GitHub repository,
+    (6) create pull requests.
+
+    Method run(), implemented in the base class, is the entry point to any crawler.
+    It does the following things:
+    (1) It calls abstract method get_all_dataset_description(), that must be implemented
+    by the crawler in the child class. get_all_dataset_description()
+    retrieves from the remote platform
+    all the necessary information about each dataset that is supposed to be added or
+    updated to conp-dataset.
+    (2) It iterates through each dataset description, and switch to a dedicated git branch
+    for each dataset.
+       (2.a) If the dataset is new, the base class will create a new branch,
+             an empty datalad repository, unannex DATS.json and README.md and create an
+             empty GitHub repository. It will then call abstract method add_new_dataset()
+             which will add/download all dataset files under given directory.
+             The crawler will then add a custom DATS.json and README.md if those weren't added.
+             Creating the README.md requires get_readme_content() to be implemented, which will
+             return the content of the README.md in markdown format. The crawler will then save and
+             publish all changes to the newly create repository. It will also handle adding a new submodule
+             to .gitmodules and creating a pull request to CONP-PCNO/conp-dataset.
+       (2.b) If the dataset already exists, verified by the existence of its corresponding branch,
+             the base class will call abstract method update_if_necessary() which will verify
+             if the dataset requires updating and update if so. If the dataset got updated, This method
+             will return True which will trigger saving, publishing new content to the dataset's respective
+             repository, creating a new DATS.json if it doesn't exist and creating a pull
+             request to CONP-PCNO/conp-dataset.
+
+    ==================
+    How to implement a new crawler
+    ==================
+
+        (1) Create a class deriving from BaseCrawler
+        (2) Implement the four abstract methods:
+            * get_all_dataset_description,
+            * add_new_dataset
+            * update_if_necessary
+            * get_readme_content.
+            See docstrings of each method for specifications.
+        (3) In crawl.py, locate the comment where it says to instantiate new crawlers,
+            instantiate this new Crawler and call run() on it
+    """
+
+    def __init__(self, github_token, config_path, verbose, force, no_pr, basedir):
+        self.basedir = basedir
+        self.repo = git.Repo(self.basedir)
+        self.username = self._check_requirements()
+        self.github_token = github_token
+        self.config_path = config_path
+        self.verbose = verbose
+        self.force = force
+        self.git = git
+        self.datalad = api
+        self.no_pr = no_pr
+        if self.verbose:
+            print(f"Using base directory {self.basedir}")
+
+    @abc.abstractmethod
+    def get_all_dataset_description(self):
+        """
+        Get relevant datasets' description from platform.
+
+        Retrieves datasets' description that needs to be in CONP-datasets
+        from platform specific to each crawler like Zenodo, OSF, etc. It is up
+        to the crawler to identify which datasets on the platform should be crawled.
+        The Zenodo crawler uses keywords for this purpose, but other mechanisms
+        could work too.
+
+        Each description is required to have the necessary information in order
+        to build a valid DATS file from it. The following keys are necessary in
+        each description:
+            description["title"]: The name of the dataset, usually one sentence or short description of the dataset
+            description["identifier"]: The identifier of the dataset
+            description["creators"]: The person(s) or organization(s) which contributed to the creation of the dataset
+            description["description"]: A textual narrative comprised of one or more statements describing the dataset
+            description["version"]: A release point for the dataset when applicable
+            description["licenses"]: The terms of use of the dataset
+            description["keywords"]: Tags associated with the dataset, which will help in its discovery
+            description["types"]: A term, ideally from a controlled terminology, identifying the dataset type or nature
+                                  of the data, placing it in a typology
+        More fields can be added as long as they comply with the DATS schema available at
+        https://github.com/CONP-PCNO/schema/blob/master/dataset_schema.json
+
+        Any fields/keys not in the schema will be ignored when creating the dataset's DATS.
+        It is fine to add more helpful information for other methods which will use them.
+
+        Here are some examples of valid DATS.json files:
+        https://github.com/conp-bot/conp-dataset-Learning_Naturalistic_Structure__Processed_fMRI_dataset/blob/476a1ee3c4df59aca471499b2e492a65bd389a88/DATS.json
+        https://github.com/conp-bot/conp-dataset-MRI_and_unbiased_averages_of_wild_muskrats__Ondatra_zibethicus__and_red_squirrels__Tami/blob/c9e9683fbfec71f44a5fc3576515011f6cd024fe/DATS.json
+        https://github.com/conp-bot/conp-dataset-PERFORM_Dataset__one_control_subject/blob/0b1e271fb4dcc03f9d15f694cc3dfae5c7c2d358/DATS.json
+
+        Returns:
+            List of description of relevant datasets. Each description is a
+            dictionary. For example:
+
+            [{
+                "title": "PERFORM Dataset Example",
+                "description": "PERFORM dataset description",
+                "version": "0.0.1",
+                ...
+             },
+             {
+                "title": "SIMON dataset example",
+                "description: "SIMON dataset description",
+                "version": "1.4.2",
+                ...
+             },
+             ...
+            ]
+        """
+        return []
+
+    @abc.abstractmethod
+    def add_new_dataset(self, dataset_description, dataset_dir):
+        """
+        Configure and add newly created dataset to the local CONP git repository.
+
+        The BaseCrawler will take care of a few overhead tasks before
+        add_new_dataset() is called, namely:
+        (1) Creating and checkout a dedicated branch for this dataset
+        (2) Initialising a Github repo for this dataset
+        (3) Creating an empty datalad dataset for files to be added
+        (4) Annex ignoring README.md and DATS.json
+
+        After add_new_dataset() is called, BaseCrawler will:
+        (1) Create a custom DATS.json if it isn't added in add_new_dataset()
+        (2) Create a custom README.md with get_readme_content() if that file is non-existent
+        (3) Save and publish all changes
+        (4) Adding this dataset as a submodule
+        (5) Creating a pull request to CONP-PCNO/conp-dataset
+        (6) Switch back to the master branch
+
+        This means that add_new_dataset() will only have to implement a few tasks in the given
+        previously initialized datalad dataset directory:
+        (1) Adding any one time configuration such as annex ignoring files or adding dataset version tracker
+        (2) Downloading and unarchiving relevant archives using datalad.download_url(link, archive=True)
+        (3) Adding file links as symlinks using annex("addurl", link, "--fast", "--file", filename)
+
+        There is no need to save/push/publish/create pull request
+        as those will be done after this function has finished
+
+        Parameter:
+        dataset_description (dict): Dictionary containing information on
+                                    retrieved dataset from platform. Element of
+                                    the list returned by get_all_dataset_description.
+        dataset_dir (str): Local directory path where the newly
+                           created datalad dataset is located.
+        """
+        return
+
+    @abc.abstractmethod
+    def update_if_necessary(self, dataset_description, dataset_dir):
+        """
+        Update dataset if it has been modified on the remote platform.
+
+        Determines if local dataset identified by 'identifier'
+        needs to be updated. If so, update dataset.
+
+        Similarily to add_new_dataset(), update_if_necessary() will need to
+        take care of updating the dataset if required:
+        (1) Downloading and unarchiving relevant archives using datalad.download_url(link, archive=True)
+        (2) Adding new file links as symlinks using annex("addurl", link, "--fast", "--file", filename)
+        (3) Updating any tracker files used to determine if the dataset needs to be updated
+
+        There is no need to save/push/publish/create pull request
+        as those will be done after this function has finished
+
+        Parameter:
+        dataset_description (dict): Dictionary containing information on
+                                    retrieved dataset from platform.
+                                    Element of the list returned by
+                                    get_all_dataset_description.
+        dataset_dir (str): Directory path of the
+                           previously created datalad dataset
+
+        Returns:
+        bool: True if dataset got modified, False otherwise
+        """
+        return False
+
+    @abc.abstractmethod
+    def get_readme_content(self, dataset_description):
+        """
+        Returns the content of the README.md in markdown.
+
+        Given the dataset description provided by
+        get_all_dataset_description(), return the content of the
+        README.md in markdown.
+
+        Parameter:
+        dataset_description (dict): Dictionary containing information on
+                                    retrieved dataset from platform.
+                                    Element of the list returned by
+                                    get_all_dataset_description.
+
+        Returns:
+        string: Content of the README.md
+        """
+        return ""
+
+    def run(self):
+        """
+        DO NOT OVERRIDE THIS METHOD
+        This method is the entry point for all Crawler classes.
+        It will loop through dataset descriptions collected from get_all_dataset_description(),
+        verify if each of those dataset present locally, create a new dataset with add_new_dataset()
+        if not. If dataset is already existing locally, verify if dataset needs updating
+        with update_if_necessary() and update if so
+        """
+        dataset_description_list = self.get_all_dataset_description()
+        for dataset_description in dataset_description_list:
+            clean_title = self._clean_dataset_title(dataset_description["title"])
+            branch_name = "conp-bot/" + clean_title
+            dataset_dir = os.path.join(self.basedir, "projects", clean_title)
+            d = self.datalad.Dataset(dataset_dir)
+            if branch_name not in self.repo.remotes.origin.refs:  # New dataset
+                self.repo.git.checkout("-b", branch_name)
+                repo_title = ("conp-dataset-" + dataset_description["title"])[0:100]
+                try:
+                    d.create()
+                    r = d.create_sibling_github(
+                        repo_title,
+                        name="origin",
+                        github_login=self.github_token,
+                        github_passwd=self.github_token,
+                    )
+                except Exception as error:
+                    # handle the exception
+                    print("An exception occurred:", error)
+
+                # Add github token to dataset origin remote url
+                try:
+                    origin = self.repo.remote("origin")
+                    origin_url = next(origin.urls)
+                    if "@" not in origin_url:
+                        origin.set_url(
+                            origin_url.replace(
+                                "https://",
+                                "https://" + self.github_token + "@",
+                            ),
+                        )
+                except git.exc.NoSuchPathError:
+                    pass
+
+                self._add_github_repo_description(repo_title, dataset_description)
+                for pattern in NO_ANNEX_FILE_PATTERNS:
+                    d.no_annex(pattern)
+                self.add_new_dataset(dataset_description, dataset_dir)
+
+                # Create DATS.json if it exists in directory and 1 level deep subdir
+                dats_path: str = os.path.join(dataset_dir, "DATS.json")
+                if existing_dats_path := self._check_file_present(
+                    dataset_dir, "dats.json"
+                ):
+                    if self.verbose:
+                        print(f"Found existing DATS.json at {existing_dats_path}")
+                    if existing_dats_path != dats_path:
+                        shutil.copy(existing_dats_path, dats_path)
+                    self._add_source_data_submodule_if_derived_from_conp_dataset(
+                        dats_path, dataset_dir
+                    )
+                else:
+                    self._create_new_dats(
+                        dataset_dir,
+                        dats_path,
+                        dataset_description,
+                        d,
+                    )
+                # Move the logo into the root directory if found in 1 level deep subdir
+                logo_path = os.path.join(dataset_dir, "logo.png")
+                if existing_logo_path := self._check_file_present(
+                    dataset_dir, "logo.png"
+                ):
+                    if self.verbose:
+                        print(f"Found logo at {existing_logo_path}")
+                    if existing_logo_path != logo_path:
+                        os.rename(existing_logo_path, logo_path)
+
+                # Create README.md if it doesn't exist
+                if not os.path.isfile(os.path.join(dataset_dir, "README.md")):
+                    readme = self.get_readme_content(dataset_description)
+                    self._create_readme(readme, os.path.join(dataset_dir, "README.md"))
+                d.save()
+                try:
+                    d.publish(to="origin")
+                    self.repo.git.submodule(
+                        "add",
+                        r[0][1].replace(self.github_token + "@", ""),
+                        dataset_dir,
+                    )
+                except Exception as e:
+                    print(f"Skipping publication due to an error: {e}")
+                modified = True
+                commit_msg = "Created " + dataset_description["title"]
+            else:  # Dataset already existing locally
+                self.repo.git.checkout("-f", branch_name)
+                try:
+                    self.repo.git.merge("-n", "--no-verify", "master")
+                except Exception as e:
+                    print(f"Error while merging master into {branch_name}: {e}")
+                    print("Skipping this dataset")
+                    self.repo.git.merge("--abort")
+                    self.repo.git.checkout("-f", "master")
+                    continue
+
+                modified = self.update_if_necessary(dataset_description, dataset_dir)
+                if modified:
+                    # Create DATS.json if it exists in directory and 1 level deep subdir
+                    dats_path: str = os.path.join(dataset_dir, "DATS.json")
+                    if existing_dats_path := self._check_file_present(
+                        dataset_dir, "dats.json"
+                    ):
+                        if self.verbose:
+                            print(f"Found existing DATS.json at {existing_dats_path}")
+                        if existing_dats_path != dats_path:
+                            os.rename(existing_dats_path, dats_path)
+                        self._add_source_data_submodule_if_derived_from_conp_dataset(
+                            dats_path, dataset_dir
+                        )
+                    else:
+                        self._create_new_dats(
+                            dataset_dir,
+                            dats_path,
+                            dataset_description,
+                            d,
+                        )
+                    # Move the logo into the root directory if found in 1 level deep subdir
+                    logo_path = os.path.join(dataset_dir, "logo.png")
+                    if existing_logo_path := self._check_file_present(
+                        dataset_dir, "logo.png"
+                    ):
+                        if self.verbose:
+                            print(f"Found logo at {existing_logo_path}")
+                        if existing_logo_path != logo_path:
+                            os.rename(existing_logo_path, logo_path)
+                    # Create README.md if it doesn't exist
+                    if not os.path.isfile(os.path.join(dataset_dir, "README.md")):
+                        readme = self.get_readme_content(dataset_description)
+                        self._create_readme(
+                            readme,
+                            os.path.join(dataset_dir, "README.md"),
+                        )
+                    d.save()
+                    try:
+                        d.publish(to="origin")
+                    except Exception as e:
+                        print(f"Skipping publication due to an error: {e}")
+                commit_msg = "Updated " + dataset_description["title"]
+
+            # If modification detected in dataset, push to branch and create PR
+            if modified:
+                self._push_and_pull_request(
+                    commit_msg,
+                    dataset_dir,
+                    dataset_description["title"],
+                )
+
+            # Go back to master
+            self.repo.git.checkout("master")
+
+    def _add_github_repo_description(self, repo_title, dataset_description):
+        url = "https://api.github.com/repos/{}/{}".format(
+            self.username,
+            repo_title,
+        )
+        head = {"Authorization": "token {}".format(self.github_token)}
+        description = "Please don't submit any PR to this repository. "
+        if "creators" in dataset_description.keys():
+            description += (
+                "If you want to request modifications, please contact "
+                f"{dataset_description['creators'][0]['name']}"
+            )
+        payload = {"description": description}
+        r = requests.patch(url, data=json.dumps(payload), headers=head)
+        if not r.ok:
+            print(
+                "Problem adding description to repository {}:".format(repo_title),
+            )
+            print(r.content)
+
+    def _check_requirements(self):
+        # GitHub user must have a fork of https://github.com/CONP-PCNO/conp-dataset
+        # Script must be run in the  directory of a local clone of this fork
+        # Git remote 'origin' of local Git clone must point to that fork
+        # Local Git clone must be set to branch 'master'
+        if "origin" not in self.repo.remotes:
+            raise Exception("Remote 'origin' does not exist in current reposition")
+        origin_url = next(self.repo.remote("origin").urls)
+        full_name = re.search("github.com[/,:](.*).git", origin_url).group(1)
+        r = requests.get("http://api.github.com/repos/" + full_name).json()
+        if not r["fork"] or r["parent"]["full_name"] != "CONP-PCNO/conp-dataset":
+            raise Exception("Current repository not a fork of CONP-PCNO/conp-dataset")
+        branch = self.repo.active_branch.name
+        if branch != "master":
+            raise Exception("Local git clone active branch not set to 'master'")
+
+        # Return username
+        return full_name.split("/")[0]
+
+    def _push_and_pull_request(self, msg, dataset_dir, title):
+        self.repo.git.add(dataset_dir)
+        self.repo.git.add(".gitmodules")
+        self.repo.git.commit("-m", "[conp-bot] " + msg)
+        clean_title = self._clean_dataset_title(title)
+        origin = self.repo.remote("origin")
+        origin_url = next(origin.urls)
+        if "@" not in origin_url:
+            origin.set_url(
+                origin_url.replace("https://", "https://" + self.github_token + "@"),
+            )
+        self.repo.git.push("--set-upstream", "origin", "conp-bot/" + clean_title)
+
+        # Create PR
+        print("Creating PR for " + title)
+        if not self.no_pr:
+            r = requests.post(
+                "https://api.github.com/repos/CONP-PCNO/conp-dataset/pulls",
+                json={
+                    "title": "Crawler result ({})".format(title),
+                    "body": """## Description
+{}
+
+## Checklist
+
+Mandatory files and elements:
+- [x] A `README.md` file, at the root of the dataset
+- [x] A `DATS.json` file, at the root of the dataset
+- [ ] If configuration is required (for instance to enable a special remote),
+ a `config.sh` script at the root of the dataset
+- [x] A DOI (see instructions in [contribution guide]
+(https://github.com/CONP-PCNO/conp-dataset/blob/master/.github/CONTRIBUTING.md), and corresponding badge in `README.md`
+
+Functional checks:
+- [x] Dataset can be installed using DataLad, recursively if it has sub-datasets
+- [x] Every data file has a URL
+- [x] Every data file can be retrieved or requires authentication
+- [ ] `DATS.json` is a valid DATs model
+- [ ] If dataset is derived data, raw data is a sub-dataset
+""".format(
+                        msg + "\n",
+                    ),
+                    "head": self.username + ":conp-bot/" + clean_title,
+                    "base": "master",
+                },
+                headers={"Authorization": "token {}".format(self.github_token)},
+            )
+            if r.status_code != 201:
+                raise Exception("Error while creating pull request: " + r.text)
+
+    def _clean_dataset_title(self, title):
+        return re.sub(r"\W|^(?=\d)", "_", title)
+
+    def _create_new_dats(self, dataset_dir, dats_path, dataset, d):
+        # Helper recursive function
+        def retrieve_license_path_in_dir(dir, paths):
+            for f_name in os.listdir(dir):
+                f_path = os.path.join(dir, f_name)
+                if os.path.isdir(f_path):
+                    retrieve_license_path_in_dir(f_path, paths)
+                    continue
+                elif "license" not in f_name.lower():
+                    continue
+                elif os.path.islink(f_path):
+                    d.get(f_path)
+                paths.append(f_path)
+
+        # Check required properties
+        for field in REQUIRED_DATS_FIELDS:
+            if field not in dataset.keys():
+                print(
+                    "Warning: required property {} not found in dataset description".format(
+                        field,
+                    ),
+                )
+
+        # Add all dats properties from dataset description
+        data = {key: value for key, value in dataset.items() if key in DATS_FIELDS}
+
+        # Check for license code in dataset if a license was not specified from the platform
+        if "licenses" not in data or (
+            len(data["licenses"]) == 1 and data["licenses"][0]["name"].lower() == "none"
+        ):
+            # Collect all license file paths
+            license_f_paths = []
+            retrieve_license_path_in_dir(dataset_dir, license_f_paths)
+
+            # If found some license files, for each, check for first valid license code and add to DATS
+            if license_f_paths:
+                licenses = set()
+                for f_path in license_f_paths:
+                    with open(f_path) as f:
+                        text = f.read().lower()
+                    for code in LICENSE_CODES:
+                        if code.lower() in text:
+                            licenses.add(code)
+                            break
+                data["licenses"] = [{"name": code} for code in licenses]
+
+        # Add file count
+        num = 0
+        for file in os.listdir(dataset_dir):
+            file_path = os.path.join(dataset_dir, file)
+            if (
+                file[0] == "."
+                or file == "DATS.json"
+                or file == "README.md"
+                or file == "logo.png"
+            ):
+                continue
+            elif os.path.isdir(file_path):
+                num += sum([len(files) for r, d, files in os.walk(file_path)])
+            else:
+                num += 1
+        if "extraProperties" not in data.keys():
+            data["extraProperties"] = [
+                {"category": "files", "values": [{"value": str(num)}]},
+            ]
+        else:
+            data["extraProperties"].append(
+                {"category": "files", "values": [{"value": str(num)}]},
+            )
+
+        # Retrieve modalities from files
+        file_paths = map(
+            lambda x: x.split(" ")[-1],
+            filter(
+                lambda x: " " in x,
+                git.Repo(dataset_dir).git.annex("list").split("\n"),
+            ),
+        )  # Get file paths
+        file_names = list(
+            map(lambda x: x.split("/")[-1] if "/" in x else x, file_paths),
+        )  # Get file names from path
+        modalities = {self._guess_modality(file_name) for file_name in file_names}
+        if len(modalities) == 0:
+            modalities.add("unknown")
+        elif len(modalities) > 1 and "unknown" in modalities:
+            modalities.remove("unknown")
+        if "types" not in data.keys():
+            data["types"] = [{"value": modality} for modality in modalities]
+        else:
+            for modality in modalities:
+                data["types"].append({"value": modality})
+
+        # Create file
+        with open(dats_path, "w") as f:
+            json.dump(data, f, indent=4)
+
+    def _guess_modality(self, file_name):
+        # Associate file types to substrings found in the file name
+        for m in MODALITIES:
+            for s in MODALITIES[m]:
+                if s in file_name:
+                    return m
+        return "unknown"
+
+    def _create_readme(self, content, path):
+        with open(path, "w") as f:
+            f.write(content)
+
+    def _check_file_present(self, directory, filename):
+        for file_name in os.listdir(directory):
+            file_path: str = os.path.join(directory, file_name)
+            if os.path.isdir(file_path):
+                for subfile_name in os.listdir(file_path):
+                    if subfile_name.lower() == filename.lower():
+                        return os.path.join(file_path, subfile_name)
+            elif file_name.lower() == filename.lower():
+                return file_path
+
+    def _add_source_data_submodule_if_derived_from_conp_dataset(
+        self, dats_json, dataset_dir
+    ):
+        with open(dats_json) as f:
+            metadata = json.loads(f.read())
+
+        source_dataset_link = None
+        source_dataset_id = None
+        if "extraProperties" not in metadata.keys():
+            return
+        for property in metadata["extraProperties"]:
+            if property["category"] == "derivedFrom":
+                try:
+                    source_dataset_link = property["values"][0]["value"]
+                except (KeyError, IndexError):
+                    continue
+            if property["category"] == "parent_dataset_id":
+                try:
+                    source_dataset_id = property["values"][0]["value"]
+                except (KeyError, IndexError):
+                    continue
+
+        if source_dataset_link is not None and "github.com" in source_dataset_link:
+            d = self.datalad.Dataset(os.path.join(dataset_dir, source_dataset_id))
+            d.create()
diff --git a/scripts/Crawlers/OSFCrawlerTest.py b/scripts/Crawlers/OSFCrawlerTest.py
new file mode 100644
index 000000000..5bb16d935
--- /dev/null
+++ b/scripts/Crawlers/OSFCrawlerTest.py
@@ -0,0 +1,661 @@
+import datetime
+import json
+import os
+import time
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import humanize
+import requests
+from datalad.distribution.dataset import Dataset
+from datalad.support.exceptions import IncompleteResultsError
+from git import Repo
+from requests.exceptions import HTTPError
+
+from scripts.Crawlers.BaseCrawlerTest import BaseCrawler
+
+
+def _create_osf_tracker(path, dataset):
+    with open(path, "w") as f:
+        data = {
+            "version": dataset["version"],
+            "title": dataset["title"],
+        }
+        json.dump(data, f, indent=4)
+
+
+class OSFCrawler(BaseCrawler):
+    def __init__(self, github_token, config_path, verbose, force, no_pr, basedir):
+        super().__init__(github_token, config_path, verbose, force, no_pr, basedir)
+        self.osf_token = self._get_token()
+
+    def _get_token(self):
+        if os.path.isfile(self.config_path):
+            with open(self.config_path) as f:
+                data = json.load(f)
+            if "osf_token" in data.keys():
+                return data["osf_token"]
+
+    def _get_request_with_bearer_token(self, link, redirect=True, retries=5):
+        header = {"Authorization": f"Bearer {self.osf_token}"}
+        attempt = 0
+        while attempt < retries:
+            try:
+                r = requests.get(link, headers=header, allow_redirects=redirect)
+                r.raise_for_status()  # Cela va lever une exception pour les réponses 4xx et 5xx
+                return r  # Retourne la réponse si tout va bien
+            except HTTPError as http_err:
+                print(f"HTTP error occurred: {http_err} - Response: {r.text}")
+                if r.status_code == 503:  # Spécifiquement pour gérer les erreurs 503
+                    print(
+                        f"Request to {r.url} failed with 503 Bad Gateway, retrying..."
+                    )
+                    attempt += 1
+                    time.sleep(2**attempt)  # Backoff exponentiel
+                    continue
+                if r.status_code == 502:  # Spécifiquement pour gérer les erreurs 502
+                    print(
+                        f"Request to {r.url} failed with 502 Bad Gateway, skipping download."
+                    )
+                    return None  # Retourne None pour permettre au code de continuer
+                else:
+                    raise Exception(
+                        f"HTTP error occurred: {http_err} - {r.status_code}"
+                    )  # Lève l'exception pour les autres erreurs HTTP
+            except Exception as err:
+                raise Exception(f"An error occurred: {err}")
+
+    def _query_osf(self):
+        query = "https://api.osf.io/v2/nodes/?filter[tags]=canadian-open-neuroscience-platform"
+        r_json = self._get_request_with_bearer_token(query).json()
+        results = r_json["data"]
+
+        # Retrieve results from other pages
+        if r_json["links"]["meta"]["total"] > r_json["links"]["meta"]["per_page"]:
+            next_page = r_json["links"]["next"]
+            while next_page is not None:
+                next_page_json = self._get_request_with_bearer_token(next_page).json()
+                results.extend(next_page_json["data"])
+                next_page = next_page_json["links"]["next"]
+
+        if self.verbose:
+            print("OSF query: {}".format(query))
+        return results
+
+    def _download_files(
+        self,
+        link,
+        current_dir,
+        inner_path,
+        d,
+        annex,
+        sizes,
+        is_private=False,
+    ):
+        response = self._get_request_with_bearer_token(link)
+        if response is None:
+            print(f"Skipping download for {link} due to a failed request.")
+            return
+        print("first download", response)
+        r_json = response.json()
+        files = r_json["data"]
+
+        # Retrieve the files in the other pages if there are more than 1 page
+        if (
+            "links" in r_json.keys()
+            and r_json["links"]["meta"]["total"] > r_json["links"]["meta"]["per_page"]
+        ):
+            print("dans le next page")
+            next_page = r_json["links"]["next"]
+            while next_page is not None:
+                response = self._get_request_with_bearer_token(next_page)
+                if response is None:
+                    print(f"Skipping page {next_page} due to a failed request.")
+                    break
+
+                next_page_json = response.json()
+                files.extend(next_page_json["data"])
+                next_page = next_page_json["links"]["next"]
+
+        for file in files:
+            # Handle folders
+            if file["attributes"]["kind"] == "folder":
+                folder_path = os.path.join(current_dir, file["attributes"]["name"])
+                # Conditions added by Alex
+                if not os.path.exists(folder_path):
+                    os.mkdir(folder_path)
+                    self._download_files(
+                        file["relationships"]["files"]["links"]["related"]["href"],
+                        folder_path,
+                        os.path.join(inner_path, file["attributes"]["name"]),
+                        d,
+                        annex,
+                        sizes,
+                        is_private,
+                    )
+                else:
+                    print(f"the folder {folder_path} already exist.")
+
+            # Handle single files
+            elif file["attributes"]["kind"] == "file":
+                try:
+                    # Private dataset/files
+                    if is_private:
+                        correct_download_link = self._get_request_with_bearer_token(
+                            file["links"]["download"],
+                            redirect=False,
+                        )
+                        if correct_download_link is not None:
+                            correct_download_link = correct_download_link.headers[
+                                "location"
+                            ]
+                            if (
+                                "https://accounts.osf.io/login"
+                                not in correct_download_link
+                            ):
+                                zip_file = (
+                                    True
+                                    if file["attributes"]["name"].split(".")[-1]
+                                    == "zip"
+                                    else False
+                                )
+                                d.download_url(
+                                    correct_download_link,
+                                    path=os.path.join(inner_path, ""),
+                                    archive=zip_file,
+                                )
+                            else:  # Token did not work for downloading file, return
+                                file = file["links"]["download"]
+                                print(
+                                    f"Unable to download file {file} with current token, skipping file",
+                                )
+                                return
+
+                    # Public file
+                    else:
+                        # Handle zip files
+                        if file["attributes"]["name"].split(".")[-1] == "zip":
+                            d.download_url(
+                                file["links"]["download"],
+                                path=os.path.join(inner_path, ""),
+                                archive=True,
+                            )
+                        else:
+                            d.download_url(
+                                file["links"]["download"],
+                                path=os.path.join(inner_path, ""),
+                            )
+
+                except IncompleteResultsError as e:
+                    print(
+                        f"Skipping file {file['links']['download']} due to error: {e}"
+                    )
+                    continue  # Skip ce fichier et passer au suivant
+
+                # append the size of the downloaded file to the sizes array
+                file_size = file["attributes"]["size"]
+                if not file_size:
+                    # if the file size cannot be found in the OSF API response, then get it from git annex info
+                    inner_file_path = os.path.join(
+                        inner_path,
+                        file["attributes"]["name"],
+                    )
+                    annex_info_dict = json.loads(
+                        annex("info", "--bytes", "--json", inner_file_path),
+                    )
+                    file_size = int(annex_info_dict.get("size", 0))
+                sizes.append(file_size)
+
+    def _download_components(
+        self,
+        components_list,
+        current_dir,
+        inner_path,
+        d,
+        annex,
+        dataset_size,
+        is_private,
+    ):
+        # Loop through each available components and download their files
+        for component in components_list:
+            component_title = self._clean_dataset_title(
+                component["attributes"]["title"],
+            )
+            component_inner_path = os.path.join(
+                inner_path,
+                "components",
+                component_title,
+            )
+            os.makedirs(os.path.join(current_dir, component_inner_path))
+            self._download_files(
+                component["relationships"]["files"]["links"]["related"]["href"],
+                os.path.join(current_dir, component_inner_path),
+                component_inner_path,
+                d,
+                annex,
+                dataset_size,
+                is_private,
+            )
+
+            # check if the component contains (sub)components, in which case, download the (sub)components data
+            subcomponents_list = self._get_components(
+                component["relationships"]["children"]["links"]["related"]["href"],
+            )
+            if subcomponents_list:
+                self._download_components(
+                    subcomponents_list,
+                    current_dir,
+                    os.path.join(component_inner_path),
+                    d,
+                    annex,
+                    dataset_size,
+                    is_private,
+                )
+
+        # Once we have downloaded all the components files, check to see if there are any empty
+        # directories (in the case the 'OSF parent' dataset did not have any downloaded files
+        list_of_empty_dirs = [
+            dirpath
+            for (dirpath, dirnames, filenames) in os.walk(current_dir)
+            if len(dirnames) == 0 and len(filenames) == 0
+        ]
+        for empty_dir in list_of_empty_dirs:
+            os.rmdir(empty_dir)
+
+    def _get_contributors(self, link):
+        r = self._get_request_with_bearer_token(link)
+        contributors = [
+            contributor["embeds"]["users"]["data"]["attributes"]["full_name"]
+            for contributor in r.json()["data"]
+        ]
+        return contributors
+
+    def _get_license(self, link):
+        r = self._get_request_with_bearer_token(link)
+        return r.json()["data"]["attributes"]["name"]
+
+    def _get_components(self, link):
+        r = self._get_request_with_bearer_token(link)
+        return r.json()["data"]
+
+    def _get_wiki(self, link) -> Optional[str]:
+        r = self._get_request_with_bearer_token(link)
+        data = r.json()["data"]
+        if len(data) > 0:
+            return self._get_request_with_bearer_token(
+                data[0]["links"]["download"]
+            ).content.decode()
+
+    def _get_institutions(self, link):
+        r = self._get_request_with_bearer_token(link)
+        if r.json()["data"]:
+            institutions = [
+                institution["attributes"]["name"] for institution in r.json()["data"]
+            ]
+            return institutions
+
+    def _get_identifier(self, link):
+        r = self._get_request_with_bearer_token(link)
+        return r.json()["data"][0]["attributes"]["value"] if r.json()["data"] else False
+
+    def get_all_dataset_description(self):
+        osf_dois = []
+        datasets = self._query_osf()
+        for dataset in datasets:
+            # skip datasets that have a parent since the files' components will
+            # go into the parent dataset.
+            # print("parent" in dataset["relationships"].keys())
+            if "parent" in dataset["relationships"].keys():
+                print(dataset["relationships"]["parent"])
+            #    continue
+
+            attributes = dataset["attributes"]
+
+            # Retrieve keywords/tags
+            keywords = list(map(lambda x: {"value": x}, attributes["tags"]))
+
+            # Retrieve contributors/creators
+            contributors = self._get_contributors(
+                dataset["relationships"]["contributors"]["links"]["related"]["href"],
+            )
+
+            # Retrieve license
+            license_ = "None"
+            if "license" in dataset["relationships"].keys():
+                license_ = self._get_license(
+                    dataset["relationships"]["license"]["links"]["related"]["href"],
+                )
+
+            # Retrieve institution information
+            institutions = self._get_institutions(
+                dataset["relationships"]["affiliated_institutions"]["links"]["related"][
+                    "href"
+                ],
+            )
+
+            # Retrieve identifier information
+            identifier = self._get_identifier(
+                dataset["relationships"]["identifiers"]["links"]["related"]["href"],
+            )
+
+            # Get link for the dataset files
+            files_link = dataset["relationships"]["files"]["links"]["related"]["href"]
+
+            # Get components list
+            components_list = self._get_components(
+                dataset["relationships"]["children"]["links"]["related"]["href"],
+            )
+
+            # Get wiki to put in README
+            wiki: Optional[str] = None
+            try:
+                wiki = self._get_wiki(
+                    dataset["relationships"]["wikis"]["links"]["related"]["href"]
+                )
+            except Exception as e:
+                print(f'Error getting wiki for {attributes["title"]} because of {e}')
+
+            # Gather extra properties
+            extra_properties = [
+                {
+                    "category": "logo",
+                    "values": [
+                        {
+                            "value": "https://osf.io/static/img/institutions/shields/cos-shield.png",
+                        },
+                    ],
+                },
+            ]
+            if institutions:
+                extra_properties.append(
+                    {
+                        "category": "origin_institution",
+                        "values": list(
+                            map(lambda x: {"value": x}, institutions),
+                        ),
+                    },
+                )
+
+            # Retrieve dates
+            date_created = datetime.datetime.strptime(
+                attributes["date_created"],
+                "%Y-%m-%dT%H:%M:%S.%f",
+            )
+            date_modified = datetime.datetime.strptime(
+                attributes["date_modified"],
+                "%Y-%m-%dT%H:%M:%S.%f",
+            )
+
+            dataset_dats_content = {
+                "title": attributes["title"],
+                "files": files_link,
+                "components_list": components_list,
+                "homepage": dataset["links"]["html"],
+                "creators": list(
+                    map(lambda x: {"name": x}, contributors),
+                ),
+                "description": attributes["description"],
+                "wiki": wiki,
+                "version": attributes["date_modified"],
+                "licenses": [
+                    {
+                        "name": license_,
+                    },
+                ],
+                "dates": [
+                    {
+                        "date": date_created.strftime("%Y-%m-%d %H:%M:%S"),
+                        "type": {
+                            "value": "date created",
+                        },
+                    },
+                    {
+                        "date": date_modified.strftime("%Y-%m-%d %H:%M:%S"),
+                        "type": {
+                            "value": "date modified",
+                        },
+                    },
+                ],
+                "keywords": keywords,
+                "distributions": [
+                    {
+                        "size": 0,
+                        "unit": {"value": "B"},
+                        "access": {
+                            "landingPage": dataset["links"]["html"],
+                            "authorizations": [
+                                {
+                                    "value": "public"
+                                    if attributes["public"]
+                                    else "private",
+                                },
+                            ],
+                        },
+                    },
+                ],
+                "extraProperties": extra_properties,
+            }
+
+            if identifier:
+                source = "OSF DOI" if "OSF.IO" in identifier else "DOI"
+                dataset_dats_content["identifier"] = {
+                    "identifier": identifier,
+                    "identifierSource": source,
+                }
+
+            osf_dois.append(dataset_dats_content)
+
+        if self.verbose:
+            print("Retrieved OSF DOIs: ")
+            for osf_doi in osf_dois:
+                print(
+                    "- Title: {}, Last modified: {}".format(
+                        osf_doi["title"],
+                        osf_doi["version"],
+                    ),
+                )
+
+        return osf_dois
+
+    def add_new_dataset(self, dataset: Dict[str, Any], dataset_dir: str):
+        d: Dataset = self.datalad.Dataset(dataset_dir)
+        d.no_annex(".conp-osf-crawler.json")
+        d.save()
+        annex: Callable = Repo(dataset_dir).git.annex
+        dataset_size: List[int] = []
+
+        # Setup private OSF dataset if the dataset is private
+        is_private: bool = self._setup_private_dataset(
+            dataset["files"],
+            dataset_dir,
+            annex,
+            d,
+        )
+        self._download_files(
+            dataset["files"],
+            dataset_dir,
+            "",
+            d,
+            annex,
+            dataset_size,
+            is_private,
+        )
+        if dataset["components_list"]:
+            self._download_components(
+                dataset["components_list"],
+                dataset_dir,
+                "",
+                d,
+                annex,
+                dataset_size,
+                is_private,
+            )
+        dataset_size_num, dataset_unit = humanize.naturalsize(sum(dataset_size)).split(
+            " ",
+        )
+        dataset["distributions"][0]["size"] = float(dataset_size_num)
+        dataset["distributions"][0]["unit"]["value"] = dataset_unit
+
+        # Add .conp-osf-crawler.json tracker file
+        _create_osf_tracker(
+            os.path.join(dataset_dir, ".conp-osf-crawler.json"),
+            dataset,
+        )
+        # Tenter de publier sur le remote 'origin'
+        try:
+            d.publish(to="origin")
+        except IncompleteResultsError as e:
+            print(f"Skipping publication due to error: {e}")
+        except Exception as e:
+            print(f"An unexpected error occurred during publication: {e}")
+
+    def update_if_necessary(self, dataset_description, dataset_dir):
+        tracker_path = os.path.join(dataset_dir, ".conp-osf-crawler.json")
+        if not os.path.isfile(tracker_path):
+            print("{} does not exist in dataset, skipping".format(tracker_path))
+            return False
+        with open(tracker_path) as f:
+            tracker = json.load(f)
+        if tracker["version"] == dataset_description["version"]:
+            # Same version, no need to update
+            if self.verbose:
+                print(
+                    "{}, version {} same as OSF version DOI ({}), no need to update".format(
+                        dataset_description["title"],
+                        dataset_description["version"],
+                        tracker["version"],
+                    ),
+                )
+            return False
+
+        # Update dataset
+        if self.verbose:
+            print(
+                "{}, version {} different from OSF version DOI {}, updating".format(
+                    dataset_description["title"],
+                    tracker["version"],
+                    dataset_description["version"],
+                ),
+            )
+
+        # Remove all data and DATS.json files
+        for file_name in os.listdir(dataset_dir):
+            if file_name[0] == ".":
+                continue
+            self.datalad.remove(os.path.join(dataset_dir, file_name), check=False)
+
+        d = self.datalad.Dataset(dataset_dir)
+        annex = Repo(dataset_dir).git.annex
+
+        dataset_size = []
+        is_private: bool = self._is_private_dataset(dataset_description["files"])
+        self._download_files(
+            dataset_description["files"],
+            dataset_dir,
+            "",
+            d,
+            annex,
+            dataset_size,
+            is_private,
+        )
+        if dataset_description["components_list"]:
+            self._download_components(
+                dataset_description["components_list"],
+                dataset_dir,
+                "",
+                d,
+                annex,
+                dataset_size,
+                is_private,
+            )
+        dataset_size, dataset_unit = humanize.naturalsize(sum(dataset_size)).split(
+            " ",
+        )
+        dataset_description["distributions"][0]["size"] = float(dataset_size)
+        dataset_description["distributions"][0]["unit"]["value"] = dataset_unit
+
+        # Add .conp-osf-crawler.json tracker file
+        _create_osf_tracker(
+            os.path.join(dataset_dir, ".conp-osf-crawler.json"),
+            dataset_description,
+        )
+
+        return True
+
+    def get_readme_content(self, dataset):
+        readme_content = (
+            f'# {dataset["title"]}\n\nCrawled from [OSF]({dataset["homepage"]})'
+        )
+
+        if "description" in dataset and dataset["description"]:
+            readme_content += f'\n\n## Description\n\n{dataset["description"]}'
+
+        if "identifier" in dataset and dataset["identifier"]:
+            readme_content += f'\n\n## DOI: {dataset["identifier"]["identifier"]}'
+
+        if "wiki" in dataset and dataset["wiki"]:
+            readme_content += f'\n\n## WIKI\n\n{dataset["wiki"]}'
+
+        return readme_content
+
+    def _setup_private_dataset(
+        self,
+        files_url: str,
+        dataset_dir: str,
+        annex: Callable,
+        dataset: Dataset,
+    ) -> bool:
+        # Check if the dataset is indeed private
+        if self._is_private_dataset(files_url):
+            if self.verbose:
+                print(
+                    "Dataset is private, creating OSF provider and make git annex autoenable datalad remote",
+                )
+
+            # Create OSF provider file and needed directories and don't annex the file
+            datalad_dir: str = os.path.join(dataset_dir, ".datalad")
+            if not os.path.exists(datalad_dir):
+                os.mkdir(datalad_dir)
+            providers_dir: str = os.path.join(datalad_dir, "providers")
+            if not os.path.exists(providers_dir):
+                os.mkdir(providers_dir)
+            osf_config_path: str = os.path.join(providers_dir, "OSF.cfg")
+            with open(osf_config_path, "w") as f:
+                f.write(
+                    """[provider:OSF]
+url_re = .*osf\\.io.*
+authentication_type = bearer_token
+credential = OSF
+
+[credential:OSF]
+# If known, specify URL or email to how/where to request credentials
+# url = ???
+type = token"""
+                )
+            dataset.no_annex(os.path.join("**", "OSF.cfg"))
+
+            # Make git annex autoenable datalad remote
+            annex(
+                "initremote",
+                "datalad",
+                "externaltype=datalad",
+                "type=external",
+                "encryption=none",
+                "autoenable=true",
+            )
+
+            # Set OSF token as a environment variable for authentication
+            os.environ["DATALAD_OSF_token"] = self.osf_token
+
+            # Save changes
+            dataset.save()
+
+            return True
+
+        return False
+
+    def _is_private_dataset(self, files_url) -> bool:
+        return True if requests.get(files_url).status_code == 401 else False
diff --git a/scripts/Crawlers/ZenodoCrawler.py b/scripts/Crawlers/ZenodoCrawler.py
index 8878b8e82..4a8c8f44e 100644
--- a/scripts/Crawlers/ZenodoCrawler.py
+++ b/scripts/Crawlers/ZenodoCrawler.py
@@ -17,10 +17,10 @@ def _create_zenodo_tracker(path, dataset):
     with open(path, "w") as f:
         data = {
             "zenodo": {
-                "concept_doi": dataset["concept_doi"],
-                "version": dataset["latest_version"],
+                "concept_doi": dataset.get("concept_doi"),
+                "version": dataset.get("latest_version"),
             },
-            "title": dataset["title"],
+            "title": dataset.get("title"),
         }
         json.dump(data, f, indent=4)
 
@@ -39,7 +39,7 @@ def _get_tokens(self):
             with open(self.config_path) as f:
                 data = json.load(f)
             if "zenodo_tokens" in data.keys():
-                return data["zenodo_tokens"]
+                return data.get("zenodo_tokens")
             else:
                 return {}
 
@@ -50,9 +50,9 @@ def _query_zenodo(self):
             'q=keywords:"canadian-open-neuroscience-platform"'
         )
         r_json = requests.get(query).json()
-        results = r_json["hits"]["hits"]
+        results = r_json.get("hits", {}).get("hits")
 
-        if r_json["links"]["next"]:
+        if r_json and r_json.get("links", {}).get("next"):
             next_page = r_json["links"]["next"]
             while next_page is not None:
                 next_page_json = requests.get(next_page).json()
@@ -77,7 +77,7 @@ def _download_file(self, bucket, d, is_private):
         file_size: int = bucket.get("size", 0)
         if self.verbose:
             print(f"Downloading {link} as {file_name} of size {file_size}")
-        d.download_url(link, archive=True if bucket["type"] == "zip" else False)
+        d.download_url(link, archive=True if bucket.get("type") == "zip" else False)
 
     def get_all_dataset_description(self):
         zenodo_dois = []
@@ -124,21 +124,25 @@ def get_all_dataset_description(self):
                 for bucket in dataset["files"]:
                     files.append(bucket)
 
-            latest_version_doi = metadata["relations"]["version"][0]["last_child"][
-                "pid_value"
-            ]
+            latest_version_doi = None
+            version = metadata.get("relations", {}).get("version", [])
+            if len(version):
+                latest_version_doi = version[0].get("last_child", {}).get("pid_value")
 
             # Retrieve and clean file formats/extensions
             file_formats = (
-                list(set(map(lambda x: x["type"], files))) if len(files) > 0 else None
+                list(set(map(lambda x: os.path.splitext(x.get("key"))[1][1:], files)))
+                if len(files) > 0
+                else []
             )
+
             if "" in file_formats:
                 file_formats.remove("")
 
             # Retrieve and clean file keywords
             keywords = []
             if "keywords" in metadata.keys():
-                keywords = list(map(lambda x: {"value": x}, metadata["keywords"]))
+                keywords = list(map(lambda x: {"value": x}, metadata.get("keywords")))
 
             # Retrieve subject annotations from Zenodo and clean the annotated
             # subjects to insert in isAbout of DATS file
@@ -231,12 +235,12 @@ def get_all_dataset_description(self):
                                 file_format.upper()
                                 for file_format in file_formats
                                 # Do not modify specific file formats.
-                                if file_formats not in ["NIfTI", "BigWig"]
+                                if file_format not in ["NIfTI", "BigWig"]
                             ],
                             "size": dataset_size,
                             "unit": {"value": dataset_unit},
                             "access": {
-                                "landingPage": dataset["links"]["html"],
+                                "landingPage": dataset.get("links", {}).get("html"),
                                 "authorizations": [
                                     {
                                         "value": "public"