From 510bbc82ca374048d055153c524bc3b6512d013f Mon Sep 17 00:00:00 2001
From: Andrey Fedorov <andrey.fedorov@gmail.com>
Date: Fri, 4 Oct 2024 14:01:52 -0400
Subject: [PATCH 1/2] ENH: support download by crdc_series_uuid

Aims to resolve #117
---
 idc_index/cli.py   | 20 ++++++++++++-
 idc_index/index.py | 71 ++++++++++++++++++++++++++++++++++++----------
 tests/idcindex.py  |  9 ++++++
 3 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/idc_index/cli.py b/idc_index/cli.py
index 3171276b..011227f8 100644
--- a/idc_index/cli.py
+++ b/idc_index/cli.py
@@ -83,6 +83,13 @@ def set_log_level(log_level):
     default=None,
     help="DICOM SeriesInstanceUID(s) to filter by.",
 )
+@click.option(
+    "--crdc-series-uuid",
+    type=str,
+    multiple=True,
+    default=None,
+    help="crdc_series_uuid(s) to filter by.",
+)
 @click.option(
     "--quiet",
     type=bool,
@@ -122,6 +129,7 @@ def download_from_selection(
     patient_id,
     study_instance_uid,
     series_instance_uid,
+    crdc_series_uuid,
     quiet,
     show_progress_bar,
     use_s5cmd_sync,
@@ -159,11 +167,17 @@ def download_from_selection(
         if series_instance_uid
         else None
     )
+    crdc_series_uuid = (
+        [uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")]
+        if crdc_series_uuid
+        else None
+    )
     logger_cli.debug("Inputs received from cli download:")
     logger_cli.debug(f"collection_id: {collection_id}")
     logger_cli.debug(f"patient_id: {patient_id}")
     logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
     logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
+    logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}")
     logger_cli.debug(f"dry_run: {dry_run}")
     logger_cli.debug(f"quiet: {quiet}")
     logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
@@ -177,6 +191,7 @@ def download_from_selection(
         patientId=patient_id,
         studyInstanceUID=study_instance_uid,
         seriesInstanceUID=series_instance_uid,
+        crdc_series_uuid=crdc_series_uuid,
         quiet=quiet,
         show_progress_bar=show_progress_bar,
         use_s5cmd_sync=use_s5cmd_sync,
@@ -346,9 +361,12 @@ def check_and_download(column_name, item_ids, download_dir, kwarg_name):
         matches_found += check_and_download(
             "SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
         )
+        matches_found += check_and_download(
+            "crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid"
+        )
         if not matches_found:
             logger_cli.error(
-                "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
+                "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid."
             )
 
 
diff --git a/idc_index/index.py b/idc_index/index.py
index ce44f696..410476d4 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -64,6 +64,15 @@ def __init__(self):
         logger.debug(f"Reading index file v{idc_index_data.__version__}")
         self.index = pd.read_parquet(file_path)
 
+        # initialize crdc_series_uuid for the index
+        # TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113
+        # is merged (to minimize disruption), it will make more sense to change
+        # idc-index-data to separate bucket from crdc_series_uuid, add support for GCP,
+        # and consequently simplify the code here
+        self.index["crdc_series_uuid"] = (
+            self.index["series_aws_url"].str.split("/").str[3]
+        )
+
         self.previous_versions_index_path = (
             idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
         )
@@ -136,7 +145,12 @@ def _filter_dataframe_by_id(key, dataframe, _id):
 
     @staticmethod
     def _safe_filter_by_selection(
-        df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID
+        df_index,
+        collection_id,
+        patientId,
+        studyInstanceUID,
+        seriesInstanceUID,
+        crdc_series_uuid,
     ):
         if collection_id is not None:
             if not isinstance(collection_id, str) and not isinstance(
@@ -156,26 +170,48 @@ def _safe_filter_by_selection(
                 seriesInstanceUID, list
             ):
                 raise TypeError("seriesInstanceUID must be a string or list of strings")
-
-        if collection_id is not None:
-            result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
-        else:
-            result_df = df_index
-
-        if patientId is not None:
-            result_df = IDCClient._filter_by_patient_id(result_df, patientId)
-
-        if studyInstanceUID is not None:
-            result_df = IDCClient._filter_by_dicom_study_uid(
-                result_df, studyInstanceUID
+        if crdc_series_uuid is not None:
+            if not isinstance(crdc_series_uuid, str) and not isinstance(
+                crdc_series_uuid, list
+            ):
+                raise TypeError("crdc_series_uuid must be a string or list of strings")
+
+        # Here we go down-up the hierarchy of filtering, taking into
+        # account the direction of one-to-many relationships
+        #   one crdc_series_uuid can be associated with one and only one SeriesInstanceUID
+        #   one SeriesInstanceUID can be associated with one and only one StudyInstanceUID
+        #   one StudyInstanceUID can be associated with one and only one PatientID
+        #   one PatientID can be associated with one and only one collection_id
+        # because of this we do not need to apply attributes above the given defined
+        # attribute in the hierarchy
+        # The earlier implemented behavior was a relic of the API from a different system
+        # that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately.
+
+        if crdc_series_uuid is not None:
+            result_df = IDCClient._filter_dataframe_by_id(
+                "crdc_series_uuid", df_index, crdc_series_uuid
             )
+            return result_df
 
         if seriesInstanceUID is not None:
             result_df = IDCClient._filter_by_dicom_series_uid(
-                result_df, seriesInstanceUID
+                df_index, seriesInstanceUID
             )
+            return result_df
+
+        if studyInstanceUID is not None:
+            result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID)
+            return result_df
+
+        if patientId is not None:
+            result_df = IDCClient._filter_by_patient_id(df_index, patientId)
+            return result_df
+
+        if collection_id is not None:
+            result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
+            return result_df
 
-        return result_df
+        return None
 
     @staticmethod
     def _filter_by_collection_id(df_index, collection_id):
@@ -1399,6 +1435,7 @@ def citations_from_selection(
         patientId=None,
         studyInstanceUID=None,
         seriesInstanceUID=None,
+        crdc_series_uuid=None,
         citation_format=CITATION_FORMAT_APA,
     ):
         """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID.
@@ -1419,6 +1456,7 @@ def citations_from_selection(
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
+            crdc_series_uuid=crdc_series_uuid,
         )
 
         citations = []
@@ -1469,6 +1507,7 @@ def download_from_selection(
         patientId=None,
         studyInstanceUID=None,
         seriesInstanceUID=None,
+        crdc_series_uuid=None,
         quiet=True,
         show_progress_bar=True,
         use_s5cmd_sync=False,
@@ -1484,6 +1523,7 @@ def download_from_selection(
             patientId: string or list of strings containing the values of PatientID to filter by
             studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
+            crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by
             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
             show_progress_bar (bool): If True, tracks the progress of download
             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
@@ -1499,6 +1539,7 @@ def download_from_selection(
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
+            crdc_series_uuid=crdc_series_uuid,
         )
 
         total_size = round(result_df["series_size_MB"].sum(), 2)
diff --git a/tests/idcindex.py b/tests/idcindex.py
index 73e9a1f3..4d12a71a 100644
--- a/tests/idcindex.py
+++ b/tests/idcindex.py
@@ -469,10 +469,19 @@ def test_cli_download(self):
         with runner.isolated_filesystem():
             result = runner.invoke(
                 self.download,
+                # StudyInstanceUID:
                 ["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
             )
             assert len(os.listdir(Path.cwd())) != 0
 
+        with runner.isolated_filesystem():
+            result = runner.invoke(
+                self.download,
+                # crdc_series_uuid:
+                ["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"],
+            )
+            assert len(os.listdir(Path.cwd())) != 0
+
     def test_prior_version_manifest(self):
         # Define the values for each optional parameter
         quiet_values = [True, False]

From 51242bf055298681ac7d7a668f2aec6c2a17ae0b Mon Sep 17 00:00:00 2001
From: Andrey Fedorov <andrey.fedorov@gmail.com>
Date: Fri, 4 Oct 2024 16:21:04 -0400
Subject: [PATCH 2/2] ENH: look at prior versions index when crdc_series_uuid
 is used

---
 idc_index/index.py | 66 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index 410476d4..f4abc06d 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -76,6 +76,9 @@ def __init__(self):
         self.previous_versions_index_path = (
             idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
         )
+        file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
+
+        self.previous_versions_index = pd.read_parquet(file_path)
 
         # self.index = self.index.astype(str).replace("nan", "")
         self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
@@ -678,7 +681,28 @@ def _validate_update_manifest_and_get_download_size(
         manifest_df.columns = ["manifest_cp_cmd"]
 
         # create a copy of the index
-        index_df_copy = self.index
+        index_df_copy = self.index[
+            [
+                "SeriesInstanceUID",
+                "series_aws_url",
+                "series_size_MB",
+                "PatientID",
+                "collection_id",
+                "Modality",
+                "StudyInstanceUID",
+            ]
+        ]
+        previous_versions_index_df_copy = self.previous_versions_index[
+            [
+                "SeriesInstanceUID",
+                "series_aws_url",
+                "series_size_MB",
+                "PatientID",
+                "collection_id",
+                "Modality",
+                "StudyInstanceUID",
+            ]
+        ]
 
         # use default hierarchy
         if dirTemplate is not None:
@@ -773,7 +797,7 @@ def _validate_update_manifest_and_get_download_size(
                 series_size_MB,
                  {hierarchy} AS path,
             FROM
-                '{self.previous_versions_index_path}' pvip
+                previous_versions_index_df_copy pvip
 
             ),
             index_temp AS (
@@ -1435,7 +1459,6 @@ def citations_from_selection(
         patientId=None,
         studyInstanceUID=None,
         seriesInstanceUID=None,
-        crdc_series_uuid=None,
         citation_format=CITATION_FORMAT_APA,
     ):
         """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID.
@@ -1450,13 +1473,14 @@ def citations_from_selection(
         Returns:
             List of citations in the requested format.
         """
+
         result_df = self._safe_filter_by_selection(
             self.index,
             collection_id=collection_id,
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
-            crdc_series_uuid=crdc_series_uuid,
+            crdc_series_uuid=None,
         )
 
         citations = []
@@ -1533,8 +1557,40 @@ def download_from_selection(
 
         downloadDir = self._check_create_directory(downloadDir)
 
+        if crdc_series_uuid is not None:
+            download_df = pd.concat(
+                [
+                    self.index[
+                        [
+                            "PatientID",
+                            "collection_id",
+                            "Modality",
+                            "StudyInstanceUID",
+                            "SeriesInstanceUID",
+                            "crdc_series_uuid",
+                            "series_aws_url",
+                            "series_size_MB",
+                        ]
+                    ],
+                    self.previous_versions_index[
+                        [
+                            "PatientID",
+                            "collection_id",
+                            "Modality",
+                            "StudyInstanceUID",
+                            "SeriesInstanceUID",
+                            "crdc_series_uuid",
+                            "series_aws_url",
+                            "series_size_MB",
+                        ]
+                    ],
+                ],
+            )
+        else:
+            download_df = self.index
+
         result_df = self._safe_filter_by_selection(
-            self.index,
+            download_df,
             collection_id=collection_id,
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,