From 510bbc82ca374048d055153c524bc3b6512d013f Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Fri, 4 Oct 2024 14:01:52 -0400 Subject: [PATCH 1/2] ENH: support download by crdc_series_uuid Aims to resolve #117 --- idc_index/cli.py | 20 ++++++++++++- idc_index/index.py | 71 ++++++++++++++++++++++++++++++++++++---------- tests/idcindex.py | 9 ++++++ 3 files changed, 84 insertions(+), 16 deletions(-) diff --git a/idc_index/cli.py b/idc_index/cli.py index 3171276b..011227f8 100644 --- a/idc_index/cli.py +++ b/idc_index/cli.py @@ -83,6 +83,13 @@ def set_log_level(log_level): default=None, help="DICOM SeriesInstanceUID(s) to filter by.", ) +@click.option( + "--crdc-series-uuid", + type=str, + multiple=True, + default=None, + help="crdc_series_uuid(s) to filter by.", +) @click.option( "--quiet", type=bool, @@ -122,6 +129,7 @@ def download_from_selection( patient_id, study_instance_uid, series_instance_uid, + crdc_series_uuid, quiet, show_progress_bar, use_s5cmd_sync, @@ -159,11 +167,17 @@ def download_from_selection( if series_instance_uid else None ) + crdc_series_uuid = ( + [uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")] + if crdc_series_uuid + else None + ) logger_cli.debug("Inputs received from cli download:") logger_cli.debug(f"collection_id: {collection_id}") logger_cli.debug(f"patient_id: {patient_id}") logger_cli.debug(f"study_instance_uid: {study_instance_uid}") logger_cli.debug(f"series_instance_uid: {series_instance_uid}") + logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}") logger_cli.debug(f"dry_run: {dry_run}") logger_cli.debug(f"quiet: {quiet}") logger_cli.debug(f"show_progress_bar: {show_progress_bar}") @@ -177,6 +191,7 @@ def download_from_selection( patientId=patient_id, studyInstanceUID=study_instance_uid, seriesInstanceUID=series_instance_uid, + crdc_series_uuid=crdc_series_uuid, quiet=quiet, show_progress_bar=show_progress_bar, use_s5cmd_sync=use_s5cmd_sync, @@ -346,9 +361,12 @@ def check_and_download(column_name, item_ids, download_dir, kwarg_name): matches_found += check_and_download( "SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID" ) + matches_found += check_and_download( + "crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid" + ) if not matches_found: logger_cli.error( - "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID." + "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid." ) diff --git a/idc_index/index.py b/idc_index/index.py index ce44f696..410476d4 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -64,6 +64,15 @@ def __init__(self): logger.debug(f"Reading index file v{idc_index_data.__version__}") self.index = pd.read_parquet(file_path) + # initialize crdc_series_uuid for the index + # TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113 + # is merged (to minimize disruption), it will make more sense to change + # idc-index-data to separate bucket from crdc_series_uuid, add support for GCP, + # and consequently simplify the code here + self.index["crdc_series_uuid"] = ( + self.index["series_aws_url"].str.split("/").str[3] + ) + self.previous_versions_index_path = ( idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH ) @@ -136,7 +145,12 @@ def _filter_dataframe_by_id(key, dataframe, _id): @staticmethod def _safe_filter_by_selection( - df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID + df_index, + collection_id, + patientId, + studyInstanceUID, + seriesInstanceUID, + crdc_series_uuid, ): if collection_id is not None: if not isinstance(collection_id, str) and not isinstance( @@ -156,26 +170,48 @@ def _safe_filter_by_selection( seriesInstanceUID, list ): raise TypeError("seriesInstanceUID must be a string or list of strings") - - if collection_id is not None: - result_df = IDCClient._filter_by_collection_id(df_index, collection_id) - else: - result_df = df_index - - if patientId is not None: - result_df = IDCClient._filter_by_patient_id(result_df, patientId) - - if studyInstanceUID is not None: - result_df = IDCClient._filter_by_dicom_study_uid( - result_df, studyInstanceUID + if crdc_series_uuid is not None: + if not isinstance(crdc_series_uuid, str) and not isinstance( + crdc_series_uuid, list + ): + raise TypeError("crdc_series_uuid must be a string or list of strings") + + # Here we go down-up the hierarchy of filtering, taking into + # account the direction of one-to-many relationships + # one crdc_series_uuid can be associated with one and only one SeriesInstanceUID + # one SeriesInstanceUID can be associated with one and only one StudyInstanceUID + # one StudyInstanceUID can be associated with one and only one PatientID + # one PatientID can be associated with one and only one collection_id + # because of this we do not need to apply attributes above the given defined + # attribute in the hierarchy + # The earlier implemented behavior was a relic of the API from a different system + # that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately. + + if crdc_series_uuid is not None: + result_df = IDCClient._filter_dataframe_by_id( + "crdc_series_uuid", df_index, crdc_series_uuid ) + return result_df if seriesInstanceUID is not None: result_df = IDCClient._filter_by_dicom_series_uid( - result_df, seriesInstanceUID + df_index, seriesInstanceUID ) + return result_df + + if studyInstanceUID is not None: + result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID) + return result_df + + if patientId is not None: + result_df = IDCClient._filter_by_patient_id(df_index, patientId) + return result_df + + if collection_id is not None: + result_df = IDCClient._filter_by_collection_id(df_index, collection_id) + return result_df - return result_df + return None @staticmethod def _filter_by_collection_id(df_index, collection_id): @@ -1399,6 +1435,7 @@ def citations_from_selection( patientId=None, studyInstanceUID=None, seriesInstanceUID=None, + crdc_series_uuid=None, citation_format=CITATION_FORMAT_APA, ): """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID. @@ -1419,6 +1456,7 @@ def citations_from_selection( patientId=patientId, studyInstanceUID=studyInstanceUID, seriesInstanceUID=seriesInstanceUID, + crdc_series_uuid=crdc_series_uuid, ) citations = [] @@ -1469,6 +1507,7 @@ def download_from_selection( patientId=None, studyInstanceUID=None, seriesInstanceUID=None, + crdc_series_uuid=None, quiet=True, show_progress_bar=True, use_s5cmd_sync=False, @@ -1484,6 +1523,7 @@ def download_from_selection( patientId: string or list of strings containing the values of PatientID to filter by studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by + crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by quiet (bool): If True, suppresses the output of the subprocess. Defaults to True show_progress_bar (bool): If True, tracks the progress of download use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded @@ -1499,6 +1539,7 @@ def download_from_selection( patientId=patientId, studyInstanceUID=studyInstanceUID, seriesInstanceUID=seriesInstanceUID, + crdc_series_uuid=crdc_series_uuid, ) total_size = round(result_df["series_size_MB"].sum(), 2) diff --git a/tests/idcindex.py b/tests/idcindex.py index 73e9a1f3..4d12a71a 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -469,10 +469,19 @@ def test_cli_download(self): with runner.isolated_filesystem(): result = runner.invoke( self.download, + # StudyInstanceUID: ["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"], ) assert len(os.listdir(Path.cwd())) != 0 + with runner.isolated_filesystem(): + result = runner.invoke( + self.download, + # crdc_series_uuid: + ["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"], + ) + assert len(os.listdir(Path.cwd())) != 0 + def test_prior_version_manifest(self): # Define the values for each optional parameter quiet_values = [True, False] From 51242bf055298681ac7d7a668f2aec6c2a17ae0b Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Fri, 4 Oct 2024 16:21:04 -0400 Subject: [PATCH 2/2] ENH: look at prior versions index when crdc_series_uuid is used --- idc_index/index.py | 66 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 410476d4..f4abc06d 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -76,6 +76,9 @@ def __init__(self): self.previous_versions_index_path = ( idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH ) + file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH + + self.previous_versions_index = pd.read_parquet(file_path) # self.index = self.index.astype(str).replace("nan", "") self.index["series_size_MB"] = self.index["series_size_MB"].astype(float) @@ -678,7 +681,28 @@ def _validate_update_manifest_and_get_download_size( manifest_df.columns = ["manifest_cp_cmd"] # create a copy of the index - index_df_copy = self.index + index_df_copy = self.index[ + [ + "SeriesInstanceUID", + "series_aws_url", + "series_size_MB", + "PatientID", + "collection_id", + "Modality", + "StudyInstanceUID", + ] + ] + previous_versions_index_df_copy = self.previous_versions_index[ + [ + "SeriesInstanceUID", + "series_aws_url", + "series_size_MB", + "PatientID", + "collection_id", + "Modality", + "StudyInstanceUID", + ] + ] # use default hierarchy if dirTemplate is not None: @@ -773,7 +797,7 @@ def _validate_update_manifest_and_get_download_size( series_size_MB, {hierarchy} AS path, FROM - '{self.previous_versions_index_path}' pvip + previous_versions_index_df_copy pvip ), index_temp AS ( @@ -1435,7 +1459,6 @@ def citations_from_selection( patientId=None, studyInstanceUID=None, seriesInstanceUID=None, - crdc_series_uuid=None, citation_format=CITATION_FORMAT_APA, ): """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID. @@ -1450,13 +1473,14 @@ def citations_from_selection( Returns: List of citations in the requested format. """ + result_df = self._safe_filter_by_selection( self.index, collection_id=collection_id, patientId=patientId, studyInstanceUID=studyInstanceUID, seriesInstanceUID=seriesInstanceUID, - crdc_series_uuid=crdc_series_uuid, + crdc_series_uuid=None, ) citations = [] @@ -1533,8 +1557,40 @@ def download_from_selection( downloadDir = self._check_create_directory(downloadDir) + if crdc_series_uuid is not None: + download_df = pd.concat( + [ + self.index[ + [ + "PatientID", + "collection_id", + "Modality", + "StudyInstanceUID", + "SeriesInstanceUID", + "crdc_series_uuid", + "series_aws_url", + "series_size_MB", + ] + ], + self.previous_versions_index[ + [ + "PatientID", + "collection_id", + "Modality", + "StudyInstanceUID", + "SeriesInstanceUID", + "crdc_series_uuid", + "series_aws_url", + "series_size_MB", + ] + ], + ], + ) + else: + download_df = self.index + result_df = self._safe_filter_by_selection( - self.index, + download_df, collection_id=collection_id, patientId=patientId, studyInstanceUID=studyInstanceUID,