Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: support download by crdc_series_uuid #130

Merged
merged 2 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion idc_index/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ def set_log_level(log_level):
default=None,
help="DICOM SeriesInstanceUID(s) to filter by.",
)
@click.option(
"--crdc-series-uuid",
type=str,
multiple=True,
default=None,
help="crdc_series_uuid(s) to filter by.",
)
@click.option(
"--quiet",
type=bool,
Expand Down Expand Up @@ -122,6 +129,7 @@ def download_from_selection(
patient_id,
study_instance_uid,
series_instance_uid,
crdc_series_uuid,
quiet,
show_progress_bar,
use_s5cmd_sync,
Expand Down Expand Up @@ -159,11 +167,17 @@ def download_from_selection(
if series_instance_uid
else None
)
crdc_series_uuid = (
[uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")]
if crdc_series_uuid
else None
)
logger_cli.debug("Inputs received from cli download:")
logger_cli.debug(f"collection_id: {collection_id}")
logger_cli.debug(f"patient_id: {patient_id}")
logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}")
logger_cli.debug(f"dry_run: {dry_run}")
logger_cli.debug(f"quiet: {quiet}")
logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
Expand All @@ -177,6 +191,7 @@ def download_from_selection(
patientId=patient_id,
studyInstanceUID=study_instance_uid,
seriesInstanceUID=series_instance_uid,
crdc_series_uuid=crdc_series_uuid,
quiet=quiet,
show_progress_bar=show_progress_bar,
use_s5cmd_sync=use_s5cmd_sync,
Expand Down Expand Up @@ -346,9 +361,12 @@ def check_and_download(column_name, item_ids, download_dir, kwarg_name):
matches_found += check_and_download(
"SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
)
matches_found += check_and_download(
"crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid"
)
if not matches_found:
logger_cli.error(
"None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
"None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid."
)


Expand Down
133 changes: 115 additions & 18 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,21 @@ def __init__(self):
logger.debug(f"Reading index file v{idc_index_data.__version__}")
self.index = pd.read_parquet(file_path)

# initialize crdc_series_uuid for the index
# TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113
# is merged (to minimize disruption), it will make more sense to change
# idc-index-data to separate bucket from crdc_series_uuid, add support for GCP,
# and consequently simplify the code here
self.index["crdc_series_uuid"] = (
self.index["series_aws_url"].str.split("/").str[3]
)

self.previous_versions_index_path = (
idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
)
file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH

self.previous_versions_index = pd.read_parquet(file_path)

# self.index = self.index.astype(str).replace("nan", "")
self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
Expand Down Expand Up @@ -136,7 +148,12 @@ def _filter_dataframe_by_id(key, dataframe, _id):

@staticmethod
def _safe_filter_by_selection(
df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID
df_index,
collection_id,
patientId,
studyInstanceUID,
seriesInstanceUID,
crdc_series_uuid,
):
if collection_id is not None:
if not isinstance(collection_id, str) and not isinstance(
Expand All @@ -156,26 +173,48 @@ def _safe_filter_by_selection(
seriesInstanceUID, list
):
raise TypeError("seriesInstanceUID must be a string or list of strings")

if collection_id is not None:
result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
else:
result_df = df_index

if patientId is not None:
result_df = IDCClient._filter_by_patient_id(result_df, patientId)

if studyInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_study_uid(
result_df, studyInstanceUID
if crdc_series_uuid is not None:
if not isinstance(crdc_series_uuid, str) and not isinstance(
crdc_series_uuid, list
):
raise TypeError("crdc_series_uuid must be a string or list of strings")

# Here we go down-up the hierarchy of filtering, taking into
# account the direction of one-to-many relationships
# one crdc_series_uuid can be associated with one and only one SeriesInstanceUID
# one SeriesInstanceUID can be associated with one and only one StudyInstanceUID
# one StudyInstanceUID can be associated with one and only one PatientID
# one PatientID can be associated with one and only one collection_id
# because of this we do not need to apply attributes above the given defined
# attribute in the hierarchy
# The earlier implemented behavior was a relic of the API from a different system
# that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately.

if crdc_series_uuid is not None:
result_df = IDCClient._filter_dataframe_by_id(
"crdc_series_uuid", df_index, crdc_series_uuid
)
return result_df

if seriesInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_series_uid(
result_df, seriesInstanceUID
df_index, seriesInstanceUID
)
return result_df

if studyInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID)
return result_df

if patientId is not None:
result_df = IDCClient._filter_by_patient_id(df_index, patientId)
return result_df

return result_df
if collection_id is not None:
result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
return result_df

return None

@staticmethod
def _filter_by_collection_id(df_index, collection_id):
Expand Down Expand Up @@ -642,7 +681,28 @@ def _validate_update_manifest_and_get_download_size(
manifest_df.columns = ["manifest_cp_cmd"]

# create a copy of the index
index_df_copy = self.index
index_df_copy = self.index[
[
"SeriesInstanceUID",
"series_aws_url",
"series_size_MB",
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
]
]
previous_versions_index_df_copy = self.previous_versions_index[
[
"SeriesInstanceUID",
"series_aws_url",
"series_size_MB",
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
]
]

# use default hierarchy
if dirTemplate is not None:
Expand Down Expand Up @@ -737,7 +797,7 @@ def _validate_update_manifest_and_get_download_size(
series_size_MB,
{hierarchy} AS path,
FROM
'{self.previous_versions_index_path}' pvip
previous_versions_index_df_copy pvip

),
index_temp AS (
Expand Down Expand Up @@ -1413,12 +1473,14 @@ def citations_from_selection(
Returns:
List of citations in the requested format.
"""

result_df = self._safe_filter_by_selection(
self.index,
collection_id=collection_id,
patientId=patientId,
studyInstanceUID=studyInstanceUID,
seriesInstanceUID=seriesInstanceUID,
crdc_series_uuid=None,
)

citations = []
Expand Down Expand Up @@ -1469,6 +1531,7 @@ def download_from_selection(
patientId=None,
studyInstanceUID=None,
seriesInstanceUID=None,
crdc_series_uuid=None,
quiet=True,
show_progress_bar=True,
use_s5cmd_sync=False,
Expand All @@ -1484,6 +1547,7 @@ def download_from_selection(
patientId: string or list of strings containing the values of PatientID to filter by
studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by
quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
show_progress_bar (bool): If True, tracks the progress of download
use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
Expand All @@ -1493,12 +1557,45 @@ def download_from_selection(

downloadDir = self._check_create_directory(downloadDir)

if crdc_series_uuid is not None:
download_df = pd.concat(
[
self.index[
[
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
"SeriesInstanceUID",
"crdc_series_uuid",
"series_aws_url",
"series_size_MB",
]
],
self.previous_versions_index[
[
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
"SeriesInstanceUID",
"crdc_series_uuid",
"series_aws_url",
"series_size_MB",
]
],
],
)
else:
download_df = self.index

result_df = self._safe_filter_by_selection(
self.index,
download_df,
collection_id=collection_id,
patientId=patientId,
studyInstanceUID=studyInstanceUID,
seriesInstanceUID=seriesInstanceUID,
crdc_series_uuid=crdc_series_uuid,
)

total_size = round(result_df["series_size_MB"].sum(), 2)
Expand Down
9 changes: 9 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,10 +469,19 @@ def test_cli_download(self):
with runner.isolated_filesystem():
result = runner.invoke(
self.download,
# StudyInstanceUID:
["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
)
assert len(os.listdir(Path.cwd())) != 0

with runner.isolated_filesystem():
result = runner.invoke(
self.download,
# crdc_series_uuid:
["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"],
)
assert len(os.listdir(Path.cwd())) != 0

def test_prior_version_manifest(self):
# Define the values for each optional parameter
quiet_values = [True, False]
Expand Down
Loading