From 2f43ff8ea0ff889f8650a1b9800cda6a05ceb686 Mon Sep 17 00:00:00 2001 From: vkt1414 Date: Fri, 1 Dec 2023 12:08:53 -0500 Subject: [PATCH] remove * in s5cmd urls --- idc_index/index.py | 19 ++++++++++--------- queries/idc_index.sql | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 22457747..daed8113 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -191,7 +191,7 @@ def download_dicom_series(self, seriesInstanceUID, downloadDir, dry_run=False, q logging.debug('AWS Bucket Location: '+series_url) cmd = [self.s5cmdPath, '--no-sign-request', '--endpoint-url', 'https://s3.amazonaws.com', 'cp', '--show-progress', - series_url, downloadDir] + series_url+'*', downloadDir] if not dry_run: process = subprocess.run(cmd, capture_output=(not quiet), text=(not quiet)) @@ -253,13 +253,14 @@ def download_from_selection(self, downloadDir=None, dry_run=True, collection_id= logging.info("Dry run. Not downloading files. Rerun with dry_run=False to download the files.") return - # Download the files - # make temporary file to store the list of files to download - manifest_file = os.path.join(downloadDir, 'download_manifest.s5cmd') - for index, row in result_df.iterrows(): - with open(manifest_file, 'a') as f: - f.write("cp --show-progress "+row['series_aws_url'] + " "+downloadDir+"\n") - self.download_from_manifest(manifest_file, downloadDir) + else: + # Download the files + # make temporary file to store the list of files to download + manifest_file = os.path.join(downloadDir, 'download_manifest.s5cmd') + for index, row in result_df.iterrows(): + with open(manifest_file, 'a') as f: + f.write("cp --show-progress "+row['series_aws_url'] + " "+downloadDir+"\n") + self.download_from_manifest(manifest_file, downloadDir) """Download the files corresponding to the manifest file from IDC. The manifest file should be a text file with each line containing the s5cmd command to download the file. The URLs in the file must correspond to those in the AWS buckets! @@ -273,7 +274,7 @@ def download_from_selection(self, downloadDir=None, dry_run=True, collection_id= """ def download_from_manifest(self, manifest_file, downloadDir): cmd = [self.s5cmdPath, '--no-sign-request', '--endpoint-url', 'https://s3.amazonaws.com', 'run', - manifest_file, downloadDir] + manifest_file] process = subprocess.run(cmd, capture_output=True, text=True) logging.info(process.stderr) if process.returncode == 0: diff --git a/queries/idc_index.sql b/queries/idc_index.sql index 6718adbb..f6e17c75 100644 --- a/queries/idc_index.sql +++ b/queries/idc_index.sql @@ -22,7 +22,7 @@ SELECT COUNT(dicom_all.SOPInstanceUID) AS instanceCount, ANY_VALUE(license_short_name) as license_short_name, # download related attributes - ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/*")) AS series_aws_url, + ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/")) AS series_aws_url, ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB, FROM bigquery-public-data.idc_v16.dicom_all AS dicom_all