Skip to content

Commit

Permalink
remove * in s5cmd urls
Browse files Browse the repository at this point in the history
  • Loading branch information
vkt1414 committed Dec 1, 2023
1 parent 9370a49 commit 2f43ff8
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 10 deletions.
19 changes: 10 additions & 9 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def download_dicom_series(self, seriesInstanceUID, downloadDir, dry_run=False, q
logging.debug('AWS Bucket Location: '+series_url)

cmd = [self.s5cmdPath, '--no-sign-request', '--endpoint-url', 'https://s3.amazonaws.com', 'cp', '--show-progress',
series_url, downloadDir]
series_url+'*', downloadDir]

if not dry_run:
process = subprocess.run(cmd, capture_output=(not quiet), text=(not quiet))
Expand Down Expand Up @@ -253,13 +253,14 @@ def download_from_selection(self, downloadDir=None, dry_run=True, collection_id=
logging.info("Dry run. Not downloading files. Rerun with dry_run=False to download the files.")
return

# Download the files
# make temporary file to store the list of files to download
manifest_file = os.path.join(downloadDir, 'download_manifest.s5cmd')
for index, row in result_df.iterrows():
with open(manifest_file, 'a') as f:
f.write("cp --show-progress "+row['series_aws_url'] + " "+downloadDir+"\n")
self.download_from_manifest(manifest_file, downloadDir)
else:
# Download the files
# make temporary file to store the list of files to download
manifest_file = os.path.join(downloadDir, 'download_manifest.s5cmd')
for index, row in result_df.iterrows():
with open(manifest_file, 'a') as f:
f.write("cp --show-progress "+row['series_aws_url'] + " "+downloadDir+"\n")
self.download_from_manifest(manifest_file, downloadDir)

"""Download the files corresponding to the manifest file from IDC. The manifest file should be a text file with each line containing the s5cmd command to download the file. The URLs in the file must correspond to those in the AWS buckets!
Expand All @@ -273,7 +274,7 @@ def download_from_selection(self, downloadDir=None, dry_run=True, collection_id=
"""
def download_from_manifest(self, manifest_file, downloadDir):
cmd = [self.s5cmdPath, '--no-sign-request', '--endpoint-url', 'https://s3.amazonaws.com', 'run',
manifest_file, downloadDir]
manifest_file]
process = subprocess.run(cmd, capture_output=True, text=True)
logging.info(process.stderr)
if process.returncode == 0:
Expand Down
2 changes: 1 addition & 1 deletion queries/idc_index.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ SELECT
COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
ANY_VALUE(license_short_name) as license_short_name,
# download related attributes
ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/*")) AS series_aws_url,
ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/")) AS series_aws_url,
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
FROM
bigquery-public-data.idc_v16.dicom_all AS dicom_all
Expand Down

0 comments on commit 2f43ff8

Please sign in to comment.