Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline.get_sample_names() fix for plate-replicates #108

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
59 changes: 46 additions & 13 deletions sequence_processing_pipeline/Pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from json import load as json_load
from json import loads as json_loads
from json.decoder import JSONDecodeError
from os import makedirs, listdir
from os.path import join, exists, isdir, basename
Expand Down Expand Up @@ -453,28 +452,62 @@ def get_sample_names(self, project_name=None):
# test for self.mapping_file, since self.sample_sheet will be
# defined in both cases.
if self.mapping_file is not None:
return self._get_sample_names_from_mapping_file(project_name)
results = self._get_sample_names_from_mapping_file(project_name)
else:
return self._get_sample_names_from_sample_sheet(project_name)
results = self._get_sample_names_from_sample_sheet(project_name)

# sort all results before returning. sets will automatically be
# converted to lists.
return sorted(results)

def _get_sample_names_from_sample_sheet(self, project_name):
if project_name is None:
return [x.Sample_Name for x in self.sample_sheet.samples]
if 'orig_name' in self.sample_sheet.samples[0]:
if project_name is None:
return {x.orig_name for x in self.sample_sheet.samples}
else:
return {x.orig_name for x in self.sample_sheet.samples
if project_name in x['Sample_Project']}
else:
# Since the project-name is stored in an internal variable
# in a third-party library, convert the data structure to
# JSON using the exposed method and obtain from the result.
jsn = json_loads(self.sample_sheet.to_json())
return [x['Sample_Name'] for x in jsn['Data'] if
f'{project_name}_' in x['Sample_Project']]
if project_name is None:
return [x.Sample_Name for x in self.sample_sheet.samples]
else:
return [x.Sample_Name for x in self.sample_sheet.samples
if project_name in x['Sample_Project']]

def _get_sample_names_from_mapping_file(self, project_name):
if project_name is None:
return list(self.mapping_file.sample_name)
if 'orig_name' in self.mapping_file.columns:
# because sample-names w/out replicate suffix will naturally
# appear more than once, we need to remove duplicates before
# returning them.
return set(self.mapping_file.orig_name)
else:
return self.mapping_file.sample_name
else:
df = self.mapping_file[self.mapping_file['project_name'] ==
project_name]
return list(df['sample_name'])

# since orig_name is currently a required column whether or not
# the pre-prep file contains replicates, search for the presence
# of contains_replicates column instead, as it's not required for
# non-replicate pre-prep files.
if 'contains_replicates' in df.columns:
values = set(df['contains_replicates'])
if values == {True, False}:
# this is a sanity-check. A pre-prep file with both
# values should fail validation during Pipeline creation.
raise ValueError("'contains_replicates' column must either"
" be all True or all False")
elif values == {True, }:
# contains_replicates should be either all True or all
# False. We need not assume non-boolean data-types.
# With replicates, assume there are duplicate entries in
# orig_name column.
return set((df['orig_name']))

charles-cowart marked this conversation as resolved.
Show resolved Hide resolved
# if values == {False, } or is a legacy mapping-file w/out the
# column, simply return sample-names.
return df['sample_name']

def _parse_project_name(self, project_name, short_names):
'''
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
orig_name sample_name contains_replicates barcode primer primer_plate well_id_96 well_id_384 plating extractionkit_lot extraction_robot tm1000_8_tool primer_date mastermix_lot water_lot processing_robot tm300_8_tool tm50_8_tool sample_plate project_name well_description experiment_design_description library_construction_protocol linker platform run_center run_date run_prefix pcr_primers sequencing_meth target_gene target_subfragment center_name center_project_name instrument_model runid tm10_8_tool
0 9.18.19.RK.ST.900 9.18.19.RK.ST.900.A1 True ATGTTAGGGAAT GTGYCAGCMGCCGCGGTAA 5 A1 A1 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
1 9.19.19.RK.ST.1100 9.19.19.RK.ST.1100.A3 False AAGTGGCTATCC GTGYCAGCMGCCGCGGTAA 5 A2 A3 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
2 9.20.19.RK.ST.700 9.20.19.RK.ST.700.A5 True GTCGTTACCCGC GTGYCAGCMGCCGCGGTAA 5 A3 A5 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
3 9.20.19.RK.ST.1100 9.20.19.RK.ST.1100.A7 True AGTATATGTTTC GTGYCAGCMGCCGCGGTAA 5 A4 A7 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
4 9.21.19.RK.ST.500 9.21.19.RK.ST.500.A9 True GGCTCGTCGGAG GTGYCAGCMGCCGCGGTAA 5 A5 A9 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
5 9.22.19.RK.ST.700 9.22.19.RK.ST.700.A11 True GACATCTGACAC GTGYCAGCMGCCGCGGTAA 5 A6 A11 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
6 9.16.19.RK.ST.700 9.16.19.RK.ST.700.A13 True AATTTCCTAACA GTGYCAGCMGCCGCGGTAA 5 A7 A13 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
7 9.16.19.RK.ST.1100 9.16.19.RK.ST.1100.A15 True ATAAACGGACAT GTGYCAGCMGCCGCGGTAA 5 A8 A15 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
8 9.18.19.RK.ST.900 9.18.19.RK.ST.900.B1 True ATGTTAGGGAAT GTGYCAGCMGCCGCGGTAA 5 A1 B1 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
9 9.19.19.RK.ST.1100 9.19.19.RK.ST.1100.B3 True AAGTGGCTATCC GTGYCAGCMGCCGCGGTAA 5 A2 B3 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
10 9.20.19.RK.ST.700 9.20.19.RK.ST.700.B5 True GTCGTTACCCGC GTGYCAGCMGCCGCGGTAA 5 A3 B5 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
11 9.20.19.RK.ST.1100 9.20.19.RK.ST.1100.B7 True AGTATATGTTTC GTGYCAGCMGCCGCGGTAA 5 A4 B7 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
12 9.21.19.RK.ST.500 9.21.19.RK.ST.500.B9 True GGCTCGTCGGAG GTGYCAGCMGCCGCGGTAA 5 A5 B9 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
13 9.22.19.RK.ST.700 9.22.19.RK.ST.700.B11 True GACATCTGACAC GTGYCAGCMGCCGCGGTAA 5 A6 B11 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
14 9.16.19.RK.ST.700 9.16.19.RK.ST.700.B13 True AATTTCCTAACA GTGYCAGCMGCCGCGGTAA 5 A7 B13 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
15 9.16.19.RK.ST.1100 9.16.19.RK.ST.1100.B15 True ATAAACGGACAT GTGYCAGCMGCCGCGGTAA 5 A8 B15 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
16 9.18.19.RK.ST.900 9.18.19.RK.ST.900.A2 True ATGTTAGGGAAT GTGYCAGCMGCCGCGGTAA 5 A1 A2 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
17 9.19.19.RK.ST.1100 9.19.19.RK.ST.1100.A4 True AAGTGGCTATCC GTGYCAGCMGCCGCGGTAA 5 A2 A4 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
18 9.20.19.RK.ST.700 9.20.19.RK.ST.700.A6 True GTCGTTACCCGC GTGYCAGCMGCCGCGGTAA 5 A3 A6 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
19 9.20.19.RK.ST.1100 9.20.19.RK.ST.1100.A8 True AGTATATGTTTC GTGYCAGCMGCCGCGGTAA 5 A4 A8 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
20 9.21.19.RK.ST.500 9.21.19.RK.ST.500.A10 True GGCTCGTCGGAG GTGYCAGCMGCCGCGGTAA 5 A5 A10 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
21 9.22.19.RK.ST.700 9.22.19.RK.ST.700.A12 True GACATCTGACAC GTGYCAGCMGCCGCGGTAA 5 A6 A12 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
22 9.16.19.RK.ST.700 9.16.19.RK.ST.700.A14 True AATTTCCTAACA GTGYCAGCMGCCGCGGTAA 5 A7 A14 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
23 9.16.19.RK.ST.1100 9.16.19.RK.ST.1100.A16 True ATAAACGGACAT GTGYCAGCMGCCGCGGTAA 5 A8 A16 SF 163051748 Carmen_HOWE_KF3 109379Z 12/28/22 1266015 RNBL1950 Echo550 not applicable not applicable ABTX_Plate_174 ABTX_11052 description exp design description Illumina EMP protocol 515fbc, 806r amplification of 16S rRNA V4 GT Illumina UCSDMI 3/2/23 FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT Sequencing by synthesis 16S rRNA V4 UCSDMI Rob ABTX Illumina MiSeq
Loading
Loading