Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] explore all overlapping genomes, not just minimum metagenome cover #124

Open
wants to merge 21 commits into
base: latest
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 148 additions & 4 deletions genome_grist/conf/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ if not base_tempdir:
print("Please set 'tempdir' in the config.", file=sys.stderr)
sys.exit(-1)

# prefetch mapping analysis selection
prefetch_select = config.get('prefetch_select', [])
print('XXX', prefetch_select)

ABUNDTRIM_MEMORY = float(config.get('metagenome_trim_memory', '1e9'))

GENBANK_CACHE = config.get('genbank_cache', './genbank_cache/')
Expand Down Expand Up @@ -208,6 +212,79 @@ rule summarize_sample_info:
input:
expand(outdir + '/{sample}.info.yaml', sample=SAMPLES)

checkpoint prefetch_genbank_wc:
input:
prefetch_csv = f'{outdir}/gather/{{sample}}.x.prefetch.csv'
output:
touch(f"{outdir}/.prefetch.{{sample}}") # checkpoints need an output ;)

class Checkpoint_PrefetchResults:
def __init__(self, pattern, samples=None):
self.pattern = pattern
self.samples = samples

def get_genome_idents(self, sample):
prefetch_csv = f'{outdir}/gather/{sample}.prefetch.csv'
assert os.path.exists(prefetch_csv)

genome_idents = []
with open(prefetch_csv, 'rt') as fp:
r = csv.DictReader(fp)
n_nomatch = 0
for row in r:
name = row['match_name']

is_substr_match = False
for substr in prefetch_select:
if substr in name:
is_substr_match = True
break

if is_substr_match:
ident = name.split(' ')[0]
genome_idents.append(ident)
else:
n_nomatch += 1

print(f'prefetch: loaded {len(genome_idents)} identifiers from {prefetch_csv}.',
file=sys.stderr)
print(f"skipped {n_nomatch} because they did not match 'prefetch_select' configuration.",
file=sys.stderr)

return genome_idents

def __call__(self, w):
global checkpoints

if self.samples:
assert not hasattr(w, 'sample')

ret = []
for sample in self.samples:
d = dict(sample=sample)
w = snakemake.io.Wildcards(fromdict=d)

x = self.do_sample(w)
print('ZAZA', (x,))
ret.extend(x)

return ret
else:
x = self.do_sample(w)
print('ZIZA', (x,))
return x

def do_sample(self, w):
# wait for the results of 'prefetch_genbank_wc'; this will trigger
# exception until that rule has been run.
checkpoints.prefetch_genbank_wc.get(**w)

# parse hitlist_genomes,
genome_idents = self.get_genome_idents(w.sample)

p = expand(self.pattern, ident=genome_idents, **w)
return p

@toplevel
checkpoint gather_reads:
input:
Expand Down Expand Up @@ -251,7 +328,10 @@ class Checkpoint_GatherResults:
def __call__(self, w):
# get 'sample' from wildcards?
if self.samples is None:
return self.do_sample(w)
print('XYZ')
x = self.do_sample(w)
print(x)
return x
else:
assert not hasattr(w, 'sample'), "if 'samples' provided to constructor, cannot also be in rule inputs"

Expand All @@ -262,6 +342,7 @@ class Checkpoint_GatherResults:

x = self.do_sample(w)
ret.extend(x)
print('YYY', x)

return ret

Expand Down Expand Up @@ -336,7 +417,7 @@ class ListGatherGenomes(Checkpoint_GatherResults):
genome_filenames.append(info['genome_filename'])
genome_filenames.append(info['info_filename'])

# genbank: point at genbank_genomes
# genbank: point at genbank_cache
else:
genome_filenames.append(f'{GENBANK_CACHE}/{ident}_genomic.fna.gz')
genome_filenames.append(f'{GENBANK_CACHE}/{ident}.info.csv')
Expand Down Expand Up @@ -383,12 +464,26 @@ rule combine_genome_info:
expand(f"{outdir}/gather/{{sample}}.genomes.info.csv",
sample=SAMPLES)

@toplevel
rule download_prefetch_genomes_info:
input:
expand(f"{outdir}/gather/{{sample}}.genomes.prefetch.info.csv",
sample=SAMPLES)

@toplevel
rule download_genbank_genomes:
input:
Checkpoint_GatherResults(f"{GENBANK_CACHE}/{{ident}}_genomic.fna.gz",
samples=SAMPLES)

# @CTB switch to using /genomes/?
@toplevel
rule download_prefetch_genomes:
input:
Checkpoint_PrefetchResults(f"{GENBANK_CACHE}/{{ident}}_genomic.fna.gz",
SAMPLES),
Checkpoint_PrefetchResults(f"{GENBANK_CACHE}/{{ident}}.info.csv", SAMPLES)

@toplevel
rule retrieve_genomes:
input:
Expand All @@ -400,6 +495,16 @@ rule map_reads:
expand(f"{outdir}/mapping/{{sample}}.summary.csv", sample=SAMPLES),
expand(f"{outdir}/leftover/{{sample}}.summary.csv", sample=SAMPLES)

@toplevel
rule map_prefetch:
input:
expand(f"{outdir}/prefetch/{{sample}}.summary2.csv", sample=SAMPLES),

@toplevel
rule prefetch_to_vcf:
input:
Checkpoint_PrefetchResults(outdir + f"/prefetch/{{sample}}.x.{{ident}}.vcf.gz", samples=SAMPLES)

@toplevel
rule build_consensus:
input:
Expand Down Expand Up @@ -440,7 +545,8 @@ rule zip:
rm -f transfer.zip
zip -r transfer.zip {outdir}/leftover/*.summary.csv \
{outdir}/mapping/*.summary.csv {outdir}/*.yaml \
{outdir}/gather/*.csv {outdir}/reports/
{outdir}/gather/*.csv {outdir}/reports/ \
{outdir}/prefetch/*.summary2.csv
"""


Expand Down Expand Up @@ -637,6 +743,20 @@ rule minimap_wc:
samtools view -b -F 4 - | samtools sort - > {output.bam}
"""

# map abundtrim reads and produce a bam for prefetch results
rule minimap_prefetch_wc:
input:
query = ancient(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
metagenome = outdir + "/abundtrim/{sample}.abundtrim.fq.gz",
output:
bam = outdir + "/prefetch/{sample}.x.{ident}.bam",
conda: "env/minimap2.yml"
threads: 4
shell: """
minimap2 -ax sr -t {threads} {input.query} {input.metagenome} | \
samtools view -b -F 4 - | samtools sort - > {output.bam}
"""

# extract FASTQ from BAM
rule bam_to_fastq_wc:
input:
Expand Down Expand Up @@ -724,6 +844,18 @@ rule summarize_samtools_depth_wc:
{input.depth} -o {output.csv}
"""

# summarize depth into a CSV for prefetch results
rule summarize_samtools_depth_prefetch_wc:
input:
depth = Checkpoint_PrefetchResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt"),
vcf = Checkpoint_PrefetchResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.vcf.gz")
output:
csv = f"{outdir}/{{dir}}/{{sample}}.summary2.csv"
shell: """
python -m genome_grist.summarize_mapping {wildcards.sample} \
{input.depth} -o {output.csv}
"""

# compute sourmash signature from abundtrim reads
rule smash_abundtrim_wc:
input:
Expand Down Expand Up @@ -960,7 +1092,7 @@ checkpoint copy_sample_genomes_to_output_wc:
input:
# note: a key thing here is that the filenames themselves are correct,
# so we are simply copying from (multiple) directories into one.
# this is why the genome filenames need to be {acc}_genomic.fna.gz.
# this is why the genome filenames need to be {ident}_genomic.fna.gz.
genomes = ListGatherGenomes(),
output:
touch(f"{outdir}/genomes/.genomes.{{sample}}")
Expand All @@ -976,11 +1108,23 @@ rule make_combined_info_csv_wc:
output:
genomes_info_csv = f"{outdir}/gather/{{sample}}.genomes.info.csv",
shell: """
echo hello, world
echo FOO {input.csvs}
python -Werror -m genome_grist.combine_csvs \
--fields ident,display_name \
{input.csvs} > {output}
"""

# combined info.csv for prefetch
rule make_combined_prefetch_info_csv:
input:
Checkpoint_PrefetchResults(f'{outdir}/genomes/{{ident}}.info.csv')
output:
genomes_info_csv = f"{outdir}/gather/{{sample}}.genomes.prefetch.info.csv",
shell: """
python -Werror -m genome_grist.combine_csvs {input} > {output}
"""

# download actual genomes from genbank!
rule download_matching_genome_wc:
input:
Expand Down
Loading