Skip to content

Commit

Permalink
Merge pull request #108 from genxnetwork/dummy_simulated_samples
Browse files Browse the repository at this point in the history
Dummy simulated samples
  • Loading branch information
pavelnikonorov authored Jul 28, 2023
2 parents 23e423c + a997db9 commit fdf5742
Show file tree
Hide file tree
Showing 26 changed files with 661 additions and 251 deletions.
28 changes: 19 additions & 9 deletions analysis/GRAPE Article Visualisations.ipynb

Large diffs are not rendered by default.

19 changes: 12 additions & 7 deletions containers/snakemake/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@ ENV SHELL /bin/bash
RUN apt-get update && apt-get install -y wget bzip2 gnupg2 git libgomp1 libarchive13 && \
wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
rm Miniconda3-latest-Linux-x86_64.sh && \
conda install -c conda-forge mamba
rm Miniconda3-latest-Linux-x86_64.sh
# conda install -n base --override-channels -c conda-forge mamba==1.4.5 'python_abi=*=*cp*'

RUN for e in envs/*; do mamba env create -f $e ; done && \
# RUN for e in envs/*; do mamba env create -f $e ; done && \
# conda clean --all -y

RUN conda install -n base conda-libmamba-solver && \
conda config --set solver libmamba && \
conda env create -f envs/big_env.yaml && \
conda clean --all -y

# Intall Minimac3
Expand All @@ -29,10 +34,6 @@ ENV PATH "$PATH:/opt/Minimac3Executable/bin"
# Install Minimac4
RUN apt-get install -y minimac4

# Install Eagle
# RUN wget "https://data.broadinstitute.org/alkesgroup/Eagle/downloads/dev/eagle_v2.4.1" -O /usr/bin/eagle
# RUN chmod a+x /usr/bin/eagle

# Install Germline
# conda version of germline has a problem with non-zero error code
# https://github.com/gusevlab/germline/issues/8
Expand All @@ -48,6 +49,10 @@ RUN apt-get install -y make g++ git && \
ENV PATH "$PATH:/opt/germline/bin"
WORKDIR /

# Install Eagle
RUN wget "https://data.broadinstitute.org/alkesgroup/Eagle/downloads/dev/eagle_v2.4.1" -O /usr/bin/eagle
RUN chmod a+x /usr/bin/eagle

# Workaround of NonWritableError when conda tries to create environments for the first time
# funnel launches docker containers with --read-only and snakemake cannot create conda envs
# because it has to do something with urls.txt
Expand Down
47 changes: 47 additions & 0 deletions envs/big_env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: snakemake
channels:
- conda-forge
- bioconda
- biocore
- b3bg
- alex_genxt
- defaults
dependencies:
- python>=3.9
- matplotlib
- pandas
- numpy
- seaborn
- biom-format
- scikit-bio
- docutils
- mmh3
- bcftools
- samtools
- plink
- unzip
- wget
- openssl
- parallel
- ibis
- openjdk
- picard-slim
- ped-sim
- networkx
- scipy
- statsmodels
- rapid-ibd
- scikit-allel
- scikit-learn
- king
- snakemake
- pip
- pip:
- polars
- pydot
- "--editable=git+https://github.com/alex-medvedev-msc/ersa.git#egg=ersa"
# - pip:
# - "--editable=git+https://github.com/Jahysama/snakemake.git#egg=snakemake"
#Fork of a snakemake is used here because of not working conda envs inside python scripts
#Please check out https://github.com/snakemake/snakemake/pull/1812 for more info

25 changes: 24 additions & 1 deletion launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,27 @@ def get_parser_args():
type=int,
help='Random seed for Ped-sim pedigree simulation. The default value is randomly generated.')

parser.add_argument(
'--sim-samples-number',
default=1000,
type=int,
help='Number of samples to simulate in Ped-sim pedigree simulation using simbig workflow.'
)

parser.add_argument(
'--background',
default='1kg',
type=str,
help='Founders for simulation. The default value is 1kg. In this case, it will use 1000 genomes founders'
)

parser.add_argument(
'--augment-background',
default=0,
type=int,
help='How many samples to add to the background. The default value is 0. In this case, it will not add any samples to the background'
)

args = parser.parse_args()

valid_commands = [
Expand Down Expand Up @@ -493,8 +514,10 @@ def copy_input(input_dir, working_dir, samples_file):
config_dict['alt_hom_samples'] = args.alt_hom_samples
config_dict['het_samples'] = args.het_samples
config_dict['iqr_alpha'] = args.iqr_alpha

config_dict['sim_samples_number'] = args.sim_samples_number
config_dict['seed'] = args.seed
config_dict['background'] = args.background
config_dict['augment_background'] = args.augment_background

if args.weight_mask:
config_dict['weight_mask'] = os.path.join(args.directory, args.weight_mask)
Expand Down
22 changes: 12 additions & 10 deletions rules/filter.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
rule annotate_snp_ids:
input:
vcf = 'vcf/{batch}_merged_lifted.vcf.gz'
output:
vcf = 'vcf/{batch}_merged_annotated.vcf.gz'
shell:
'''
bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" {input.vcf} -O z -o {output.vcf}
'''

rule select_bad_samples:
input:
vcf='vcf/{batch}_merged_lifted.vcf.gz'
vcf='vcf/{batch}_merged_annotated.vcf.gz'
output:
bad_samples='vcf/{batch}_lifted_vcf.badsamples',
report='results/{batch}_bad_samples_report.tsv',
Expand All @@ -17,22 +27,18 @@ rule select_bad_samples:
psc='stats/{batch}_lifted_vcf.psc',
keep_samples='stats/{batch}_keep_samples.list',

conda:
'evaluation'
script:
'../scripts/select_bad_samples.py'


rule plink_filter:
input:
vcf='vcf/{batch}_merged_lifted.vcf.gz',
vcf='vcf/{batch}_merged_annotated.vcf.gz',
bad_samples=rules.select_bad_samples.output.bad_samples
output:
bed = temp('plink/{batch}_merged_filter.bed'),
bim = temp('plink/{batch}_merged_filter.bim'),
fam = temp('plink/{batch}_merged_filter.fam')
conda:
'plink'
params:
input = '{batch}_merged',
out = '{batch}_merged_filter',
Expand Down Expand Up @@ -78,8 +84,6 @@ rule plink_clean_up:
params:
input = 'plink/{batch}_merged_filter',
out = 'plink/{batch}_merged_mapped'
conda:
'plink'
log:
'logs/plink/{batch}_plink_clean_up.log'
benchmark:
Expand Down Expand Up @@ -116,8 +120,6 @@ rule prepare_vcf:
params:
input = 'plink/{batch}_merged_mapped',
vcf = 'vcf/{batch}_merged_mapped_sorted.vcf.gz'
conda:
'bcf_plink'
log:
plink='logs/plink/{batch}_prepare_vcf.log',
vcf='logs/vcf/{batch}_prepare_vcf.log'
Expand Down
27 changes: 21 additions & 6 deletions rules/phasing.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
rule index_for_eagle:
input:
bcf='vcf/{batch}_merged_mapped_sorted.bcf.gz'
output:
idx='vcf/{batch}_merged_mapped_sorted.bcf.gz.tbi'
log:
'logs/vcf/{batch}_merged_mapped_sorted.bcf.gz.tbi.log'
conda:
'bcftools'
shell:
'''
bcftools index -f -t {input.bcf} |& tee {log}
'''

rule phase:
input:
vcf='vcf/{batch}_merged_mapped_sorted.vcf.gz',
vcf='vcf/{batch}_merged_mapped_sorted.bcf.gz',
idx='vcf/{batch}_merged_mapped_sorted.bcf.gz.tbi',
vcfRef=REF_VCF
output: temp('phase/{batch}_chr{chrom}.phased.vcf.gz')
output: temp('phase/{batch}_chr{chrom}.phased.bcf.gz')
log:
'logs/phase/{batch}_eagle-{chrom}.log'
benchmark:
Expand All @@ -13,20 +28,20 @@ rule phase:
--vcfTarget {input.vcf} \
--geneticMapFile {GENETIC_MAP} \
--chrom {wildcards.chrom} \
--vcfOutFormat z \
--vcfOutFormat b \
--pbwtIters 2 \
--Kpbwt 20000 \
--outPrefix phase/{wildcards.batch}_chr{wildcards.chrom}.phased |& tee {log}
'''


phase = ['chr{i}.phased.vcf.gz'.format(i=chr) for chr in CHROMOSOMES]
phase = ['chr{i}.phased.bcf.gz'.format(i=chr) for chr in CHROMOSOMES]
phase_batch = [ 'phase/{batch}_' + line for line in phase]
rule merge_phased:
input:
phase_batch
output:
'phase/{batch}_merged_phased.vcf.gz'
'phase/{batch}_merged_phased.bcf.gz'
params:
list='vcf/{batch}_phased.merge.list',
mode=config['mode']
Expand Down Expand Up @@ -55,7 +70,7 @@ rule merge_phased:
# check if there is a background data and merge it
if [ -f "background/{wildcards.batch}_merged_imputed.vcf.gz" && {params.mode} = "client" ]; then
mv {output} {output}.client
bcftools merge --force-samples background/{wildcards.batch}_merged_imputed.vcf.gz {output}.client -O z -o {output} |& tee -a {log}
bcftools merge --force-samples background/{wildcards.batch}_merged_imputed.vcf.gz {output}.client -O b -o {output} |& tee -a {log}
bcftools index -f {output} |& tee -a {log}
fi
'''
32 changes: 5 additions & 27 deletions rules/preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ if NUM_BATCHES > 1:
temp(expand('vcf/batch{i}.txt',i=BATCHES))
params:
num_batches=NUM_BATCHES
conda:
'bcftools'
shell:
'''
bcftools query --list-samples input.vcf.gz >> vcf/samples.txt
Expand All @@ -37,8 +35,6 @@ if NUM_BATCHES > 1:
samples='vcf/{batch}.txt'
output:
vcf='vcf/{batch}.vcf.gz'
conda:
'bcftools'
shell:
'''
bcftools view -S {input.samples} {input.vcf} -O z -o {output.vcf} --force-samples
Expand Down Expand Up @@ -81,8 +77,6 @@ if assembly == 'hg38':
vcf='vcf/{batch}_imputation_removed.vcf.gz'
output:
vcf=temp('vcf/{batch}_merged_lifted.vcf.gz')
conda:
'liftover'
log:
'logs/liftover/liftover{batch}.log'
params:
Expand All @@ -91,7 +85,7 @@ if assembly == 'hg38':
mem_mb=_mem_gb_for_ram_hungry_jobs() * 1024
shell:
'''
JAVA_OPTS="-Xmx{params.mem_gb}g" picard LiftoverVcf WARN_ON_MISSING_CONTIG=true MAX_RECORDS_IN_RAM=5000 I={input.vcf} O={output.vcf} CHAIN={LIFT_CHAIN} REJECT=vcf/chr{wildcards.batch}_rejected.vcf.gz R={GRCH37_FASTA} |& tee -a {log}
JAVA_OPTS="-Xmx{params.mem_gb}g" picard -Xmx12g LiftoverVcf WARN_ON_MISSING_CONTIG=true MAX_RECORDS_IN_RAM=5000 I={input.vcf} O={output.vcf} CHAIN={LIFT_CHAIN} REJECT=vcf/chr{wildcards.batch}_rejected.vcf.gz R={GRCH37_FASTA} |& tee -a {log}
'''
else:
rule copy_liftover:
Expand All @@ -110,11 +104,11 @@ if flow == 'rapid' or flow == 'germline-king':
vcf = 'vcf/{batch}_imputation_removed.vcf.gz'
output:
bcf = 'vcf/{batch}_merged_mapped_sorted.bcf.gz'
conda:
'bcftools'
shell:
'''
bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" {input.vcf} | bcftools view --min-af 0.05 -O b -o {output.bcf}
bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" {input.vcf} | \
bcftools view -t ^8:10428647-13469693,21:16344186-19375168,10:44555093-53240188,22:16051881-25095451,2:85304243-99558013,1:118434520-153401108,15:20060673-25145260,17:77186666-78417478,15:27115823-30295750,17:59518083-64970531,2:132695025-141442636,16:19393068-24031556,2:192352906-198110229 | \
bcftools view --min-af 0.05 --types snps -O b -o {output.bcf}
'''
else:
include: '../rules/filter.smk'
Expand Down Expand Up @@ -149,8 +143,6 @@ else:
bcf=bcf_input
output:
vcf=vcf_output
conda:
'bcftools'
shell:
'''
bcftools view {input.bcf} -O z -o {output.vcf}
Expand All @@ -168,8 +160,6 @@ if not flow == 'rapid':
bim='preprocessed/{batch}_data.bim'
params:
out='preprocessed/{batch}_data'
conda:
'plink'
log:
'logs/plink/convert_mapped_to_plink{batch}.log'
benchmark:
Expand Down Expand Up @@ -199,8 +189,6 @@ if not flow == 'rapid':
bim='preprocessed/data.bim'
threads:
workflow.cores
conda:
'plink'
shell:
'''
rm files_list.txt || true
Expand All @@ -222,8 +210,6 @@ if not flow == 'rapid':
batches_vcf='preprocessed/{batch}_data.vcf.gz'
output:
batches_vcf_index=temp('preprocessed/{batch}_data.vcf.gz.csi')
conda:
'bcftools'
shell:
'''
bcftools index -f {input.batches_vcf}
Expand All @@ -248,8 +234,6 @@ if not flow == 'rapid':
params:
batches_vcf=expand('batch{s}_data.vcf.gz',s=BATCHES),
vcf='data.vcf.gz'
conda:
'bcftools'
shell:
'''
rm complete_vcf_list.txt || true
Expand All @@ -274,8 +258,6 @@ if not flow == 'rapid':
bim='preprocessed/data.bim'
params:
out='preprocessed/data'
conda:
'plink'
log:
'logs/plink/convert_mapped_to_plink_batch1.log'
benchmark:
Expand All @@ -291,8 +273,6 @@ if not flow == 'rapid':
bim='preprocessed/data.bim'
params:
genetic_map_GRCh37=expand(GENETIC_MAP_GRCH37,chrom=CHROMOSOMES)
conda:
'ibis'
output:
'preprocessed/data_mapped.bim'
log:
Expand All @@ -306,11 +286,9 @@ if not flow == 'rapid':
else:
rule create_samples_list:
input:
bcf_input = 'phase/batch1_merged_phased.bcf.gz'
bcf = 'phase/batch1_merged_phased.bcf.gz'
output:
fam='preprocessed/data.fam'
conda:
'bcftools'
shell:
'''
bcftools query --list-samples {input.bcf} >> {output.fam}
Expand Down
Loading

0 comments on commit fdf5742

Please sign in to comment.