Merge pull request #69 from genxnetwork/develop

GRAPE v1.6 Release
genxnetwork · May 19, 2022 · 6ca9635 · 6ca9635
2 parents 69c6ffd + 1818623
commit 6ca9635
Show file tree

Hide file tree

Showing 42 changed files with 4,674 additions and 460 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,67 @@
+# Changelog
+
+## [v1.6] - 2022-05-20
+
+### Added
+
+- New workflow for simulation of a big relatives dataset (~500k samples) was added. It's available via `simbig` command of the pipeline launcher.
+- Support multiple cores for the preprocessing (`preprocess`) workflow.
+- IBD segments weighting feature was added, see `compute-weight-mask` workflow and `--weight-mask` parameter of the pipeline launcher.
+- Several options for better control of the samples filtering were added: `--missing-samples`, `--alt-hom-samples`, `--het-samples`.
+- Random seed parameter was added for the Ped-sim simulation.
+
+### Changed
+
+- GRAPE flows were renamed in the pipeline launcnher: `ibis_king` -> `ibis-king`, `germline` -> `germline-king`.
+- `readme.md` and the GRAPE scheme were updated and actualized.
+- Singularity support was removed in favour of conda environments.
+- Code refactoring and clean up.
+
+### Fixed
+
+- Fixed `germline-king` simulation flow.
+- Fixed `java command not found` during the `reference` workflow evaluation.
+
+## [v1.5.2] - 2021-12-24
+
+### Added
+
+With `--flow ibis_king` grape now calculates IBD1 and IBD2 shares from KING data for the 0-3 degrees.
+
+### Fixed
+
+- Fixed a bug with parsing ERSA output for large datasets.
+- Fixed a bug with setting every values for some rows in relatives.tsv to 2.
+- Fixed `total_seg_len` and `total_seg_len_ibd2` calculation. Now `total_seg_len` corresponds to only ibd1 segments.
+
+## [v1.5.1] - 2021-12-13
+
+### Fixed
+
+- Bundle downloading hotfix.
+- File verification hotfix for reference workflow.
+
+## [v1.5] - 2021-12-01
+
+### Changed
+
+- Removed singularity from all workflows.
+- Many intermediate files are now temporary. This significantly reduces working folder size.
+
+### Fixed
+
+- Fixed removal of duplicate SNPs.
+- ERSA-only workflow now correctly detects duplicates or monozygotic twins.
+
+## Dockstore Release - 2021-09-28
+
+### Added
+
+- Dockstore support
+- Small and full bundle reference downloading from azure
+
+### Changed
+
+- ERSA can handle 100k samples.
+- Preprocessing saves phasing information in vcf input.
+- MAF filter is now consistent across different inputs.
diff --git a/config.yaml b/config.yaml
@@ -5,6 +5,7 @@ ibis_min_snp: 500
 zero_seg_count: 0.5
 zero_seg_len: 5.0
 alpha: 0.01
+num_batches: 1
 samples_file: samples.tsv
 vcf_file: vcf/merged.vcf.gz
 use_simulated_ibd: False

diff --git a/envs/java.yaml b/envs/java.yaml
@@ -0,0 +1,6 @@
+name: java
+channels:
+  - conda-forge
+dependencies:
+  - openjdk==11.0.9.1
+
diff --git a/envs/remove_map.yaml b/envs/remove_map.yaml
@@ -0,0 +1,5 @@
+name: remove_map
+channels:
+    - conda-forge
+dependencies:
+    - pandas==1.1.1
diff --git a/envs/remove_relatives.yaml b/envs/remove_relatives.yaml
@@ -0,0 +1,6 @@
+name: remove_relatives
+channels:
+    - conda-forge
+dependencies:
+    - networkx==2.5
+    - pandas==1.1.1
diff --git a/envs/snakemake.yaml b/envs/snakemake.yaml
@@ -6,12 +6,12 @@ channels:
   - defaults
 dependencies:
   - python>=3.5
-  - snakemake==6.10.0-0
+  - snakemake==7.3.8
   - matplotlib==3.3.1
   - pandas==1.1.1
   - numpy==1.19.1
   - seaborn==0.10.1
   - biom-format==2.1.8
   - scikit-bio==0.5.6
   - docutils==0.16
-  - mmh3==3.0.0
+  - mmh3==3.0.0
diff --git a/pedigree.png → img/pedigree.png b/pedigree.png → img/pedigree.png
diff --git a/precision & recall.png → img/precision & recall.png b/precision & recall.png → img/precision & recall.png
diff --git a/scheme.png → img/scheme.png b/scheme.png → img/scheme.png
diff --git a/launcher.py b/launcher.py
@@ -4,16 +4,20 @@
 import argparse
 import shutil
 import os
+from random import randint
+
 
 from inspect import getsourcefile
 from weight.ibd_segments_weigher import IBDSegmentsWeigher
 
 
+
 # Returns an integer value for total available memory, in GB.
 def total_memory_gb():
     n_bytes = psutil.virtual_memory().total
     return int(n_bytes / (1024 * 1024 * 1024))
 
+
 def get_parser_args():
     parser = argparse.ArgumentParser()
 
@@ -44,6 +48,12 @@ def get_parser_args():
         type=int,
         help="Number of CPU cores to use in this pipeline run (default %(default)s)")
 
+    parser.add_argument(
+        "--num-batches",
+        default=1,
+        type=int,
+        help="Number of batches to split input vcf.gz file into (default is 1). Splitting can speed up the preprocessing stage for the large datasets.")
+
     parser.add_argument(
         "--memory",
         default=max(1, total_memory_gb() - 1),
@@ -220,15 +230,55 @@ def get_parser_args():
         help='Download all references as single file'
     )
 
+    parser.add_argument(
+        '--chip',
+        default='background.vcf.gz',
+        help='Path to chip file'
+    )
+
     parser.add_argument(
         '--weight-mask',
         help='Mask of weights used to re-weight IBD segments length while using ERSA algorithm'
             'for `ibis` and `ibis-king` flows'
     )
 
+    parser.add_argument(
+        '--missing-samples',
+        default=15.0,
+        type=float,
+        help='Upper bound of missing SNPs (%). Samples with higher values are removed from the relatedness detection analysis.')
+
+    parser.add_argument(
+        '--alt-hom-samples',
+        default=1.0,
+        type=float,
+        help='Lower bound of homozygous alternative SNPs (%). Samples with lower values are removed from the relatedness detection analysis.')
+
+    parser.add_argument(
+        '--het-samples',
+        default=5.0,
+        type=float,
+        help='Lower bound of heterozygous SNPs (%). Samples with lower values are removed from the relatedness detection analysis.')
+
+    parser.add_argument(
+        '--seed',
+        default=randint(0, 10**7),
+        type=int,
+        help='Random seed for Ped-sim pedigree simulation. The default value is randomly generated.')
+
     args = parser.parse_args()
 
-    valid_commands = ['preprocess', 'find', 'simulate', 'reference', 'bundle', 'compute-weight-mask']
+    valid_commands = [
+        'preprocess',
+        'find',
+        'simulate',
+        'reference',
+        'bundle',
+        'compute-weight-mask',
+        'simbig',
+        'remove_relatives'
+    ]
+
     if args.command not in valid_commands:
         raise RuntimeError(f'command {args.command} not in list of valid commands: {valid_commands}')
 
@@ -238,19 +288,23 @@ def get_parser_args():
     if args.command != 'reference' and args.use_bundle:
         raise ValueError('--bundle option only available for reference downloading')
 
+    if args.num_batches > args.cores:
+        raise ValueError('Number of batches is bigger than number cores, please change --num-batches value to be lower or equal --cores')
+
+    if any((i < 0 or i > 100) for i in (args.het_samples, args.missing_samples, args.alt_hom_samples)):
+        raise ValueError('Percentage cannot be higher than 100 or lower than 0')
+
     return args
 
 
 def copy_file(working_dir, file_path):
-
     samples_name = os.path.split(file_path)[-1]
     samples_path = os.path.join(working_dir, samples_name)
     if not os.path.exists(samples_path):
         shutil.copy(file_path, os.path.join(working_dir, samples_name))
 
 
 def copy_input(input_dir, working_dir, samples_file):
-
     input_name = os.path.split(input_dir)[-1]
     dest_path = os.path.join(working_dir, input_name)
     if not os.path.exists(dest_path):
@@ -291,10 +345,23 @@ def copy_input(input_dir, working_dir, samples_file):
             os.path.join(args.directory, 'config.yaml')
         )
 
+    if args.command == 'simbig':
+        copy_input(
+            os.path.join(current_path, 'workflows/simbig/params'),
+            args.directory, os.path.join(current_path, 'workflows/simbig/', args.sim_samples_file)
+        )
+        # for some reason launching with docker from command line
+        # sets root directory for 'configfile' directive in bundle.Snakefile as snakemake.workdir
+        # therefore config.yaml must be in snakemake.workdir
+        shutil.copy(
+            os.path.join(current_path, 'workflows/simbig/config.yaml'),
+            os.path.join(args.directory, 'config.yaml')
+        )
+
     if args.command == 'preprocess':
         shutil.copy(args.vcf_file, os.path.join(args.directory, 'input.vcf.gz'))
 
-    if args.command in ['preprocess', 'find', 'reference', 'bundle', 'compute-weight-mask']:
+    if args.command in ['preprocess', 'find', 'reference', 'bundle', 'remove_relatives', 'compute-weight-mask']:
         if args.directory != '.':
             shutil.copy(os.path.join(current_path, 'config.yaml'), os.path.join(args.directory, 'config.yaml'))
 
@@ -304,6 +371,8 @@ def copy_input(input_dir, working_dir, samples_file):
         'simulate': 'workflows/pedsim/Snakefile',
         'reference': 'workflows/reference/Snakefile',
         'bundle': 'workflows/bundle/Snakefile',
+        'simbig': 'workflows/simbig/Snakefile',
+        'remove_relatives': 'workflows/remove_relatives/Snakefile',
         'compute-weight-mask': 'workflows/weight/Snakefile'
     }
 
@@ -332,12 +401,15 @@ def copy_input(input_dir, working_dir, samples_file):
     config_dict['sim_samples_file'] = args.sim_samples_file
     config_dict['assembly'] = args.assembly
     config_dict['mem_gb'] = args.memory
+    config_dict['num_batches'] = args.num_batches
     if args.ref_directory != '':
         config_dict['ref_dir'] = args.ref_directory
+    if args.chip:
+        config_dict['chip'] = args.chip
     if args.flow not in ['ibis', 'ibis-king', 'germline-king']:
         raise ValueError(f'--flow can be one of the ["ibis", "ibis-king", "germline-king"] and not {args.flow}')
     config_dict['flow'] = args.flow
-    if args.command in ['preprocess', 'simulate', 'reference', 'bundle']:
+    if args.command in ['preprocess', 'simulate', 'reference', 'bundle', 'simbig', 'remove_relatives']:
         config_dict['remove_imputation'] = args.remove_imputation
         config_dict['impute'] = args.impute
         config_dict['phase'] = args.phase
@@ -348,6 +420,12 @@ def copy_input(input_dir, working_dir, samples_file):
     config_dict['ibis_seg_len'] = args.ibis_seg_len
     config_dict['ibis_min_snp'] = args.ibis_min_snp
 
+    config_dict['missing_samples'] = args.missing_samples
+    config_dict['alt_hom_samples'] = args.alt_hom_samples
+    config_dict['het_samples'] = args.het_samples
+
+    config_dict['seed'] = args.seed
+
     if args.weight_mask:
         config_dict['weight_mask'] = os.path.join(args.directory, args.weight_mask)
         config_dict['ersa_r'] = IBDSegmentsWeigher.from_json_mask_file(config_dict['weight_mask']) \
@@ -357,7 +435,7 @@ def copy_input(input_dir, working_dir, samples_file):
             snakefile=snakefile,
             configfiles=[args.configfile or 'config.yaml'],
             config=config_dict,
-            workdir=args.directory,
+            workdir=args.directory if args.command != 'reference' else args.ref_directory,
             cores=args.cores,
             unlock=args.unlock,
             printshellcmds=True,
@@ -368,7 +446,8 @@ def copy_input(input_dir, working_dir, samples_file):
             until=[args.until] if args.until is not None else [],
             use_conda=True,
             conda_prefix=args.conda_prefix,
-            envvars=['CONDA_ENVS_PATH', 'CONDA_PKGS_DIRS']
+            envvars=['CONDA_ENVS_PATH', 'CONDA_PKGS_DIRS'],
+            keepgoing=True
     ):
         raise ValueError("Pipeline failed see Snakemake error message for details")