Merge branch 'master' of github.com:h3abionet/h3agwas

h3abionet · May 10, 2022 · 35785d0 · 35785d0
2 parents cee4161 + 9fa4786
commit 35785d0
Show file tree

Hide file tree

Showing 28 changed files with 642 additions and 193 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -4,19 +4,26 @@ authors:
 - family-names: "Brandenburg"
   given-names: "Jean-Tristan"
   orcid: https://orcid.org/0000-0003-0197-2648
+- family-names: "Clark"
+  given-names: "Lindsay"
+  orcid: https://orcid.org 0000-0002-3881-9252
+- family-names: "Botha"
+  given-names: "Gerrit"
+  orcid: 
+- family-names: "Panji"
+  given-names: "Sumir"
+  orcid:  https://orcid.org/0000-0003-0447-0598
+- family-names: "Baichoo"
+  given-names: "Shakuntala"
+  orcid:  https://orcid.org/0000-0002-9335-1939
+- family-names: "Fields"
+  given-names: "Christopher J."
+  orcid:  https://orcid.org/ 0000-0002-0581-149X
 - family-names: "Hazelhurst"
   given-names: Scott
   orcid: https://orcid.org/0000-0002-0581-149X
-- family-names: Magosi
-  given-names: Lerato
-- family-names: Clark
-  given-names: Lindsay
-- family-names: "De Beste"
-  given-names: Eugene
-- family-names: Clucas
-  given-names: Rob
-title: "H3AGWAS: Pan-African Bioinformatics Network for H3Africa Genome Wide Association Study Workflow"
-doi: doi.org/10.25375/uct.14405990.v2
+title: "H3AGWAS : A portable workflow for Genome Wide Association Studies"
+doi: https://doi.org/10.1101/2022.05.02.490206 
 version: 3.0
 date-released: 2019-12-18
 url: "https://github.com/h3abionet/h3agwas"
diff --git a/News.md b/News.md
@@ -2,6 +2,7 @@
 
 
 ## What's new :
+* 2022-05-04 : preprint manuscript can be found on [biorxiv](https://www.biorxiv.org/content/10.1101/2022.05.02.490206v1)
 * 2021-11-04 : association script include saige software
 * 2021-10-25 : new script to do a conditional analysis using gemma see [finemapping](finemapping/README.md)
 * 2021-09-15 :  add for script in finemapping `finemap_multi.nf`, extract positions using plink, do define independant position in function of minimum p-value and for each independant position apply finemapping [finemapping](finemapping/README.md)

diff --git a/README.md b/README.md
@@ -17,6 +17,8 @@ The major change from Version 2 to Version 3 is the reorganisation of the repo s
 This means that instead of running `nextflow run h3abionet/h3agwas/assoc.nf`, you should run `nextflow run h3abionet/h3agwas/assoc/main.nf`
 
 
+***Preprint manuscript can be found on [biorxiv](https://www.biorxiv.org/content/10.1101/2022.05.02.490206v1)***
+
 ## Brief introduction
 
 In addition to this README we have a detailed tutorial and videos 
@@ -837,6 +839,11 @@ the typical reason is that you have not allocated enough memory for the process
 
 If you use this workflow, please cite the following paper
 
+* Studies Jean-Tristan Brandenburg, Lindsay Clark, Gerrit Botha, Sumir Panji, Shakuntala Baichoo, Christopher J. Fields, Scott Hazelhurst. (2022). H3AGWAS : A portable workflow for Genome Wide Association Studies bioRxiv 2022.05.02.490206; doi: https://doi.org/10.1101/2022.05.02.490206 
+
+and 
+
+
 * Baichoo S, Souilmi Y, Panji S, Botha G, Meintjes A, Hazelhurst S, Bendou H, De Beste E, Mpangase P, Souiai O, Alghali M, Yi L, O'Connor B, Crusoe M, Armstrong D, Aron S, Joubert D, Ahmed A, Mbiyavanga M, Van Heusden P, Magosi, L, Zermeno, J, Mainzer L, Fadlelmola F, Jongeneel CV, and Mulder N. (2018) Developing reproducible bioinformatics analysis workflows for heterogenous computing environments to support African genomics, *BMC Bioinformatics* **19**, 457, 13 pages, doi:10.1186/s12859-018-2446-1.
 
 

diff --git a/Troubleshooting.md b/Troubleshooting.md
@@ -0,0 +1,7 @@
+<img src="auxfiles/H3ABioNetlogo2.jpg"/>
+
+
+# Trouble shooting and possible problems :
+ * clump function, if rs id between plink file and summary statistics,result  not completed
+  * ssee replication and finemapping pipeline
+
diff --git a/assoc/bin/format_saige_pheno.r b/assoc/bin/format_saige_pheno.r
@@ -0,0 +1,36 @@
+#!/usr/bin/env Rscript
+library("optparse")
+
+option_list = list(
+  make_option(c("--data"), type="character", default=NULL,
+              help="dataset file name", metavar="character"),
+  make_option(c( "--ind_vcf"), type="character", default=NULL,
+              help="dataset file name", metavar="character"),
+  make_option(c("--out"), type="character",
+              help="list of genes ", metavar="character")
+);
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+Data<-read.table(opt[['data']], header=T)
+ndata<-names(Data)
+listind<-read.table(opt[['ind_vcf']])
+Data$FIDIID<-paste(Data[,1], Data[,2],sep='_')
+
+if(any(listind$V1 %in% listind[,1]) & any(listind$V2 %in% listind[,1])){
+write.table(Data[,ndata], file=opt[['out']], row.names=F, col.names=T, sep='\t', quote=F)
+write.table(Data[,c(1,2,1,2)], file=paste(opt[['out']],'_updateid',sep=''),  col.names=F, sep='\t', quote=F)
+}else if(any(listind$V1 %in% Data$FIDIID)){
+write.table(Data[,c('FID','IID','FIDIID','FIDIID')], file=paste(opt[['out']],'_updateid',sep=''),  col.names=F, sep='\t', quote=F, row.names=F)
+Data$FID<-Data$FIDIID
+Data$IID<-Data$FIDIID
+write.table(Data[,ndata], file=opt[['out']], row.names=F, col.names=T, sep='\t', quote=F)
+}else{
+cat(Data$FID)
+cat('\n')
+cat(Data$IID)
+cat('no same individual between vcf and pheno file\nexit\n')
+q('n', 2)
+}
+
diff --git a/assoc/main.nf b/assoc/main.nf
@@ -38,7 +38,7 @@ allowed_params+=ParamFast
 /*Gxe : */
 GxE_params=['gemma_gxe', "plink_gxe", "gxe"]
 allowed_params+=GxE_params
-Saige_params=['pheno_bin', "list_vcf", "saige_num_cores", "saige_mem_req"]
+Saige_params=['pheno_bin', "list_vcf", "saige_num_cores", "saige_mem_req", "vcf_field"]
 allowed_params+=Saige_params
 
 FastGWA_params=["fastgwa_mem_req", "fastgwa_num_cores", 'fastgwa', "gcta64_bin"]
@@ -402,9 +402,9 @@ else {
 
 
 
-
-num_assoc_cores = params.mperm == 0 ? 1 : Math.min(10,params.max_plink_cores)
-
+//scott, why are you limited at one core when you don't have permutation?
+//num_assoc_cores = params.mperm == 0 ? 1 : Math.min(10,params.max_plink_cores)
+num_assoc_cores = params.max_plink_cores
 supported_tests = ["assoc","fisher","model","cmh","linear","logistic"]
 
 requested_tests = supported_tests.findAll { entry -> params.get(entry) }
@@ -416,7 +416,7 @@ pheno     = ""
 
 /*Case where we sample builrelatdness*/
 balise_filers_rel=1
-if(params.boltlmm+params.gemma+params.fastlmm+params.fastgwa+params.saige+params.gemma_gxe>0){
+if(params.boltlmm+params.gemma+params.fastlmm+params.fastgwa+params.saige+params.gemma_gxe+params.saige>0){
  if(params.file_rs_buildrelat=="" && params.sample_snps_rel==1){
   process select_rs_format{
      cpus max_plink_cores
@@ -441,6 +441,7 @@ if(params.boltlmm+params.gemma+params.fastlmm+params.fastgwa+params.saige+params
    filers_matrel_mat_fast=Channel.fromPath("${dummy_dir}/0",checkIfExists:true) 
    filers_matrel_mat_GWA=Channel.fromPath("${dummy_dir}/0") 
    filers_matrel_mat_gem=channel.fromPath("${dummy_dir}/0") 
+   filers_her_saige=channel.fromPath("${dummy_dir}/00")
    if(params.boltlmm==1){
       //BoltNbMaxSnps=1000000
       process buildBoltFileSnpRel{
@@ -1565,7 +1566,8 @@ if(params.saige==1){
     println "No list_vcf given for saige -- set params.list_vcf";
     System.exit(-2);
   }
-  process subplink_heritability_saige{
+  if(balise_filers_rel==1){
+    process subplink_heritability_saige{
     input :
        file(rs) from filers_her_saige
        file(plk) from ch_saige_heritability
@@ -1577,16 +1579,46 @@ if(params.saige==1){
        """
        plink -bfile $bfile --extract $rs --make-bed -out $out --keep-allele-order
        """
+    }
+   }else{
+   ch_plk_saige=Channel.create()
+    Channel
+    .from(file(bed),file(bim),file(fam))
+    .buffer(size:3)
+    .map { a -> [checker(a[0]), checker(a[1]), checker(a[2])] }
+    .set { ch_plk_saige}
+
   }
+
+
   fam_ch_saige = Channel.create()
   plk_ch_saige_her = Channel.create()
   ch_plk_saige.separate (plk_ch_saige_her, fam_ch_saige) { a -> [ a, a[2]] }
 
   data_ch_saige = Channel.fromPath(params.data,checkIfExists:true)
+  firstvcf_ch=Channel.fromPath(file(params.list_vcf).readLines()[0], checkIfExists:true)
+  process checkidd_saige{
+    label 'R'
+    input :
+      path(covariates) from data_ch_saige
+      path(vcf) from  firstvcf_ch
+      tuple path(bed), path(bim), path(fam) from plk_ch_saige_her
+    output :
+      tuple path("${bfileupdate}.bed"), path("${bfileupdate}.bim"), path("${bfileupdate}.fam") into plk_ch_saige_her_idupd
+      tuple path(covariates_form),path("${bfileupdate}.fam") into data_ch_saige_form
+    script :
+      covariates_form=covariates.baseName+'_saige.ind'
+      bfile=bed.baseName
+      bfileupdate=bfile+'_idsaige'
+      """
+      zcat $vcf |head -1000 |grep "#"| awk '{for(Cmt=10;Cmt<=NF;Cmt++)print \$Cmt}' > fileind
+      format_saige_pheno.r --data $covariates --ind_vcf fileind --out ${covariates_form}
+      plink -bfile  $bfile --keep-allele-order -out $bfileupdate --make-bed --update-ids  ${covariates_form}"_updateid"
+      """
+  }
   process  getSaigePheno {
     input:
-      file(covariates) from data_ch_saige
-      file(fam) from fam_ch_saige
+      tuple path(covariates), path(fam) from data_ch_saige_form
     output:
       file(phef) into saige_data_format_ch
       stdout into pheno_cols_ch_saige
@@ -1609,8 +1641,8 @@ if(params.saige==1){
    cpus params.saige_num_cores
    label 'saige'
    input :
-      set file(bed), file(bim), file(fam) from plk_ch_saige_her
-      file(pheno) from saige_data_format_ch
+      tuple path(bed), path(bim), path(fam) from plk_ch_saige_her_idupd
+      path(pheno) from saige_data_format_ch
    publishDir "${params.output_dir}/saige/varexp/", overwrite:true, mode:'copy'
    each this_pheno from pheno_ch_saige
    output :
@@ -1634,7 +1666,7 @@ if(params.saige==1){
      """
 
   }
-  listvcf_ch=Channel.fromPath(file(params.list_vcf).readLines())
+  listvcf_ch=Channel.fromPath(file(params.list_vcf).readLines(), checkIfExists:true)
   process buildindex {
     label 'saige'
     input :

diff --git a/assoc/nextflow.config b/assoc/nextflow.config
@@ -98,7 +98,7 @@ params {
     AMI                  = "ami-710b9108"
     instanceType         = "m4.xlarge"
     bootStorageSize      = "20GB"
-    maxInstances         = "1"
+    maxInstances         = "5"
 
     plink_mem_req        = "750MB"
     other_mem_req        = "750MB"

diff --git a/finemapping/bin/change_genes_gencode.py b/finemapping/bin/change_genes_gencode.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+import sys
+import re
+args = sys.argv
+print(args)
+if len(sys.argv)==0 :
+  filegenes="/spaces/jeantristan/GWAS/Ressource/gencode.v19.annotation.gtf"
+else :
+ filegenes=args[1]
+
+readgene=open(sys.argv[1])
+writegene=open("gencode.v19.genes", 'w')
+writegene.write("\t".join(["CHR", "Type","BEGIN", "END","GENE"])+'\n')
+patrn='gene_name'
+for line in readgene :
+ if line[0] != "#" :
+   splline=line.split('\t')  
+   #['chr1', 'ENSEMBL', 'CDS', '6186632', '6186806', '.', '-', '0', 'gene_id "ENSG00000116254.13"; transcript_id "ENST00000378021.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "CHD5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "CHD5-201"; exon_number 26; exon_id "ENSE00003551672.1"; level 3; protein_id "ENSP00000367260.1"; tag "basic"; havana_gene "OTTHUMG00000000952.6";\n']^C
+   if splline[2]=='CDS' or splline[2]=='gene' :
+      tmp=splline[8].split(';')
+      resgene=[x for x in tmp if patrn in x ]
+      if len(resgene)> 0:
+        resgene=resgene[0].replace(patrn, '').replace(' ','').replace('"',"")
+        chro=resgene[0].replace('chr','')
+        writegene.write("\t".join([chro, splline[2], splline[3], splline[4], resgene])+'\n')
+
+writegene.close()
+readgene.close()
+