From 778913c06f865a96da758643d53bf3efda2a5673 Mon Sep 17 00:00:00 2001 From: kopardev Date: Mon, 14 Aug 2023 17:30:51 -0400 Subject: [PATCH] feature: circRNA id is independent of strand, counts for both + and - strand are reported together, strand from all callers are reported, both + and - flanking sites are reported, rev-comp function updated --- .../scripts/_bam_filter_BSJ_for_HQonly.py | 5 +- .../_create_circExplorer_linear_bam.sh | 91 ++++---- ...filter_linear_spliced_readids_w_rid2jid.py | 38 ++-- workflow/scripts/_make_master_counts_table.py | 104 +--------- .../_merge_circExplorer_found_counts.py | 16 +- .../scripts/_merge_per_sample_counts_table.py | 196 ++++++++++-------- ...te_circExplorer_per_sample_counts_table.py | 17 +- 7 files changed, 200 insertions(+), 267 deletions(-) diff --git a/workflow/scripts/_bam_filter_BSJ_for_HQonly.py b/workflow/scripts/_bam_filter_BSJ_for_HQonly.py index 2cb6f19..8315b0b 100644 --- a/workflow/scripts/_bam_filter_BSJ_for_HQonly.py +++ b/workflow/scripts/_bam_filter_BSJ_for_HQonly.py @@ -74,7 +74,7 @@ def main(): RGlist = dict() for index,row in indf.iterrows(): - jid = row['chrom']+"##"+str(row['start'])+"##"+str(row['end'])+"##"+row['strand'] + jid = row['chrom']+"##"+str(row['start'])+"##"+str(row['end']) RGlist[jid]=1 print("Number of RGs: ",len(RGlist)) @@ -114,6 +114,9 @@ def main(): for read in samfile.fetch(): rg = read.get_tag("RG") + rg = rg.split("##") + rg = rg[:len(rg)-1] + rg = "##".join(rg) if rg in RGlist: regionname=_get_regionname_from_seqname(regions,read.reference_name) if regionname in hosts: diff --git a/workflow/scripts/_create_circExplorer_linear_bam.sh b/workflow/scripts/_create_circExplorer_linear_bam.sh index 00ec996..1435396 100644 --- a/workflow/scripts/_create_circExplorer_linear_bam.sh +++ b/workflow/scripts/_create_circExplorer_linear_bam.sh @@ -1,15 +1,18 @@ #!/usr/bin/env bash -# module load parallel -# module load python/3.7 -# module load bedtools -# module load ucsc -# module load samtools + module load parallel + module load python/3.7 + module load bedtools + module load ucsc + module load samtools set -e -x -o pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -# SCRIPT_DIR="/vf/users/Ziegelbauer_lab/Pipelines/circRNA/activeDev/workflow/scripts" +# only for debugging +SCRIPT_DIR2="/vf/users/Ziegelbauer_lab/Pipelines/circRNA/activeDev/workflow/scripts" +# add the following line when not debugging +SCRIPT_DIR2="$SCRIPT_DIR" SCRIPT_NAME=$( basename -- "${BASH_SOURCE[0]}" ) ARGPARSE_DESCRIPTION="create linear and splice BAMs (and BIGWIGs)" @@ -60,29 +63,6 @@ start0=$(date +%s.%N) # bedSort from ucsc tools # ucsc -#if [ "$#" != 18 ];then -# echo "$#" -# echo "18 arguments expected!" -# echo "#1 --> path to non-chimeric BAM file" -# echo "#2 --> sample name" -# echo "#3 --> PE or SE" -# echo "#4 --> known BSJs in bed.gz format" -# echo "#5 --> tmpdir" -# echo "#6 --> gzipped outputfilename eg.${sample_name}.rid2jid.tsv.gz" -# echo "#7 --> output filtered sample BAM" -# echo "#8 --> gzip-ed list of linear BSJ readids" -# echo "#9 --> gzip-ed list of linear spliced BSJ readids" -# echo "#10 --> jid counts (linear and linear-spliced) per jid or BSJ" -# echo "#11 --> linear BSJ readids in BAM" -# echo "#12 --> linear-spliced BSJ readids in BAM" -# echo "#13 --> .regions file eg. ref/ref.fa.regions" -# echo "#14 --> host list comma separated .. no spaces" -# echo "#15 --> additives list comma separated .. no spaces" -# echo "#16 --> viruses list comma separated .. no spaces" -# echo "#17 --> all linear readids in BAM" -# echo "#18 --> all spliced readids in BAM" -# exit -#fi set -exo pipefail @@ -139,27 +119,26 @@ start=$(date +%s.%N) if [ "$peorse" == "PE" ];then -python3 ${SCRIPT_DIR}/filter_bam.py \ +python3 ${SCRIPT_DIR2}/filter_bam.py \ --inbam $non_chimeric_star2p_bam \ --outbam $filtered_bam \ --pe else -python3 ${SCRIPT_DIR}/filter_bam.py \ +python3 ${SCRIPT_DIR2}/filter_bam.py \ --inbam $non_chimeric_star2p_bam \ --outbam $filtered_bam \ fi -nreads=$(samtools view /gpfs/gsfs8/users/CBLCCBR/kopardevn_tmp/issue57_testing_2/results/GI1_T/circExplorer/GI1_T.bam|wc -l) +nreads=$(samtools view $filtered_bam|wc -l) if [ "$nreads" == "0" ];then echo "SOMETHING WENT WRONG ...filtered BAM is empty!!" exit fi samtools index -@4 $filtered_bam - ################################################################################################### # convert BAM to read ends BED format ################################################################################################### @@ -169,7 +148,7 @@ start=$(date +%s.%N) bedtools bamtobed -split -i $filtered_bam > ${tmpdir}/${sample_name}.bed -python ${SCRIPT_DIR}/_process_bamtobed.py \ +python ${SCRIPT_DIR2}/_process_bamtobed.py \ --inbed ${tmpdir}/${sample_name}.bed \ --outbed ${tmpdir}/${sample_name}.readends.bed \ --linear ${tmpdir}/${sample_name}.linear.readids.gz \ @@ -188,7 +167,6 @@ zcat $bsjbedgz |awk -F"\t" -v OFS="\t" '{print $1, $2, $2, $1"##"$2"##"$3"##"$6, zcat $bsjbedgz |awk -F"\t" -v OFS="\t" '{print $1, $3, $3, $1"##"$2"##"$3"##"$6, $5, $6}' - >> ${tmpdir}/BSJ.ends.bed bedSort ${tmpdir}/BSJ.ends.bed ${tmpdir}/BSJ.ends.bed - ################################################################################################### # finding max readlength ################################################################################################### @@ -196,11 +174,12 @@ bedSort ${tmpdir}/BSJ.ends.bed ${tmpdir}/BSJ.ends.bed printtime $SCRIPT_NAME $start0 $start "finding max readlength" start=$(date +%s.%N) -python3 ${SCRIPT_DIR}/bam_get_max_readlen.py -i $filtered_bam > ${tmpdir}/${sample_name}.maxrl +python3 ${SCRIPT_DIR2}/bam_get_max_readlen.py -i $filtered_bam > ${tmpdir}/${sample_name}.maxrl maxrl=$(cat ${tmpdir}/${sample_name}.maxrl) two_maxrl=$((maxrl*2)) +echo $two_maxrl - +#two_maxrl="302" ################################################################################################### # find closest known BSJs with primary alignments and then keep alignments with some alignment within the "inclusion zone" # this is run in parallel @@ -217,7 +196,7 @@ bedSort $f $f && \ bedtools closest -nonamecheck \ -a $f \ -b ${tmpdir}/BSJ.ends.bed -d | \ - awk -F\"\\t\" -v OFS=\"\\t\" -v limit=$two_maxrl '{if (\$NF > 0 && \$NF <= limit) {print \$4,\$10}}' | \ + awk -F\"\\t\" -v OFS=\"\\t\" -v limit=$two_maxrl '{if (\$NF > 0 && \$NF <= limit) {print \$4,\$10,\$NF}}' | \ sort --buffer-size=2G --unique --parallel=2 > ${f}.rid2jid.closest.txt """ done > ${tmpdir}/para.tmp @@ -225,10 +204,12 @@ grep -v "^$" ${tmpdir}/para.tmp > ${tmpdir}/para parallel -j $threads < ${tmpdir}/para #cat ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt > ${rid2jidgzip%.*}.tmp -for f in $(ls ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt);do +for f in $(find $tmpdir -name "${sample_name}.readends.part.*.rid2jid.closest.txt");do +#for f in $(ls ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt);do cat $f done > ${rid2jidgzip%.*}.tmp -sort --buffer-size=20G --parallel=$threads --unique ${rid2jidgzip%.*}.tmp > ${rid2jidgzip%.*} +sort -k1,1 -k3,3n --buffer-size=20G --parallel=$threads --unique ${rid2jidgzip%.*}.tmp | \ + awk -F"\t" -v OFS="\t" '{a[$1]++;if (a[$1]==1) {print $1,$2}}' > ${rid2jidgzip%.*} pigz -p4 -f ${rid2jidgzip%.*} && rm -f ${rid2jidgzip%.*}.tmp @@ -239,7 +220,7 @@ pigz -p4 -f ${rid2jidgzip%.*} && rm -f ${rid2jidgzip%.*}.tmp printtime $SCRIPT_NAME $start0 $start "filter linear and spliced readids to those near BSJs only; creating counts table" start=$(date +%s.%N) -python3 ${SCRIPT_DIR}/_filter_linear_spliced_readids_w_rid2jid.py \ +python3 ${SCRIPT_DIR2}/_filter_linear_spliced_readids_w_rid2jid.py \ --linearin ${tmpdir}/${sample_name}.linear.readids.gz \ --splicedin ${tmpdir}/${sample_name}.spliced.readids.gz \ --rid2jid ${rid2jidgzip} \ @@ -247,7 +228,6 @@ python3 ${SCRIPT_DIR}/_filter_linear_spliced_readids_w_rid2jid.py \ --splicedout ${splicedrids} \ --jidcounts ${jidcounts} - # rm -rf ${tmpdir}/${sample_name}.readends.part.* # the above statement leads to /usr/bin/rm: Argument list too long # hence, @@ -267,10 +247,10 @@ splicedbam_bn=$(basename $splicedbam) linearbam_all_bn=$(basename $linearbam_all) splicedbam_all_bn=$(basename $splicedbam_all) -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_bn} --readids $linearrids" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_bn} --readids $splicedrids" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_all_bn} --readids ${tmpdir}/${sample_name}.linear.readids.gz" >> ${tmpdir}/para2 -echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_all_bn} --readids ${tmpdir}/${sample_name}.spliced.readids.gz" >> ${tmpdir}/para2 +echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_bn} --readids $linearrids" >> ${tmpdir}/para2 +echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_bn} --readids $splicedrids" >> ${tmpdir}/para2 +echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_all_bn} --readids ${tmpdir}/${sample_name}.linear.readids.gz" >> ${tmpdir}/para2 +echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_all_bn} --readids ${tmpdir}/${sample_name}.spliced.readids.gz" >> ${tmpdir}/para2 parallel -j 4 < ${tmpdir}/para2 @@ -296,10 +276,10 @@ samtools sort -l 9 -T ${tmpdir}/sorttmp --write-index -@${threads} -O BAM -o ${s if [ -f ${tmpdir}/para3 ];then rm -f ${tmpdir}/para3;fi -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam --sample_name $sample_name --regions $regions --prefix linear_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam --sample_name $sample_name --regions $regions --prefix spliced_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam_all --sample_name $sample_name --regions $regions --prefix linear --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 -echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam_all --sample_name $sample_name --regions $regions --prefix spliced --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 +echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $linearbam --sample_name $sample_name --regions $regions --prefix linear_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 +echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $splicedbam --sample_name $sample_name --regions $regions --prefix spliced_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 +echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $linearbam_all --sample_name $sample_name --regions $regions --prefix linear --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 +echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $splicedbam_all --sample_name $sample_name --regions $regions --prefix spliced --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3 parallel -j 4 < ${tmpdir}/para3 @@ -317,16 +297,16 @@ for folder in $(ls ${tmpdir}/b2b*);do rm -rf $folder;done if [ -f ${tmpdir}/para3 ];then rm -f ${tmpdir}/para4;fi -echo "mkdir -p ${tmpdir}/b2b_1 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $linearbam ${tmpdir}/b2b_1" >> ${tmpdir}/para4 -echo "mkdir -p ${tmpdir}/b2b_2 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $splicedbam ${tmpdir}/b2b_2" >> ${tmpdir}/para4 -echo "mkdir -p ${tmpdir}/b2b_3 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $linearbam_all ${tmpdir}/b2b_3" >> ${tmpdir}/para4 -echo "mkdir -p ${tmpdir}/b2b_4 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $splicedbam_all ${tmpdir}/b2b_4" >> ${tmpdir}/para4 +echo "mkdir -p ${tmpdir}/b2b_1 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $linearbam ${tmpdir}/b2b_1" >> ${tmpdir}/para4 +echo "mkdir -p ${tmpdir}/b2b_2 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $splicedbam ${tmpdir}/b2b_2" >> ${tmpdir}/para4 +echo "mkdir -p ${tmpdir}/b2b_3 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $linearbam_all ${tmpdir}/b2b_3" >> ${tmpdir}/para4 +echo "mkdir -p ${tmpdir}/b2b_4 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $splicedbam_all ${tmpdir}/b2b_4" >> ${tmpdir}/para4 count=4 for prefix in "linear_BSJ" "spliced_BSJ" "linear" "spliced";do for b in $(echo "$host $viruses"|tr ',' ' ');do count=$((count+1)) bam="${outdir}/${sample_name}.${prefix}.${b}.bam" - echo "mkdir -p ${tmpdir}/b2b_${count} && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $bam ${tmpdir}/b2b_${count}" >> ${tmpdir}/para4 + echo "mkdir -p ${tmpdir}/b2b_${count} && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $bam ${tmpdir}/b2b_${count}" >> ${tmpdir}/para4 done done @@ -334,3 +314,4 @@ parallel -j 12 < ${tmpdir}/para4 #rm -rf ${tmpdir}/* printtime $SCRIPT_NAME $start0 $start "Done!" + diff --git a/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py b/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py index bc60a00..7bbace0 100644 --- a/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py +++ b/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py @@ -50,34 +50,29 @@ def main(): jid=l[1] if jid==".": print(">>>>>>>>jid is dot:",l) - if not jid in scount: - scount[jid]=dict() - lcount[jid]=dict() - scount[jid]["SS"]=0 - scount[jid]["OS"]=0 - scount[jid]["Unknown"]=0 - lcount[jid]["SS"]=0 - lcount[jid]["OS"]=0 - lcount[jid]["Unknown"]=0 + # jchr,jstart,jend,jstrand=jid.split("##") + # jid2="##".join([jchr,jstart,jend]) + jid2=jid + if not jid2 in scount: + scount[jid2]=dict() + lcount[jid2]=dict() + scount[jid2]["+"]=0 + scount[jid2]["-"]=0 + scount[jid2]["."]=0 + lcount[jid2]["+"]=0 + lcount[jid2]["-"]=0 + lcount[jid2]["."]=0 if "##" in l[0]: rid,rstrand=l[0].split("##") else: rid=l[0] rstrand="." - jstrand=jid.split("##")[-1] - if rstrand=="+" or rstrand=="-": - if rstrand==jstrand: - strandinfo="OS" - else: - strandinfo="SS" - else: - strandinfo="Unknown" if rid in linridlist: linridlist[rid]+=1 - lcount[jid][strandinfo]+=1 + lcount[jid][rstrand]+=1 if rid in sinridlist: sinridlist[rid]+=1 - scount[jid][strandinfo]+=1 + scount[jid][rstrand]+=1 rid2jid.close() with gzip.open(args.linearout,'wt') as outrl: for k,v in linridlist.items(): @@ -90,13 +85,14 @@ def main(): outrl.write("%s\n"%k) outrl.close() countout=open(args.jidcounts,'w') - countout.write("#chrom\tstart\tend\tstrand\tlinear_same_strand\tspliced_same_strand\tlinear_opposite_strand\tspliced_opposite_strand\tlinear_unknown_strand\tspliced_unknown_strand\n") + # countout.write("#chrom\tstart\tend\tlinear_+\tspliced_+\tlinear_-\tspliced_-\tlinear_.\tspliced_.\n") + countout.write("#chrom\tstart\tend\tstrand\tlinear_+\tspliced_+\tlinear_-\tspliced_-\tlinear_.\tspliced_.\n") for k in lcount.keys(): v1=lcount[k] v2=scount[k] kstr=k.split("##") k="\t".join(kstr) - countout.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\n"%(k,v1["SS"],v2["SS"],v1["OS"],v2["OS"],v1["Unknown"],v2["Unknown"])) + countout.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\n"%(k,v1["+"],v2["+"],v1["-"],v2["-"],v1["."],v2["."])) countout.close() if __name__ == "__main__": diff --git a/workflow/scripts/_make_master_counts_table.py b/workflow/scripts/_make_master_counts_table.py index 19c00a8..1a81dbf 100644 --- a/workflow/scripts/_make_master_counts_table.py +++ b/workflow/scripts/_make_master_counts_table.py @@ -32,10 +32,10 @@ def main() : count += 1 if count==1: outdf = pd.read_csv(f,sep="\t",header=0,compression='gzip') - outdf.set_index(['chrom', 'start', 'end', 'strand', 'flanking_sites', 'sample_name']) + outdf.set_index(['chrom', 'start', 'end', 'sample_name']) else: tmpdf = pd.read_csv(f,sep="\t",header=0,compression='gzip') - tmpdf.set_index(['chrom', 'start', 'end', 'strand', 'flanking_sites', 'sample_name']) + tmpdf.set_index(['chrom', 'start', 'end', 'sample_name']) outdf = pd.concat([outdf , tmpdf],axis=0,join="outer",sort=False) outdf.reset_index(drop=True,inplace=True) outdf.fillna(-1,inplace=True) @@ -49,104 +49,8 @@ def main() : # print(strcols) outdf = _df_setcol_as_int(outdf,intcols) outdf = _df_setcol_as_str(outdf,strcols) - outdf = outdf.sort_values(by=['chrom','start','end','strand']) + outdf = outdf.sort_values(by=['chrom','start','end', 'sample_name']) - # Applying circExplorer_BWA fixes - indf=outdf - non_circExplorer_BWA_df=indf.loc[(indf['strand']!=".")] - ambiguous_strand_df=indf.loc[(indf['strand']==".") & (indf['circExplorer_bwa_read_count']>=args.minreads)] - # print(ambiguous_strand_df.head()) - - countlist = set(['circExplorer_read_count','ciri_read_count','findcirc_read_count','dcc_read_count','circrnafinder_read_count','mapsplice_read_count','nclscan_read_count']) - colnames = set(indf.columns) - excludelist = countlist - colnames - newcountlist = countlist - excludelist - # print("-2") - # non_circExplorer_BWA_df = non_circExplorer_BWA_df.reset_index() - lookup = dict() - for index,row in non_circExplorer_BWA_df.iterrows(): - circID = row['chrom'] + "##" + str(row['start']) + "##" + str(row['end']) - if not circID in lookup: - lookup[circID] = dict() - lookup[circID]["+"] = dict() - lookup[circID]["+"]['sum'] = 0 - lookup[circID]["-"] = dict() - lookup[circID]["-"]['sum'] = 0 - for c in newcountlist: - if int(row[c]) != -1: - lookup[circID][row['strand']]['sum'] += int(row[c]) - lookup[circID][row['strand']]['flanking_sites'] = row['flanking_sites'] - # print("-1") - # find winner - for k,v in lookup.items(): - if v["+"]['sum'] >= v["-"]['sum']: - lookup[k]['strand'] = "+" - else: - lookup[k]['strand'] = "-" - # print("0") - # ambiguous_strand_df = ambiguous_strand_df.reset_index() - for index,row in ambiguous_strand_df.iterrows(): - circID = row['chrom'] + "##" + str(row['start']) + "##" + str(row['end']) - if not circID in lookup: continue - ambiguous_strand_df.at[index,'strand']=lookup[circID]['strand'] - ambiguous_strand_df.at[index,'flanking_sites']=lookup[circID][lookup[circID]['strand']]['flanking_sites'] - - non_ambiguous_strand_df = ambiguous_strand_df.loc[ambiguous_strand_df['strand']!="."] - # print("1") - bwa_counts = dict() - circID2index = dict() - circIDlist=list() - for index2,row2 in non_ambiguous_strand_df.iterrows(): - circID2 = row2['chrom'] + "##" + str(row2['start']) + "##" + str(row2['end']) + "##" + row2['sample_name'] - circIDlist.append(circID2) - bwa_counts[circID2] = row2['circExplorer_bwa_read_count'] - circID2index[circID2] = index2 - - non_ambiguous_strand_df.insert(0,'circID',circIDlist) - - # print("2") - claimed=dict() - for index,row in non_circExplorer_BWA_df.iterrows(): - circID = row['chrom'] + "##" + str(row['start']) + "##" + str(row['end']) + "##" + row['sample_name'] - if circID in bwa_counts: - claimed[circID]=1 - non_circExplorer_BWA_df.at[index,'circExplorer_bwa_read_count']=bwa_counts[circID] - # print("3") - set1=set(bwa_counts.keys()) - set2=set(claimed.keys()) - unclaimed=list(set1-set2) - # print("4") - # unclaimed_index=list() - # for c in unclaimed: - # unclaimed_index.append(circID2index[c]) - if len(unclaimed) > 0: - # unclaimed_non_ambiguous_strand_df = non_ambiguous_strand_df.iloc[unclaimed_index] - unclaimed_non_ambiguous_strand_df = non_ambiguous_strand_df.loc[non_ambiguous_strand_df['circID'].isin(unclaimed)] - unclaimed_non_ambiguous_strand_df_wo_circID = unclaimed_non_ambiguous_strand_df.drop(['circID'],axis=1) - outdf=pd.concat([non_circExplorer_BWA_df,unclaimed_non_ambiguous_strand_df_wo_circID]) - else: - outdf=non_circExplorer_BWA_df - # print("5") - - # countlist = set(['circExplorer_read_count','ciri_read_count','findcirc_read_count','dcc_read_count','circrnafinder_read_count','mapsplice_read_count','nclscan_read_count']) - # newcountlist - hqcountlist = set(['circExplorer_read_count','circExplorer_bwa_read_count']) - nonhqcountlist = newcountlist-hqcountlist - - for index,row in outdf.iterrows(): - hqcount=0 - nonhqcount=0 - for c in hqcountlist: - if row[c] >= args.minreads: hqcount+=1 - for c in nonhqcountlist: - if row[c] >= args.minreads: nonhqcount+=1 - outdf.at[index,'hqcounts']=hqcount - outdf.at[index,'nonhqcounts']=nonhqcount - outdf.at[index,'ntools']=hqcount+nonhqcount - if hqcount==len(hqcountlist) and nonhqcount >= 1: - outdf.at[index,'HQ']="Y" - else: - outdf.at[index,'HQ']="N" intcols=['start','end','ntools'] for c in outdf.columns: @@ -155,7 +59,7 @@ def main() : strcols=list(set(outdf.columns)-set(intcols)) outdf = _df_setcol_as_int(outdf,intcols) outdf = _df_setcol_as_str(outdf,strcols) - outdf = outdf.sort_values(by=['chrom','start','end','strand']) + outdf = outdf.sort_values(by=['chrom','start','end','sample_name']) outdf.to_csv(args.outfile,sep="\t",header=True,index=False,compression='gzip') if __name__ == "__main__": diff --git a/workflow/scripts/_merge_circExplorer_found_counts.py b/workflow/scripts/_merge_circExplorer_found_counts.py index 3ed2da8..245c260 100644 --- a/workflow/scripts/_merge_circExplorer_found_counts.py +++ b/workflow/scripts/_merge_circExplorer_found_counts.py @@ -2,6 +2,16 @@ import sys import pandas +def _df_setcol_as_int(df,collist): + for c in collist: + df[[c]]=df[[c]].astype(int) + return df + +def _df_setcol_as_str(df,collist): + for c in collist: + df[[c]]=df[[c]].astype(str) + return df + def main(): # debug = True debug = False @@ -20,7 +30,11 @@ def main(): print(bcounts.head()) print(lcounts.head()) mcounts = bcounts.merge(lcounts,how='outer',on=["#chrom","start","end","strand"]) - + strcols = [ '#chrom', 'strand' ] + intcols = list ( set(mcounts.columns) - set(strcols) ) + mcounts.fillna(value=0,inplace=True) + mcounts = _df_setcol_as_str(mcounts,strcols) + mcounts = _df_setcol_as_int(mcounts,intcols) mcounts.to_csv(args.mergedcounts,index=False,doublequote=False,sep="\t") if __name__ == "__main__": diff --git a/workflow/scripts/_merge_per_sample_counts_table.py b/workflow/scripts/_merge_per_sample_counts_table.py index 7635f2b..5d44d38 100644 --- a/workflow/scripts/_merge_per_sample_counts_table.py +++ b/workflow/scripts/_merge_per_sample_counts_table.py @@ -19,39 +19,33 @@ def _df_setcol_as_float(df,collist): df[[c]]=df[[c]].astype(float) return df +def _rev_comp(seq): + seq = seq.upper() + seq = seq.replace("A", "t").replace("C", "g").replace("T", "a").replace("G", "c") + seq = seq.upper()[::-1] + return seq + class BSJ: - def __init__(self,chrom,start,end,strand): + def __init__(self,chrom,start,end,strand="+"): self.chrom=chrom self.start=int(start) self.end=int(end) self.strand=strand self.splice_site_flank_5="" #donor self.splice_site_flank_3="" #acceptor - - def add_flanks(self,sequences): - if self.strand == '+': - coord = int(self.end) - seq = sequences[self.chrom][coord:coord+2] - self.splice_site_flank_5 = seq.upper() - coord = int(self.start) - seq = sequences[self.chrom][coord-2:coord] - self.splice_site_flank_3 = seq.upper() - elif self.strand == '-': - coord = int(self.end) - seq = sequences[self.chrom][coord:coord+2] - seq = seq.upper() - seq = seq.replace("A", "t").replace("C", "g").replace("T", "a").replace("G", "c") - seq = seq.upper()[::-1] - self.splice_site_flank_3 = seq - coord = int(self.start) - seq = sequences[self.chrom][coord-2:coord] - seq = seq.upper() - seq = seq.replace("A", "t").replace("C", "g").replace("T", "a").replace("G", "c") - seq = seq.upper()[::-1] - self.splice_site_flank_5 = seq - def get_flanks(self): - return self.splice_site_flank_5+"##"+self.splice_site_flank_3 + def add_flanks(self,sequences): # adds flanking assuming + strand + coord = int(self.end) + seq = sequences[self.chrom][coord:coord+2] + self.splice_site_flank_5 = seq.upper() + coord = int(self.start) + seq = sequences[self.chrom][coord-2:coord] + self.splice_site_flank_3 = seq.upper() + + def get_flanks(self): # returns + and - strand flanks + plus_strand = self.splice_site_flank_5+"##"+self.splice_site_flank_3 + minus_strand = _rev_comp(self.splice_site_flank_5)+"##"+_rev_comp(self.splice_site_flank_3) + return plus_strand,minus_strand def main() : parser = argparse.ArgumentParser(description='Merge per sample Counts from different circRNA detection tools') @@ -94,6 +88,7 @@ def main() : # load circExplorer circE=pandas.read_csv(args.circE,sep="\t",header=0) + print(circE.columns) # columns are: # | # | Column | # | -- | ---------------------------------------- | @@ -104,35 +99,37 @@ def main() : # | 5 | known_novel | # | 6 | expected_BSJ_reads | # | 7 | found_BSJ_reads | - # | 8 | linear_same_strand | - # | 9 | spliced_same_strand | - # | 10 | linear_opposite_strand | - # | 11 | spliced_opposite_strand | - # | 12 | linear_unknown_strand | - # | 13 | spliced_unknown_strand | - circE['circRNA_id']=circE['#chrom'].astype(str)+"##"+circE['start'].astype(str)+"##"+circE['end'].astype(str)+"##"+circE['strand'].astype(str) - circE.rename({'known_novel' : 'circExplorer_annotation', - 'expected_BSJ_reads' : 'circExplorer_read_count', - 'found_BSJ_reads' : 'circExplorer_found_BSJcounts', - 'linear_same_strand' : 'circExplorer_found_linear_BSJ_same_strand_counts', - 'spliced_same_strand' : 'circExplorer_found_linear_spliced_BSJ_same_strand_counts', - 'linear_opposite_strand' : 'circExplorer_found_linear_BSJ_opposite_strand_counts', - 'spliced_opposite_strand' : 'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts', - 'linear_unknown_strand' : 'circExplorer_found_linear_BSJ_unknown_strand_counts', - 'spliced_unknown_strand' : 'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts'}, axis=1, inplace=True) - circE.drop(['#chrom','start', 'end','strand'], axis = 1,inplace=True) + # | 8 | linear_+ | + # | 9 | spliced_+ | + # | 10 | linear_- | + # | 11 | spliced_- | + # | 12 | linear_. | + # | 13 | spliced_. | + circE['circRNA_id']=circE['#chrom'].astype(str)+"##"+circE['start'].astype(str)+"##"+circE['end'].astype(str) + circE.rename({'strand' : 'circExplorer_strand', + 'known_novel' : 'circExplorer_annotation', + 'expected_BSJ_reads' : 'circExplorer_read_count', + 'found_BSJ_reads' : 'circExplorer_found_BSJcounts', + 'linear_+' : 'circExplorer_found_linear_BSJ_+_counts', + 'spliced_+' : 'circExplorer_found_linear_spliced_BSJ_+_counts', + 'linear_-' : 'circExplorer_found_linear_BSJ_-_counts', + 'spliced_-' : 'circExplorer_found_linear_spliced_BSJ_-_counts', + 'linear_.' : 'circExplorer_found_linear_BSJ_._counts', + 'spliced_.' : 'circExplorer_found_linear_spliced_BSJ_._counts'}, axis=1, inplace=True) + circE.drop(['#chrom','start', 'end'], axis = 1,inplace=True) circE.set_index(['circRNA_id'],inplace=True,drop=True) circE.fillna(value=-1,inplace=True) + print(circE.columns) intcols = [ 'circExplorer_read_count', 'circExplorer_found_BSJcounts', - 'circExplorer_found_linear_BSJ_same_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_same_strand_counts', - 'circExplorer_found_linear_BSJ_opposite_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts', - 'circExplorer_found_linear_BSJ_unknown_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts' ] + 'circExplorer_found_linear_BSJ_+_counts', + 'circExplorer_found_linear_spliced_BSJ_+_counts', + 'circExplorer_found_linear_BSJ_-_counts', + 'circExplorer_found_linear_spliced_BSJ_-_counts', + 'circExplorer_found_linear_BSJ_._counts', + 'circExplorer_found_linear_spliced_BSJ_._counts' ] strcols = list ( set(circE.columns) - set(intcols) ) circE = _df_setcol_as_int(circE,intcols) circE = _df_setcol_as_str(circE,strcols) @@ -147,10 +144,11 @@ def main() : # circExplorer2 with BWA circEbwa=pandas.read_csv(args.circEbwa,sep="\t",header=0) - circEbwa['circRNA_id']=circEbwa['#chrom'].astype(str)+"##"+circEbwa['start'].astype(str)+"##"+circEbwa['end'].astype(str)+"##"+circEbwa['strand'].astype(str) - circEbwa.rename({'known_novel' : 'circExplorer_bwa_annotation', - 'read_count' : 'circExplorer_bwa_read_count'}, axis=1, inplace=True) - circEbwa.drop(['#chrom','start', 'end','strand'], axis = 1,inplace=True) + circEbwa['circRNA_id']=circEbwa['#chrom'].astype(str)+"##"+circEbwa['start'].astype(str)+"##"+circEbwa['end'].astype(str) + circEbwa.rename({'strand' : 'circExplorer_bwa_strand', + 'known_novel' : 'circExplorer_bwa_annotation', + 'read_count' : 'circExplorer_bwa_read_count'}, axis=1, inplace=True) + circEbwa.drop(['#chrom','start', 'end'], axis = 1,inplace=True) circEbwa.set_index(['circRNA_id'],inplace=True,drop=True) circEbwa.fillna(value=-1,inplace=True) @@ -185,11 +183,12 @@ def main() : # | 11 | strand | strand info of a predicted circRNAs (new in CIRI2) | # | 12 | junction_reads_ID | all of the circular junction read IDs (split by ",") ciri["circRNA_start"]=ciri["circRNA_start"].astype(int)-1 - ciri['circRNA_id']=ciri['chr'].astype(str)+"##"+ciri['circRNA_start'].astype(str)+"##"+ciri['circRNA_end'].astype(str)+"##"+ciri['strand'].astype(str) - ciri.rename({ '#junction_reads': 'ciri_read_count', - '#non_junction_reads' : 'ciri_linear_read_count', - 'circRNA_type': 'ciri_annotation'}, axis=1, inplace=True) - ciri.drop(['chr','circRNA_start', 'circRNA_end','strand'], axis = 1,inplace=True) + ciri['circRNA_id']=ciri['chr'].astype(str)+"##"+ciri['circRNA_start'].astype(str)+"##"+ciri['circRNA_end'].astype(str) + ciri.rename({ 'strand' : 'ciri_strand', + '#junction_reads' : 'ciri_read_count', + '#non_junction_reads' : 'ciri_linear_read_count', + 'circRNA_type' : 'ciri_annotation'}, axis=1, inplace=True) + ciri.drop(['chr','circRNA_start', 'circRNA_end'], axis = 1,inplace=True) ciri.set_index(['circRNA_id'],inplace=True,drop=True) ciri.fillna(value=-1,inplace=True) @@ -208,6 +207,7 @@ def main() : if args.findcirc: findcirc=pandas.read_csv(args.findcirc,sep="\t",header=0) + print(findcirc.columns) # add find_circ # | # | short_name | description # | -- | --------------- | ---------------------------------------------------------------------------------------------------------------- | @@ -229,9 +229,10 @@ def main() : # | 16 | signal | flanking dinucleotide splice signal (normally GT/AG) | # | 17 | strandmatch | 'MATCH', 'MISMATCH' or 'NA' for non-stranded analysis | # | 18 | category | list of keywords describing the junction. Useful for quick grep filtering | - findcirc['circRNA_id']=findcirc['chrom'].astype(str)+"##"+findcirc['start'].astype(str)+"##"+findcirc['end'].astype(str)+"##"+findcirc['strand'].astype(str) - findcirc = findcirc.loc[:, ['circRNA_id', 'n_reads']] - findcirc.rename({'n_reads': 'findcirc_read_count'}, axis=1, inplace=True) + findcirc['circRNA_id']=findcirc['chrom'].astype(str)+"##"+findcirc['start'].astype(str)+"##"+findcirc['end'].astype(str) + findcirc = findcirc.loc[:, ['circRNA_id', 'n_reads', 'strand']] + findcirc.rename({ 'strand' : 'findcirc_strand', + 'n_reads' : 'findcirc_read_count'}, axis=1, inplace=True) findcirc.set_index(['circRNA_id'],inplace=True,drop=True) findcirc.fillna(value=-1,inplace=True) @@ -240,7 +241,6 @@ def main() : strcols = list ( set(findcirc.columns) - set(intcols) ) findcirc = _df_setcol_as_int(findcirc,intcols) if len(strcols) > 0: findcirc = _df_setcol_as_str(findcirc,strcols) - dfs.append(findcirc) if "findcirc".lower() in hqcc: required_hqcols.append("findcirc_read_count") @@ -261,11 +261,12 @@ def main() : # | 6 | linear_read_count| # | 7 | dcc_annotation | --> this is gene##JunctionType##Start-End Region from CircCoordinates file dcc["start"]=dcc["start"].astype(int)-1 - dcc['circRNA_id']=dcc['chr'].astype(str)+"##"+dcc['start'].astype(str)+"##"+dcc['end'].astype(str)+"##"+dcc['strand'].astype(str) + dcc['circRNA_id']=dcc['chr'].astype(str)+"##"+dcc['start'].astype(str)+"##"+dcc['end'].astype(str) + dcc.rename({'strand': 'dcc_strand'}, axis=1, inplace=True) dcc.rename({'read_count': 'dcc_read_count'}, axis=1, inplace=True) dcc.rename({'linear_read_count': 'dcc_linear_read_count'}, axis=1, inplace=True) dcc[['dcc_gene', 'dcc_junction_type', 'dcc_annotation2']] = dcc['dcc_annotation'].apply(lambda x: pandas.Series(str(x).split("##"))) - dcc.drop(['chr','start', 'end','strand','dcc_annotation'], axis = 1,inplace=True) + dcc.drop(['chr','start', 'end','dcc_annotation'], axis = 1,inplace=True) dcc.rename({'dcc_annotation2': 'dcc_annotation'}, axis=1, inplace=True) dcc.set_index(['circRNA_id'],inplace=True,drop=True) @@ -297,10 +298,11 @@ def main() : # | 6 | mapsplice_annotation | normal##2.811419 | <--"fusion_type"##"entropy" # "fusion_type" is either "normal" or "overlapping" ... higher "entropy" values are better! mapsplice["start"]=mapsplice["start"].astype(int)-1 - mapsplice['circRNA_id']=mapsplice['chrom'].astype(str)+"##"+mapsplice['start'].astype(str)+"##"+mapsplice['end'].astype(str)+"##"+mapsplice['strand'].astype(str) + mapsplice['circRNA_id']=mapsplice['chrom'].astype(str)+"##"+mapsplice['start'].astype(str)+"##"+mapsplice['end'].astype(str) + mapsplice.rename({'strand': 'mapsplice_strand'}, axis=1, inplace=True) mapsplice.rename({'read_count': 'mapsplice_read_count'}, axis=1, inplace=True) mapsplice[['mapsplice_annotation2', 'mapsplice_entropy']] = mapsplice['mapsplice_annotation'].apply(lambda x: pandas.Series(str(x).split("##"))) - mapsplice.drop(['chrom','start', 'end','strand','mapsplice_annotation'], axis = 1,inplace=True) + mapsplice.drop(['chrom','start', 'end','mapsplice_annotation'], axis = 1,inplace=True) mapsplice.rename({'mapsplice_annotation2': 'mapsplice_annotation'}, axis=1, inplace=True) mapsplice.set_index(['circRNA_id'],inplace=True,drop=True) @@ -335,9 +337,10 @@ def main() : if nclscan.shape[0]==0: includenclscan=False if includenclscan: nclscan["start"]=nclscan["start"].astype(int)-1 - nclscan['circRNA_id']=nclscan['chrom'].astype(str)+"##"+nclscan['start'].astype(str)+"##"+nclscan['end'].astype(str)+"##"+nclscan['strand'].astype(str) + nclscan['circRNA_id']=nclscan['chrom'].astype(str)+"##"+nclscan['start'].astype(str)+"##"+nclscan['end'].astype(str) + nclscan.rename({'strand': 'nclscan_strand'}, axis=1, inplace=True) nclscan.rename({'read_count': 'nclscan_read_count'}, axis=1, inplace=True) - nclscan.drop(['chrom','start', 'end','strand'], axis = 1,inplace=True) + nclscan.drop(['chrom','start', 'end'], axis = 1,inplace=True) nclscan = _df_setcol_as_str(nclscan,['nclscan_annotation']) nclscan.loc[nclscan['nclscan_annotation']=="1", 'nclscan_annotation'] = "Intragenic" nclscan.loc[nclscan['nclscan_annotation']=="0", 'nclscan_annotation'] = "Intergenic" @@ -367,9 +370,10 @@ def main() : # | 3 | end | 1223968 | # | 4 | strand | - | # | 5 | read_count | 26 | - circrnafinder['circRNA_id']=circrnafinder['chr'].astype(str)+"##"+circrnafinder['start'].astype(str)+"##"+circrnafinder['end'].astype(str)+"##"+circrnafinder['strand'].astype(str) + circrnafinder['circRNA_id']=circrnafinder['chr'].astype(str)+"##"+circrnafinder['start'].astype(str)+"##"+circrnafinder['end'].astype(str) + circrnafinder.rename({'strand': 'circrnafinder_strand'}, axis=1, inplace=True) circrnafinder.rename({'read_count': 'circrnafinder_read_count'}, axis=1, inplace=True) - circrnafinder.drop(['chr','start', 'end','strand'], axis = 1,inplace=True) + circrnafinder.drop(['chr','start', 'end'], axis = 1,inplace=True) circrnafinder.set_index(['circRNA_id'],inplace=True,drop=True) circrnafinder.fillna(value=-1,inplace=True) @@ -418,6 +422,9 @@ def main() : df['circRNA_id']=df.index df.reset_index(inplace=True,drop=True) merged_counts=pandas.merge(merged_counts,df,how='outer',on=['circRNA_id']) + + print(merged_counts.columns) + # merged_counts.set_index(['circRNA_id'],inplace=True,drop=True) merged_counts.fillna(-1,inplace=True) @@ -428,14 +435,16 @@ def main() : annotation_cols=['circExplorer_annotation','ciri_annotation'] floatcols = [] + strand_cols = ['circExplorer_strand','circExplorer_bwa_strand','ciri_strand'] + intcols = [ 'circExplorer_read_count', 'circExplorer_found_BSJcounts', - 'circExplorer_found_linear_BSJ_same_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_same_strand_counts', - 'circExplorer_found_linear_BSJ_opposite_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts', - 'circExplorer_found_linear_BSJ_unknown_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts' ] + 'circExplorer_found_linear_BSJ_+_counts', + 'circExplorer_found_linear_spliced_BSJ_+_counts', + 'circExplorer_found_linear_BSJ_-_counts', + 'circExplorer_found_linear_spliced_BSJ_-_counts', + 'circExplorer_found_linear_BSJ_._counts', + 'circExplorer_found_linear_spliced_BSJ_._counts' ] intcols.extend([ 'ciri_read_count', 'ciri_linear_read_count' ]) @@ -445,23 +454,28 @@ def main() : if args.findcirc: intcols.extend(['findcirc_read_count']) + strand_cols.append('findcirc_strand') if args.dcc: intcols.extend([ 'dcc_read_count', 'dcc_linear_read_count' ]) annotation_cols.extend(['dcc_gene','dcc_junction_type','dcc_annotation']) + strand_cols.append('dcc_strand') if args.mapsplice: intcols.extend([ 'mapsplice_read_count' ]) floatcols.extend([ 'mapsplice_entropy' ]) annotation_cols.extend(['mapsplice_annotation']) + strand_cols.append('mapsplice_strand') if args.nclscan and includenclscan: intcols.extend([ 'nclscan_read_count' ]) annotation_cols.extend(['nclscan_annotation']) + strand_cols.append('nclscan_strand') if args.circrnafinder: intcols.extend(['circrnafinder_read_count']) + strand_cols.append('circrnafinder_strand') intcols.extend(['ntools']) intcols.extend(['hqcounts','nonhqcounts']) @@ -491,35 +505,41 @@ def main() : if args.mapsplice: merged_counts.loc[merged_counts['mapsplice_read_count'] >= args.minreads, 'ntools'] += 1 if args.nclscan and includenclscan: merged_counts.loc[merged_counts['nclscan_read_count'] >= args.minreads, 'ntools'] += 1 if args.circrnafinder: merged_counts.loc[merged_counts['circrnafinder_read_count'] >= args.minreads, 'ntools'] += 1 - merged_counts[['chrom', 'start', 'end', 'strand']] = merged_counts['circRNA_id'].str.split('##', expand=True) + merged_counts[['chrom', 'start', 'end']] = merged_counts['circRNA_id'].str.split('##', expand=True) merged_counts=_df_setcol_as_int(merged_counts,['start','end','ntools']) - merged_counts=_df_setcol_as_str(merged_counts,['chrom','strand']) + merged_counts=_df_setcol_as_str(merged_counts,['chrom']) # adding flanking sites - merged_counts['flanking_sites']="-1" + merged_counts['flanking_sites_+']="-1" + merged_counts['flanking_sites_-']="-1" sequences = dict((s[1], s[0]) for s in HTSeq.FastaReader(args.reffa, raw_iterator=True)) for index, row in merged_counts.iterrows(): - bsj = BSJ(chrom=row['chrom'],start=row['start'],end=row['end'],strand=row['strand']) + bsj = BSJ(chrom=row['chrom'],start=row['start'],end=row['end']) bsj.add_flanks(sequences) - merged_counts.loc[index, 'flanking_sites'] = bsj.get_flanks() + plus_flank, minus_flank = bsj.get_flanks() + merged_counts.loc[index, 'flanking_sites_+'] = plus_flank + merged_counts.loc[index, 'flanking_sites_-'] = minus_flank # add samplename merged_counts['sample_name'] = args.samplename - merged_counts=_df_setcol_as_str(merged_counts,['sample_name','flanking_sites']) + merged_counts=_df_setcol_as_str(merged_counts,['sample_name','flanking_sites_+','flanking_sites_-']) + print(merged_counts.columns) # prepare output ... reorder columns - outcols=['chrom', 'start', 'end', 'strand', 'flanking_sites', 'sample_name', 'ntools', 'HQ'] + outcols=['chrom', 'start', 'end'] + outcols.extend(strand_cols) + outcols.extend(['flanking_sites_+','flanking_sites_-', 'sample_name', 'ntools', 'HQ']) # add circExplorer columns outcols.extend(['circExplorer_read_count', 'circExplorer_found_BSJcounts', - 'circExplorer_found_linear_BSJ_same_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_same_strand_counts', - 'circExplorer_found_linear_BSJ_opposite_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts', - 'circExplorer_found_linear_BSJ_unknown_strand_counts', - 'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts']) + 'circExplorer_found_linear_BSJ_+_counts', + 'circExplorer_found_linear_spliced_BSJ_+_counts', + 'circExplorer_found_linear_BSJ_-_counts', + 'circExplorer_found_linear_spliced_BSJ_-_counts', + 'circExplorer_found_linear_BSJ_._counts', + 'circExplorer_found_linear_spliced_BSJ_._counts']) # add ciri columns outcols.extend(['ciri_read_count', 'ciri_linear_read_count']) diff --git a/workflow/scripts/create_circExplorer_per_sample_counts_table.py b/workflow/scripts/create_circExplorer_per_sample_counts_table.py index 458d295..a60e321 100644 --- a/workflow/scripts/create_circExplorer_per_sample_counts_table.py +++ b/workflow/scripts/create_circExplorer_per_sample_counts_table.py @@ -15,6 +15,17 @@ # 10 linear_BSJ_reads_opposite_strand # 11 linear_spliced_BSJ_reads_opposite_strand + +def _df_setcol_as_int(df,collist): + for c in collist: + df[[c]]=df[[c]].astype(int) + return df + +def _df_setcol_as_str(df,collist): + for c in collist: + df[[c]]=df[[c]].astype(str) + return df + def main(): # debug = True debug = False @@ -33,7 +44,11 @@ def main(): # print(bcounts.head()) # print(lcounts.head()) mcounts = bcounts.merge(lcounts,how='outer',on=["#chrom","start","end","strand"]) - + mcounts.fillna(value=0,inplace=True) + strcols = [ '#chrom', 'strand', 'known_novel' ] + intcols = list ( set(mcounts.columns) - set(strcols) ) + mcounts = _df_setcol_as_str(mcounts,strcols) + mcounts = _df_setcol_as_int(mcounts,intcols) mcounts.drop(["read_count"],axis=1,inplace=True) mcounts.to_csv(args.mergedcounts,index=False,doublequote=False,sep="\t")