From 778913c06f865a96da758643d53bf3efda2a5673 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Mon, 14 Aug 2023 17:30:51 -0400
Subject: [PATCH] feature: circRNA id is independent of strand, counts for both
 + and - strand are reported together, strand from all callers are reported,
 both + and - flanking sites are reported, rev-comp function updated

---
 .../scripts/_bam_filter_BSJ_for_HQonly.py     |   5 +-
 .../_create_circExplorer_linear_bam.sh        |  91 ++++----
 ...filter_linear_spliced_readids_w_rid2jid.py |  38 ++--
 workflow/scripts/_make_master_counts_table.py | 104 +---------
 .../_merge_circExplorer_found_counts.py       |  16 +-
 .../scripts/_merge_per_sample_counts_table.py | 196 ++++++++++--------
 ...te_circExplorer_per_sample_counts_table.py |  17 +-
 7 files changed, 200 insertions(+), 267 deletions(-)

diff --git a/workflow/scripts/_bam_filter_BSJ_for_HQonly.py b/workflow/scripts/_bam_filter_BSJ_for_HQonly.py
index 2cb6f19..8315b0b 100644
--- a/workflow/scripts/_bam_filter_BSJ_for_HQonly.py
+++ b/workflow/scripts/_bam_filter_BSJ_for_HQonly.py
@@ -74,7 +74,7 @@ def main():
 
     RGlist = dict()
     for index,row in indf.iterrows():
-        jid = row['chrom']+"##"+str(row['start'])+"##"+str(row['end'])+"##"+row['strand']
+        jid = row['chrom']+"##"+str(row['start'])+"##"+str(row['end'])
         RGlist[jid]=1
     print("Number of RGs: ",len(RGlist))
 
@@ -114,6 +114,9 @@ def main():
 
     for read in samfile.fetch():
         rg = read.get_tag("RG")
+        rg = rg.split("##")
+        rg = rg[:len(rg)-1]
+        rg = "##".join(rg)
         if rg in RGlist:
             regionname=_get_regionname_from_seqname(regions,read.reference_name)
             if regionname in hosts:
diff --git a/workflow/scripts/_create_circExplorer_linear_bam.sh b/workflow/scripts/_create_circExplorer_linear_bam.sh
index 00ec996..1435396 100644
--- a/workflow/scripts/_create_circExplorer_linear_bam.sh
+++ b/workflow/scripts/_create_circExplorer_linear_bam.sh
@@ -1,15 +1,18 @@
 #!/usr/bin/env bash
-# module load parallel
-# module load python/3.7
-# module load bedtools
-# module load ucsc
-# module load samtools
+ module load parallel
+ module load python/3.7
+ module load bedtools
+ module load ucsc
+ module load samtools
 
 
 set -e -x -o pipefail
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-# SCRIPT_DIR="/vf/users/Ziegelbauer_lab/Pipelines/circRNA/activeDev/workflow/scripts"
+# only for debugging
+SCRIPT_DIR2="/vf/users/Ziegelbauer_lab/Pipelines/circRNA/activeDev/workflow/scripts"
+# add the following line when not debugging
+SCRIPT_DIR2="$SCRIPT_DIR"
 SCRIPT_NAME=$( basename -- "${BASH_SOURCE[0]}" )
 
 ARGPARSE_DESCRIPTION="create linear and splice BAMs (and BIGWIGs)"
@@ -60,29 +63,6 @@ start0=$(date +%s.%N)
     # bedSort from ucsc tools
     # ucsc
 
-#if [ "$#" != 18 ];then
-#    echo "$#"
-#    echo "18 arguments expected!"
-#    echo "#1 --> path to non-chimeric BAM file"
-#    echo "#2 --> sample name"
-#    echo "#3 --> PE or SE"
-#    echo "#4 --> known BSJs in bed.gz format"
-#    echo "#5 --> tmpdir"
-#    echo "#6 --> gzipped outputfilename eg.${sample_name}.rid2jid.tsv.gz"
-#    echo "#7 --> output filtered sample BAM"
-#	echo "#8 --> gzip-ed list of linear BSJ readids"
-#	echo "#9 --> gzip-ed list of linear spliced BSJ readids"
-#	echo "#10 --> jid counts (linear and linear-spliced) per jid or BSJ"
-#	echo "#11 --> linear BSJ readids in BAM"
-#	echo "#12 --> linear-spliced BSJ readids in BAM"
-#	echo "#13 --> .regions file eg. ref/ref.fa.regions"
-#	echo "#14 --> host list comma separated .. no spaces"
-#	echo "#15 --> additives list comma separated .. no spaces"
-#       echo "#16 --> viruses list comma separated .. no spaces"
-#	echo "#17 --> all linear readids in BAM"
-#	echo "#18 --> all spliced readids in BAM"
-#    exit
-#fi
 
 set -exo pipefail
 
@@ -139,27 +119,26 @@ start=$(date +%s.%N)
 
 if [ "$peorse" == "PE" ];then
 
-python3 ${SCRIPT_DIR}/filter_bam.py \
+python3 ${SCRIPT_DIR2}/filter_bam.py \
     --inbam $non_chimeric_star2p_bam \
     --outbam $filtered_bam \
     --pe
 
 else
 
-python3 ${SCRIPT_DIR}/filter_bam.py \
+python3 ${SCRIPT_DIR2}/filter_bam.py \
     --inbam $non_chimeric_star2p_bam \
     --outbam $filtered_bam \
 
 fi
 
-nreads=$(samtools view /gpfs/gsfs8/users/CBLCCBR/kopardevn_tmp/issue57_testing_2/results/GI1_T/circExplorer/GI1_T.bam|wc -l)
+nreads=$(samtools view $filtered_bam|wc -l)
 if [ "$nreads" == "0" ];then
     echo "SOMETHING WENT WRONG ...filtered BAM is empty!!"
     exit
 fi
 samtools index -@4 $filtered_bam
 
-
 ###################################################################################################
 # convert BAM to read ends BED format
 ###################################################################################################
@@ -169,7 +148,7 @@ start=$(date +%s.%N)
 
 bedtools bamtobed -split -i $filtered_bam > ${tmpdir}/${sample_name}.bed
 
-python ${SCRIPT_DIR}/_process_bamtobed.py \
+python ${SCRIPT_DIR2}/_process_bamtobed.py \
     --inbed ${tmpdir}/${sample_name}.bed \
     --outbed ${tmpdir}/${sample_name}.readends.bed \
     --linear ${tmpdir}/${sample_name}.linear.readids.gz \
@@ -188,7 +167,6 @@ zcat $bsjbedgz |awk -F"\t" -v OFS="\t" '{print $1, $2, $2, $1"##"$2"##"$3"##"$6,
 zcat $bsjbedgz |awk -F"\t" -v OFS="\t" '{print $1, $3, $3, $1"##"$2"##"$3"##"$6, $5, $6}' - >> ${tmpdir}/BSJ.ends.bed
 bedSort ${tmpdir}/BSJ.ends.bed ${tmpdir}/BSJ.ends.bed
 
-
 ###################################################################################################
 # finding max readlength
 ###################################################################################################
@@ -196,11 +174,12 @@ bedSort ${tmpdir}/BSJ.ends.bed ${tmpdir}/BSJ.ends.bed
 printtime $SCRIPT_NAME $start0 $start "finding max readlength"
 start=$(date +%s.%N)
 
-python3 ${SCRIPT_DIR}/bam_get_max_readlen.py -i $filtered_bam > ${tmpdir}/${sample_name}.maxrl
+python3 ${SCRIPT_DIR2}/bam_get_max_readlen.py -i $filtered_bam > ${tmpdir}/${sample_name}.maxrl
 maxrl=$(cat ${tmpdir}/${sample_name}.maxrl)
 two_maxrl=$((maxrl*2))
+echo $two_maxrl
 
-
+#two_maxrl="302"
 ###################################################################################################
 # find closest known BSJs with primary alignments and then keep alignments with some alignment within the "inclusion zone"
 # this is run in parallel
@@ -217,7 +196,7 @@ bedSort $f $f && \
 bedtools closest -nonamecheck \
     -a $f \
     -b ${tmpdir}/BSJ.ends.bed  -d | \
-    awk -F\"\\t\" -v OFS=\"\\t\" -v limit=$two_maxrl '{if (\$NF > 0 && \$NF <= limit) {print \$4,\$10}}' | \
+    awk -F\"\\t\" -v OFS=\"\\t\" -v limit=$two_maxrl '{if (\$NF > 0 && \$NF <= limit) {print \$4,\$10,\$NF}}' | \
     sort --buffer-size=2G --unique --parallel=2 > ${f}.rid2jid.closest.txt
 """
 done > ${tmpdir}/para.tmp
@@ -225,10 +204,12 @@ grep -v "^$" ${tmpdir}/para.tmp > ${tmpdir}/para
 parallel -j $threads < ${tmpdir}/para
 
 #cat ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt > ${rid2jidgzip%.*}.tmp
-for f in $(ls ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt);do
+for f in $(find $tmpdir -name "${sample_name}.readends.part.*.rid2jid.closest.txt");do
+#for f in $(ls ${tmpdir}/${sample_name}.readends.part.?????.rid2jid.closest.txt);do
        cat $f
 done > ${rid2jidgzip%.*}.tmp
-sort --buffer-size=20G --parallel=$threads --unique ${rid2jidgzip%.*}.tmp > ${rid2jidgzip%.*}
+sort -k1,1 -k3,3n --buffer-size=20G --parallel=$threads --unique ${rid2jidgzip%.*}.tmp | \
+	awk -F"\t" -v OFS="\t" '{a[$1]++;if (a[$1]==1) {print $1,$2}}' > ${rid2jidgzip%.*}
 
 pigz -p4 -f ${rid2jidgzip%.*} && rm -f ${rid2jidgzip%.*}.tmp
 
@@ -239,7 +220,7 @@ pigz -p4 -f ${rid2jidgzip%.*} && rm -f ${rid2jidgzip%.*}.tmp
 printtime $SCRIPT_NAME $start0 $start "filter linear and spliced readids to those near BSJs only; creating counts table"
 start=$(date +%s.%N)
 
-python3 ${SCRIPT_DIR}/_filter_linear_spliced_readids_w_rid2jid.py \
+python3 ${SCRIPT_DIR2}/_filter_linear_spliced_readids_w_rid2jid.py \
     --linearin ${tmpdir}/${sample_name}.linear.readids.gz \
     --splicedin ${tmpdir}/${sample_name}.spliced.readids.gz \
     --rid2jid ${rid2jidgzip} \
@@ -247,7 +228,6 @@ python3 ${SCRIPT_DIR}/_filter_linear_spliced_readids_w_rid2jid.py \
     --splicedout ${splicedrids} \
     --jidcounts ${jidcounts}
 
-
 # rm -rf ${tmpdir}/${sample_name}.readends.part.*
 # the above statement leads to /usr/bin/rm: Argument list too long
 # hence,
@@ -267,10 +247,10 @@ splicedbam_bn=$(basename $splicedbam)
 linearbam_all_bn=$(basename $linearbam_all)
 splicedbam_all_bn=$(basename $splicedbam_all)
 
-echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_bn} --readids $linearrids" >> ${tmpdir}/para2
-echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_bn} --readids $splicedrids" >> ${tmpdir}/para2
-echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_all_bn} --readids ${tmpdir}/${sample_name}.linear.readids.gz" >> ${tmpdir}/para2
-echo "python3 ${SCRIPT_DIR}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_all_bn} --readids ${tmpdir}/${sample_name}.spliced.readids.gz" >> ${tmpdir}/para2
+echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_bn} --readids $linearrids" >> ${tmpdir}/para2
+echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_bn} --readids $splicedrids" >> ${tmpdir}/para2
+echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${linearbam_all_bn} --readids ${tmpdir}/${sample_name}.linear.readids.gz" >> ${tmpdir}/para2
+echo "python3 ${SCRIPT_DIR2}/filter_bam_by_readids.py --inputBAM $filtered_bam --outputBAM ${tmpdir}/${splicedbam_all_bn} --readids ${tmpdir}/${sample_name}.spliced.readids.gz" >> ${tmpdir}/para2
 
 parallel -j 4 < ${tmpdir}/para2
 
@@ -296,10 +276,10 @@ samtools sort -l 9 -T ${tmpdir}/sorttmp --write-index -@${threads} -O BAM -o ${s
 
 if [ -f ${tmpdir}/para3 ];then rm -f ${tmpdir}/para3;fi
 
-echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam --sample_name $sample_name --regions $regions --prefix linear_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
-echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam --sample_name $sample_name --regions $regions --prefix spliced_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
-echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $linearbam_all --sample_name $sample_name --regions $regions --prefix linear --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
-echo "python3 ${SCRIPT_DIR}/bam_split_by_regions.py --inbam $splicedbam_all --sample_name $sample_name --regions $regions --prefix spliced --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
+echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $linearbam --sample_name $sample_name --regions $regions --prefix linear_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
+echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $splicedbam --sample_name $sample_name --regions $regions --prefix spliced_BSJ --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
+echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $linearbam_all --sample_name $sample_name --regions $regions --prefix linear --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
+echo "python3 ${SCRIPT_DIR2}/bam_split_by_regions.py --inbam $splicedbam_all --sample_name $sample_name --regions $regions --prefix spliced --outdir $outdir --host \"$host\" --additives \"$additives\" --viruses \"$viruses\"" >> ${tmpdir}/para3
 
 parallel -j 4 < ${tmpdir}/para3
 
@@ -317,16 +297,16 @@ for folder in $(ls ${tmpdir}/b2b*);do rm -rf $folder;done
 
 if [ -f ${tmpdir}/para3 ];then rm -f ${tmpdir}/para4;fi
 
-echo "mkdir -p ${tmpdir}/b2b_1 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $linearbam ${tmpdir}/b2b_1" >> ${tmpdir}/para4
-echo "mkdir -p ${tmpdir}/b2b_2 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $splicedbam ${tmpdir}/b2b_2" >> ${tmpdir}/para4
-echo "mkdir -p ${tmpdir}/b2b_3 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $linearbam_all ${tmpdir}/b2b_3" >> ${tmpdir}/para4
-echo "mkdir -p ${tmpdir}/b2b_4 && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $splicedbam_all ${tmpdir}/b2b_4" >> ${tmpdir}/para4
+echo "mkdir -p ${tmpdir}/b2b_1 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $linearbam ${tmpdir}/b2b_1" >> ${tmpdir}/para4
+echo "mkdir -p ${tmpdir}/b2b_2 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $splicedbam ${tmpdir}/b2b_2" >> ${tmpdir}/para4
+echo "mkdir -p ${tmpdir}/b2b_3 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $linearbam_all ${tmpdir}/b2b_3" >> ${tmpdir}/para4
+echo "mkdir -p ${tmpdir}/b2b_4 && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $splicedbam_all ${tmpdir}/b2b_4" >> ${tmpdir}/para4
 count=4
 for prefix in "linear_BSJ" "spliced_BSJ" "linear" "spliced";do
 for b in $(echo "$host $viruses"|tr ',' ' ');do
     count=$((count+1))
     bam="${outdir}/${sample_name}.${prefix}.${b}.bam"
-    echo "mkdir -p ${tmpdir}/b2b_${count} && bash ${SCRIPT_DIR}/bam_to_bigwig.sh $bam ${tmpdir}/b2b_${count}" >> ${tmpdir}/para4
+    echo "mkdir -p ${tmpdir}/b2b_${count} && bash ${SCRIPT_DIR2}/bam_to_bigwig.sh $bam ${tmpdir}/b2b_${count}" >> ${tmpdir}/para4
 done
 done
 
@@ -334,3 +314,4 @@ parallel -j 12 < ${tmpdir}/para4
 
 #rm -rf ${tmpdir}/*
 printtime $SCRIPT_NAME $start0 $start "Done!"
+
diff --git a/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py b/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py
index bc60a00..7bbace0 100644
--- a/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py
+++ b/workflow/scripts/_filter_linear_spliced_readids_w_rid2jid.py
@@ -50,34 +50,29 @@ def main():
             jid=l[1]
             if jid==".":
                 print(">>>>>>>>jid is dot:",l)
-            if not jid in scount:
-                scount[jid]=dict()
-                lcount[jid]=dict()
-                scount[jid]["SS"]=0
-                scount[jid]["OS"]=0
-                scount[jid]["Unknown"]=0
-                lcount[jid]["SS"]=0
-                lcount[jid]["OS"]=0
-                lcount[jid]["Unknown"]=0
+            # jchr,jstart,jend,jstrand=jid.split("##")
+            # jid2="##".join([jchr,jstart,jend])
+            jid2=jid
+            if not jid2 in scount:
+                scount[jid2]=dict()
+                lcount[jid2]=dict()
+                scount[jid2]["+"]=0
+                scount[jid2]["-"]=0
+                scount[jid2]["."]=0
+                lcount[jid2]["+"]=0
+                lcount[jid2]["-"]=0
+                lcount[jid2]["."]=0
             if "##" in l[0]:
                 rid,rstrand=l[0].split("##")
             else:
                 rid=l[0]
                 rstrand="."
-            jstrand=jid.split("##")[-1]
-            if rstrand=="+" or rstrand=="-":
-                if rstrand==jstrand:
-                    strandinfo="OS"
-                else:
-                    strandinfo="SS"
-            else:
-                strandinfo="Unknown"
             if rid in linridlist:
                 linridlist[rid]+=1
-                lcount[jid][strandinfo]+=1
+                lcount[jid][rstrand]+=1
             if rid in sinridlist:
                 sinridlist[rid]+=1
-                scount[jid][strandinfo]+=1
+                scount[jid][rstrand]+=1
     rid2jid.close()
     with gzip.open(args.linearout,'wt') as outrl:
         for k,v in linridlist.items():
@@ -90,13 +85,14 @@ def main():
                 outrl.write("%s\n"%k)
     outrl.close()
     countout=open(args.jidcounts,'w')
-    countout.write("#chrom\tstart\tend\tstrand\tlinear_same_strand\tspliced_same_strand\tlinear_opposite_strand\tspliced_opposite_strand\tlinear_unknown_strand\tspliced_unknown_strand\n")
+    # countout.write("#chrom\tstart\tend\tlinear_+\tspliced_+\tlinear_-\tspliced_-\tlinear_.\tspliced_.\n")
+    countout.write("#chrom\tstart\tend\tstrand\tlinear_+\tspliced_+\tlinear_-\tspliced_-\tlinear_.\tspliced_.\n")
     for k in lcount.keys():
         v1=lcount[k]
         v2=scount[k]
         kstr=k.split("##")
         k="\t".join(kstr)
-        countout.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\n"%(k,v1["SS"],v2["SS"],v1["OS"],v2["OS"],v1["Unknown"],v2["Unknown"]))
+        countout.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\n"%(k,v1["+"],v2["+"],v1["-"],v2["-"],v1["."],v2["."]))
     countout.close()
 
 if __name__ == "__main__":
diff --git a/workflow/scripts/_make_master_counts_table.py b/workflow/scripts/_make_master_counts_table.py
index 19c00a8..1a81dbf 100644
--- a/workflow/scripts/_make_master_counts_table.py
+++ b/workflow/scripts/_make_master_counts_table.py
@@ -32,10 +32,10 @@ def main() :
         count += 1
         if count==1:
             outdf = pd.read_csv(f,sep="\t",header=0,compression='gzip')
-            outdf.set_index(['chrom', 'start', 'end', 'strand', 'flanking_sites', 'sample_name'])
+            outdf.set_index(['chrom', 'start', 'end', 'sample_name'])
         else:
             tmpdf = pd.read_csv(f,sep="\t",header=0,compression='gzip')
-            tmpdf.set_index(['chrom', 'start', 'end', 'strand', 'flanking_sites', 'sample_name'])
+            tmpdf.set_index(['chrom', 'start', 'end', 'sample_name'])
             outdf = pd.concat([outdf , tmpdf],axis=0,join="outer",sort=False)
     outdf.reset_index(drop=True,inplace=True)
     outdf.fillna(-1,inplace=True)
@@ -49,104 +49,8 @@ def main() :
     # print(strcols)
     outdf = _df_setcol_as_int(outdf,intcols)
     outdf = _df_setcol_as_str(outdf,strcols)
-    outdf = outdf.sort_values(by=['chrom','start','end','strand'])
+    outdf = outdf.sort_values(by=['chrom','start','end', 'sample_name'])
 
-    # Applying circExplorer_BWA fixes
-    indf=outdf
-    non_circExplorer_BWA_df=indf.loc[(indf['strand']!=".")]
-    ambiguous_strand_df=indf.loc[(indf['strand']==".") & (indf['circExplorer_bwa_read_count']>=args.minreads)]
-    # print(ambiguous_strand_df.head())
-
-    countlist = set(['circExplorer_read_count','ciri_read_count','findcirc_read_count','dcc_read_count','circrnafinder_read_count','mapsplice_read_count','nclscan_read_count'])
-    colnames = set(indf.columns)
-    excludelist = countlist - colnames
-    newcountlist = countlist - excludelist
-    # print("-2")
-    # non_circExplorer_BWA_df = non_circExplorer_BWA_df.reset_index()
-    lookup = dict()
-    for index,row in non_circExplorer_BWA_df.iterrows():
-        circID = row['chrom'] + "##" + str(row['start']) + "##" + str(row['end'])
-        if not circID in lookup:
-            lookup[circID] = dict()
-            lookup[circID]["+"] = dict()
-            lookup[circID]["+"]['sum'] = 0
-            lookup[circID]["-"] = dict()
-            lookup[circID]["-"]['sum'] = 0
-        for c in newcountlist:
-            if int(row[c]) != -1: 
-                lookup[circID][row['strand']]['sum'] += int(row[c])
-                lookup[circID][row['strand']]['flanking_sites'] = row['flanking_sites']
-    # print("-1")
-    # find winner
-    for k,v in lookup.items():
-        if v["+"]['sum'] >= v["-"]['sum']: 
-            lookup[k]['strand'] = "+"
-        else:
-            lookup[k]['strand'] = "-"
-    # print("0")
-    # ambiguous_strand_df = ambiguous_strand_df.reset_index()
-    for index,row in ambiguous_strand_df.iterrows():
-        circID = row['chrom'] + "##" + str(row['start']) + "##" + str(row['end'])
-        if not circID in lookup: continue
-        ambiguous_strand_df.at[index,'strand']=lookup[circID]['strand']
-        ambiguous_strand_df.at[index,'flanking_sites']=lookup[circID][lookup[circID]['strand']]['flanking_sites']
-
-    non_ambiguous_strand_df = ambiguous_strand_df.loc[ambiguous_strand_df['strand']!="."]
-    # print("1")
-    bwa_counts = dict()
-    circID2index = dict()
-    circIDlist=list()
-    for index2,row2 in non_ambiguous_strand_df.iterrows():
-        circID2 = row2['chrom'] + "##" + str(row2['start']) + "##" + str(row2['end']) + "##" + row2['sample_name']
-        circIDlist.append(circID2)
-        bwa_counts[circID2] = row2['circExplorer_bwa_read_count']
-        circID2index[circID2] = index2
-
-    non_ambiguous_strand_df.insert(0,'circID',circIDlist)
-
-    # print("2")
-    claimed=dict()
-    for index,row in non_circExplorer_BWA_df.iterrows():
-        circID = row['chrom'] + "##" + str(row['start']) + "##" + str(row['end']) + "##" + row['sample_name']
-        if circID in bwa_counts:
-            claimed[circID]=1
-            non_circExplorer_BWA_df.at[index,'circExplorer_bwa_read_count']=bwa_counts[circID]
-    # print("3")
-    set1=set(bwa_counts.keys())
-    set2=set(claimed.keys())
-    unclaimed=list(set1-set2)
-    # print("4")
-    # unclaimed_index=list()
-    # for c in unclaimed:
-    #     unclaimed_index.append(circID2index[c])
-    if len(unclaimed) > 0:
-        # unclaimed_non_ambiguous_strand_df = non_ambiguous_strand_df.iloc[unclaimed_index]
-        unclaimed_non_ambiguous_strand_df = non_ambiguous_strand_df.loc[non_ambiguous_strand_df['circID'].isin(unclaimed)]
-        unclaimed_non_ambiguous_strand_df_wo_circID = unclaimed_non_ambiguous_strand_df.drop(['circID'],axis=1)
-        outdf=pd.concat([non_circExplorer_BWA_df,unclaimed_non_ambiguous_strand_df_wo_circID])
-    else:
-        outdf=non_circExplorer_BWA_df
-    # print("5")
-
-    # countlist = set(['circExplorer_read_count','ciri_read_count','findcirc_read_count','dcc_read_count','circrnafinder_read_count','mapsplice_read_count','nclscan_read_count'])
-    # newcountlist
-    hqcountlist = set(['circExplorer_read_count','circExplorer_bwa_read_count'])
-    nonhqcountlist = newcountlist-hqcountlist
-
-    for index,row in outdf.iterrows():
-        hqcount=0
-        nonhqcount=0
-        for c in hqcountlist:
-            if row[c] >= args.minreads: hqcount+=1
-        for c in nonhqcountlist:
-            if row[c] >= args.minreads: nonhqcount+=1
-        outdf.at[index,'hqcounts']=hqcount
-        outdf.at[index,'nonhqcounts']=nonhqcount
-        outdf.at[index,'ntools']=hqcount+nonhqcount
-        if hqcount==len(hqcountlist) and nonhqcount >= 1: 
-            outdf.at[index,'HQ']="Y"
-        else:
-            outdf.at[index,'HQ']="N"
 
     intcols=['start','end','ntools']
     for c in outdf.columns:
@@ -155,7 +59,7 @@ def main() :
     strcols=list(set(outdf.columns)-set(intcols))
     outdf = _df_setcol_as_int(outdf,intcols)
     outdf = _df_setcol_as_str(outdf,strcols)
-    outdf = outdf.sort_values(by=['chrom','start','end','strand'])
+    outdf = outdf.sort_values(by=['chrom','start','end','sample_name'])
     outdf.to_csv(args.outfile,sep="\t",header=True,index=False,compression='gzip')
 
 if __name__ == "__main__":
diff --git a/workflow/scripts/_merge_circExplorer_found_counts.py b/workflow/scripts/_merge_circExplorer_found_counts.py
index 3ed2da8..245c260 100644
--- a/workflow/scripts/_merge_circExplorer_found_counts.py
+++ b/workflow/scripts/_merge_circExplorer_found_counts.py
@@ -2,6 +2,16 @@
 import sys
 import pandas
 
+def _df_setcol_as_int(df,collist):
+    for c in collist:
+        df[[c]]=df[[c]].astype(int)
+    return df
+
+def _df_setcol_as_str(df,collist):
+    for c in collist:
+        df[[c]]=df[[c]].astype(str)
+    return df
+
 def main():
     # debug = True
     debug = False
@@ -20,7 +30,11 @@ def main():
     print(bcounts.head())
     print(lcounts.head())
     mcounts = bcounts.merge(lcounts,how='outer',on=["#chrom","start","end","strand"])
-
+    strcols = [ '#chrom', 'strand' ]
+    intcols = list ( set(mcounts.columns) - set(strcols) )
+    mcounts.fillna(value=0,inplace=True)
+    mcounts = _df_setcol_as_str(mcounts,strcols)
+    mcounts = _df_setcol_as_int(mcounts,intcols)
     mcounts.to_csv(args.mergedcounts,index=False,doublequote=False,sep="\t")
 
 if __name__ == "__main__":
diff --git a/workflow/scripts/_merge_per_sample_counts_table.py b/workflow/scripts/_merge_per_sample_counts_table.py
index 7635f2b..5d44d38 100644
--- a/workflow/scripts/_merge_per_sample_counts_table.py
+++ b/workflow/scripts/_merge_per_sample_counts_table.py
@@ -19,39 +19,33 @@ def _df_setcol_as_float(df,collist):
         df[[c]]=df[[c]].astype(float)
     return df
 
+def _rev_comp(seq):
+    seq = seq.upper()
+    seq = seq.replace("A", "t").replace("C", "g").replace("T", "a").replace("G", "c")
+    seq = seq.upper()[::-1]
+    return seq
+
 class BSJ:
-    def __init__(self,chrom,start,end,strand):
+    def __init__(self,chrom,start,end,strand="+"):
         self.chrom=chrom
         self.start=int(start)
         self.end=int(end)
         self.strand=strand
         self.splice_site_flank_5="" #donor
         self.splice_site_flank_3="" #acceptor
-
-    def add_flanks(self,sequences):
-        if self.strand == '+':
-            coord = int(self.end)
-            seq = sequences[self.chrom][coord:coord+2]
-            self.splice_site_flank_5 = seq.upper()
-            coord = int(self.start)
-            seq = sequences[self.chrom][coord-2:coord]
-            self.splice_site_flank_3 = seq.upper()
-        elif self.strand == '-':
-            coord = int(self.end)
-            seq =  sequences[self.chrom][coord:coord+2]
-            seq = seq.upper()
-            seq = seq.replace("A", "t").replace("C", "g").replace("T", "a").replace("G", "c")
-            seq = seq.upper()[::-1]
-            self.splice_site_flank_3 = seq
-            coord = int(self.start)
-            seq = sequences[self.chrom][coord-2:coord]
-            seq = seq.upper()
-            seq = seq.replace("A", "t").replace("C", "g").replace("T", "a").replace("G", "c")
-            seq = seq.upper()[::-1]
-            self.splice_site_flank_5 = seq
     
-    def get_flanks(self):
-        return self.splice_site_flank_5+"##"+self.splice_site_flank_3
+    def add_flanks(self,sequences): # adds flanking assuming + strand
+        coord = int(self.end)
+        seq = sequences[self.chrom][coord:coord+2]
+        self.splice_site_flank_5 = seq.upper()
+        coord = int(self.start)
+        seq = sequences[self.chrom][coord-2:coord]
+        self.splice_site_flank_3 = seq.upper()
+    
+    def get_flanks(self): # returns + and - strand flanks
+        plus_strand = self.splice_site_flank_5+"##"+self.splice_site_flank_3
+        minus_strand = _rev_comp(self.splice_site_flank_5)+"##"+_rev_comp(self.splice_site_flank_3)
+        return plus_strand,minus_strand
 
 def main() :
     parser = argparse.ArgumentParser(description='Merge per sample Counts from different circRNA detection tools')
@@ -94,6 +88,7 @@ def main() :
 
     # load circExplorer
     circE=pandas.read_csv(args.circE,sep="\t",header=0)
+    print(circE.columns)
     # columns are:
     # | #  | Column                                   |
     # | -- | ---------------------------------------- |
@@ -104,35 +99,37 @@ def main() :
     # | 5  | known_novel                              |
     # | 6  | expected_BSJ_reads                       |
     # | 7  | found_BSJ_reads                          |
-    # | 8  | linear_same_strand                       |
-    # | 9  | spliced_same_strand                      |
-    # | 10 | linear_opposite_strand                   |
-    # | 11 | spliced_opposite_strand                  |
-    # | 12 | linear_unknown_strand                    |
-    # | 13 | spliced_unknown_strand                   |
-    circE['circRNA_id']=circE['#chrom'].astype(str)+"##"+circE['start'].astype(str)+"##"+circE['end'].astype(str)+"##"+circE['strand'].astype(str)
-    circE.rename({'known_novel' : 'circExplorer_annotation',
-                'expected_BSJ_reads' : 'circExplorer_read_count',
-                'found_BSJ_reads' : 'circExplorer_found_BSJcounts',
-                'linear_same_strand' : 'circExplorer_found_linear_BSJ_same_strand_counts',
-                'spliced_same_strand' : 'circExplorer_found_linear_spliced_BSJ_same_strand_counts',
-                'linear_opposite_strand' : 'circExplorer_found_linear_BSJ_opposite_strand_counts',
-                'spliced_opposite_strand' : 'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts',
-                'linear_unknown_strand' : 'circExplorer_found_linear_BSJ_unknown_strand_counts',
-                'spliced_unknown_strand' : 'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts'}, axis=1, inplace=True)
-    circE.drop(['#chrom','start', 'end','strand'], axis = 1,inplace=True)
+    # | 8  | linear_+                       |
+    # | 9  | spliced_+                      |
+    # | 10 | linear_-                   |
+    # | 11 | spliced_-                  |
+    # | 12 | linear_.                    |
+    # | 13 | spliced_.                   |
+    circE['circRNA_id']=circE['#chrom'].astype(str)+"##"+circE['start'].astype(str)+"##"+circE['end'].astype(str)
+    circE.rename({'strand'              : 'circExplorer_strand',
+                  'known_novel'         : 'circExplorer_annotation',
+                'expected_BSJ_reads'    : 'circExplorer_read_count',
+                'found_BSJ_reads'       : 'circExplorer_found_BSJcounts',
+                'linear_+'              : 'circExplorer_found_linear_BSJ_+_counts',
+                'spliced_+'             : 'circExplorer_found_linear_spliced_BSJ_+_counts',
+                'linear_-'              : 'circExplorer_found_linear_BSJ_-_counts',
+                'spliced_-'             : 'circExplorer_found_linear_spliced_BSJ_-_counts',
+                'linear_.'              : 'circExplorer_found_linear_BSJ_._counts',
+                'spliced_.'             : 'circExplorer_found_linear_spliced_BSJ_._counts'}, axis=1, inplace=True)
+    circE.drop(['#chrom','start', 'end'], axis = 1,inplace=True)
     circE.set_index(['circRNA_id'],inplace=True,drop=True)
     
     circE.fillna(value=-1,inplace=True)
+    print(circE.columns)
 
     intcols = [ 'circExplorer_read_count', 
                 'circExplorer_found_BSJcounts', 
-                'circExplorer_found_linear_BSJ_same_strand_counts', 
-                'circExplorer_found_linear_spliced_BSJ_same_strand_counts', 
-                'circExplorer_found_linear_BSJ_opposite_strand_counts', 
-                'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts', 
-                'circExplorer_found_linear_BSJ_unknown_strand_counts', 
-                'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts' ]
+                'circExplorer_found_linear_BSJ_+_counts', 
+                'circExplorer_found_linear_spliced_BSJ_+_counts', 
+                'circExplorer_found_linear_BSJ_-_counts', 
+                'circExplorer_found_linear_spliced_BSJ_-_counts', 
+                'circExplorer_found_linear_BSJ_._counts', 
+                'circExplorer_found_linear_spliced_BSJ_._counts' ]
     strcols = list ( set(circE.columns) - set(intcols) )
     circE = _df_setcol_as_int(circE,intcols)
     circE = _df_setcol_as_str(circE,strcols)
@@ -147,10 +144,11 @@ def main() :
     # circExplorer2 with BWA
 
     circEbwa=pandas.read_csv(args.circEbwa,sep="\t",header=0)
-    circEbwa['circRNA_id']=circEbwa['#chrom'].astype(str)+"##"+circEbwa['start'].astype(str)+"##"+circEbwa['end'].astype(str)+"##"+circEbwa['strand'].astype(str)
-    circEbwa.rename({'known_novel' : 'circExplorer_bwa_annotation',
-                    'read_count' : 'circExplorer_bwa_read_count'}, axis=1, inplace=True)
-    circEbwa.drop(['#chrom','start', 'end','strand'], axis = 1,inplace=True)
+    circEbwa['circRNA_id']=circEbwa['#chrom'].astype(str)+"##"+circEbwa['start'].astype(str)+"##"+circEbwa['end'].astype(str)
+    circEbwa.rename({'strand'       : 'circExplorer_bwa_strand',
+                     'known_novel'  : 'circExplorer_bwa_annotation',
+                     'read_count'   : 'circExplorer_bwa_read_count'}, axis=1, inplace=True)
+    circEbwa.drop(['#chrom','start', 'end'], axis = 1,inplace=True)
     circEbwa.set_index(['circRNA_id'],inplace=True,drop=True)
     
     circEbwa.fillna(value=-1,inplace=True)
@@ -185,11 +183,12 @@ def main() :
     # | 11 | strand               | strand info of a predicted circRNAs (new in CIRI2)                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
     # | 12 | junction_reads_ID    | all of the circular junction read IDs (split by ",")   
     ciri["circRNA_start"]=ciri["circRNA_start"].astype(int)-1
-    ciri['circRNA_id']=ciri['chr'].astype(str)+"##"+ciri['circRNA_start'].astype(str)+"##"+ciri['circRNA_end'].astype(str)+"##"+ciri['strand'].astype(str)
-    ciri.rename({   '#junction_reads': 'ciri_read_count', 
-                    '#non_junction_reads' : 'ciri_linear_read_count', 
-                    'circRNA_type': 'ciri_annotation'}, axis=1, inplace=True)
-    ciri.drop(['chr','circRNA_start', 'circRNA_end','strand'], axis = 1,inplace=True)
+    ciri['circRNA_id']=ciri['chr'].astype(str)+"##"+ciri['circRNA_start'].astype(str)+"##"+ciri['circRNA_end'].astype(str)
+    ciri.rename({   'strand'                : 'ciri_strand',
+                    '#junction_reads'       : 'ciri_read_count', 
+                    '#non_junction_reads'   : 'ciri_linear_read_count', 
+                    'circRNA_type'          : 'ciri_annotation'}, axis=1, inplace=True)
+    ciri.drop(['chr','circRNA_start', 'circRNA_end'], axis = 1,inplace=True)
     ciri.set_index(['circRNA_id'],inplace=True,drop=True)
 
     ciri.fillna(value=-1,inplace=True)
@@ -208,6 +207,7 @@ def main() :
 
     if args.findcirc:
         findcirc=pandas.read_csv(args.findcirc,sep="\t",header=0)
+        print(findcirc.columns)
 # add find_circ
 # | #  | short_name      | description
 # | -- | --------------- | ---------------------------------------------------------------------------------------------------------------- |
@@ -229,9 +229,10 @@ def main() :
 # | 16 | signal          | flanking dinucleotide splice signal (normally GT/AG)                                                             |
 # | 17 | strandmatch     | 'MATCH', 'MISMATCH' or 'NA' for non-stranded analysis                                                            |
 # | 18 | category        | list of keywords describing the junction. Useful for quick grep filtering                                        |
-        findcirc['circRNA_id']=findcirc['chrom'].astype(str)+"##"+findcirc['start'].astype(str)+"##"+findcirc['end'].astype(str)+"##"+findcirc['strand'].astype(str)
-        findcirc = findcirc.loc[:, ['circRNA_id', 'n_reads']]
-        findcirc.rename({'n_reads': 'findcirc_read_count'}, axis=1, inplace=True)
+        findcirc['circRNA_id']=findcirc['chrom'].astype(str)+"##"+findcirc['start'].astype(str)+"##"+findcirc['end'].astype(str)
+        findcirc = findcirc.loc[:, ['circRNA_id', 'n_reads', 'strand']]
+        findcirc.rename({ 'strand'      : 'findcirc_strand',
+                          'n_reads'     : 'findcirc_read_count'}, axis=1, inplace=True)
         findcirc.set_index(['circRNA_id'],inplace=True,drop=True)
 
         findcirc.fillna(value=-1,inplace=True)
@@ -240,7 +241,6 @@ def main() :
         strcols = list ( set(findcirc.columns) - set(intcols) )
         findcirc = _df_setcol_as_int(findcirc,intcols)
         if len(strcols) > 0: findcirc = _df_setcol_as_str(findcirc,strcols)    
-
         dfs.append(findcirc)
         if "findcirc".lower() in hqcc: 
             required_hqcols.append("findcirc_read_count")
@@ -261,11 +261,12 @@ def main() :
         # | 6 | linear_read_count|
         # | 7 | dcc_annotation | --> this is gene##JunctionType##Start-End Region from CircCoordinates file
         dcc["start"]=dcc["start"].astype(int)-1
-        dcc['circRNA_id']=dcc['chr'].astype(str)+"##"+dcc['start'].astype(str)+"##"+dcc['end'].astype(str)+"##"+dcc['strand'].astype(str)
+        dcc['circRNA_id']=dcc['chr'].astype(str)+"##"+dcc['start'].astype(str)+"##"+dcc['end'].astype(str)
+        dcc.rename({'strand': 'dcc_strand'}, axis=1, inplace=True)
         dcc.rename({'read_count': 'dcc_read_count'}, axis=1, inplace=True)
         dcc.rename({'linear_read_count': 'dcc_linear_read_count'}, axis=1, inplace=True)
         dcc[['dcc_gene', 'dcc_junction_type', 'dcc_annotation2']] = dcc['dcc_annotation'].apply(lambda x: pandas.Series(str(x).split("##")))
-        dcc.drop(['chr','start', 'end','strand','dcc_annotation'], axis = 1,inplace=True)
+        dcc.drop(['chr','start', 'end','dcc_annotation'], axis = 1,inplace=True)
         dcc.rename({'dcc_annotation2': 'dcc_annotation'}, axis=1, inplace=True)
         dcc.set_index(['circRNA_id'],inplace=True,drop=True)
 
@@ -297,10 +298,11 @@ def main() :
         # | 6 | mapsplice_annotation | normal##2.811419 | <--"fusion_type"##"entropy" 
         # "fusion_type" is either "normal" or "overlapping" ... higher "entropy" values are better!
         mapsplice["start"]=mapsplice["start"].astype(int)-1
-        mapsplice['circRNA_id']=mapsplice['chrom'].astype(str)+"##"+mapsplice['start'].astype(str)+"##"+mapsplice['end'].astype(str)+"##"+mapsplice['strand'].astype(str)
+        mapsplice['circRNA_id']=mapsplice['chrom'].astype(str)+"##"+mapsplice['start'].astype(str)+"##"+mapsplice['end'].astype(str)
+        mapsplice.rename({'strand': 'mapsplice_strand'}, axis=1, inplace=True)
         mapsplice.rename({'read_count': 'mapsplice_read_count'}, axis=1, inplace=True)
         mapsplice[['mapsplice_annotation2', 'mapsplice_entropy']] = mapsplice['mapsplice_annotation'].apply(lambda x: pandas.Series(str(x).split("##")))
-        mapsplice.drop(['chrom','start', 'end','strand','mapsplice_annotation'], axis = 1,inplace=True)
+        mapsplice.drop(['chrom','start', 'end','mapsplice_annotation'], axis = 1,inplace=True)
         mapsplice.rename({'mapsplice_annotation2': 'mapsplice_annotation'}, axis=1, inplace=True)
         mapsplice.set_index(['circRNA_id'],inplace=True,drop=True)
 
@@ -335,9 +337,10 @@ def main() :
         if nclscan.shape[0]==0: includenclscan=False
         if includenclscan:
             nclscan["start"]=nclscan["start"].astype(int)-1
-            nclscan['circRNA_id']=nclscan['chrom'].astype(str)+"##"+nclscan['start'].astype(str)+"##"+nclscan['end'].astype(str)+"##"+nclscan['strand'].astype(str)
+            nclscan['circRNA_id']=nclscan['chrom'].astype(str)+"##"+nclscan['start'].astype(str)+"##"+nclscan['end'].astype(str)
+            nclscan.rename({'strand': 'nclscan_strand'}, axis=1, inplace=True)
             nclscan.rename({'read_count': 'nclscan_read_count'}, axis=1, inplace=True)
-            nclscan.drop(['chrom','start', 'end','strand'], axis = 1,inplace=True)
+            nclscan.drop(['chrom','start', 'end'], axis = 1,inplace=True)
             nclscan = _df_setcol_as_str(nclscan,['nclscan_annotation'])
             nclscan.loc[nclscan['nclscan_annotation']=="1", 'nclscan_annotation'] = "Intragenic"
             nclscan.loc[nclscan['nclscan_annotation']=="0", 'nclscan_annotation'] = "Intergenic"
@@ -367,9 +370,10 @@ def main() :
         # | 3 | end                  | 1223968          |
         # | 4 | strand               | -                |
         # | 5 | read_count           | 26               |
-        circrnafinder['circRNA_id']=circrnafinder['chr'].astype(str)+"##"+circrnafinder['start'].astype(str)+"##"+circrnafinder['end'].astype(str)+"##"+circrnafinder['strand'].astype(str)
+        circrnafinder['circRNA_id']=circrnafinder['chr'].astype(str)+"##"+circrnafinder['start'].astype(str)+"##"+circrnafinder['end'].astype(str)
+        circrnafinder.rename({'strand': 'circrnafinder_strand'}, axis=1, inplace=True)
         circrnafinder.rename({'read_count': 'circrnafinder_read_count'}, axis=1, inplace=True)
-        circrnafinder.drop(['chr','start', 'end','strand'], axis = 1,inplace=True)
+        circrnafinder.drop(['chr','start', 'end'], axis = 1,inplace=True)
         circrnafinder.set_index(['circRNA_id'],inplace=True,drop=True)
 
         circrnafinder.fillna(value=-1,inplace=True)
@@ -418,6 +422,9 @@ def main() :
             df['circRNA_id']=df.index
             df.reset_index(inplace=True,drop=True)
             merged_counts=pandas.merge(merged_counts,df,how='outer',on=['circRNA_id'])
+    
+    print(merged_counts.columns)
+    
     # merged_counts.set_index(['circRNA_id'],inplace=True,drop=True)
 
     merged_counts.fillna(-1,inplace=True)
@@ -428,14 +435,16 @@ def main() :
 
     annotation_cols=['circExplorer_annotation','ciri_annotation']
     floatcols = []
+    strand_cols = ['circExplorer_strand','circExplorer_bwa_strand','ciri_strand']
+
     intcols = [ 'circExplorer_read_count', 
                 'circExplorer_found_BSJcounts', 
-                'circExplorer_found_linear_BSJ_same_strand_counts', 
-                'circExplorer_found_linear_spliced_BSJ_same_strand_counts', 
-                'circExplorer_found_linear_BSJ_opposite_strand_counts', 
-                'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts', 
-                'circExplorer_found_linear_BSJ_unknown_strand_counts', 
-                'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts' ]
+                'circExplorer_found_linear_BSJ_+_counts', 
+                'circExplorer_found_linear_spliced_BSJ_+_counts', 
+                'circExplorer_found_linear_BSJ_-_counts', 
+                'circExplorer_found_linear_spliced_BSJ_-_counts', 
+                'circExplorer_found_linear_BSJ_._counts', 
+                'circExplorer_found_linear_spliced_BSJ_._counts' ]
 
     intcols.extend([ 'ciri_read_count',
                 'ciri_linear_read_count' ])
@@ -445,23 +454,28 @@ def main() :
 
     if args.findcirc:
         intcols.extend(['findcirc_read_count'])
+        strand_cols.append('findcirc_strand')
     
     if args.dcc:
         intcols.extend([ 'dcc_read_count',
                     'dcc_linear_read_count' ])
         annotation_cols.extend(['dcc_gene','dcc_junction_type','dcc_annotation'])
+        strand_cols.append('dcc_strand')
     
     if args.mapsplice:
         intcols.extend([ 'mapsplice_read_count' ])
         floatcols.extend([ 'mapsplice_entropy' ])
         annotation_cols.extend(['mapsplice_annotation'])
+        strand_cols.append('mapsplice_strand')
 
     if args.nclscan and includenclscan:
         intcols.extend([ 'nclscan_read_count' ])
         annotation_cols.extend(['nclscan_annotation'])
+        strand_cols.append('nclscan_strand')
     
     if args.circrnafinder:
         intcols.extend(['circrnafinder_read_count'])
+        strand_cols.append('circrnafinder_strand')
 
     intcols.extend(['ntools'])
     intcols.extend(['hqcounts','nonhqcounts'])
@@ -491,35 +505,41 @@ def main() :
     if args.mapsplice: merged_counts.loc[merged_counts['mapsplice_read_count'] >= args.minreads, 'ntools'] += 1
     if args.nclscan and includenclscan: merged_counts.loc[merged_counts['nclscan_read_count'] >= args.minreads, 'ntools'] += 1
     if args.circrnafinder: merged_counts.loc[merged_counts['circrnafinder_read_count'] >= args.minreads, 'ntools'] += 1
-    merged_counts[['chrom', 'start', 'end', 'strand']] = merged_counts['circRNA_id'].str.split('##', expand=True)
+    merged_counts[['chrom', 'start', 'end']] = merged_counts['circRNA_id'].str.split('##', expand=True)
  
     merged_counts=_df_setcol_as_int(merged_counts,['start','end','ntools'])
-    merged_counts=_df_setcol_as_str(merged_counts,['chrom','strand'])
+    merged_counts=_df_setcol_as_str(merged_counts,['chrom'])
 
     # adding flanking sites
-    merged_counts['flanking_sites']="-1"
+    merged_counts['flanking_sites_+']="-1"
+    merged_counts['flanking_sites_-']="-1"
 
     sequences = dict((s[1], s[0]) for s in HTSeq.FastaReader(args.reffa, raw_iterator=True))
     for index, row in merged_counts.iterrows():
-        bsj = BSJ(chrom=row['chrom'],start=row['start'],end=row['end'],strand=row['strand'])
+        bsj = BSJ(chrom=row['chrom'],start=row['start'],end=row['end'])
         bsj.add_flanks(sequences)
-        merged_counts.loc[index, 'flanking_sites'] = bsj.get_flanks()
+        plus_flank, minus_flank = bsj.get_flanks()
+        merged_counts.loc[index, 'flanking_sites_+'] = plus_flank
+        merged_counts.loc[index, 'flanking_sites_-'] = minus_flank
 
     # add samplename
     merged_counts['sample_name'] = args.samplename
-    merged_counts=_df_setcol_as_str(merged_counts,['sample_name','flanking_sites'])
+    merged_counts=_df_setcol_as_str(merged_counts,['sample_name','flanking_sites_+','flanking_sites_-'])
+    print(merged_counts.columns)
 
     # prepare output ... reorder columns
-    outcols=['chrom', 'start', 'end', 'strand', 'flanking_sites', 'sample_name', 'ntools', 'HQ']
+    outcols=['chrom', 'start', 'end']
+    outcols.extend(strand_cols)
+    outcols.extend(['flanking_sites_+','flanking_sites_-', 'sample_name', 'ntools', 'HQ'])
     # add circExplorer columns
     outcols.extend(['circExplorer_read_count',
                     'circExplorer_found_BSJcounts', 
-                    'circExplorer_found_linear_BSJ_same_strand_counts', 
-                    'circExplorer_found_linear_spliced_BSJ_same_strand_counts', 
-                    'circExplorer_found_linear_BSJ_opposite_strand_counts', 
-                    'circExplorer_found_linear_spliced_BSJ_opposite_strand_counts', 
-                    'circExplorer_found_linear_BSJ_unknown_strand_counts', 
-                    'circExplorer_found_linear_spliced_BSJ_unknown_strand_counts'])
+                    'circExplorer_found_linear_BSJ_+_counts', 
+                    'circExplorer_found_linear_spliced_BSJ_+_counts', 
+                    'circExplorer_found_linear_BSJ_-_counts', 
+                    'circExplorer_found_linear_spliced_BSJ_-_counts', 
+                    'circExplorer_found_linear_BSJ_._counts', 
+                    'circExplorer_found_linear_spliced_BSJ_._counts'])
     # add ciri columns
     outcols.extend(['ciri_read_count',
                     'ciri_linear_read_count'])
diff --git a/workflow/scripts/create_circExplorer_per_sample_counts_table.py b/workflow/scripts/create_circExplorer_per_sample_counts_table.py
index 458d295..a60e321 100644
--- a/workflow/scripts/create_circExplorer_per_sample_counts_table.py
+++ b/workflow/scripts/create_circExplorer_per_sample_counts_table.py
@@ -15,6 +15,17 @@
 # 10      linear_BSJ_reads_opposite_strand
 # 11      linear_spliced_BSJ_reads_opposite_strand
 
+
+def _df_setcol_as_int(df,collist):
+    for c in collist:
+        df[[c]]=df[[c]].astype(int)
+    return df
+
+def _df_setcol_as_str(df,collist):
+    for c in collist:
+        df[[c]]=df[[c]].astype(str)
+    return df
+
 def main():
     # debug = True
     debug = False
@@ -33,7 +44,11 @@ def main():
     # print(bcounts.head())
     # print(lcounts.head())
     mcounts = bcounts.merge(lcounts,how='outer',on=["#chrom","start","end","strand"])
-
+    mcounts.fillna(value=0,inplace=True)
+    strcols = [ '#chrom', 'strand', 'known_novel' ]
+    intcols = list ( set(mcounts.columns) - set(strcols) )
+    mcounts = _df_setcol_as_str(mcounts,strcols)
+    mcounts = _df_setcol_as_int(mcounts,intcols)
     mcounts.drop(["read_count"],axis=1,inplace=True)
     mcounts.to_csv(args.mergedcounts,index=False,doublequote=False,sep="\t")