Merge pull request #40 from genxnetwork/fix/shared_genome_proportion

Fixed shared genome proportions
genxnetwork · Dec 24, 2021 · df2353e · df2353e
2 parents 0ab4c18 + 9d14737
commit df2353e
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 67 deletions.
diff --git a/readme.md b/readme.md
@@ -175,7 +175,7 @@ g1-b1-i1 g3-b1-i1     2           2                 0.2369          0.1163
  * `shared_ancestors` is the most likeliest number of shared ancestors, if it is 0, then one relative is a direct descendant of the other, 
  if 1 then they probably have one common ancestor, i.e. half siblings, if 2 then they have common mother and father, for example.  
  * `final_degree` is simply `king_degree` for close relatives up to 3rd degree and `ersa_degree` for distant relatives.
- * `total_seg_len` is the total length of all IBD segments, for the first 3 degrees it is calculated using KING IBD data, 
+ * `total_seg_len` is the total length of all IBD1 segments, for the first 3 degrees it is calculated using KING IBD data, 
    for the 4th+ degrees it is calculated using IBID or Germline IBD data.
  * `seg_count` is the total number of all IBD segments found by KING for the first 3 degrees and found by IBIS\Germline for the 4th+ degrees.
 

diff --git a/rules/relatives_ibis.smk b/rules/relatives_ibis.smk
@@ -64,10 +64,18 @@ rule ersa:
         
         FILES="{input.ibd}"
         TEMPFILE=ersa/temp_relatives.tsv
-
+        rm -f $TEMPFILE
+        rm -f {output}
+        
         for input_file in $FILES; do
+
             ersa --avuncular-adj -ci --alpha {params.alpha} --dmax 14 -t $ERSA_T -l $ERSA_L -th $ERSA_TH $input_file -o $TEMPFILE  |& tee {log}
-            cat $TEMPFILE >> {output}
+
+            if [[ "$input_file" == "${{FILES[0]}}" ]]; then
+                cat $TEMPFILE >> {output}
+            else
+                sed 1d $TEMPFILE >> {output}
+            fi
         done
         """
 

diff --git a/rules/relatives_ibis_king.smk b/rules/relatives_ibis_king.smk
@@ -104,10 +104,18 @@ rule ersa:
         
         FILES="{input.ibd}"
         TEMPFILE=ersa/temp_relatives.tsv
-
+        rm -f $TEMPFILE
+        rm -f {output}
+        
         for input_file in $FILES; do
+
             ersa --avuncular-adj -ci --alpha {params.alpha} --dmax 14 -t $ERSA_T -l $ERSA_L -th $ERSA_TH $input_file -o $TEMPFILE  |& tee {log}
-            cat $TEMPFILE >> {output}
+
+            if [[ "$input_file" == "${{FILES[0]}}" ]]; then
+                cat $TEMPFILE >> {output}
+            else
+                sed 1d $TEMPFILE >> {output}
+            fi
         done
         """
 
@@ -126,7 +134,6 @@ rule merge_king_ersa:
     input:
         king=rules.run_king.output['king'],
         king_segments=rules.run_king.output['segments'],
-        bucket_dir=directory('ibd'),
         ersa=rules.ersa.output[0],
         kinship=rules.run_king.output['kinship'],
         kinship0=rules.run_king.output['kinship0'],

diff --git a/scripts/merge_king_ersa.py b/scripts/merge_king_ersa.py
@@ -95,8 +95,8 @@ def read_king(king_path):
         data.loc[:, 'king_degree'] = map_king_degree(data.InfType)
         data.loc[:, 'king_relation'] = map_king_relation(data.InfType)
         data.loc[:, 'king_degree'] = data.king_degree.astype(float).astype(pandas.Int32Dtype())
-        data.rename({'PropIBD': 'shared_genome_proportion'}, axis='columns', inplace=True)
-        indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'shared_genome_proportion']].\
+        data.rename({'PropIBD': 'shared_genome_proportion', 'IBD1Seg': 'king_ibd1_prop', 'IBD2Seg': 'king_ibd2_prop'}, axis='columns', inplace=True)
+        indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'king_ibd1_prop', 'king_ibd2_prop', 'shared_genome_proportion']].\
             set_index(['id1', 'id2'])
         return indexed
 
@@ -106,6 +106,8 @@ def read_king(king_path):
                                          'id2',
                                          'king_degree',
                                          'king_relation',
+                                         'king_ibd1_prop',
+                                         'king_ibd2_prop',
                                          'shared_genome_proportion']).set_index(['id1', 'id2'])
 
 
@@ -228,8 +230,8 @@ def read_ersa(ersa_path):
     # Indv_1     Indv_2      Rel_est1      Rel_est2      d_est     lower_d  upper_d     N_seg     Tot_cM
     data = pandas.read_table(ersa_path, header=0,
                              names=['id1', 'id2', 'rel_est1', 'rel_est2',
-                                    'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count', 'total_seg_len'],
-                             dtype={'ersa_degree': str})
+                                    'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count_ersa', 'total_seg_len_ersa'],
+                             dtype={'ersa_degree': str, 'seg_count_ersa': str, 'total_seg_len_ersa': str})
 
     data = data.loc[(data.rel_est1 != 'NA') | (data.rel_est2 != 'NA'), :]
     data.loc[:, 'id1'] = data.id1.str.strip()
@@ -241,18 +243,20 @@ def read_ersa(ersa_path):
         pandas.Int32Dtype())
     data.loc[:, 'ersa_upper_bound'] = pandas.to_numeric(data.ersa_upper_bound.str.strip(), errors='coerce').astype(
         pandas.Int32Dtype())
+    print('dtypes are: ', data.dtypes)
+    data.loc[:, 'seg_count_ersa'] = pandas.to_numeric(data.seg_count_ersa.str.strip(), errors='coerce').astype(
+        pandas.Int32Dtype())
+    data.loc[:, 'total_seg_len_ersa'] = pandas.to_numeric(data.total_seg_len_ersa.str.strip(), errors='coerce')
 
     data.loc[data.ersa_lower_bound != 1, 'ersa_lower_bound'] = data.ersa_lower_bound - 1
     data.loc[data.ersa_upper_bound != 1, 'ersa_upper_bound'] = data.ersa_upper_bound - 1
 
-    print(f'Bad degrees are {(data.ersa_lower_bound > data.ersa_degree).sum()}')
-    print(Counter(data.ersa_degree[data.ersa_lower_bound > data.ersa_degree]))
     logging.info(f'read {data.shape[0]} pairs from ersa output')
     logging.info(f'ersa after reading has {pandas.notna(data.ersa_degree).sum()}')
     data.loc[:, 'is_niece_aunt'] = [True if 'Niece' in d else False for d in data.rel_est1]
     logging.info(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships')
-    print(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships')
-    return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt']].\
+
+    return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt', 'seg_count_ersa', 'total_seg_len_ersa']].\
         set_index(['id1', 'id2'])
 
 
@@ -294,8 +298,7 @@ def read_ersa2(ersa_path):
         test_dir = '/media_ssd/pipeline_data/TF-CEU-TRIBES-ibis-king-2'
         Snakemake = namedtuple('Snakemake', ['input', 'output', 'params', 'log'])
         snakemake = Snakemake(
-            input={'bucket_dir': f'{test_dir}/ibd',
-                   'king': f'{test_dir}/king/data.seg',
+            input={'king': f'{test_dir}/king/data.seg',
                    'king_segments': f'{test_dir}/king/data.segments.gz',
                    'kinship': f'{test_dir}/king/data.kin',
                    'kinship0': f'{test_dir}/king/data.kin0',
@@ -308,7 +311,6 @@ def read_ersa2(ersa_path):
 
     logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s')
 
-    ibd_path = snakemake.input['bucket_dir']
     king_path = snakemake.input['king']
     king_segments_path = snakemake.input['king_segments']
     # within families
@@ -319,38 +321,33 @@ def read_ersa2(ersa_path):
     map_dir = snakemake.params['cm_dir']
     output_path = snakemake.output[0]
 
-    ibd = read_bucket_dir(ibd_path)
     king = read_king(king_path)
     kinship = read_kinship(kinship_path, kinship0_path)
     king_segments = read_king_segments_chunked(king_segments_path, map_dir)
     ersa = read_ersa(ersa_path)
 
-    logging.info(f'ibd shape: {ibd.shape[0]}, ersa shape: {ersa.shape[0]}')
-    # print('ibd test:',  ibd[('GRC12118091', 'GRC12118096')])
-    # print('ibd test2:', ibd[('GRC12118096', 'GRC12118091')])
-
-    relatives = ibd.merge(king, how='outer', left_index=True, right_index=True).\
-                    merge(kinship, how='outer', left_index=True, right_index=True).\
-                    merge(ersa, how='outer', left_index=True, right_index=True).\
-                    merge(king_segments, how='outer', left_index=True, right_index=True)
+    relatives = king.merge(kinship, how='outer', left_index=True, right_index=True).\
+                     merge(ersa, how='outer', left_index=True, right_index=True).\
+                     merge(king_segments, how='outer', left_index=True, right_index=True)
 
     prefer_ersa_mask = pandas.isnull(relatives.king_degree) | (relatives.king_degree > 3)
     relatives.loc[:, 'final_degree'] = relatives.king_degree
     # if king is unsure or king degree > 3 then we use ersa distant relatives estimation
     relatives.loc[prefer_ersa_mask, 'final_degree'] = relatives.ersa_degree
     logging.info(f'king is null or more than 3: {prefer_ersa_mask.sum()}')
     logging.info(f'ersa is not null: {pandas.notna(relatives.ersa_degree).sum()}')
-
+    print(f'relatives columns are {relatives.columns}')
     if 'total_seg_len_king' in relatives.columns:
-        relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king
+        relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king*relatives.king_ibd1_prop
+        relatives.loc[:, 'total_seg_len_ibd2'] = relatives.total_seg_len_king*relatives.king_ibd2_prop
         relatives.loc[:, 'seg_count'] = relatives.seg_count_king
 
-    relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_germline
-    relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_germline
-
+    relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_ersa
+    relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_ersa
+    print('is na: ', pandas.isna(relatives.loc[prefer_ersa_mask, 'total_seg_len_ersa']).sum())
     # approximate calculations, IBD share is really small in this case
-    relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'] / 3540
-    relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_germline', 'seg_count_germline'],
+    relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'].values / 3440
+    relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_ersa', 'seg_count_ersa', 'king_ibd1_prop', 'king_ibd2_prop'],
                    axis='columns', inplace=True)
 
     logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}')

diff --git a/scripts/postprocess_ersa.py b/scripts/postprocess_ersa.py
@@ -95,27 +95,38 @@ def read_ersa(ersa_path):
     relatives = ibd.merge(ersa, how='outer', left_index=True, right_index=True)
 
     # It is impossible to have more than 50% of ibd2 segments unless you are monozygotic twins or duplicates.
-    dup_mask = relatives.total_seg_len_ibd2 > 1800
-    fs_mask = (relatives.total_seg_len > 2100) & (relatives.total_seg_len_ibd2 > 450) & (~dup_mask)
-    po_mask = (relatives.total_seg_len / 3540 > 0.8) & (~fs_mask) & (~dup_mask)
+    GENOME_CM_LEN = 3440
+    DUPLICATES_THRESHOLD = 1750
+    FS_TOTAL_THRESHOLD = 2100
+    FS_IBD2_THRESHOLD = 450
+    dup_mask = relatives.total_seg_len_ibd2 > DUPLICATES_THRESHOLD
+    fs_mask = (relatives.total_seg_len > FS_TOTAL_THRESHOLD) & (relatives.total_seg_len_ibd2 > FS_IBD2_THRESHOLD) & (~dup_mask)
+    po_mask = (relatives.total_seg_len / GENOME_CM_LEN > 0.8) & (~fs_mask) & (~dup_mask)
     print(f'We have {sum(dup_mask)} duplicates, {sum(fs_mask)} full siblings and {sum(po_mask)} parent-offspring relationships')
 
     relatives.loc[:, 'final_degree'] = relatives.ersa_degree
     relatives.loc[dup_mask, 'final_degree'] = 0
     relatives.loc[po_mask, 'final_degree'] = 1
 
-    relatives.loc[(~po_mask) & (relatives.final_degree == 1)] = 2  # to eliminate some ersa false positives
+    relatives.loc[(~po_mask) & (relatives.final_degree == 1), 'final_degree'] = 2  # to eliminate some ersa false positives
     relatives.loc[fs_mask, 'final_degree'] = 2
     relatives.loc[:, 'relation'] = relatives.ersa_degree.astype(str)
     relatives.loc[po_mask, 'relation'] = 'PO'
     relatives.loc[fs_mask, 'relation'] = 'FS'
     relatives.loc[dup_mask, 'relation'] = 'MZ/Dup'
-    # if king is unsure or king degree > 3 then we use ersa distant relatives estimation
 
     # approximate calculations, IBD share is really small in this case
     relatives.loc[pandas.isna(relatives.total_seg_len_ibd2), 'total_seg_len_ibd2'] = 0
     relatives.loc[pandas.isna(relatives.seg_count_ibd2), 'seg_count_ibd2'] = 0
-    relatives.loc[:, 'shared_genome_proportion'] = 0.5*relatives.total_seg_len / 3540 + relatives.total_seg_len_ibd2 / 3540
+    '''
+        IBIS and ERSA do not distinguish IBD1 and IBD2 segments
+        shared_genome_proportion is a proportion of identical alleles
+        i.e. shared_genome_proportion of [(0/0), (0/1)] and [(0/0), (1/1)] should be 3/4
+        GENOME_SEG_LEN is length of centimorgans of the haplotype
+        ibd2 segments span two haplotypes, therefore their length should be doubled
+        total_seg_len includes both ibd1 and ibd2 segments length
+    '''
+    relatives.loc[:, 'shared_genome_proportion'] = (relatives.total_seg_len - relatives.total_seg_len_ibd2) / (2*GENOME_CM_LEN) + relatives.total_seg_len_ibd2*2 / (2*GENOME_CM_LEN)
     relatives.loc[relatives.shared_genome_proportion > 1.0, 'shared_genome_proportion'] = 1.0
     logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}')
     relatives.loc[pandas.notna(relatives.final_degree), :].to_csv(output_path, sep='\t')
diff --git a/workflows/bundle/Snakefile b/workflows/bundle/Snakefile
@@ -24,31 +24,59 @@ BUNDLE_md5         = config["reference"]["bundle" if full else "bundle_min"]["md
 
 CHROMOSOMES = [str(i) for i in range(1, 23)]
 
-rule all:
-    output:
-        GRCh37_fasta = GRCH37_FASTA,
-        GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA),
-        GENETIC_MAP = GENETIC_MAP,
-        genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
-        cmmap = expand(CMMAP, chrom=CHROMOSOMES),
-        lift_chain = LIFT_CHAIN,
-        SITE_1000GENOME = SITE_1000GENOME,
-        pedsim_map = PEDSIM_MAP,
-        affymetrix_chip = AFFYMETRIX_CHIP,
-        vcfRef = expand(REF_VCF, chrom=CHROMOSOMES if full else []),
-        refHaps = expand(REF_HAPS, chrom=CHROMOSOMES if full else [])
-    conda:
-        "../../envs/download.yaml"
-    log:
-        "logs/ref/download_bundle.log"
-    shell:
-        """
-            wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log}
-            sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1)
-            if [ $sum != {BUNDLE_md5} ]; then
-                echo "md5 sum of BUDNLE is invalid" |& tee -a {log}
-                exit 1
-            fi  
-            tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log}
-            rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log}
-        """
+if full:
+    rule all:
+        output:
+            GRCh37_fasta = GRCH37_FASTA,
+            GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA),
+            GENETIC_MAP = GENETIC_MAP,
+            genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
+            cmmap = expand(CMMAP, chrom=CHROMOSOMES),
+            lift_chain = LIFT_CHAIN,
+            SITE_1000GENOME = SITE_1000GENOME,
+            pedsim_map = PEDSIM_MAP,
+            affymetrix_chip = AFFYMETRIX_CHIP,
+            vcfRef = expand(REF_VCF, chrom=CHROMOSOMES),
+            refHaps = expand(REF_HAPS, chrom=CHROMOSOMES)
+        conda:
+            "../../envs/download.yaml"
+        log:
+            "logs/ref/download_bundle.log"
+        shell:
+            """
+                wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log}
+                sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1)
+                if [ $sum != {BUNDLE_md5} ]; then
+                    echo "md5 sum of BUDNLE is invalid" |& tee -a {log}
+                    exit 1
+                fi  
+                tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log}
+                rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log}
+            """
+else:
+    rule all:
+        output:
+            GRCh37_fasta = GRCH37_FASTA,
+            GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA),
+            GENETIC_MAP = GENETIC_MAP,
+            genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
+            cmmap = expand(CMMAP, chrom=CHROMOSOMES),
+            lift_chain = LIFT_CHAIN,
+            SITE_1000GENOME = SITE_1000GENOME,
+            pedsim_map = PEDSIM_MAP,
+            affymetrix_chip = AFFYMETRIX_CHIP
+        conda:
+            "../../envs/download.yaml"
+        log:
+            "logs/ref/download_bundle.log"
+        shell:
+            """
+                wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log}
+                sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1)
+                if [ $sum != {BUNDLE_md5} ]; then
+                    echo "md5 sum of BUDNLE is invalid" |& tee -a {log}
+                    exit 1
+                fi  
+                tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log}
+                rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log}
+            """