Skip to content

Commit

Permalink
Merge pull request #40 from genxnetwork/fix/shared_genome_proportion
Browse files Browse the repository at this point in the history
Fixed shared genome proportions
  • Loading branch information
alex-medvedev-msc committed Dec 24, 2021
2 parents 0ab4c18 + 9d14737 commit df2353e
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 67 deletions.
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ g1-b1-i1 g3-b1-i1 2 2 0.2369 0.1163
* `shared_ancestors` is the most likeliest number of shared ancestors, if it is 0, then one relative is a direct descendant of the other,
if 1 then they probably have one common ancestor, i.e. half siblings, if 2 then they have common mother and father, for example.
* `final_degree` is simply `king_degree` for close relatives up to 3rd degree and `ersa_degree` for distant relatives.
* `total_seg_len` is the total length of all IBD segments, for the first 3 degrees it is calculated using KING IBD data,
* `total_seg_len` is the total length of all IBD1 segments, for the first 3 degrees it is calculated using KING IBD data,
for the 4th+ degrees it is calculated using IBID or Germline IBD data.
* `seg_count` is the total number of all IBD segments found by KING for the first 3 degrees and found by IBIS\Germline for the 4th+ degrees.

Expand Down
12 changes: 10 additions & 2 deletions rules/relatives_ibis.smk
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,18 @@ rule ersa:
FILES="{input.ibd}"
TEMPFILE=ersa/temp_relatives.tsv
rm -f $TEMPFILE
rm -f {output}
for input_file in $FILES; do
ersa --avuncular-adj -ci --alpha {params.alpha} --dmax 14 -t $ERSA_T -l $ERSA_L -th $ERSA_TH $input_file -o $TEMPFILE |& tee {log}
cat $TEMPFILE >> {output}
if [[ "$input_file" == "${{FILES[0]}}" ]]; then
cat $TEMPFILE >> {output}
else
sed 1d $TEMPFILE >> {output}
fi
done
"""

Expand Down
13 changes: 10 additions & 3 deletions rules/relatives_ibis_king.smk
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,18 @@ rule ersa:
FILES="{input.ibd}"
TEMPFILE=ersa/temp_relatives.tsv
rm -f $TEMPFILE
rm -f {output}
for input_file in $FILES; do
ersa --avuncular-adj -ci --alpha {params.alpha} --dmax 14 -t $ERSA_T -l $ERSA_L -th $ERSA_TH $input_file -o $TEMPFILE |& tee {log}
cat $TEMPFILE >> {output}
if [[ "$input_file" == "${{FILES[0]}}" ]]; then
cat $TEMPFILE >> {output}
else
sed 1d $TEMPFILE >> {output}
fi
done
"""

Expand All @@ -126,7 +134,6 @@ rule merge_king_ersa:
input:
king=rules.run_king.output['king'],
king_segments=rules.run_king.output['segments'],
bucket_dir=directory('ibd'),
ersa=rules.ersa.output[0],
kinship=rules.run_king.output['kinship'],
kinship0=rules.run_king.output['kinship0'],
Expand Down
51 changes: 24 additions & 27 deletions scripts/merge_king_ersa.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ def read_king(king_path):
data.loc[:, 'king_degree'] = map_king_degree(data.InfType)
data.loc[:, 'king_relation'] = map_king_relation(data.InfType)
data.loc[:, 'king_degree'] = data.king_degree.astype(float).astype(pandas.Int32Dtype())
data.rename({'PropIBD': 'shared_genome_proportion'}, axis='columns', inplace=True)
indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'shared_genome_proportion']].\
data.rename({'PropIBD': 'shared_genome_proportion', 'IBD1Seg': 'king_ibd1_prop', 'IBD2Seg': 'king_ibd2_prop'}, axis='columns', inplace=True)
indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'king_ibd1_prop', 'king_ibd2_prop', 'shared_genome_proportion']].\
set_index(['id1', 'id2'])
return indexed

Expand All @@ -106,6 +106,8 @@ def read_king(king_path):
'id2',
'king_degree',
'king_relation',
'king_ibd1_prop',
'king_ibd2_prop',
'shared_genome_proportion']).set_index(['id1', 'id2'])


Expand Down Expand Up @@ -228,8 +230,8 @@ def read_ersa(ersa_path):
# Indv_1 Indv_2 Rel_est1 Rel_est2 d_est lower_d upper_d N_seg Tot_cM
data = pandas.read_table(ersa_path, header=0,
names=['id1', 'id2', 'rel_est1', 'rel_est2',
'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count', 'total_seg_len'],
dtype={'ersa_degree': str})
'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count_ersa', 'total_seg_len_ersa'],
dtype={'ersa_degree': str, 'seg_count_ersa': str, 'total_seg_len_ersa': str})

data = data.loc[(data.rel_est1 != 'NA') | (data.rel_est2 != 'NA'), :]
data.loc[:, 'id1'] = data.id1.str.strip()
Expand All @@ -241,18 +243,20 @@ def read_ersa(ersa_path):
pandas.Int32Dtype())
data.loc[:, 'ersa_upper_bound'] = pandas.to_numeric(data.ersa_upper_bound.str.strip(), errors='coerce').astype(
pandas.Int32Dtype())
print('dtypes are: ', data.dtypes)
data.loc[:, 'seg_count_ersa'] = pandas.to_numeric(data.seg_count_ersa.str.strip(), errors='coerce').astype(
pandas.Int32Dtype())
data.loc[:, 'total_seg_len_ersa'] = pandas.to_numeric(data.total_seg_len_ersa.str.strip(), errors='coerce')

data.loc[data.ersa_lower_bound != 1, 'ersa_lower_bound'] = data.ersa_lower_bound - 1
data.loc[data.ersa_upper_bound != 1, 'ersa_upper_bound'] = data.ersa_upper_bound - 1

print(f'Bad degrees are {(data.ersa_lower_bound > data.ersa_degree).sum()}')
print(Counter(data.ersa_degree[data.ersa_lower_bound > data.ersa_degree]))
logging.info(f'read {data.shape[0]} pairs from ersa output')
logging.info(f'ersa after reading has {pandas.notna(data.ersa_degree).sum()}')
data.loc[:, 'is_niece_aunt'] = [True if 'Niece' in d else False for d in data.rel_est1]
logging.info(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships')
print(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships')
return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt']].\

return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt', 'seg_count_ersa', 'total_seg_len_ersa']].\
set_index(['id1', 'id2'])


Expand Down Expand Up @@ -294,8 +298,7 @@ def read_ersa2(ersa_path):
test_dir = '/media_ssd/pipeline_data/TF-CEU-TRIBES-ibis-king-2'
Snakemake = namedtuple('Snakemake', ['input', 'output', 'params', 'log'])
snakemake = Snakemake(
input={'bucket_dir': f'{test_dir}/ibd',
'king': f'{test_dir}/king/data.seg',
input={'king': f'{test_dir}/king/data.seg',
'king_segments': f'{test_dir}/king/data.segments.gz',
'kinship': f'{test_dir}/king/data.kin',
'kinship0': f'{test_dir}/king/data.kin0',
Expand All @@ -308,7 +311,6 @@ def read_ersa2(ersa_path):

logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s')

ibd_path = snakemake.input['bucket_dir']
king_path = snakemake.input['king']
king_segments_path = snakemake.input['king_segments']
# within families
Expand All @@ -319,38 +321,33 @@ def read_ersa2(ersa_path):
map_dir = snakemake.params['cm_dir']
output_path = snakemake.output[0]

ibd = read_bucket_dir(ibd_path)
king = read_king(king_path)
kinship = read_kinship(kinship_path, kinship0_path)
king_segments = read_king_segments_chunked(king_segments_path, map_dir)
ersa = read_ersa(ersa_path)

logging.info(f'ibd shape: {ibd.shape[0]}, ersa shape: {ersa.shape[0]}')
# print('ibd test:', ibd[('GRC12118091', 'GRC12118096')])
# print('ibd test2:', ibd[('GRC12118096', 'GRC12118091')])

relatives = ibd.merge(king, how='outer', left_index=True, right_index=True).\
merge(kinship, how='outer', left_index=True, right_index=True).\
merge(ersa, how='outer', left_index=True, right_index=True).\
merge(king_segments, how='outer', left_index=True, right_index=True)
relatives = king.merge(kinship, how='outer', left_index=True, right_index=True).\
merge(ersa, how='outer', left_index=True, right_index=True).\
merge(king_segments, how='outer', left_index=True, right_index=True)

prefer_ersa_mask = pandas.isnull(relatives.king_degree) | (relatives.king_degree > 3)
relatives.loc[:, 'final_degree'] = relatives.king_degree
# if king is unsure or king degree > 3 then we use ersa distant relatives estimation
relatives.loc[prefer_ersa_mask, 'final_degree'] = relatives.ersa_degree
logging.info(f'king is null or more than 3: {prefer_ersa_mask.sum()}')
logging.info(f'ersa is not null: {pandas.notna(relatives.ersa_degree).sum()}')

print(f'relatives columns are {relatives.columns}')
if 'total_seg_len_king' in relatives.columns:
relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king
relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king*relatives.king_ibd1_prop
relatives.loc[:, 'total_seg_len_ibd2'] = relatives.total_seg_len_king*relatives.king_ibd2_prop
relatives.loc[:, 'seg_count'] = relatives.seg_count_king

relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_germline
relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_germline

relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_ersa
relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_ersa
print('is na: ', pandas.isna(relatives.loc[prefer_ersa_mask, 'total_seg_len_ersa']).sum())
# approximate calculations, IBD share is really small in this case
relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'] / 3540
relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_germline', 'seg_count_germline'],
relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'].values / 3440
relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_ersa', 'seg_count_ersa', 'king_ibd1_prop', 'king_ibd2_prop'],
axis='columns', inplace=True)

logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}')
Expand Down
23 changes: 17 additions & 6 deletions scripts/postprocess_ersa.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,27 +95,38 @@ def read_ersa(ersa_path):
relatives = ibd.merge(ersa, how='outer', left_index=True, right_index=True)

# It is impossible to have more than 50% of ibd2 segments unless you are monozygotic twins or duplicates.
dup_mask = relatives.total_seg_len_ibd2 > 1800
fs_mask = (relatives.total_seg_len > 2100) & (relatives.total_seg_len_ibd2 > 450) & (~dup_mask)
po_mask = (relatives.total_seg_len / 3540 > 0.8) & (~fs_mask) & (~dup_mask)
GENOME_CM_LEN = 3440
DUPLICATES_THRESHOLD = 1750
FS_TOTAL_THRESHOLD = 2100
FS_IBD2_THRESHOLD = 450
dup_mask = relatives.total_seg_len_ibd2 > DUPLICATES_THRESHOLD
fs_mask = (relatives.total_seg_len > FS_TOTAL_THRESHOLD) & (relatives.total_seg_len_ibd2 > FS_IBD2_THRESHOLD) & (~dup_mask)
po_mask = (relatives.total_seg_len / GENOME_CM_LEN > 0.8) & (~fs_mask) & (~dup_mask)
print(f'We have {sum(dup_mask)} duplicates, {sum(fs_mask)} full siblings and {sum(po_mask)} parent-offspring relationships')

relatives.loc[:, 'final_degree'] = relatives.ersa_degree
relatives.loc[dup_mask, 'final_degree'] = 0
relatives.loc[po_mask, 'final_degree'] = 1

relatives.loc[(~po_mask) & (relatives.final_degree == 1)] = 2 # to eliminate some ersa false positives
relatives.loc[(~po_mask) & (relatives.final_degree == 1), 'final_degree'] = 2 # to eliminate some ersa false positives
relatives.loc[fs_mask, 'final_degree'] = 2
relatives.loc[:, 'relation'] = relatives.ersa_degree.astype(str)
relatives.loc[po_mask, 'relation'] = 'PO'
relatives.loc[fs_mask, 'relation'] = 'FS'
relatives.loc[dup_mask, 'relation'] = 'MZ/Dup'
# if king is unsure or king degree > 3 then we use ersa distant relatives estimation

# approximate calculations, IBD share is really small in this case
relatives.loc[pandas.isna(relatives.total_seg_len_ibd2), 'total_seg_len_ibd2'] = 0
relatives.loc[pandas.isna(relatives.seg_count_ibd2), 'seg_count_ibd2'] = 0
relatives.loc[:, 'shared_genome_proportion'] = 0.5*relatives.total_seg_len / 3540 + relatives.total_seg_len_ibd2 / 3540
'''
IBIS and ERSA do not distinguish IBD1 and IBD2 segments
shared_genome_proportion is a proportion of identical alleles
i.e. shared_genome_proportion of [(0/0), (0/1)] and [(0/0), (1/1)] should be 3/4
GENOME_SEG_LEN is length of centimorgans of the haplotype
ibd2 segments span two haplotypes, therefore their length should be doubled
total_seg_len includes both ibd1 and ibd2 segments length
'''
relatives.loc[:, 'shared_genome_proportion'] = (relatives.total_seg_len - relatives.total_seg_len_ibd2) / (2*GENOME_CM_LEN) + relatives.total_seg_len_ibd2*2 / (2*GENOME_CM_LEN)
relatives.loc[relatives.shared_genome_proportion > 1.0, 'shared_genome_proportion'] = 1.0
logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}')
relatives.loc[pandas.notna(relatives.final_degree), :].to_csv(output_path, sep='\t')
84 changes: 56 additions & 28 deletions workflows/bundle/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,59 @@ BUNDLE_md5 = config["reference"]["bundle" if full else "bundle_min"]["md

CHROMOSOMES = [str(i) for i in range(1, 23)]

rule all:
output:
GRCh37_fasta = GRCH37_FASTA,
GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA),
GENETIC_MAP = GENETIC_MAP,
genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
cmmap = expand(CMMAP, chrom=CHROMOSOMES),
lift_chain = LIFT_CHAIN,
SITE_1000GENOME = SITE_1000GENOME,
pedsim_map = PEDSIM_MAP,
affymetrix_chip = AFFYMETRIX_CHIP,
vcfRef = expand(REF_VCF, chrom=CHROMOSOMES if full else []),
refHaps = expand(REF_HAPS, chrom=CHROMOSOMES if full else [])
conda:
"../../envs/download.yaml"
log:
"logs/ref/download_bundle.log"
shell:
"""
wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log}
sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1)
if [ $sum != {BUNDLE_md5} ]; then
echo "md5 sum of BUDNLE is invalid" |& tee -a {log}
exit 1
fi
tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log}
rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log}
"""
if full:
rule all:
output:
GRCh37_fasta = GRCH37_FASTA,
GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA),
GENETIC_MAP = GENETIC_MAP,
genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
cmmap = expand(CMMAP, chrom=CHROMOSOMES),
lift_chain = LIFT_CHAIN,
SITE_1000GENOME = SITE_1000GENOME,
pedsim_map = PEDSIM_MAP,
affymetrix_chip = AFFYMETRIX_CHIP,
vcfRef = expand(REF_VCF, chrom=CHROMOSOMES),
refHaps = expand(REF_HAPS, chrom=CHROMOSOMES)
conda:
"../../envs/download.yaml"
log:
"logs/ref/download_bundle.log"
shell:
"""
wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log}
sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1)
if [ $sum != {BUNDLE_md5} ]; then
echo "md5 sum of BUDNLE is invalid" |& tee -a {log}
exit 1
fi
tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log}
rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log}
"""
else:
rule all:
output:
GRCh37_fasta = GRCH37_FASTA,
GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA),
GENETIC_MAP = GENETIC_MAP,
genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES),
cmmap = expand(CMMAP, chrom=CHROMOSOMES),
lift_chain = LIFT_CHAIN,
SITE_1000GENOME = SITE_1000GENOME,
pedsim_map = PEDSIM_MAP,
affymetrix_chip = AFFYMETRIX_CHIP
conda:
"../../envs/download.yaml"
log:
"logs/ref/download_bundle.log"
shell:
"""
wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log}
sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1)
if [ $sum != {BUNDLE_md5} ]; then
echo "md5 sum of BUDNLE is invalid" |& tee -a {log}
exit 1
fi
tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log}
rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log}
"""

0 comments on commit df2353e

Please sign in to comment.