-
Notifications
You must be signed in to change notification settings - Fork 592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix to Enable SelectVariants to drop sites with * allele as only ALT #5129
Changes from 5 commits
8aee9d3
d9fb0d5
fde9671
cc14c23
9dd443a
8c78964
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -343,6 +343,54 @@ public void testNoGTs() throws IOException { | |
spec.executeTest("testNoGTs--" + testFile, this); | ||
} | ||
|
||
@Test | ||
public void testRemoveMonomorphAfterSNSelect() throws IOException { | ||
final String testFile = getToolTestDataDir() + "spanning_deletion.vcf"; | ||
final String samplesFile = "NA1"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really this is a sample name rather than a samplesFile so can you change the variable name? |
||
|
||
final IntegrationTestSpec spec = new IntegrationTestSpec( | ||
baseTestString(" -sn " + samplesFile + " --remove-unused-alternates --exclude-non-variants", testFile), | ||
Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_RemoveMonomorphAfterSNSelect.vcf") | ||
); | ||
spec.executeTest("test will make sure line is removed when only monomorphic allele exists" + testFile, this); | ||
} | ||
|
||
@Test | ||
public void testRemoveSingleSpanDelAlleleNoSpanDel() throws IOException { | ||
final String testFile = getToolTestDataDir() + "spanning_deletion.vcf"; | ||
final String samplesFile = "NA1"; | ||
|
||
final IntegrationTestSpec spec = new IntegrationTestSpec( | ||
baseTestString(" -sn " + samplesFile + " --remove-unused-alternates --exclude-non-variants", testFile), | ||
Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_RemoveSingleSpanDelAlleleNoSpanDel.vcf") | ||
); | ||
spec.executeTest("test will not remove variant line where '*' is only ALT allele because none exist" + testFile, this); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see how this is different from the above test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you want to use a different input file where the * is in a heterozygous genotype? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My mistake. They are the same. Turns out the test technically tests two things at once, I deleted the second instance. |
||
} | ||
|
||
@Test | ||
public void testRemoveSingleSpanDelAlleleExNonVar() throws IOException { | ||
final String testFile = getToolTestDataDir() + "spanning_deletion.vcf"; | ||
final String samplesFile = "NA2"; | ||
|
||
final IntegrationTestSpec spec = new IntegrationTestSpec( | ||
baseTestString(" -sn " + samplesFile + " --remove-unused-alternates", testFile), | ||
Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_RemoveSingleSpanDelAlleleExNoVar.vcf") | ||
); | ||
spec.executeTest("test will not remove variant line where '*' is only ALT allele because --exclude-non-variants not called --" + testFile, this); | ||
} | ||
|
||
@Test | ||
public void testRemoveSingleSpanDelAllele() throws IOException { | ||
final String testFile = getToolTestDataDir() + "spanning_deletion.vcf"; | ||
final String samplesFile = "NA2"; | ||
|
||
final IntegrationTestSpec spec = new IntegrationTestSpec( | ||
baseTestString(" -sn " + samplesFile + " --exclude-non-variants --remove-unused-alternates", testFile), | ||
Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_RemoveSingleSpanDelAllele.vcf") | ||
); | ||
spec.executeTest("test removes variant line where '*' is only ALT allele --" + testFile, this); | ||
} | ||
|
||
@Test | ||
public void testSelectFromMultiAllelic() throws IOException { | ||
final String testFile = getToolTestDataDir() + "multi-allelic.bi-allelicInGIH.vcf"; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
##fileformat=VCFv4.2 | ||
##ALT=<ID=*:DEL,Description="Represents any possible spanning deletion allele at this location"> | ||
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> | ||
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)"> | ||
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> | ||
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> | ||
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity"> | ||
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> | ||
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> | ||
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> | ||
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> | ||
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> | ||
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | ||
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> | ||
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"> | ||
##contig=<ID=20,length=63025520> | ||
##contig=<ID=21,length=48129895> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 | ||
20 20 . A T 4490.20 . AC=1;AF=0.500;AN=2;DP=80 GT:AD:DP:GQ:PL 0/1:1,0:80:3:0,3,32 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
##fileformat=VCFv4.2 | ||
##ALT=<ID=*:DEL,Description="Represents any possible spanning deletion allele at this location"> | ||
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> | ||
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)"> | ||
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> | ||
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> | ||
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity"> | ||
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> | ||
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> | ||
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> | ||
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> | ||
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> | ||
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | ||
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> | ||
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"> | ||
##contig=<ID=20,length=63025520> | ||
##contig=<ID=21,length=48129895> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA2 | ||
20 14 . GCCCCCACCC G 4490.20 . AC=2;AF=1.00;AN=2;DP=80 GT:AD:DP:GQ:PL 1/1:1,79,0:80:99:2233,233,0,2244,234,2333 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
##fileformat=VCFv4.2 | ||
##ALT=<ID=*:DEL,Description="Represents any possible spanning deletion allele at this location"> | ||
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> | ||
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)"> | ||
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> | ||
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> | ||
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity"> | ||
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> | ||
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> | ||
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> | ||
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> | ||
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> | ||
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | ||
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> | ||
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"> | ||
##contig=<ID=20,length=63025520> | ||
##contig=<ID=21,length=48129895> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA2 | ||
20 14 . GCCCCCACCC G 4490.20 . AC=2;AF=1.00;AN=2;DP=80 GT:AD:DP:GQ:PL 1/1:1,79,0:80:99:2233,233,0,2244,234,2333 | ||
20 20 . A * 4490.20 . AC=2;AF=1.00;AN=2;DP=80 GT:AD:DP:GQ:PL 1/1:1,0:80:11:0,11,100 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
##fileformat=VCFv4.2 | ||
##ALT=<ID=*:DEL,Description="Represents any possible spanning deletion allele at this location"> | ||
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location"> | ||
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> | ||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> | ||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"> | ||
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block"> | ||
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> | ||
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)"> | ||
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."> | ||
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> | ||
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval"> | ||
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity"> | ||
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> | ||
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> | ||
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> | ||
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> | ||
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> | ||
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | ||
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> | ||
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"> | ||
##contig=<ID=20,length=63025520> | ||
##contig=<ID=21,length=48129895> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1 | ||
20 20 . A T 4490.20 . AC=1;AF=0.500;AN=2;DP=80 GT:AD:DP:GQ:PL 0/1:1,0:80:3:0,3,32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is still really hard to read. Maybe give examples of things that will pass here?