From 58bee081bda9506b26f224aad484ad8ad4c38a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 18 Jul 2024 12:38:37 +0100 Subject: [PATCH] tools: Normalize sv for non-symbolic variants. #TASK-6558 --- .../tools/variant/VariantNormalizer.java | 160 +++++++++++------- .../tools/variant/VariantNormalizerTest.java | 26 +++ .../variant/merge/VariantMergerTest.java | 3 +- 3 files changed, 126 insertions(+), 63 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index e902ce99f..3e16977f5 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -301,19 +301,16 @@ public List normalize(List batch, boolean reuse) throws NonSta normalizedVariants.add(variant); continue; } - String reference = variant.getReference(); //Save original values, as they can be changed + //Save original values, as they can be changed + String reference = variant.getReference(); String alternate = variant.getAlternate(); Integer start = variant.getStart(); Integer end = variant.getEnd(); String chromosome = variant.getChromosome(); if (variant.getStudies() == null || variant.getStudies().isEmpty()) { - List keyFieldsList; - if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternate, variant.getSv()); - } else { - keyFieldsList = normalize(chromosome, start, reference, alternate); - } + List keyFieldsList = normalizeAlleles(variant); + // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order! for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) { OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele()); @@ -331,25 +328,16 @@ public List normalize(List batch, boolean reuse) throws NonSta normalizedVariants.add(normalizedVariant); } } else { - for (StudyEntry entry : variant.getStudies()) { - List originalAlternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); - List alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); - alternates.add(alternate); - originalAlternates.add(alternate); - for (String secondaryAlternatesAllele : entry.getSecondaryAlternatesAlleles()) { - alternates.add(secondaryAlternatesAllele); - originalAlternates.add(secondaryAlternatesAllele); - } + if (variant.getStudies().size() != 1) { + throw new IllegalStateException("Only one study per variant is supported when normalizing variants. Found " + + variant.getStudies().size() + " studies. Variant: " + variant); + } else { + StudyEntry entry = variant.getStudies().get(0); + List alternates = getAllAlternates(variant); // FIXME: assumes there wont be multinucleotide positions with CNVs and short variants mixed - List keyFieldsList; - List originalKeyFieldsList; - if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternates, variant.getSv()); - } else { - keyFieldsList = normalize(chromosome, start, reference, alternates); - } - originalKeyFieldsList = keyFieldsList + List keyFieldsList = normalizeAlleles(variant); + List originalKeyFieldsList = keyFieldsList .stream() .filter(k -> !k.isReferenceBlock()) .map(k -> k.originalKeyFields) @@ -372,8 +360,8 @@ public List normalize(List batch, boolean reuse) throws NonSta originalCall = entry.getFiles().get(0).getCall().getVariantId(); } else { StringBuilder sb = new StringBuilder(variant.toString()); - for (int i = 1; i < originalAlternates.size(); i++) { - sb.append(",").append(originalAlternates.get(i)); + for (int i = 1; i < alternates.size(); i++) { + sb.append(",").append(alternates.get(i)); } originalCall = sb.toString(); } @@ -600,17 +588,54 @@ private Collection sortByPosition(List keyFi // } // } + protected List normalizeAlleles(Variant variant) { + List alternates = getAllAlternates(variant); + + List keyFieldsList; + if (isSymbolic(variant)) { + keyFieldsList = normalizeSymbolic(variant.getStart(), variant.getEnd(), variant.getReference(), alternates, variant.getSv()); + } else { + keyFieldsList = normalize(variant.getChromosome(), variant.getStart(), variant.getReference(), alternates, variant.getSv()); + } + return keyFieldsList; + } + + private static List getAllAlternates(Variant variant) { + List alternates; + if (variant.getStudies() != null && !variant.getStudies().isEmpty()) { + StudyEntry entry = variant.getStudies().get(0); + String alternate = variant.getAlternate(); + alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); + alternates.add(alternate); + for (AlternateCoordinate secondaryAlternate : entry.getSecondaryAlternates()) { + if (secondaryAlternate.getStart() != null && !secondaryAlternate.getStart().equals(variant.getStart())) { + throw new IllegalStateException("Unable to normalize variant where secondary alternates do not start at the same position. " + + "Variant: " + variant + " , secondaryAlternate: " + secondaryAlternate); + } + if (secondaryAlternate.getEnd() != null && !secondaryAlternate.getEnd().equals(variant.getEnd())) { + throw new IllegalStateException("Unable to normalize variant where secondary alternates do not end at the same position. " + + "Variant: " + variant + " (end=" + variant.getEnd() + ") , secondaryAlternate: " + secondaryAlternate); + } + alternates.add(secondaryAlternate.getAlternate()); + } + } else { + alternates = Collections.singletonList(variant.getAlternate()); + } + return Collections.unmodifiableList(alternates); + } + + @Deprecated // Test purposes only public List normalizeSymbolic(Integer start, Integer end, String reference, String alternate, StructuralVariation sv) { return normalizeSymbolic(start, end, reference, Collections.singletonList(alternate), sv); } - @Deprecated + @Deprecated // Test purposes only public List normalizeSymbolic(final Integer start, final Integer end, final String reference, final List alternates) { return normalizeSymbolic(start, end, reference, alternates, null); } - public List normalizeSymbolic(final Integer start, final Integer end, final String reference, + protected List normalizeSymbolic(final Integer start, final Integer end, final String reference, final List alternates, StructuralVariation sv) { List list = new ArrayList<>(alternates.size()); @@ -634,37 +659,7 @@ public List normalizeSymbolic(final Integer start, final Integ keyFields.getSv().setType(StructuralVariantType.TANDEM_DUPLICATION); } - if (sv != null) { - StructuralVariation normalizedSv = keyFields.getSv(); - if (normalizedSv == null) { - normalizedSv = new StructuralVariation(); - } - // CI positions may change during the normalization. Update them. - normalizedSv.setCiStartLeft(sv.getCiStartLeft()); - normalizedSv.setCiStartRight(sv.getCiStartRight()); - - // Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND. - // At this point, we're removing the CIEND from the normalized variant. - // Do not remove the value from the INFO field (if any). - // The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start") - if (keyFields.getEnd() < keyFields.getStart()) { - normalizedSv.setCiEndLeft(null); - normalizedSv.setCiEndRight(null); - } else { - normalizedSv.setCiEndLeft(sv.getCiEndLeft()); - normalizedSv.setCiEndRight(sv.getCiEndRight()); - } - normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); - normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq()); - - if (keyFields.getSv() == null) { - if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null - || normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null - || normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) { - keyFields.setSv(normalizedSv); - } - } - } + normalizeSvField(sv, keyFields); list.add(keyFields); } @@ -672,6 +667,40 @@ public List normalizeSymbolic(final Integer start, final Integ return list; } + private static void normalizeSvField(StructuralVariation sv, VariantKeyFields keyFields) { + if (sv != null) { + StructuralVariation normalizedSv = keyFields.getSv(); + if (normalizedSv == null) { + normalizedSv = new StructuralVariation(); + } + // CI positions may change during the normalization. Update them. + normalizedSv.setCiStartLeft(sv.getCiStartLeft()); + normalizedSv.setCiStartRight(sv.getCiStartRight()); + + // Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND. + // At this point, we're removing the CIEND from the normalized variant. + // Do not remove the value from the INFO field (if any). + // The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start") + if (keyFields.getEnd() < keyFields.getStart()) { + normalizedSv.setCiEndLeft(null); + normalizedSv.setCiEndRight(null); + } else { + normalizedSv.setCiEndLeft(sv.getCiEndLeft()); + normalizedSv.setCiEndRight(sv.getCiEndRight()); + } + normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); + normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq()); + + if (keyFields.getSv() == null) { + if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null + || normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null + || normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) { + keyFields.setSv(normalizedSv); + } + } + } + } + private boolean isNonRef(String alternate) { return alternate.equals(Allele.NO_CALL_STRING) || alternate.equals(VariantBuilder.NON_REF_ALT) @@ -780,12 +809,17 @@ private VariantKeyFields normalizeSymbolic( } + @Deprecated // Test purposes only public List normalize(String chromosome, int position, String reference, String alternate) { - return normalize(chromosome, position, reference, Collections.singletonList(alternate)); + return normalize(chromosome, position, reference, Collections.singletonList(alternate), null); } - public List normalize(String chromosome, int position, String reference, List alternates) - { + @Deprecated // Test purposes only + public List normalize(String chromosome, int position, String reference, List alternates) { + return normalize(chromosome, position, reference, alternates, null); + } + + protected List normalize(String chromosome, int position, String reference, List alternates, StructuralVariation sv) { List list = new ArrayList<>(alternates.size()); int numAllelesIdx = 0; // This index is necessary for getting the samples where the mutated allele is present @@ -829,6 +863,8 @@ public List normalize(String chromosome, int position, String } } + normalizeSvField(sv, keyFields); + if (keyFields != null) { // To deal with cases such as A>GT diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java index 4253d9405..a1faf4869 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java @@ -684,6 +684,32 @@ public void testINSsNormalizationWithCIEND() throws Exception { }); } + @Test + public void testNormalizeNonSymbolicInsertion() throws Exception { + Variant variant = newVariantBuilder(100, null, "C", Collections.singletonList("CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, null, null, null, null, null, null, null), normalizedVariant.getSv()); + }); + } + + @Test + public void testNormalizeNonSymbolicDeletion() throws Exception { + Variant variant = newVariantBuilder(100, null, "CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "C", "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-1,1") + .addSample("HG00096", "0|1") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 179, 181, null, null, null, null, null), normalizedVariant.getSv()); + }); + } + @Test public void testDUPTANDEMNormalization() throws Exception { Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java index 46ab5800e..07533ab5b 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java @@ -498,7 +498,8 @@ public void testMergeIndelCase1() throws NonStandardCompliantSampleField { Variant v1 = VariantTestUtils.generateVariantWithFormat("1:328:CTT:C", VCFConstants.GENOTYPE_KEY + "," + VCFConstants.GENOTYPE_FILTER_KEY, "S1", "1/2","PASS"); - v1.getStudies().get(0).getSecondaryAlternates().add(new AlternateCoordinate(null,null,331,"CTT", "CTTTC", VariantType.INDEL)); + + v1.getStudies().get(0).getSecondaryAlternates().add(new AlternateCoordinate(null, null, 330, "CTT", "CTTTC", VariantType.INDEL)); Variant v2 = VariantTestUtils.generateVariantWithFormat("1:331:T:TCT", VCFConstants.GENOTYPE_KEY + "," + VCFConstants.GENOTYPE_FILTER_KEY,