diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParser.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParser.java index 6137c5bd6c1..803fa75fea5 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParser.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParser.java @@ -1216,18 +1216,7 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema CtBtFtCombinationIndexSchema.Filter ctBtTfFilter = schema.getCtBtTfIndex().getField().noOpFilter(); IndexFilter clinicalFilter = schema.getClinicalIndexSchema().noOpFilter(); - Boolean intergenic = null; - - ParsedVariantQuery.VariantQueryXref variantQueryXref = VariantQueryParser.parseXrefs(query); - if (!isValidParam(query, REGION)) { - if (!variantQueryXref.getGenes().isEmpty() - && variantQueryXref.getIds().isEmpty() - && variantQueryXref.getOtherXrefs().isEmpty() - && variantQueryXref.getVariants().isEmpty()) { - // If only filtering by genes, is not intergenic. - intergenic = false; - } - } + final Boolean intergenic = isIntergenicQuery(query); // BiotypeConsquenceTypeFlagCombination combination = BiotypeConsquenceTypeFlagCombination // .fromQuery(query, Arrays.asList(schema.getTranscriptFlagIndexSchema().getField().getConfiguration().getValues())); @@ -1237,18 +1226,10 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema boolean tfCovered = false; if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE)) { - List soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key()); - soNames = soNames.stream() + List soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key()) + .stream() .map(ct -> ConsequenceTypeMappings.accessionToTerm.get(VariantQueryUtils.parseConsequenceType(ct))) .collect(Collectors.toList()); - if (!soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT) - && !soNames.contains(VariantAnnotationConstants.REGULATORY_REGION_VARIANT) - && !soNames.contains(VariantAnnotationConstants.TF_BINDING_SITE_VARIANT)) { - // All ct values but "intergenic_variant" and "regulatory_region_variant" are in genes (i.e. non-intergenic) - intergenic = false; - } else if (soNames.size() == 1 && soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)) { - intergenic = true; - } // else, leave undefined : intergenic = null boolean ctFilterCoveredBySummary = false; boolean ctBtCombinationCoveredBySummary = false; if (SampleIndexSchema.CUSTOM_LOF.containsAll(soNames)) { @@ -1295,14 +1276,17 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema } } - // Do not use ctIndex if the CT filter is covered by the summary - // Use the ctIndex if: + // Do not use ctIndex for intergenic queries (intergenic == true) + // or queries that might return intergenic variants (intergenic == null) + // + // Use the ctIndex if any of: // - The CtFilter is not covered by the summary // - The query has the combination CT+BT , and it is not covered by the summary // - The query has the combination CT+TF - boolean useCtIndexFilter = !ctFilterCoveredBySummary - || (!ctBtCombinationCoveredBySummary && combination.isBiotype()) - || combination.isFlag(); + boolean useCtIndexFilter = + intergenic == Boolean.FALSE && (!ctFilterCoveredBySummary + || (!ctBtCombinationCoveredBySummary && combination.isBiotype()) + || combination.isFlag()); if (useCtIndexFilter) { ctCovered = completeIndex; consequenceTypeFilter = schema.getCtIndex().getField().buildFilter(new OpValue<>("=", soNames)); @@ -1317,8 +1301,6 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema } if (isValidParam(query, ANNOT_BIOTYPE)) { - // All biotype values are in genes (i.e. non-intergenic) - intergenic = false; boolean biotypeFilterCoveredBySummary = false; List biotypes = query.getAsStringList(VariantQueryParam.ANNOT_BIOTYPE.key()); if (BIOTYPE_SET.containsAll(biotypes)) { @@ -1350,8 +1332,6 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema List transcriptFlags = query.getAsStringList(ANNOT_TRANSCRIPT_FLAG.key()); tfFilter = schema.getTranscriptFlagIndexSchema().getField().buildFilter(new OpValue<>("=", transcriptFlags)); tfCovered = completeIndex & tfFilter.isExactFilter(); - // Transcript flags are in transcripts/genes. (i.e. non-intergenic) - intergenic = false; // TranscriptFlag filter is covered by index if (tfCovered) { if (!isValidParam(query, GENE) && simpleCombination(combination)) { @@ -1538,12 +1518,60 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema // If intergenic is undefined, or true, CT and BT filters can not be used. biotypeFilter = schema.getBiotypeIndex().getField().noOpFilter(); consequenceTypeFilter = schema.getCtIndex().getField().noOpFilter(); + if (!biotypeFilter.isNoOp()) { + throw new IllegalStateException("Unexpected BT filter for intergenic=" + intergenic); + } + if (!consequenceTypeFilter.isNoOp()) { + throw new IllegalStateException("Unexpected CT filter for intergenic=" + intergenic); + } } return new SampleAnnotationIndexQuery(new byte[]{annotationIndexMask, annotationIndex}, consequenceTypeFilter, biotypeFilter, tfFilter, ctBtTfFilter, clinicalFilter, populationFrequencyFilter); } + private Boolean isIntergenicQuery(Query query) { + ParsedVariantQuery.VariantQueryXref variantQueryXref = VariantQueryParser.parseXrefs(query); + if (!isValidParam(query, REGION)) { + if (!variantQueryXref.getGenes().isEmpty() + && variantQueryXref.getIds().isEmpty() + && variantQueryXref.getOtherXrefs().isEmpty() + && variantQueryXref.getVariants().isEmpty()) { + // If only filtering by genes, is not intergenic. + return false; + } + } + + if (isValidParam(query, ANNOT_BIOTYPE)) { + // All biotype values are in genes (i.e. non-intergenic) + return false; + } + if (isValidParam(query, ANNOT_BIOTYPE)) { + // All biotype values are in genes (i.e. non-intergenic) + return false; + } + if (isValidParam(query, ANNOT_TRANSCRIPT_FLAG)) { + // Transcript flags are in transcripts/genes. (i.e. non-intergenic) + return false; + } + if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE)) { + List soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key()); + soNames = soNames.stream() + .map(ct -> ConsequenceTypeMappings.accessionToTerm.get(VariantQueryUtils.parseConsequenceType(ct))) + .collect(Collectors.toList()); + if (!soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT) + && !soNames.contains(VariantAnnotationConstants.REGULATORY_REGION_VARIANT) + && !soNames.contains(VariantAnnotationConstants.TF_BINDING_SITE_VARIANT)) { + // All ct values but "intergenic_variant" and "regulatory_region_variant" are in genes (i.e. non-intergenic) + return false; + } else if (soNames.size() == 1 && soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)) { + return true; + } // else, leave undefined : intergenic = null + } + // Unable to determine if the query is intergenic or not. Return null for uncertain. + return null; + } + private boolean simpleCombination(BiotypeConsquenceTypeFlagCombination combination) { return combination.numParams() == 1; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParserTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParserTest.java index 7ae4e36f3ec..212b0521b26 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParserTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexQueryParserTest.java @@ -1384,7 +1384,10 @@ public void parseIntergenicTest() { checkIntergenic(true, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "intergenic_variant")); checkIntergenic(null, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "missense_variant,intergenic_variant")); checkIntergenic(null, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "intergenic_variant,missense_variant")); - + checkIntergenic(null, new Query(ANNOT_CONSEQUENCE_TYPE.key(), VariantAnnotationConstants.REGULATORY_REGION_VARIANT)); + checkIntergenic(false, new Query(ANNOT_CONSEQUENCE_TYPE.key(), VariantAnnotationConstants.REGULATORY_REGION_VARIANT) + .append(ANNOT_BIOTYPE.key(), "protein_coding")); + // Nonsense combination checkIntergenic(false, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "intergenic_variant").append(ANNOT_BIOTYPE.key(), "protein_coding")); } @@ -1570,6 +1573,19 @@ public void testCoveredQuery_ct() { parseAnnotationIndexQuery(query, true); assertTrue(query.isEmpty()); + query = new Query().append(ANNOT_CONSEQUENCE_TYPE.key(), String.join(OR, VariantAnnotationConstants.REGULATORY_REGION_VARIANT)); + parseAnnotationIndexQuery(query, true); + indexQuery = parseAnnotationIndexQuery(query, true); + assertTrue(indexQuery.getConsequenceTypeFilter().isNoOp()); + assertFalse(query.isEmpty()); // regulatory_region_variant can't be used for CT filter alone + + query = new Query().append(ANNOT_CONSEQUENCE_TYPE.key(), String.join(OR, VariantAnnotationConstants.REGULATORY_REGION_VARIANT)) + .append(ANNOT_BIOTYPE.key(), "protein_coding"); + indexQuery = parseAnnotationIndexQuery(query, true); + assertFalse(indexQuery.getConsequenceTypeFilter().isNoOp()); + assertFalse(indexQuery.getBiotypeFilter().isNoOp()); + assertTrue(query.isEmpty()); // regulatory_region_variant can be used together with biotype + query = new Query().append(ANNOT_CONSEQUENCE_TYPE.key(), String.join(OR, VariantAnnotationConstants.STOP_LOST)); parseAnnotationIndexQuery(query, false); indexQuery = parseAnnotationIndexQuery(query, false); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java index e15cb6ae4bb..9ee362f8724 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java @@ -897,29 +897,55 @@ public void testAggregationCorrectnessCt() throws Exception { @Test public void testAggregationCorrectnessTFBS() throws Exception { - testAggregationCorrectness(TF_BINDING_SITE_VARIANT, true); + // Special scenario. This CT might include intergenic values, so can't be used alone + testAggregationCorrectness(new Query(ANNOT_BIOTYPE.key(), "protein_coding"), TF_BINDING_SITE_VARIANT); } @Test public void testAggregationCorrectnessRegulatoryRegionVariant() throws Exception { - testAggregationCorrectness(REGULATORY_REGION_VARIANT); + // Special scenario. This CT might include intergenic values, so can't be used alone + testAggregationCorrectness(new Query(ANNOT_BIOTYPE.key(), "protein_coding"), + REGULATORY_REGION_VARIANT); + } + + @Test + public void testAggregationByIntergenicQuery() throws Exception { + SampleIndexVariantAggregationExecutor executor = new SampleIndexVariantAggregationExecutor(metadataManager, sampleIndexDBAdaptor); + + Query baseQuery = new Query(STUDY.key(), STUDY_NAME_3) + .append(SAMPLE.key(), "NA12877"); + + assertFalse(executor.canUseThisExecutor(new Query(baseQuery) + .append(ANNOT_CONSEQUENCE_TYPE.key(), REGULATORY_REGION_VARIANT), new QueryOptions(QueryOptions.FACET, "consequenceType"))); + assertFalse(executor.canUseThisExecutor(new Query(baseQuery) + .append(ANNOT_CONSEQUENCE_TYPE.key(), TF_BINDING_SITE_VARIANT), new QueryOptions(QueryOptions.FACET, "consequenceType"))); + + assertTrue(executor.canUseThisExecutor(new Query(baseQuery) + .append(ANNOT_CONSEQUENCE_TYPE.key(), REGULATORY_REGION_VARIANT) + .append(ANNOT_BIOTYPE.key(), "protein_coding"), + new QueryOptions(QueryOptions.FACET, "consequenceType"))); + assertTrue(executor.canUseThisExecutor(new Query(baseQuery) + .append(ANNOT_CONSEQUENCE_TYPE.key(), TF_BINDING_SITE_VARIANT) + .append(ANNOT_BIOTYPE.key(), "protein_coding"), + new QueryOptions(QueryOptions.FACET, "consequenceType"))); } private void testAggregationCorrectness(String ct) throws Exception { - testAggregationCorrectness(ct, false); + testAggregationCorrectness(new Query(), ct); } - private void testAggregationCorrectness(String ct, boolean sampleIndexMightBeMoreAccurate) throws Exception { + private void testAggregationCorrectness(Query baseQuery, String ct) throws Exception { SampleIndexVariantAggregationExecutor executor = new SampleIndexVariantAggregationExecutor(metadataManager, sampleIndexDBAdaptor); - Query query = new Query(STUDY.key(), STUDY_NAME_3) + Query query = new Query(baseQuery) + .append(STUDY.key(), STUDY_NAME_3) .append(SAMPLE.key(), "NA12877") .append(ANNOT_CONSEQUENCE_TYPE.key(), ct); assertTrue(executor.canUseThisExecutor(query, new QueryOptions(QueryOptions.FACET, "consequenceType"))); AtomicInteger count = new AtomicInteger(0); sampleIndexDBAdaptor.iterator(new Query(query), new QueryOptions()).forEachRemaining(v -> count.incrementAndGet()); - FacetField facet = executor.aggregation(query, new QueryOptions(QueryOptions.FACET, "consequenceType")).first(); + FacetField facet = executor.aggregation(new Query(query), new QueryOptions(QueryOptions.FACET, "consequenceType")).first(); assertEquals(count.get(), facet.getCount()); FacetField.Bucket bucket = facet.getBuckets().stream().filter(b -> b.getValue().equals(ct)).findFirst().orElse(null); @@ -934,11 +960,7 @@ private void testAggregationCorrectness(String ct, boolean sampleIndexMightBeMor } } else { assertNotNull(msg, bucket); - if (sampleIndexMightBeMoreAccurate) { - assertThat(msg, count.get(), gte(bucket.getCount())); - } else { - assertEquals(msg, count.get(), bucket.getCount()); - } + assertEquals(msg, count.get(), bucket.getCount()); } }