Skip to content

Commit

Permalink
storage: Improve testing SampleIndexAggregation of intergenic queries…
Browse files Browse the repository at this point in the history
…. #TASK-6136
  • Loading branch information
j-coll committed May 31, 2024
1 parent 9f46a7c commit 1c7db62
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1216,18 +1216,7 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
CtBtFtCombinationIndexSchema.Filter ctBtTfFilter = schema.getCtBtTfIndex().getField().noOpFilter();
IndexFilter clinicalFilter = schema.getClinicalIndexSchema().noOpFilter();

Boolean intergenic = null;

ParsedVariantQuery.VariantQueryXref variantQueryXref = VariantQueryParser.parseXrefs(query);
if (!isValidParam(query, REGION)) {
if (!variantQueryXref.getGenes().isEmpty()
&& variantQueryXref.getIds().isEmpty()
&& variantQueryXref.getOtherXrefs().isEmpty()
&& variantQueryXref.getVariants().isEmpty()) {
// If only filtering by genes, is not intergenic.
intergenic = false;
}
}
final Boolean intergenic = isIntergenicQuery(query);

// BiotypeConsquenceTypeFlagCombination combination = BiotypeConsquenceTypeFlagCombination
// .fromQuery(query, Arrays.asList(schema.getTranscriptFlagIndexSchema().getField().getConfiguration().getValues()));
Expand All @@ -1237,18 +1226,10 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
boolean tfCovered = false;

if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE)) {
List<String> soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key());
soNames = soNames.stream()
List<String> soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key())
.stream()
.map(ct -> ConsequenceTypeMappings.accessionToTerm.get(VariantQueryUtils.parseConsequenceType(ct)))
.collect(Collectors.toList());
if (!soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.REGULATORY_REGION_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.TF_BINDING_SITE_VARIANT)) {
// All ct values but "intergenic_variant" and "regulatory_region_variant" are in genes (i.e. non-intergenic)
intergenic = false;
} else if (soNames.size() == 1 && soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)) {
intergenic = true;
} // else, leave undefined : intergenic = null
boolean ctFilterCoveredBySummary = false;
boolean ctBtCombinationCoveredBySummary = false;
if (SampleIndexSchema.CUSTOM_LOF.containsAll(soNames)) {
Expand Down Expand Up @@ -1295,14 +1276,17 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
}
}

// Do not use ctIndex if the CT filter is covered by the summary
// Use the ctIndex if:
// Do not use ctIndex for intergenic queries (intergenic == true)
// or queries that might return intergenic variants (intergenic == null)
//
// Use the ctIndex if any of:
// - The CtFilter is not covered by the summary
// - The query has the combination CT+BT , and it is not covered by the summary
// - The query has the combination CT+TF
boolean useCtIndexFilter = !ctFilterCoveredBySummary
|| (!ctBtCombinationCoveredBySummary && combination.isBiotype())
|| combination.isFlag();
boolean useCtIndexFilter =
intergenic == Boolean.FALSE && (!ctFilterCoveredBySummary
|| (!ctBtCombinationCoveredBySummary && combination.isBiotype())
|| combination.isFlag());
if (useCtIndexFilter) {
ctCovered = completeIndex;
consequenceTypeFilter = schema.getCtIndex().getField().buildFilter(new OpValue<>("=", soNames));
Expand All @@ -1317,8 +1301,6 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
}

if (isValidParam(query, ANNOT_BIOTYPE)) {
// All biotype values are in genes (i.e. non-intergenic)
intergenic = false;
boolean biotypeFilterCoveredBySummary = false;
List<String> biotypes = query.getAsStringList(VariantQueryParam.ANNOT_BIOTYPE.key());
if (BIOTYPE_SET.containsAll(biotypes)) {
Expand Down Expand Up @@ -1350,8 +1332,6 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
List<String> transcriptFlags = query.getAsStringList(ANNOT_TRANSCRIPT_FLAG.key());
tfFilter = schema.getTranscriptFlagIndexSchema().getField().buildFilter(new OpValue<>("=", transcriptFlags));
tfCovered = completeIndex & tfFilter.isExactFilter();
// Transcript flags are in transcripts/genes. (i.e. non-intergenic)
intergenic = false;
// TranscriptFlag filter is covered by index
if (tfCovered) {
if (!isValidParam(query, GENE) && simpleCombination(combination)) {
Expand Down Expand Up @@ -1538,12 +1518,60 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
// If intergenic is undefined, or true, CT and BT filters can not be used.
biotypeFilter = schema.getBiotypeIndex().getField().noOpFilter();
consequenceTypeFilter = schema.getCtIndex().getField().noOpFilter();
if (!biotypeFilter.isNoOp()) {
throw new IllegalStateException("Unexpected BT filter for intergenic=" + intergenic);
}
if (!consequenceTypeFilter.isNoOp()) {
throw new IllegalStateException("Unexpected CT filter for intergenic=" + intergenic);
}
}

return new SampleAnnotationIndexQuery(new byte[]{annotationIndexMask, annotationIndex},
consequenceTypeFilter, biotypeFilter, tfFilter, ctBtTfFilter, clinicalFilter, populationFrequencyFilter);
}

private Boolean isIntergenicQuery(Query query) {
ParsedVariantQuery.VariantQueryXref variantQueryXref = VariantQueryParser.parseXrefs(query);
if (!isValidParam(query, REGION)) {
if (!variantQueryXref.getGenes().isEmpty()
&& variantQueryXref.getIds().isEmpty()
&& variantQueryXref.getOtherXrefs().isEmpty()
&& variantQueryXref.getVariants().isEmpty()) {
// If only filtering by genes, is not intergenic.
return false;
}
}

if (isValidParam(query, ANNOT_BIOTYPE)) {
// All biotype values are in genes (i.e. non-intergenic)
return false;
}
if (isValidParam(query, ANNOT_BIOTYPE)) {
// All biotype values are in genes (i.e. non-intergenic)
return false;
}
if (isValidParam(query, ANNOT_TRANSCRIPT_FLAG)) {
// Transcript flags are in transcripts/genes. (i.e. non-intergenic)
return false;
}
if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE)) {
List<String> soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key());
soNames = soNames.stream()
.map(ct -> ConsequenceTypeMappings.accessionToTerm.get(VariantQueryUtils.parseConsequenceType(ct)))
.collect(Collectors.toList());
if (!soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.REGULATORY_REGION_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.TF_BINDING_SITE_VARIANT)) {
// All ct values but "intergenic_variant" and "regulatory_region_variant" are in genes (i.e. non-intergenic)
return false;
} else if (soNames.size() == 1 && soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)) {
return true;
} // else, leave undefined : intergenic = null
}
// Unable to determine if the query is intergenic or not. Return null for uncertain.
return null;
}

private boolean simpleCombination(BiotypeConsquenceTypeFlagCombination combination) {
return combination.numParams() == 1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1384,7 +1384,10 @@ public void parseIntergenicTest() {
checkIntergenic(true, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "intergenic_variant"));
checkIntergenic(null, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "missense_variant,intergenic_variant"));
checkIntergenic(null, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "intergenic_variant,missense_variant"));

checkIntergenic(null, new Query(ANNOT_CONSEQUENCE_TYPE.key(), VariantAnnotationConstants.REGULATORY_REGION_VARIANT));
checkIntergenic(false, new Query(ANNOT_CONSEQUENCE_TYPE.key(), VariantAnnotationConstants.REGULATORY_REGION_VARIANT)
.append(ANNOT_BIOTYPE.key(), "protein_coding"));

// Nonsense combination
checkIntergenic(false, new Query(ANNOT_CONSEQUENCE_TYPE.key(), "intergenic_variant").append(ANNOT_BIOTYPE.key(), "protein_coding"));
}
Expand Down Expand Up @@ -1570,6 +1573,19 @@ public void testCoveredQuery_ct() {
parseAnnotationIndexQuery(query, true);
assertTrue(query.isEmpty());

query = new Query().append(ANNOT_CONSEQUENCE_TYPE.key(), String.join(OR, VariantAnnotationConstants.REGULATORY_REGION_VARIANT));
parseAnnotationIndexQuery(query, true);
indexQuery = parseAnnotationIndexQuery(query, true);
assertTrue(indexQuery.getConsequenceTypeFilter().isNoOp());
assertFalse(query.isEmpty()); // regulatory_region_variant can't be used for CT filter alone

query = new Query().append(ANNOT_CONSEQUENCE_TYPE.key(), String.join(OR, VariantAnnotationConstants.REGULATORY_REGION_VARIANT))
.append(ANNOT_BIOTYPE.key(), "protein_coding");
indexQuery = parseAnnotationIndexQuery(query, true);
assertFalse(indexQuery.getConsequenceTypeFilter().isNoOp());
assertFalse(indexQuery.getBiotypeFilter().isNoOp());
assertTrue(query.isEmpty()); // regulatory_region_variant can be used together with biotype

query = new Query().append(ANNOT_CONSEQUENCE_TYPE.key(), String.join(OR, VariantAnnotationConstants.STOP_LOST));
parseAnnotationIndexQuery(query, false);
indexQuery = parseAnnotationIndexQuery(query, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -897,29 +897,55 @@ public void testAggregationCorrectnessCt() throws Exception {

@Test
public void testAggregationCorrectnessTFBS() throws Exception {
testAggregationCorrectness(TF_BINDING_SITE_VARIANT, true);
// Special scenario. This CT might include intergenic values, so can't be used alone
testAggregationCorrectness(new Query(ANNOT_BIOTYPE.key(), "protein_coding"), TF_BINDING_SITE_VARIANT);
}

@Test
public void testAggregationCorrectnessRegulatoryRegionVariant() throws Exception {
testAggregationCorrectness(REGULATORY_REGION_VARIANT);
// Special scenario. This CT might include intergenic values, so can't be used alone
testAggregationCorrectness(new Query(ANNOT_BIOTYPE.key(), "protein_coding"),
REGULATORY_REGION_VARIANT);
}

@Test
public void testAggregationByIntergenicQuery() throws Exception {
SampleIndexVariantAggregationExecutor executor = new SampleIndexVariantAggregationExecutor(metadataManager, sampleIndexDBAdaptor);

Query baseQuery = new Query(STUDY.key(), STUDY_NAME_3)
.append(SAMPLE.key(), "NA12877");

assertFalse(executor.canUseThisExecutor(new Query(baseQuery)
.append(ANNOT_CONSEQUENCE_TYPE.key(), REGULATORY_REGION_VARIANT), new QueryOptions(QueryOptions.FACET, "consequenceType")));
assertFalse(executor.canUseThisExecutor(new Query(baseQuery)
.append(ANNOT_CONSEQUENCE_TYPE.key(), TF_BINDING_SITE_VARIANT), new QueryOptions(QueryOptions.FACET, "consequenceType")));

assertTrue(executor.canUseThisExecutor(new Query(baseQuery)
.append(ANNOT_CONSEQUENCE_TYPE.key(), REGULATORY_REGION_VARIANT)
.append(ANNOT_BIOTYPE.key(), "protein_coding"),
new QueryOptions(QueryOptions.FACET, "consequenceType")));
assertTrue(executor.canUseThisExecutor(new Query(baseQuery)
.append(ANNOT_CONSEQUENCE_TYPE.key(), TF_BINDING_SITE_VARIANT)
.append(ANNOT_BIOTYPE.key(), "protein_coding"),
new QueryOptions(QueryOptions.FACET, "consequenceType")));
}

private void testAggregationCorrectness(String ct) throws Exception {
testAggregationCorrectness(ct, false);
testAggregationCorrectness(new Query(), ct);
}

private void testAggregationCorrectness(String ct, boolean sampleIndexMightBeMoreAccurate) throws Exception {
private void testAggregationCorrectness(Query baseQuery, String ct) throws Exception {
SampleIndexVariantAggregationExecutor executor = new SampleIndexVariantAggregationExecutor(metadataManager, sampleIndexDBAdaptor);

Query query = new Query(STUDY.key(), STUDY_NAME_3)
Query query = new Query(baseQuery)
.append(STUDY.key(), STUDY_NAME_3)
.append(SAMPLE.key(), "NA12877")
.append(ANNOT_CONSEQUENCE_TYPE.key(), ct);
assertTrue(executor.canUseThisExecutor(query, new QueryOptions(QueryOptions.FACET, "consequenceType")));

AtomicInteger count = new AtomicInteger(0);
sampleIndexDBAdaptor.iterator(new Query(query), new QueryOptions()).forEachRemaining(v -> count.incrementAndGet());
FacetField facet = executor.aggregation(query, new QueryOptions(QueryOptions.FACET, "consequenceType")).first();
FacetField facet = executor.aggregation(new Query(query), new QueryOptions(QueryOptions.FACET, "consequenceType")).first();

assertEquals(count.get(), facet.getCount());
FacetField.Bucket bucket = facet.getBuckets().stream().filter(b -> b.getValue().equals(ct)).findFirst().orElse(null);
Expand All @@ -934,11 +960,7 @@ private void testAggregationCorrectness(String ct, boolean sampleIndexMightBeMor
}
} else {
assertNotNull(msg, bucket);
if (sampleIndexMightBeMoreAccurate) {
assertThat(msg, count.get(), gte(bucket.getCount()));
} else {
assertEquals(msg, count.get(), bucket.getCount());
}
assertEquals(msg, count.get(), bucket.getCount());
}
}

Expand Down

0 comments on commit 1c7db62

Please sign in to comment.