Skip to content

Commit

Permalink
skip clustering when both are skani
Browse files Browse the repository at this point in the history
  • Loading branch information
AroneyS committed Nov 24, 2023
1 parent 07c013b commit aa50e77
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 11 deletions.
4 changes: 2 additions & 2 deletions src/cluster_argument_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -901,7 +901,7 @@ pub fn generate_galah_clusterer<'a>(
) -> std::result::Result<GalahClusterer<'a>, String> {
crate::external_command_checker::check_for_fastani();

let repeat_clusterer = {
let skip_clusterer = {
clap_matches
.get_one::<String>(&argument_definition.dereplication_precluster_method_argument)
.unwrap()
Expand Down Expand Up @@ -982,7 +982,7 @@ pub fn generate_galah_clusterer<'a>(
}),
"skani" => Preclusterer::Skani(SkaniPreclusterer {
threshold: {
if repeat_clusterer {
if skip_clusterer {
parse_percentage(
clap_matches,
&argument_definition.dereplication_ani_argument,
Expand Down
51 changes: 42 additions & 9 deletions src/clusterer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ pub fn cluster<P: PreclusterDistanceFinder, C: ClusterDistanceFinder + std::mark
preclusterer_name, clusterer_name
);

let mut skip_clusterer = false;
if clusterer_name == preclusterer_name {
info!("Preclustering and clustering methods are the same, so skipping preclustering");
skip_clusterer = true;
}

// Dashing all the genomes together
let dashing_cache = preclusterer.distances(genomes);

Expand Down Expand Up @@ -83,6 +89,7 @@ pub fn cluster<P: PreclusterDistanceFinder, C: ClusterDistanceFinder + std::mark
clusterer,
&precluster_dashing_cache,
precluster_genomes.as_slice(),
skip_clusterer,
);
debug!(
"In precluster {}, found {} genome representatives",
Expand Down Expand Up @@ -149,6 +156,7 @@ fn find_dashing_fastani_representatives(
clusterer: &(impl ClusterDistanceFinder + std::marker::Sync),
dashing_cache: &SortedPairGenomeDistanceCache,
genomes: &[&str],
skip_clusterer: bool,
) -> (BTreeSet<usize>, SortedPairGenomeDistanceCache) {
let mut clusters_to_return: BTreeSet<usize> = BTreeSet::new();
let mut fastani_cache = SortedPairGenomeDistanceCache::new();
Expand All @@ -169,15 +177,23 @@ fn find_dashing_fastani_representatives(
.collect();

// FastANI all potential reps against the current genome
let fastanis = calculate_fastani_many_to_one_pairwise_stop_early(
clusterer,
potential_refs
.iter()
.map(|ref_index| genomes[*ref_index])
.collect::<Vec<_>>()
.as_slice(),
genomes[i],
);
let fastanis = if skip_clusterer {
compute_ani_from_preclusterer(
dashing_cache,
potential_refs.iter().collect::<Vec<_>>().as_slice(),
&i,
)
} else {
calculate_fastani_many_to_one_pairwise_stop_early(
clusterer,
potential_refs
.iter()
.map(|ref_index| genomes[*ref_index])
.collect::<Vec<_>>()
.as_slice(),
genomes[i],
)
};
let mut is_rep = true;
for (potential_ref, fastani) in potential_refs.into_iter().zip(fastanis.iter()) {
debug!(
Expand Down Expand Up @@ -245,6 +261,23 @@ fn calculate_fastani_many_to_one_pairwise_stop_early(
to_return.into_inner().unwrap()
}

fn compute_ani_from_preclusterer(
dashing_cache: &SortedPairGenomeDistanceCache,
query_genome_indexes: &[&usize],
ref_genome_index: &usize,
) -> Vec<Option<f32>> {
query_genome_indexes
.iter()
.map(|query_genome| {
let ani = dashing_cache.get(&(**query_genome, *ref_genome_index));
ani.copied()
})
.collect::<Vec<_>>()
.into_iter()
.flatten()
.collect()
}

// /// For each genome (sketch) assign it to the closest representative genome:
// fn find_minhash_memberships(
// representatives: &BTreeSet<usize>,
Expand Down

0 comments on commit aa50e77

Please sign in to comment.