Skip to content

Commit

Permalink
MRG: Fix tax metagenome to work on gather output created with `--es…
Browse files Browse the repository at this point in the history
…timate-ani-ci` (#2952)

The `tax metagenome` code errors out with a confusing error message when
gather results containing confidence intervals for the ANI calculations
are output. This PR adds a test and fixes the problem.

## Details

The error message is:
```
ERROR: '/var/folders/6s/_f373w1d6hdfjc2kjstq97s80000gp/T/sourmashtest_rs5l3b23/gather.csv' is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4.
```

and it is caused by `GatherRow` running across the various extra columns
added by `--estimate-ani-ci`, such as `query_containment_ani_low`.

The fix is to add these columns in as optional/unused columns in the
`GatherRow` dataclass.
  • Loading branch information
ctb committed Jan 30, 2024
1 parent 857c0a5 commit 9033d6d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,10 @@ class GatherRow:
n_unique_weighted_found: int = None
sum_weighted_found: int = None
total_weighted_hashes: int = None
query_containment_ani_low: float = None
query_containment_ani_high: float = None
match_containment_ani_low: float = None
match_containment_ani_high: float = None


@dataclass
Expand Down
43 changes: 43 additions & 0 deletions tests/test_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -2201,6 +2201,49 @@ def test_genome_ani_lemonade_classify(runtmp):
assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output


def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp):
# test a complete MAG classification with lemonade MAG from STAMPS 2022
# (real data!)
c = runtmp

## first run gather
genome = utils.get_test_data('tax/lemonade-MAG3.sig.gz')
matches = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.zip')

c.run_sourmash('gather', genome, matches,
'--threshold-bp=5000', '-o', 'gather.csv', '--estimate-ani')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0

this_gather_file = c.output('gather.csv')
this_gather = Path(this_gather_file).read_text().splitlines()

assert len(this_gather) == 4

## now run 'tax genome' with human output
taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv')
c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file,
'--ani', '0.8', '-F', 'human')

output = c.last_result.out
assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output

# aaand classify to lineage_csv
c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file,
'--ani', '0.8', '-F', 'lineage_csv')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)
output = c.last_result.out
assert 'ident,superkingdom,phylum,class,order,family,genus,species' in output
assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output


def test_metagenome_no_gather_csv(runtmp):
# test tax metagenome with no -g
taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv')
Expand Down

0 comments on commit 9033d6d

Please sign in to comment.