diff --git a/.coveragerc b/.coveragerc
index d284e1014c..09b80d8275 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -6,6 +6,5 @@ omit =
doc/conf.py
setup.py
tests/*
- third-party/smhasher/MurmurHash3.cc
.tox/*
benchmarks/*
diff --git a/.gitignore b/.gitignore
index 0aa0e7a6f1..0b797c7bf5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,14 +13,9 @@ dist
build
sourmash.egg-info
.ipynb_checkpoints
-_minhash.so
.cache
*.so
.coverage
-sourmash_lib/_minhash.cpp
-sourmash/_minhash.cpp
-.asv/
-.eggs/
.pytest_cache
.python-version
sourmash/version.py
@@ -30,6 +25,9 @@ sourmash/_lowlevel*.py
.env
Pipfile
Pipfile.lock
-ocf/target/
target/
Cargo.lock
+.eggs
+.asv
+pkg/
+wasm-pack.log
diff --git a/.travis.yml b/.travis.yml
index 937b4b6c0b..7483db938e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,16 +7,18 @@ cache:
- "$HOME/.cache/pip"
- "$HOME/.cargo"
- "target"
- - ".tox"
branches:
only:
- master
- "/^v.*$/"
-script: tox
+script: tox -vv
-install: pip install tox-travis
+install:
+ - source .travis/install_cargo.sh
+
+before_script: pip install tox-travis
jobs:
allow_failures:
@@ -28,12 +30,14 @@ jobs:
- &test
stage: test
- python: 2.7
+ python: 3.6
- <<: *test
os: osx
+ osx_image: xcode10.1
+ python: 3.7
language: generic
env:
- - TOXENV=py36
+ - TOXENV=py37
- <<: *test
python: 3.7
name: integration (ipfs/redis)
@@ -45,7 +49,7 @@ jobs:
- redis-server
- docker
- <<: *test
- python: 3.6
+ python: 2.7
- <<: *test
python: 3.5
@@ -55,12 +59,16 @@ jobs:
services:
- docker
env:
- - PIP=pip
+ - CIBW_BUILD='cp37-*'
- CIBW_SKIP='*-manylinux_i686'
- install: skip
+ - CIBW_BEFORE_BUILD='source .travis/install_cargo.sh'
+ - CIBW_ENVIRONMENT='PATH="$HOME/.cargo/bin:$PATH"'
+ - CIBW_ENVIRONMENT_MACOS='MACOSX_DEPLOYMENT_TARGET=10.11'
+ before_script: skip
script:
- - sudo $PIP install cibuildwheel==1.0.0
- - cibuildwheel --output-dir wheelhouse
+ - python -m pip install -U pip setuptools
+ - python -m pip install cibuildwheel==1.1.0
+ - python -m cibuildwheel --output-dir wheelhouse
deploy:
provider: releases
api_key:
@@ -73,12 +81,7 @@ jobs:
- <<: *wheel
os: osx
osx_image: xcode10.1
- language: generic
- before_script:
- - sudo $PIP install -U pip setuptools
- env:
- - PIP=pip2
- - CIBW_ENVIRONMENT_MACOS='MACOSX_DEPLOYMENT_TARGET=10.11'
+ language: shell
stages:
- check
diff --git a/MANIFEST.in b/MANIFEST.in
index 69df9b2280..1d206dc896 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,10 +1,16 @@
include LICENSE Makefile Dockerfile LICENSE Makefile README.md requirements.txt
include index.ipynb
+include sourmash VERSION
recursive-include sourmash_lib *
recursive-include sourmash *
-recursive-include third-party *.cc *.h
-exclude tests/*
+recursive-include src *.rs
+recursive-include benches *.rs
+include Cargo.toml
+include include/sourmash.h
+prune .eggs
+global-exclude *.rlib
global-exclude *.orig
global-exclude *.pyc
global-exclude *.so
prune tests/test-data/
+global-exclude *.git/
diff --git a/Makefile b/Makefile
index 8bb87d29dd..559ad81cbc 100644
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,13 @@
PYTHON ?= python
-all:
- $(PYTHON) setup.py build_ext -i
+all: build
.PHONY:
+build:
+ $(PYTHON) setup.py build_ext -i
+ cargo build
+
clean:
$(PYTHON) setup.py clean --all
rm -f sourmash/*.so
@@ -19,6 +22,7 @@ dist: FORCE
test: all
pip install -e '.[test]'
$(PYTHON) -m pytest
+ cargo test
doc: .PHONY
cd doc && make html
@@ -29,12 +33,12 @@ include/sourmash.h: src/lib.rs src/ffi/minhash.rs src/ffi/signature.rs src/ffi/n
rustup override set stable
coverage: all
- $(PYTHON) setup.py clean --all
- SOURMASH_COVERAGE=1 $(PYTHON) setup.py build_ext -i
+ $(PYTHON) setup.py build_ext -i
$(PYTHON) -m pytest --cov=. --cov-report term-missing
benchmark:
asv continuous master `git rev-parse HEAD`
+ cargo bench
check:
cargo build
diff --git a/README.md b/README.md
index e2d6619cb3..b1297dade1 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,19 @@
+
+
# sourmash
[![Documentation](https://readthedocs.org/projects/sourmash/badge/?version=latest)](http://sourmash.readthedocs.io/en/latest/)
[![Build Status](https://travis-ci.com/dib-lab/sourmash.svg?branch=master)](https://travis-ci.com/dib-lab/sourmash)
+
[![codecov](https://codecov.io/gh/dib-lab/sourmash/branch/master/graph/badge.svg)](https://codecov.io/gh/dib-lab/sourmash)
[![DOI](http://joss.theoj.org/papers/10.21105/joss.00027/status.svg)](http://joss.theoj.org/papers/10.21105/joss.00027)
+
+
+🦀
+[![](http://meritbadge.herokuapp.com/sourmash)](https://crates.io/crates/sourmash)
+[![Rust API Documentation on docs.rs](https://docs.rs/sourmash/badge.svg)](https://docs.rs/sourmash)
+
+---
Compute MinHash signatures for nucleotide (DNA/RNA) and protein sequences.
@@ -13,7 +23,7 @@ Usage:
sourmash compare *.sig -o distances
sourmash plot distances
-Sourmash 1.0 is [published on JOSS](https://doi.org/10.21105/joss.00027); please cite that paper if you use sourmash (`doi: 10.21105/joss.00027`):.
+sourmash 1.0 is [published on JOSS](https://doi.org/10.21105/joss.00027); please cite that paper if you use sourmash (`doi: 10.21105/joss.00027`):.
----
@@ -48,9 +58,10 @@ A quickstart tutorial [is available](https://sourmash.readthedocs.io/en/latest/t
### Requirements
sourmash runs under both Python 2.7.x and Python 3.5+. The base
-requirements are screed and ijson, together with a C++ development
-environment and the CPython development headers and libraries (for the
-C++ extension).
+requirements are screed and ijson, together with a Rust environment (for the
+extension code). We suggest using `rustup` to install the Rust environment:
+
+ curl https://sh.rustup.rs -sSf | sh
The comparison code (`sourmash compare`) uses numpy, and the plotting
code uses matplotlib and scipy, but most of the code is usable without
diff --git a/doc/developer.md b/doc/developer.md
index cd4ab13847..79f2411e9f 100644
--- a/doc/developer.md
+++ b/doc/developer.md
@@ -7,7 +7,13 @@ You can get the latest development master branch with:
```
git clone https://github.com/dib-lab/sourmash.git
```
-To install all of the necessary dependencies, do:
+sourmash runs under both Python 2.7.x and Python 3.5+. The base
+requirements are screed and ijson, together with a Rust environment (for the
+extension code). We suggest using `rustup` to install the Rust environment:
+
+ curl https://sh.rustup.rs -sSf | sh
+
+To install all of the necessary Python dependencies, do:
```
pip install -r requirements.txt
```
@@ -25,13 +31,6 @@ pip install -e .
We use [Travis][0] for continuous integration.
-Code coverage calculation is enabled (on Linux only) by running
-`make coverage`. This recompiles the C++ extension without
-optimization and with coverage configured. See `setup.py` for
-more information on this; the environment variable
-`SOURMASH_COVERAGE` controls whether the C++ extension is
-compiled with code coverage analysis enabled.
-
Code coverage can be viewed interactively at [codecov.io][1].
[0]:https://travis-ci.org/dib-lab/sourmash
diff --git a/netlify.toml b/netlify.toml
new file mode 100644
index 0000000000..994c94d665
--- /dev/null
+++ b/netlify.toml
@@ -0,0 +1,12 @@
+# Configuration for pull request documentation previews via Netlify
+
+[build]
+publish = "_build/html"
+base = "doc"
+command = '''
+ cd .. && \
+ curl https://sh.rustup.rs -sSf | sh -s -- -y && \
+ source $HOME/.cargo/env && \
+ pip install -e .[doc] && \
+ make doc
+'''
diff --git a/setup.py b/setup.py
index 1f1720821d..2850339eda 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,32 @@
from __future__ import print_function
-import sys
-from setuptools import setup, find_packages
-from setuptools import Extension
import os
+from setuptools import setup, find_packages
+import sys
+
+
+DEBUG_BUILD = os.environ.get("SOURMASH_DEBUG") == "1"
+
+
+def build_native(spec):
+ cmd = ["cargo", "build", "--lib"]
+
+ target = "debug"
+ if not DEBUG_BUILD:
+ cmd.append("--release")
+ target = "release"
+
+ build = spec.add_external_build(cmd=cmd, path=".")
+
+ rtld_flags = ["NOW"]
+ if sys.platform == "darwin":
+ rtld_flags.append("NODELETE")
+ spec.add_cffi_module(
+ module_path="sourmash._lowlevel",
+ dylib=lambda: build.find_dylib("sourmash", in_path="target/%s" % target),
+ header_filename=lambda: build.find_header("sourmash.h", in_path="include"),
+ rtld_flags=rtld_flags,
+ )
-EXTRA_COMPILE_ARGS = ['-std=c++11', '-pedantic']
-EXTRA_LINK_ARGS=[]
CLASSIFIERS = [
"Environment :: Console",
@@ -15,7 +36,7 @@
"Natural Language :: English",
"Operating System :: POSIX :: Linux",
"Operating System :: MacOS :: MacOS X",
- "Programming Language :: C++",
+ "Programming Language :: Rust",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
@@ -24,24 +45,10 @@
CLASSIFIERS.append("Development Status :: 5 - Production/Stable")
-if sys.platform == 'darwin': # Mac OS X?
- # force 64bit only builds
- EXTRA_COMPILE_ARGS.extend(['-arch', 'x86_64', '-mmacosx-version-min=10.7',
- '-stdlib=libc++'])
-
-else: # ...likely Linux
- if os.environ.get('SOURMASH_COVERAGE'):
- print('Turning on coverage analysis.')
- EXTRA_COMPILE_ARGS.extend(['-g', '--coverage', '-lgcov'])
- EXTRA_LINK_ARGS.extend(['--coverage', '-lgcov'])
- else:
- EXTRA_COMPILE_ARGS.append('-O3')
-
-with open('README.md', 'r') as readme:
+with open("README.md", "r") as readme:
LONG_DESCRIPTION = readme.read()
-SETUP_METADATA = \
- {
+SETUP_METADATA = {
"name": "sourmash",
"description": "tools for comparing DNA sequences with MinHash sketches",
"long_description": LONG_DESCRIPTION,
@@ -55,20 +62,18 @@
'sourmash = sourmash.__main__:main'
]
},
- "ext_modules": [Extension("sourmash._minhash",
- sources=["sourmash/_minhash.pyx",
- "third-party/smhasher/MurmurHash3.cc"],
- depends=["sourmash/kmer_min_hash.hh"],
- include_dirs=["./sourmash",
- "./third-party/smhasher/"],
- language="c++",
- extra_compile_args=EXTRA_COMPILE_ARGS,
- extra_link_args=EXTRA_LINK_ARGS)],
"install_requires": ["screed>=0.9", "ijson>=2.5.1", "khmer>=2.1", 'numpy',
+ "cffi",
'matplotlib', 'scipy', "deprecation>=2.0.6"],
- "setup_requires": ['Cython>=0.25.2', "setuptools>=38.6.0",
- 'setuptools_scm', 'setuptools_scm_git_archive'],
+ "setup_requires": [
+ "setuptools>=38.6.0",
+ "milksnake",
+ "setuptools_scm",
+ "setuptools_scm_git_archive",
+ ],
"use_scm_version": {"write_to": "sourmash/version.py"},
+ "zip_safe": False,
+ "platforms": "any",
"extras_require": {
'test' : ['pytest', 'pytest-cov'],
'demo' : ['jupyter', 'jupyter_client', 'ipython'],
@@ -76,13 +81,10 @@
"sphinxcontrib-napoleon", "nbsphinx"],
'10x': ['bam2fasta==1.0.1'],
'storage': ["ipfshttpclient", "redis"]
- },
- "include_package_data": True,
- "package_data": {
- "sourmash": ['*.pxd']
},
- "classifiers": CLASSIFIERS
- }
+ "include_package_data": True,
+ "classifiers": CLASSIFIERS,
+ "milksnake_tasks": [build_native],
+}
setup(**SETUP_METADATA)
-
diff --git a/sourmash/__init__.py b/sourmash/__init__.py
index 2893023d2a..319ccd1e75 100644
--- a/sourmash/__init__.py
+++ b/sourmash/__init__.py
@@ -7,12 +7,21 @@
import math
import os
-from ._minhash import (MinHash, get_minhash_default_seed, get_minhash_max_hash)
+from ._lowlevel import ffi, lib
+
+ffi.init_once(lib.sourmash_init, "init")
+
+from ._minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash
+
DEFAULT_SEED = get_minhash_default_seed()
MAX_HASH = get_minhash_max_hash()
-from .signature import (load_signatures, load_one_signature, SourmashSignature,
- save_signatures)
+from .signature import (
+ load_signatures,
+ load_one_signature,
+ SourmashSignature,
+ save_signatures,
+)
from .sbtmh import load_sbt_index, search_sbt_index, create_sbt_index
from . import lca
from . import sbt
@@ -21,6 +30,7 @@
from . import signature
from pkg_resources import get_distribution, DistributionNotFound
+
try:
VERSION = get_distribution(__name__).version
except DistributionNotFound: # pragma: no cover
diff --git a/sourmash/_compat.py b/sourmash/_compat.py
new file mode 100644
index 0000000000..86b4e97f98
--- /dev/null
+++ b/sourmash/_compat.py
@@ -0,0 +1,24 @@
+import sys
+
+
+PY2 = sys.version_info[0] == 2
+
+if PY2:
+ text_type = unicode
+ int_types = (int, long)
+ string_types = (str, unicode)
+ range_type = xrange
+ itervalues = lambda x: x.itervalues()
+ NUL = '\x00'
+ def implements_to_string(cls):
+ cls.__unicode__ = cls.__str__
+ cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
+ return cls
+else:
+ text_type = str
+ int_types = (int,)
+ string_types = (str,)
+ range_type = range
+ itervalues = lambda x: x.values()
+ NUL = 0
+ implements_to_string = lambda x: x
diff --git a/sourmash/_minhash.pxd b/sourmash/_minhash.pxd
deleted file mode 100644
index a5aa1f4e04..0000000000
--- a/sourmash/_minhash.pxd
+++ /dev/null
@@ -1,70 +0,0 @@
-# -*- coding: UTF-8 -*-
-# cython: language_level=3, c_string_type=str, c_string_encoding=ascii
-
-from __future__ import unicode_literals
-
-from libcpp cimport bool
-from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
-from libcpp.set cimport set as cppset
-from libcpp.string cimport string
-from libc.stdint cimport uint32_t, uint64_t
-from libcpp.vector cimport vector
-
-
-cdef extern from "kmer_min_hash.hh":
- ctypedef uint64_t HashIntoType;
- ctypedef vector[HashIntoType] CMinHashType;
-
-
- cdef uint64_t _hash_murmur(const string, uint32_t seed)
- cdef uint64_t _hash_murmur(const char *, unsigned int, uint32_t)
-
- cdef cppclass KmerMinHash:
- const uint32_t seed;
- const unsigned int num;
- const unsigned int ksize;
- const bool is_protein;
- const bool dayhoff;
- const bool hp;
- const HashIntoType max_hash;
- CMinHashType mins;
-
- KmerMinHash(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType)
- void add_hash(HashIntoType) except +ValueError
- void remove_hash(HashIntoType) except +ValueError
- void add_word(const string& word) except +ValueError
- void add_word(const char * word) except +ValueError
- void add_sequence(const string&, bool) except +ValueError
- void merge(const KmerMinHash&) except +ValueError
- string aa_to_dayhoff(string aa) except +ValueError
- string aa_to_hp(string aa) except +ValueError
- string translate_codon(string codon) except +ValueError
- unsigned int count_common(const KmerMinHash&) except +ValueError
- unsigned long size()
-
-
- cdef cppclass KmerMinAbundance(KmerMinHash):
- CMinHashType abunds;
-
- KmerMinAbundance(unsigned int, unsigned int, bool, bool, bool, uint32_t, HashIntoType)
- void add_hash(HashIntoType) except +ValueError
- void remove_hash(HashIntoType) except +ValueError
- void add_word(string word) except +ValueError
- void add_word(const char * word) except +ValueError
- void add_sequence(const string&, bool) except +ValueError
- void merge(const KmerMinAbundance&) except +ValueError
- void merge(const KmerMinHash&) except +ValueError
- string aa_to_dayhoff(string aa) except +ValueError
- string aa_to_hp(string aa) except +ValueError
- string translate_codon(string codon) except +ValueError
- unsigned int count_common(const KmerMinAbundance&) except +ValueError
- unsigned long size()
-
-
-cdef class MinHash(object):
- cdef unique_ptr[KmerMinHash] _this
- cdef bool _track_abundance
-
- cpdef get_mins(self, bool with_abundance=*)
- cpdef set_abundances(self, dict)
diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py
new file mode 100644
index 0000000000..2557fd218b
--- /dev/null
+++ b/sourmash/_minhash.py
@@ -0,0 +1,527 @@
+# -*- coding: UTF-8 -*-
+from __future__ import unicode_literals, division
+
+import math
+import copy
+
+from ._compat import string_types, range_type
+from ._lowlevel import ffi, lib
+from .utils import RustObject, rustcall, decode_str
+from .exceptions import SourmashError
+
+# default MurmurHash seed
+MINHASH_DEFAULT_SEED = 42
+
+
+def get_minhash_default_seed():
+ return MINHASH_DEFAULT_SEED
+
+
+# we use the 64-bit hash space of MurmurHash only
+# this is 2 ** 64 - 1 in hexadecimal
+MINHASH_MAX_HASH = 0xFFFFFFFFFFFFFFFF
+
+
+def get_minhash_max_hash():
+ return MINHASH_MAX_HASH
+
+
+def get_max_hash_for_scaled(scaled):
+ if scaled == 0:
+ return 0
+ elif scaled == 1:
+ return get_minhash_max_hash()
+
+ return int(round(get_minhash_max_hash() / scaled, 0))
+
+
+def get_scaled_for_max_hash(max_hash):
+ if max_hash == 0:
+ return 0
+ return int(round(get_minhash_max_hash() / max_hash, 0))
+
+
+def to_bytes(s):
+ # Allow for strings, bytes or int
+ # Single item of byte string = int
+
+ if isinstance(s, bytes):
+ return s
+
+ if not isinstance(s, string_types + (bytes, int)):
+ raise TypeError("Requires a string-like sequence")
+
+ if isinstance(s, string_types):
+ s = s.encode("utf-8")
+ elif isinstance(s, int):
+ s = bytes([s])
+
+ return s
+
+
+def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED):
+ "hash_murmur(string, [,seed])\n\n"
+ "Compute a hash for a string, optionally using a seed (an integer). "
+ "The current default seed is returned by hash_seed()."
+
+ return lib.hash_murmur(to_bytes(kmer), seed)
+
+
+def dotproduct(a, b, normalize=True):
+ """
+ Compute the dot product of two dictionaries {k: v} where v is
+ abundance.
+ """
+
+ if normalize:
+ norm_a = math.sqrt(sum([x * x for x in a.values()]))
+ norm_b = math.sqrt(sum([x * x for x in b.values()]))
+
+ if norm_a == 0.0 or norm_b == 0.0:
+ return 0.0
+ else:
+ norm_a = 1.0
+ norm_b = 1.0
+
+ prod = 0.0
+ for k, abundance in a.items():
+ prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b)
+
+ return prod
+
+
+class MinHash(RustObject):
+ def __init__(
+ self,
+ n,
+ ksize,
+ is_protein=False,
+ dayhoff=False,
+ hp=False,
+ track_abundance=False,
+ seed=MINHASH_DEFAULT_SEED,
+ max_hash=0,
+ mins=None,
+ scaled=0,
+ ):
+ if max_hash and scaled:
+ raise ValueError("cannot set both max_hash and scaled")
+ elif scaled:
+ max_hash = get_max_hash_for_scaled(scaled)
+
+ if max_hash and n:
+ raise ValueError("cannot set both n and max_hash")
+
+ if not n and not (max_hash or scaled):
+ raise ValueError("cannot omit both n and scaled")
+
+ if dayhoff or hp:
+ is_protein = False
+
+ self._objptr = lib.kmerminhash_new(
+ n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance
+ )
+ self.__dealloc_func__ = lib.kmerminhash_free
+
+ if mins:
+ if track_abundance:
+ self.set_abundances(mins)
+ else:
+ self.add_many(mins)
+
+ def __copy__(self):
+ a = MinHash(
+ self.num,
+ self.ksize,
+ is_protein=self.is_protein,
+ dayhoff=self.dayhoff,
+ hp=self.hp,
+ track_abundance=self.track_abundance,
+ seed=self.seed,
+ max_hash=self.max_hash,
+ )
+ a.merge(self)
+ return a
+
+ def __getstate__(self): # enable pickling
+ return (
+ self.num,
+ self.ksize,
+ self.is_protein,
+ self.dayhoff,
+ self.hp,
+ self.get_mins(with_abundance=self.track_abundance),
+ None,
+ self.track_abundance,
+ self.max_hash,
+ self.seed,
+ )
+
+ def __setstate__(self, tup):
+ (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, max_hash, seed) = tup
+
+ self.__del__()
+ self._objptr = lib.kmerminhash_new(
+ n, ksize, is_protein, dayhoff, hp, seed, max_hash, track_abundance
+ )
+ if track_abundance:
+ self.set_abundances(mins)
+ else:
+ self.add_many(mins)
+
+ def __reduce__(self):
+ return (
+ MinHash,
+ (
+ self.num,
+ self.ksize,
+ self.is_protein,
+ self.dayhoff,
+ self.hp,
+ self.track_abundance,
+ self.seed,
+ self.max_hash,
+ self.get_mins(with_abundance=self.track_abundance),
+ 0,
+ ),
+ )
+
+ def __eq__(self, other):
+ return self.__getstate__() == other.__getstate__()
+
+ def copy_and_clear(self):
+ a = MinHash(
+ self.num,
+ self.ksize,
+ self.is_protein,
+ self.dayhoff,
+ self.hp,
+ self.track_abundance,
+ self.seed,
+ self.max_hash,
+ )
+ return a
+
+ def add_sequence(self, sequence, force=False):
+ self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), force)
+
+ def add(self, kmer):
+ "Add kmer into sketch."
+ self.add_sequence(kmer)
+
+ def add_many(self, hashes):
+ "Add many hashes in at once."
+ if isinstance(hashes, MinHash):
+ self._methodcall(lib.kmerminhash_add_from, hashes._objptr)
+ else:
+ for hash in hashes:
+ self._methodcall(lib.kmerminhash_add_hash, hash)
+
+ def remove_many(self, hashes):
+ "Add many hashes in at once."
+ self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))
+
+ def update(self, other):
+ "Update this estimator from all the hashes from the other."
+ self.add_many(other)
+
+ def __len__(self):
+ return self._methodcall(lib.kmerminhash_get_mins_size)
+
+ def get_mins(self, with_abundance=False):
+ size = self._methodcall(lib.kmerminhash_get_mins_size)
+ mins_ptr = self._methodcall(lib.kmerminhash_get_mins)
+
+ if with_abundance and self.track_abundance:
+ abunds_ptr = self._methodcall(lib.kmerminhash_get_abunds)
+ return dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size)))
+ else:
+ return ffi.unpack(mins_ptr, size)
+
+ def get_hashes(self):
+ return self.get_mins()
+
+ def subtract_mins(self, other):
+ a = set(self.get_mins())
+ b = set(other.get_mins())
+ return a - b
+
+ @property
+ def seed(self):
+ return self._methodcall(lib.kmerminhash_seed)
+
+ @property
+ def num(self):
+ return self._methodcall(lib.kmerminhash_num)
+
+ @property
+ def scaled(self):
+ if self.max_hash:
+ return get_scaled_for_max_hash(self.max_hash)
+ return 0
+
+ @property
+ def is_dna(self):
+ return not (self.is_protein or self.dayhoff or self.hp)
+
+ @property
+ def is_protein(self):
+ return self._methodcall(lib.kmerminhash_is_protein)
+
+ @property
+ def dayhoff(self):
+ return self._methodcall(lib.kmerminhash_dayhoff)
+
+ @property
+ def hp(self):
+ return self._methodcall(lib.kmerminhash_hp)
+
+ @property
+ def ksize(self):
+ return self._methodcall(lib.kmerminhash_ksize)
+
+ @property
+ def max_hash(self):
+ return self._methodcall(lib.kmerminhash_max_hash)
+
+ @property
+ def track_abundance(self):
+ return self._methodcall(lib.kmerminhash_track_abundance)
+
+ @track_abundance.setter
+ def track_abundance(self, b):
+ if self.track_abundance == b:
+ return
+
+ if b is False:
+ self._methodcall(lib.kmerminhash_disable_abundance)
+ elif len(self) > 0:
+ raise RuntimeError("Can only set track_abundance=True if the MinHash is empty")
+ else:
+ self._methodcall(lib.kmerminhash_enable_abundance)
+
+ def add_hash(self, h):
+ return self._methodcall(lib.kmerminhash_add_hash, h)
+
+ def translate_codon(self, codon):
+ try:
+ return rustcall(lib.sourmash_translate_codon,
+ to_bytes(codon)).decode('utf-8')
+ except SourmashError as e:
+ raise ValueError(e.message)
+
+ def count_common(self, other):
+ if not isinstance(other, MinHash):
+ raise TypeError("Must be a MinHash!")
+ return self._methodcall(lib.kmerminhash_count_common, other._get_objptr())
+
+ def downsample_n(self, new_num):
+ if self.num and self.num < new_num:
+ raise ValueError("new sample n is higher than current sample n")
+
+ a = MinHash(
+ new_num, self.ksize, self.is_protein, self.dayhoff, self.hp, self.track_abundance, self.seed, 0
+ )
+ if self.track_abundance:
+ a.set_abundances(self.get_mins(with_abundance=True))
+ else:
+ a.add_many(self)
+
+ return a
+
+ def downsample_max_hash(self, *others):
+ max_hashes = [x.max_hash for x in others]
+ new_max_hash = min(self.max_hash, *max_hashes)
+ new_scaled = get_scaled_for_max_hash(new_max_hash)
+
+ return self.downsample_scaled(new_scaled)
+
+ def downsample_scaled(self, new_num):
+ if self.num:
+ raise ValueError("num != 0 - cannot downsample a standard MinHash")
+
+ max_hash = self.max_hash
+ if max_hash is None:
+ raise ValueError("no max_hash available - cannot downsample")
+
+ old_scaled = get_scaled_for_max_hash(self.max_hash)
+ if old_scaled > new_num:
+ raise ValueError(
+ "new scaled {} is lower than current sample scaled {}".format(
+ new_num, old_scaled
+ )
+ )
+
+ new_max_hash = get_max_hash_for_scaled(new_num)
+
+ a = MinHash(
+ 0,
+ self.ksize,
+ self.is_protein,
+ self.dayhoff,
+ self.hp,
+ self.track_abundance,
+ self.seed,
+ new_max_hash,
+ )
+ if self.track_abundance:
+ a.set_abundances(self.get_mins(with_abundance=True))
+ else:
+ a.add_many(self)
+
+ return a
+
+ def intersection(self, other, in_common=False):
+ if not isinstance(other, MinHash):
+ raise TypeError("Must be a MinHash!")
+
+ if self.num != other.num:
+ err = "must have same num: {} != {}".format(self.num, other.num)
+ raise TypeError(err)
+
+ if in_common:
+ # TODO: copy from buffer to Python land instead,
+ # this way involves more moving data around.
+ combined_mh = self.copy_and_clear()
+ combined_mh.merge(self)
+ combined_mh.merge(other)
+
+ size = len(combined_mh)
+ common = set(self.get_mins())
+ common.intersection_update(other.get_mins())
+ common.intersection_update(combined_mh.get_mins())
+ else:
+ size = self._methodcall(lib.kmerminhash_intersection, other._get_objptr())
+ common = set()
+
+ return common, max(size, 1)
+
+ def compare(self, other):
+ if self.num != other.num:
+ err = "must have same num: {} != {}".format(self.num, other.num)
+ raise TypeError(err)
+ return self._methodcall(lib.kmerminhash_compare, other._get_objptr())
+
+ def jaccard(self, other):
+ return self.compare(other)
+
+ def similarity(self, other, ignore_abundance=False):
+ """Calculate similarity of two sketches.
+
+ If the sketches are not abundance weighted, or ignore_abundance=True,
+ compute Jaccard similarity.
+
+ If the sketches are abundance weighted, calculate a distance metric
+ based on the cosine similarity.
+
+ Note, because the term frequencies (tf-idf weights) cannot be negative,
+ the angle will never be < 0deg or > 90deg.
+
+ See https://en.wikipedia.org/wiki/Cosine_similarity
+ """
+
+ # if either signature is flat, calculate Jaccard only.
+ if not (self.track_abundance and other.track_abundance) or ignore_abundance:
+ return self.jaccard(other)
+ else:
+ # can we merge? if not, raise exception.
+ aa = copy.copy(self)
+ aa.merge(other)
+
+ a = self.get_mins(with_abundance=True)
+ b = other.get_mins(with_abundance=True)
+
+ prod = dotproduct(a, b)
+ prod = min(1.0, prod)
+
+ distance = 2 * math.acos(prod) / math.pi
+ return 1.0 - distance
+
+ def contained_by(self, other):
+ """\
+ Calculate how much of self is contained by other.
+ """
+ if not len(self):
+ return 0.0
+
+ return self.count_common(other) / len(self)
+
+ def containment_ignore_maxhash(self, other):
+ if len(self) == 0:
+ return 0.0
+
+ if not isinstance(other, MinHash):
+ raise TypeError("Must be a MinHash!")
+
+ return self._methodcall(lib.kmerminhash_containment_ignore_maxhash, other._get_objptr())
+
+ def __iadd__(self, other):
+ if not isinstance(other, MinHash):
+ raise TypeError("Must be a MinHash!")
+ self._methodcall(lib.kmerminhash_merge, other._get_objptr())
+ return self
+
+ merge = __iadd__
+
+ def set_abundances(self, values):
+ if self.track_abundance:
+ added = 0
+
+ for k, v in sorted(values.items()):
+ if not self.max_hash or k <= self.max_hash:
+ self._methodcall(lib.kmerminhash_mins_push, k)
+ self._methodcall(lib.kmerminhash_abunds_push, v)
+ added += 1
+ if self.num > 0 and added >= self.num:
+ break
+ else:
+ raise RuntimeError(
+ "Use track_abundance=True when constructing "
+ "the MinHash to use set_abundances."
+ )
+
+ def add_protein(self, sequence):
+ ksize = self.ksize // 3
+ if len(sequence) < ksize:
+ return
+
+ aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1))
+ if self.is_protein:
+ for aa_kmer in aa_kmers:
+ self._methodcall(
+ lib.kmerminhash_add_word, to_bytes(aa_kmer)
+ )
+ elif self.dayhoff:
+ for aa_kmer in aa_kmers:
+ dayhoff_kmer = ''
+ for aa in aa_kmer:
+ data = rustcall(lib.sourmash_aa_to_dayhoff, to_bytes(aa))
+ dayhoff_letter = data.decode('utf-8')
+ dayhoff_kmer += dayhoff_letter
+ self._methodcall(
+ lib.kmerminhash_add_word, to_bytes(dayhoff_kmer)
+ )
+ elif self.hp:
+ for aa_kmer in aa_kmers:
+ hp_kmer = ''
+ for aa in aa_kmer:
+ data = rustcall(lib.sourmash_aa_to_hp, to_bytes(aa))
+ hp_letter = data.decode('utf-8')
+ hp_kmer += hp_letter
+ self._methodcall(
+ lib.kmerminhash_add_word, to_bytes(hp_kmer)
+ )
+ else:
+ raise ValueError("Invalid protein type")
+
+ def is_molecule_type(self, molecule):
+ if self.is_protein and molecule == 'protein':
+ return True
+ elif self.dayhoff and molecule == 'dayhoff':
+ return True
+ elif self.hp and molecule == 'hp':
+ return True
+ elif molecule.upper() == "DNA" and self.is_dna:
+ return True
+
+ return False
diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx
deleted file mode 100644
index 66d6a357ee..0000000000
--- a/sourmash/_minhash.pyx
+++ /dev/null
@@ -1,512 +0,0 @@
-# -*- coding: UTF-8 -*-
-# cython: language_level=3, c_string_type=str, c_string_encoding=ascii
-
-from __future__ import unicode_literals
-
-from cython.operator cimport dereference as deref, address
-
-from libcpp cimport bool
-from libc.stdint cimport uint32_t
-
-from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur
-import math
-import copy
-
-
-# default MurmurHash seed
-cdef uint32_t MINHASH_DEFAULT_SEED = 42
-
-
-def get_minhash_default_seed():
- return MINHASH_DEFAULT_SEED
-
-
-# we use the 64-bit hash space of MurmurHash only
-cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1
-
-
-def get_minhash_max_hash():
- return MINHASH_MAX_HASH
-
-
-def get_max_hash_for_scaled(scaled):
- if scaled == 0:
- return 0
- elif scaled == 1:
- return get_minhash_max_hash()
-
- return int(round(get_minhash_max_hash() / scaled, 0))
-
-
-def get_scaled_for_max_hash(max_hash):
- if max_hash == 0:
- return 0
- return int(round(get_minhash_max_hash() / max_hash, 0))
-
-
-cdef bytes to_bytes(s):
- # Allow for strings, bytes or int
- # Single item of byte string = int
- if not isinstance(s, (basestring, bytes, int)):
- raise TypeError("Requires a string-like sequence")
-
- if isinstance(s, unicode):
- s = s.encode('utf-8')
- if isinstance(s, int):
- s = bytes([s])
- return s
-
-
-def hash_murmur(kmer, uint32_t seed=MINHASH_DEFAULT_SEED):
- "hash_murmur(string, [,seed])\n\n"
- "Compute a hash for a string, optionally using a seed (an integer). "
- "The current default seed is returned by hash_seed()."
-
- return _hash_murmur(to_bytes(kmer), seed)
-
-
-def dotproduct(a, b, normalize=True):
- """
- Compute the dot product of two dictionaries {k: v} where v is
- abundance.
- """
-
- if normalize:
- norm_a = math.sqrt(sum([ x*x for x in a.values() ]))
- norm_b = math.sqrt(sum([ x*x for x in b.values() ]))
-
- if norm_a == 0.0 or norm_b == 0.0:
- return 0.0
- else:
- norm_a = 1.0
- norm_b = 1.0
-
- prod = 0.
- for k, abundance in a.items():
- prod += (float(abundance) / norm_a) * (b.get(k, 0) / norm_b)
-
- return prod
-
-
-cdef class MinHash(object):
-
- def __init__(self, unsigned int n, unsigned int ksize,
- bool is_protein=False,
- bool dayhoff=False,
- bool hp=False,
- bool track_abundance=False,
- uint32_t seed=MINHASH_DEFAULT_SEED,
- HashIntoType max_hash=0,
- mins=None, HashIntoType scaled=0):
- self._track_abundance = track_abundance
-
- if max_hash and scaled:
- raise ValueError('cannot set both max_hash and scaled')
- elif scaled:
- max_hash = get_max_hash_for_scaled(scaled)
-
- if max_hash and n:
- raise ValueError('cannot set both n and max_hash')
-
- if not n and not (max_hash or scaled):
- raise ValueError("cannot omit both n and scaled")
-
- cdef KmerMinHash *mh = NULL
- if track_abundance:
- mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
- else:
- mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
-
- self._this.reset(mh)
-
- if mins:
- if track_abundance:
- self.set_abundances(mins)
- else:
- self.add_many(mins)
-
-
- def __copy__(self):
- a = MinHash(deref(self._this).num, deref(self._this).ksize,
- deref(self._this).is_protein, deref(self._this).dayhoff,
- deref(self._this).hp,
- self.track_abundance,
- deref(self._this).seed, deref(self._this).max_hash)
- a.merge(self)
- return a
-
- def __getstate__(self): # enable pickling
- with_abundance = False
- if self.track_abundance:
- with_abundance = True
-
- return (deref(self._this).num,
- deref(self._this).ksize,
- deref(self._this).is_protein,
- deref(self._this).dayhoff,
- deref(self._this).hp,
- self.get_mins(with_abundance=with_abundance),
- None, self.track_abundance, deref(self._this).max_hash,
- deref(self._this).seed)
-
- def __setstate__(self, tup):
- (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, max_hash, seed) =\
- tup
-
- self._track_abundance = track_abundance
-
- cdef KmerMinHash *mh = NULL
- if track_abundance:
- mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
- self._this.reset(mh)
- self.set_abundances(mins)
- else:
- mh = new KmerMinHash(n, ksize, is_protein, dayhoff, hp, seed, max_hash)
- self._this.reset(mh)
- self.add_many(mins)
-
- def __reduce__(self):
- return (MinHash,
- (deref(self._this).num,
- deref(self._this).ksize,
- deref(self._this).is_protein,
- deref(self._this).dayhoff,
- deref(self._this).hp,
- self.track_abundance,
- deref(self._this).seed,
- deref(self._this).max_hash,
- self.get_mins(with_abundance=self.track_abundance),
- 0))
-
- def __richcmp__(self, other, op):
- if op == 2:
- return self.__getstate__() == other.__getstate__()
- raise Exception("undefined comparison")
-
- def copy_and_clear(self):
- a = MinHash(deref(self._this).num, deref(self._this).ksize,
- deref(self._this).is_protein, deref(self._this).dayhoff,
- deref(self._this).hp, self.track_abundance,
- deref(self._this).seed, deref(self._this).max_hash)
- return a
-
- def add_sequence(self, sequence, bool force=False):
- deref(self._this).add_sequence(to_bytes(sequence), force)
-
- def add(self, kmer):
- "Add kmer into sketch."
- self.add_sequence(kmer)
-
- def add_many(self, hashes):
- "Add many hashes in at once."
- for hash in hashes:
- self.add_hash(hash)
-
- def remove_many(self, hashes):
- "Remove many hashes at once."
- for hash in hashes:
- deref(self._this).remove_hash(hash)
-
- def update(self, other):
- "Update this estimator from all the hashes from the other."
- self.add_many(other.get_mins())
-
- def __len__(self):
- return deref(self._this).mins.size()
-
- cpdef get_mins(self, bool with_abundance=False):
- cdef KmerMinAbundance *mh = address(deref(self._this))
- if with_abundance and self.track_abundance:
- return dict(zip(mh.mins, mh.abunds))
- else:
- return deref(self._this).mins
-
- def get_hashes(self):
- return self.get_mins()
-
- def subtract_mins(self, other):
- a = set(self.get_mins())
- b = set(other.get_mins())
- return a - b
-
- @property
- def seed(self):
- return deref(self._this).seed
-
- @property
- def num(self):
- return deref(self._this).num
-
- @property
- def scaled(self):
- if self.max_hash:
- return get_scaled_for_max_hash(self.max_hash)
- return 0
-
- @property
- def is_protein(self):
- return deref(self._this).is_protein
-
- @property
- def dayhoff(self):
- return deref(self._this).dayhoff
-
- @property
- def hp(self):
- return deref(self._this).hp
-
- @property
- def ksize(self):
- return deref(self._this).ksize
-
- @property
- def max_hash(self):
- return deref(self._this).max_hash
-
- @property
- def track_abundance(self):
- return self._track_abundance
-
- @track_abundance.setter
- def track_abundance(self, v):
- cdef KmerMinHash *mh = NULL
-
- if v == self._track_abundance:
- return
-
- if v is True and len(self) != 0:
- raise RuntimeError("Can only set track_abundance=True if the MinHash is empty")
-
- if v:
- mh = new KmerMinAbundance(self.num, self.ksize, self.is_protein,
- self.dayhoff, self.hp, self.seed, self.max_hash)
- self._this.reset(mh)
-
- # At this point, if we are changing from track_abundance=True to False,
- # keep the underlying Abundance MH (to avoid copying data to a new one).
-
- self._track_abundance = v
-
- def add_hash(self, uint64_t h):
- deref(self._this).add_hash(h)
-
- def translate_codon(self, codon):
- return deref(self._this).translate_codon(to_bytes(codon))
-
- def count_common(self, MinHash other):
- return deref(self._this).count_common(deref(other._this))
-
- def downsample_n(self, new_num):
- if self.num and self.num < new_num:
- raise ValueError('new sample n is higher than current sample n')
-
- a = MinHash(new_num, deref(self._this).ksize,
- deref(self._this).is_protein, deref(self._this).dayhoff,
- deref(self._this).hp,
- self.track_abundance,
- deref(self._this).seed, 0)
- if self.track_abundance:
- a.set_abundances(self.get_mins(with_abundance=True))
- else:
- a.add_many(self.get_mins())
-
- return a
-
- def downsample_max_hash(self, *others):
- max_hashes = [ x.max_hash for x in others ]
- new_max_hash = min(self.max_hash, *max_hashes)
- new_scaled = get_scaled_for_max_hash(new_max_hash)
-
- return self.downsample_scaled(new_scaled)
-
- def downsample_scaled(self, new_num):
- if self.num:
- raise ValueError('num != 0 - cannot downsample a standard MinHash')
-
- max_hash = self.max_hash
- if max_hash is None:
- raise ValueError('no max_hash available - cannot downsample')
-
- old_scaled = get_scaled_for_max_hash(self.max_hash)
- if old_scaled > new_num:
- raise ValueError('new scaled {} is lower than current sample scaled {}'.format(new_num, old_scaled))
-
- new_max_hash = get_max_hash_for_scaled(new_num)
-
- a = MinHash(0, deref(self._this).ksize,
- deref(self._this).is_protein, deref(self._this).dayhoff,
- deref(self._this).hp,
- self.track_abundance,
- deref(self._this).seed, new_max_hash)
- if self.track_abundance:
- a.set_abundances(self.get_mins(with_abundance=True))
- else:
- a.add_many(self.get_mins())
-
- return a
-
- def intersection(self, MinHash other):
- if self.num != other.num:
- err = 'must have same num: {} != {}'.format(self.num,
- other.num)
- raise TypeError(err)
- else:
- num = self.num
-
- if self.track_abundance and other.track_abundance:
- combined_mh = new KmerMinAbundance(num,
- deref(self._this).ksize,
- deref(self._this).is_protein,
- deref(self._this).dayhoff,
- deref(self._this).hp,
- deref(self._this).seed,
- deref(self._this).max_hash)
-
- else:
- combined_mh = new KmerMinHash(num,
- deref(self._this).ksize,
- deref(self._this).is_protein,
- deref(self._this).dayhoff,
- deref(self._this).hp,
- deref(self._this).seed,
- deref(self._this).max_hash)
-
- combined_mh.merge(deref(self._this))
- combined_mh.merge(deref(other._this))
-
- common = set(self.get_mins())
- common.intersection_update(other.get_mins())
- common.intersection_update(combined_mh.mins)
-
- size = max(combined_mh.size(), 1)
- del combined_mh
-
- return common, size
-
- def compare(self, MinHash other):
- common, size = self.intersection(other)
- n = len(common)
- return n / size
-
- def jaccard(self, MinHash other):
- return self.compare(other)
-
- def similarity(self, other, ignore_abundance=False):
- """\
- Calculate similarity of two sketches.
-
- If the sketches are not abundance weighted, or ignore_abundance=True,
- compute Jaccard similarity.
-
- If the sketches are abundance weighted, calculate a distance metric
- based on the cosine similarity.
-
- Note, because the term frequencies (tf-idf weights) cannot be negative,
- the angle will never be < 0deg or > 90deg.
-
- See https://en.wikipedia.org/wiki/Cosine_similarity
- """
-
- # if either signature is flat, calculate Jaccard only.
- if not (self.track_abundance and other.track_abundance) or \
- ignore_abundance:
- return self.jaccard(other)
- else:
- # can we merge? if not, raise exception.
- aa = copy.copy(self)
- aa.merge(other)
-
- a = self.get_mins(with_abundance=True)
- b = other.get_mins(with_abundance=True)
-
- prod = dotproduct(a, b)
- prod = min(1.0, prod)
-
- distance = 2*math.acos(prod) / math.pi
- return 1.0 - distance
-
- def contained_by(self, other):
- """\
- Calculate how much of self is contained by other.
- """
- if not len(self):
- return 0.0
- return self.count_common(other) / len(self.get_mins())
-
- def containment_ignore_maxhash(self, MinHash other):
- a = set(self.get_mins())
- if not a:
- return 0.0
-
- b = set(other.get_mins())
-
- overlap = a.intersection(b)
- return float(len(overlap)) / float(len(a))
-
- def __iadd__(self, MinHash other):
- cdef KmerMinAbundance *mh = address(deref(self._this))
- cdef KmerMinAbundance *other_mh = address(deref(other._this))
-
- if self.track_abundance and other.track_abundance:
- deref(mh).merge(deref(other_mh))
- else:
- deref(self._this).merge(deref(other._this))
-
- return self
- merge = __iadd__
-
- cpdef set_abundances(self, dict values):
- if self.track_abundance:
- added = 0
-
- for k, v in sorted(values.items()):
- if not self.max_hash or k <= self.max_hash:
- deref(self._this).mins.push_back(k)
- (address(deref(self._this))).abunds.push_back(v)
- added += 1
- if self.num > 0 and added >= self.num:
- break
- else:
- raise RuntimeError("Use track_abundance=True when constructing "
- "the MinHash to use set_abundances.")
-
- def add_protein(self, sequence):
- cdef uint32_t ksize = deref(self._this).ksize // 3
- if len(sequence) < ksize:
- return
-
- if not deref(self._this).is_protein:
- raise ValueError("cannot add amino acid sequence to DNA MinHash!")
-
- aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1))
- if not self.dayhoff and not self.hp:
- for aa_kmer in aa_kmers:
- deref(self._this).add_word(to_bytes(aa_kmer))
- elif self.dayhoff:
- for aa_kmer in aa_kmers:
- dayhoff_kmer = ''
- for aa in aa_kmer:
- dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa))
- dayhoff_kmer += dayhoff_letter
- # dayhoff_kmer = ''.join( for aa in aa_kmer)
- deref(self._this).add_word(to_bytes(dayhoff_kmer))
- else:
- for aa_kmer in aa_kmers:
- hp_kmer = ''
- for aa in aa_kmer:
- hp_letter = deref(self._this).aa_to_hp(to_bytes(aa))
- hp_kmer += hp_letter
- # hp_kmer = ''.join( for aa in aa_kmer)
- deref(self._this).add_word(to_bytes(hp_kmer))
-
- def is_molecule_type(self, molecule):
- if molecule.upper() == 'DNA' and not self.is_protein:
- return True
- elif self.is_protein and molecule == 'protein' and not any((self.dayhoff, self.hp)):
- return True
- elif self.dayhoff and molecule == 'dayhoff':
- return True
- elif self.hp and molecule == 'hp':
- return True
-
- return False
diff --git a/sourmash/exceptions.py b/sourmash/exceptions.py
new file mode 100644
index 0000000000..8254a21762
--- /dev/null
+++ b/sourmash/exceptions.py
@@ -0,0 +1,44 @@
+from ._compat import implements_to_string
+from ._lowlevel import lib
+
+
+__all__ = ['SourmashError']
+exceptions_by_code = {}
+
+
+@implements_to_string
+class SourmashError(Exception):
+ code = None
+
+ def __init__(self, msg):
+ Exception.__init__(self)
+ self.message = msg
+ self.rust_info = None
+
+ def __str__(self):
+ rv = self.message
+ if self.rust_info is not None:
+ return u'%s\n\n%s' % (rv, self.rust_info)
+ return rv
+
+
+def _make_exceptions():
+ for attr in dir(lib):
+ if not attr.startswith('SOURMASH_ERROR_CODE_'):
+ continue
+
+ class Exc(SourmashError):
+ pass
+
+ code = getattr(lib, attr)
+ if code < 100 or code > 10000:
+ Exc.__name__ = attr[20:].title().replace('_', '')
+ Exc.code = getattr(lib, attr)
+ globals()[Exc.__name__] = Exc
+ Exc.code = code
+ exceptions_by_code[code] = Exc
+ __all__.append(Exc.__name__)
+ else:
+ exceptions_by_code[code] = ValueError
+
+_make_exceptions()
diff --git a/sourmash/kmer_min_hash.hh b/sourmash/kmer_min_hash.hh
deleted file mode 100644
index 47decc8105..0000000000
--- a/sourmash/kmer_min_hash.hh
+++ /dev/null
@@ -1,648 +0,0 @@
-#ifndef KMER_MIN_HASH_HH
-#define KMER_MIN_HASH_HH
-
-#include
-#include
-#include
-#include