Skip to content

Commit

Permalink
Moving loading and save sigs to rust
Browse files Browse the repository at this point in the history
move json parsing and init to rust
working on loading sigs

55 failing. Now it's failing because SBT index is saving all signatures
(instead of only the one it was used to build the tree).
This was actually a feature (see #198) but it broke the SBT code
(it wasn't ready for that!)
  • Loading branch information
luizirber committed Aug 24, 2019
1 parent b7c60d8 commit f999324
Show file tree
Hide file tree
Showing 17 changed files with 263 additions and 565 deletions.
2 changes: 2 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ uint64_t kmerminhash_seed(KmerMinHash *ptr);

bool kmerminhash_track_abundance(KmerMinHash *ptr);

void kmerminhash_disable_abundance(KmerMinHash *ptr);

bool signature_eq(Signature *ptr, Signature *other);

KmerMinHash *signature_first_mh(Signature *ptr);
Expand Down
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,11 @@ def build_native(spec):
"packages": find_packages(exclude=["tests", "benchmarks"]),
"zip_safe": False,
"platforms": "any",
"entry_points": {"console_scripts": ["sourmash = sourmash.__main__:main"]},
"install_requires": ["screed>=0.9", "ijson", "khmer>=2.1", "milksnake"],
"entry_points": {'console_scripts': [
'sourmash = sourmash.__main__:main'
]
},
"install_requires": ["screed>=0.9", "khmer>=2.1", 'milksnake'],
"setup_requires": [
"setuptools>=38.6.0",
"milksnake",
Expand Down
9 changes: 9 additions & 0 deletions sourmash/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,12 @@ def implements_to_string(cls):
itervalues = lambda x: x.values()
NUL = 0
implements_to_string = lambda x: x


def to_bytes(s):
if not isinstance(s, string_types + (bytes,)):
raise TypeError("Requires a string-like sequence")

if isinstance(s, string_types):
s = s.encode('utf-8')
return s
16 changes: 14 additions & 2 deletions sourmash/_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,6 @@ def __init__(
mins=None,
scaled=0,
):
self.track_abundance = track_abundance

if max_hash and scaled:
raise ValueError("cannot set both max_hash and scaled")
elif scaled:
Expand Down Expand Up @@ -251,6 +249,20 @@ def ksize(self):
def max_hash(self):
return self._methodcall(lib.kmerminhash_max_hash)

@property
def track_abundance(self):
return self._methodcall(lib.kmerminhash_track_abundance)

@track_abundance.setter
def track_abundance(self, b):
if b is False:
self._methodcall(lib.kmerminhash_disable_abundance)
elif len(self) > 0:
raise ValueError("Can't change abundance after elements were inserted")
else:
# TODO: create a new one with abundance set?
raise NotImplemented()

def add_hash(self, h):
return self._methodcall(lib.kmerminhash_add_hash, h)

Expand Down
3 changes: 2 additions & 1 deletion sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,14 +1324,14 @@ def watch(args):
ksize = tree_mh.ksize

E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein)
streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name)

notify('Computing signature for k={}, {} from stdin', ksize, moltype)

def do_search():
search_fn = SearchMinHashesFindBest().search

results = []
streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name)
for leaf in tree.find(search_fn, streamsig, args.threshold):
results.append((streamsig.similarity(leaf.data),
leaf.data))
Expand Down Expand Up @@ -1369,6 +1369,7 @@ def do_search():

if args.output:
notify('saving signature to {}', args.output.name)
streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name)
sig.save_signatures([streamsig], args.output)


Expand Down
4 changes: 2 additions & 2 deletions sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def find_best(dblist, query, remainder):


# construct a new query that doesn't have the max_hash attribute set.
query = build_new_query([], orig_query)
query = build_new_query([], query)

cmp_scaled = 0
remainder = set()
Expand Down Expand Up @@ -264,7 +264,7 @@ def find_best(dblist, query, remainder):
leaf=best_leaf)

# construct a new query, minus the previous one.
query = build_new_query(found_mins, orig_query, cmp_scaled)
query = build_new_query(found_mins, query, cmp_scaled)
query_mins -= set(found_mins)

weighted_missed = sum((orig_abunds[k] for k in query_mins)) \
Expand Down
11 changes: 7 additions & 4 deletions sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def describe(args):
with_abundance = 1
md5 = sig.md5sum()
name = sig.name()
filename = sig.d.get('filename', '')
license = sig.d['license']
filename = sig.filename
license = sig.license

if w:
w.writerow(locals())
Expand Down Expand Up @@ -266,10 +266,13 @@ def merge(args):
_flatten(mh)

try:
sigobj_mh = sigobj.minhash
if not args.flatten:
_check_abundance_compatibility(first_sig, sigobj)
else:
_flatten(sigobj_mh)

mh.merge(sigobj.minhash)
mh.merge(sigobj_mh)
except:
error("ERROR when merging signature '{}' ({}) from file {}",
sigobj.name(), sigobj.md5sum()[:8], sigfile)
Expand Down Expand Up @@ -428,7 +431,7 @@ def rename(args):
select_moltype=moltype)

for sigobj in siglist:
sigobj.d['name'] = args.name
sigobj._name = args.name
outlist.append(sigobj)

if args.output:
Expand Down
Loading

0 comments on commit f999324

Please sign in to comment.