-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
census cell dup check #569
Merged
Merged
Changes from 8 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
e53e030
initial commit of notebook
bf579d9
fix nb pre-commit
ff46a34
re-run notebook
be7fb5d
Merge branch 'main' into bkmartinjr/primary-qc-nb
fe4dfaa
add readme
62fc698
PR feedback
059e836
add csc
955fa17
add dataset overlap
08adc15
fix typo
5fa29b5
Merge branch 'main' into bkmartinjr/primary-qc-nb
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
|
||
To install and run from source: | ||
* clone the repo and/or create a copy of this directory | ||
* create a python venv, and install the packages listed in requirements.txt. E.g., `pip install -r requirements.txt` | ||
* open the notebook, set the config variables, run all cells | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from concurrent import futures | ||
from typing import Generator, Iterator, Literal, Tuple, TypeVar | ||
|
||
import numpy as np | ||
import numpy.typing as npt | ||
import pandas as pd | ||
import scipy.sparse as sparse | ||
import tiledbsoma as soma | ||
|
||
""" | ||
This might make a good addition to soma.ExperimentAxisQuery. | ||
|
||
TODO: this implementation does not leverage the ExperimentAxisQuery internals | ||
for fast CSR creation, and instead uses the (slower) scipy.sparse implementation. | ||
If integrated, it should be migrated to the much faster implementation. | ||
""" | ||
|
||
_RT = Tuple[Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]], sparse.spmatrix] | ||
|
||
|
||
def X_sparse_iter( | ||
query: soma.ExperimentAxisQuery, | ||
X_name: str = "raw", | ||
row_stride: int = 2**16, # row stride | ||
fmt: Literal["csr", "csc"] = "csr", # the resulting sparse format | ||
be_eager: bool = True, | ||
) -> Iterator[_RT]: | ||
""" | ||
Return a row-wise iterator over the user-specified X SparseNdMatrix, returning for each | ||
iteration step: | ||
* obs coords (coordinates) | ||
* var_coords (coordinates) | ||
* X contents as a SciPy csr_matrix or csc_matrix | ||
The coordinates and X matrix chunks are indexed positionally, i.e. for any | ||
given value in the matrix, X[i, j], the original soma_joinid (aka soma_dim_0 | ||
and soma_dim_1) are present in obs_coords[i] and var_coords[j]. | ||
|
||
Args: | ||
query: | ||
A SOMA ExperimentAxisQuery defining the coordinates over which the iterator will | ||
read. | ||
X_name: | ||
The name of the X layer. | ||
row_stride: | ||
The number of rows to return in each step. | ||
fmt: | ||
The SciPy sparse array layout. Supported: 'csc' and 'csr' (default). | ||
be_eager: | ||
If true, will use multiple threads to parallelize reading | ||
and processing. This will improve speed, but at the cost | ||
of some additional memory use. | ||
|
||
Returns: | ||
An iterator which iterates over a tuple of: | ||
obs_coords | ||
var_coords | ||
SciPy sparse matrix | ||
|
||
Lifecycle: | ||
experimental | ||
""" | ||
if fmt == "csr": | ||
fmt_ctor = sparse.csr_matrix | ||
elif fmt == "csc": | ||
fmt_ctor = sparse.csc_matrix | ||
else: | ||
raise ValueError("fmt must be 'csr' or 'csc'") | ||
|
||
# Lazy partition array by chunk_size on first dimension | ||
obs_coords = query.obs_joinids().to_numpy() | ||
obs_coord_chunker = (obs_coords[i : i + row_stride] for i in range(0, len(obs_coords), row_stride)) | ||
|
||
# Lazy read into Arrow Table. Yields (coords, Arrow.Table) | ||
X = query._ms.X[X_name] | ||
var_coords = query.var_joinids().to_numpy() | ||
table_reader = ( | ||
( | ||
(obs_coords_chunk, var_coords), | ||
X.read(coords=(obs_coords_chunk, var_coords)).tables().concat(), | ||
) | ||
for obs_coords_chunk in obs_coord_chunker | ||
) | ||
if be_eager: | ||
table_reader = (t for t in _EagerIterator(table_reader, query._threadpool)) | ||
|
||
# lazy reindex of obs coordinates. Yields coords and (data, i, j) as numpy ndarrays | ||
coo_reindexer = ( | ||
( | ||
(obs_coords_chunk, var_coords), | ||
( | ||
tbl["soma_data"].to_numpy(), | ||
pd.Index(obs_coords_chunk).get_indexer(tbl["soma_dim_0"].to_numpy()), # type: ignore[no-untyped-call] | ||
query.indexer.by_var(tbl["soma_dim_1"].to_numpy()), | ||
), | ||
) | ||
for (obs_coords_chunk, var_coords), tbl in table_reader | ||
) | ||
if be_eager: | ||
coo_reindexer = (t for t in _EagerIterator(coo_reindexer, query._threadpool)) | ||
|
||
# lazy convert Arrow table to Scipy sparse.csr_matrix | ||
csr_reader: Generator[_RT, None, None] = ( | ||
( | ||
(obs_coords_chunk, var_coords), | ||
fmt_ctor( | ||
sparse.coo_matrix( | ||
(data, (i, j)), | ||
shape=(len(obs_coords_chunk), query.n_vars), | ||
) | ||
), | ||
) | ||
for (obs_coords_chunk, var_coords), (data, i, j) in coo_reindexer | ||
) | ||
if be_eager: | ||
csr_reader = (t for t in _EagerIterator(csr_reader, query._threadpool)) | ||
|
||
yield from csr_reader | ||
|
||
|
||
_T = TypeVar("_T") | ||
|
||
|
||
class _EagerIterator(Iterator[_T]): | ||
def __init__(self, iterator: Iterator[_T], pool: futures.Executor): | ||
super().__init__() | ||
self.iterator = iterator | ||
self._pool = pool | ||
self._future = self._pool.submit(self.iterator.__next__) | ||
|
||
def __next__(self) -> _T: | ||
res = self._future.result() | ||
self._future = self._pool.submit(self.iterator.__next__) | ||
return res |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the reader thanks you for the explicit type hint!