Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update python package and builder to tiledbsoma 1.0.0rc2 #227

Merged
merged 9 commits into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/python/cell_census/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies= [
# of TileDB on-disk storage format. Make sure this doesn't fall behind the builder's tiledbsoma version.
# NOTE: tiledb is also a requirement of the API, but tiledbsoma also has a tiledb dependency, so just use
# the same version here
"tiledbsoma==1.0.0rc1",
"tiledbsoma==1.0.0rc2",
"typing_extensions",
"s3fs",
"scikit-misc",
Expand Down
4 changes: 2 additions & 2 deletions tools/cell_census_builder/consolidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import tiledbsoma as soma

from .globals import SOMA_TileDB_Context
from .globals import DEFAULT_TILEDB_CONFIG, SOMA_TileDB_Context
from .mp import create_process_pool_executor, log_on_broken_process_pool


Expand Down Expand Up @@ -65,7 +65,7 @@ def consolidate_tiledb_object(uri: str) -> str:
import tiledb

logging.info(f"Consolidate: start uri {uri}")
tiledb.consolidate(uri, config=tiledb.Config({"sm.consolidation.buffer_size": 1 * 1024**3}))
tiledb.consolidate(uri, config=tiledb.Config(DEFAULT_TILEDB_CONFIG))
tiledb.vacuum(uri)
logging.info(f"Consolidate: end uri {uri}")
return uri
49 changes: 14 additions & 35 deletions tools/cell_census_builder/globals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import time
import functools
from typing import Set

import pyarrow as pa
Expand Down Expand Up @@ -210,46 +210,25 @@
FEATURE_REFERENCE_IGNORE: Set[str] = set()


# The default configuration for TileDB contexts used in the builder.
DEFAULT_TILEDB_CONFIG = {
"py.init_buffer_bytes": 512 * 1024**2,
"py.deduplicate": "true",
"soma.init_buffer_bytes": 512 * 1024**2,
"sm.consolidation.buffer_size": 1 * 1024**3,
}


"""
Singletons used throughout the package
"""

# Global SOMATileDBContext
_SOMA_TileDB_Context: soma.options.SOMATileDBContext = None

# Global TileDB context
_TileDB_Ctx: tiledb.Ctx = None

# The logical timestamp at which all builder data should be recorded
WRITE_TIMESTAMP = int(time.time() * 1000)

# Using "end of time" for read_timestamp means that all writes are visible, no matter what write timestamp was used
END_OF_TIME = 0xFFFFFFFFFFFFFFFF


@functools.cache
def SOMA_TileDB_Context() -> soma.options.SOMATileDBContext:
global _SOMA_TileDB_Context
if _SOMA_TileDB_Context is None or _SOMA_TileDB_Context != TileDB_Ctx():
# Set write timestamp to "now", so that we use consistent timestamps across all writes (mostly for aesthetic
# reasons). Set read timestamps to be same as write timestamp so that post-build validation reads can "see"
# the writes. Without setting read timestamp explicitly, the read timestamp would default to a time that
# prevents seeing the builder's writes.
_SOMA_TileDB_Context = soma.options.SOMATileDBContext(
tiledb_ctx=TileDB_Ctx(),
# TODO: Setting an explicit write timestamp causes later reads to fail!
# write_timestamp=write_timestamp,
# TODO: We *should* be able to set this equal to WRITE_TIMESTAMP, but as specifying a write_timestamp is
# problematic, we must use "end of time" for now
read_timestamp=END_OF_TIME,
)
return _SOMA_TileDB_Context
return soma.options.SOMATileDBContext(tiledb_ctx=TileDB_Ctx(), timestamp=None)


@functools.cache
def TileDB_Ctx() -> tiledb.Ctx:
return _TileDB_Ctx


def set_tiledb_ctx(ctx: tiledb.Ctx) -> None:
global _TileDB_Ctx, _SOMA_TileDB_Context
_TileDB_Ctx = ctx
_SOMA_TileDB_Context = None
return tiledb.Ctx(DEFAULT_TILEDB_CONFIG)
18 changes: 0 additions & 18 deletions tools/cell_census_builder/mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,6 @@
import os
from typing import Optional, cast

import tiledbsoma as soma

from .globals import set_tiledb_ctx

if soma.get_storage_engine() == "tiledb":
import tiledb


def cpu_count() -> int:
"""Sign, os.cpu_count() returns None if "undetermined" number of CPUs"""
Expand All @@ -29,17 +22,6 @@ def process_initializer(verbose: int = 0) -> None:
)
logging.captureWarnings(True)

if soma.get_storage_engine() == "tiledb":
set_tiledb_ctx(
tiledb.Ctx(
{
"py.init_buffer_bytes": 512 * 1024**2,
"py.deduplicate": "true",
"soma.init_buffer_bytes": 512 * 1024**2,
}
)
)


def create_process_pool_executor(
args: argparse.Namespace, max_workers: Optional[int] = None
Expand Down
12 changes: 11 additions & 1 deletion tools/cell_census_builder/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,10 +447,20 @@ def validate_manifest_contents(assets_path: str, datasets: List[Dataset]) -> boo

def validate_consolidation(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> bool:
"""Verify that obs, var and X layers are all fully consolidated & vacuumed"""

def is_empty_tiledb_array(uri: str) -> bool:
with tiledb.open(uri) as A:
return A.nonempty_domain() is None

with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census:
consolidated_uris = list_uris_to_consolidate(census)
for uri in consolidated_uris:
assert len(tiledb.array_fragments(uri)) == 1, f"{uri} has not been fully consolidated & vacuumed"
# If an empty array, must have fragment count of zero. If a non-empty array,
# must have fragment count of one.
assert (len(tiledb.array_fragments(uri)) == 1) or (
len(tiledb.array_fragments(uri)) == 0 and is_empty_tiledb_array(uri)
), f"{uri} has not been fully consolidated & vacuumed"

return True


Expand Down
2 changes: 1 addition & 1 deletion tools/scripts/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ numpy
# NOTE: You can also build this dependency from source, per ./notebooks/README.md.
# NOTE: The builder's version of tiledbsoma MUST be <= the API's tiledbsoma version, to ensure reader compatibility
# of TileDB on-disk storage format
tiledbsoma==1.0rc0
tiledbsoma==1.0rc2
# NOTE: tiledb is also a requirement of the builder, but builder must not use a tiledb version that is ahead of
# tiledbsoma's tiledb version (so just use the same version)
# tiledb
Expand Down