Skip to content

Commit

Permalink
fix presence matrix wrong shape (#236)
Browse files Browse the repository at this point in the history
* fix presence matrix wrong shape

* Improved assertions
  • Loading branch information
ebezzi authored Mar 2, 2023
1 parent 2c7a84d commit 3b5d3b3
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 3 deletions.
2 changes: 1 addition & 1 deletion tools/cell_census_builder/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def build_step4_populate_X_layers(
populate_X_layers(assets_path, filtered_datasets, experiment_builders, args)

for eb in reopen_experiment_builders(experiment_builders):
eb.populate_presence_matrix()
eb.populate_presence_matrix(filtered_datasets)

logging.info("Build step 4 - Populate X layers - finished")

Expand Down
4 changes: 2 additions & 2 deletions tools/cell_census_builder/experiment_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def create_X_with_layers(self) -> None:
platform_config=CENSUS_X_LAYERS_PLATFORM_CONFIG[layer_name],
)

def populate_presence_matrix(self) -> None:
def populate_presence_matrix(self, datasets: List[Dataset]) -> None:
"""
Save presence matrix per Experiment
"""
Expand All @@ -265,7 +265,7 @@ def populate_presence_matrix(self) -> None:
# sanity check
assert len(self.presence) == self.n_datasets

max_dataset_joinid = max(self.presence.keys())
max_dataset_joinid = max(d.soma_joinid for d in datasets)

# LIL is fast way to create spmatrix
pm = sparse.lil_array((max_dataset_joinid + 1, self.n_var), dtype=bool)
Expand Down
16 changes: 16 additions & 0 deletions tools/cell_census_builder/tests/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@ def test_base_builder_creation(
# Presence matrix should exist with the correct dimensions
for exp_name in ["homo_sapiens", "mus_musculus"]:
fdpm = census[CENSUS_DATA_NAME][exp_name].ms[MEASUREMENT_RNA_NAME][FEATURE_DATASET_PRESENCE_MATRIX_NAME]
fdpm_matrix = fdpm.read().coos().concat()

# The first dimension of the presence matrix should map to the soma_joinids of the returned datasets
dim_0 = fdpm_matrix.to_scipy().row
assert all(dim_0 >= 0)
assert all(dim_0 <= max(returned_datasets.soma_joinid))
assert fdpm_matrix.shape[0] == max(returned_datasets.soma_joinid) + 1

# All rows indexed by a Dataframe's soma_joinid that does not belong to the experiment contain all zeros
dense_pm = fdpm_matrix.to_scipy().todense()
for i, dataset in returned_datasets.iterrows():
if dataset["dataset_id"].startswith(exp_name):
assert np.count_nonzero(dense_pm[i]) > 0
else:
assert np.count_nonzero(dense_pm[i]) == 0

fdpm_df = fdpm.read().tables().concat().to_pandas()
n_datasets = fdpm_df["soma_dim_0"].nunique()
n_features = fdpm_df["soma_dim_1"].nunique()
Expand Down

0 comments on commit 3b5d3b3

Please sign in to comment.