diff --git a/tools/cell_census_builder/__main__.py b/tools/cell_census_builder/__main__.py index a81639b85..2056c36fe 100644 --- a/tools/cell_census_builder/__main__.py +++ b/tools/cell_census_builder/__main__.py @@ -295,7 +295,7 @@ def build_step4_populate_X_layers( populate_X_layers(assets_path, filtered_datasets, experiment_builders, args) for eb in reopen_experiment_builders(experiment_builders): - eb.populate_presence_matrix() + eb.populate_presence_matrix(filtered_datasets) logging.info("Build step 4 - Populate X layers - finished") diff --git a/tools/cell_census_builder/experiment_builder.py b/tools/cell_census_builder/experiment_builder.py index a3645f065..763920f24 100644 --- a/tools/cell_census_builder/experiment_builder.py +++ b/tools/cell_census_builder/experiment_builder.py @@ -254,7 +254,7 @@ def create_X_with_layers(self) -> None: platform_config=CENSUS_X_LAYERS_PLATFORM_CONFIG[layer_name], ) - def populate_presence_matrix(self) -> None: + def populate_presence_matrix(self, datasets: List[Dataset]) -> None: """ Save presence matrix per Experiment """ @@ -265,7 +265,7 @@ def populate_presence_matrix(self) -> None: # sanity check assert len(self.presence) == self.n_datasets - max_dataset_joinid = max(self.presence.keys()) + max_dataset_joinid = max(d.soma_joinid for d in datasets) # LIL is fast way to create spmatrix pm = sparse.lil_array((max_dataset_joinid + 1, self.n_var), dtype=bool) diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py index 27ec121c8..70fe53b7f 100644 --- a/tools/cell_census_builder/tests/test_builder.py +++ b/tools/cell_census_builder/tests/test_builder.py @@ -82,6 +82,22 @@ def test_base_builder_creation( # Presence matrix should exist with the correct dimensions for exp_name in ["homo_sapiens", "mus_musculus"]: fdpm = census[CENSUS_DATA_NAME][exp_name].ms[MEASUREMENT_RNA_NAME][FEATURE_DATASET_PRESENCE_MATRIX_NAME] + fdpm_matrix = fdpm.read().coos().concat() + + # The first dimension of the presence matrix should map to the soma_joinids of the returned datasets + dim_0 = fdpm_matrix.to_scipy().row + assert all(dim_0 >= 0) + assert all(dim_0 <= max(returned_datasets.soma_joinid)) + assert fdpm_matrix.shape[0] == max(returned_datasets.soma_joinid) + 1 + + # All rows indexed by a Dataframe's soma_joinid that does not belong to the experiment contain all zeros + dense_pm = fdpm_matrix.to_scipy().todense() + for i, dataset in returned_datasets.iterrows(): + if dataset["dataset_id"].startswith(exp_name): + assert np.count_nonzero(dense_pm[i]) > 0 + else: + assert np.count_nonzero(dense_pm[i]) == 0 + fdpm_df = fdpm.read().tables().concat().to_pandas() n_datasets = fdpm_df["soma_dim_0"].nunique() n_features = fdpm_df["soma_dim_1"].nunique()