Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix presence matrix wrong shape #236

Merged
merged 2 commits into from
Mar 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tools/cell_census_builder/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def build_step4_populate_X_layers(
populate_X_layers(assets_path, filtered_datasets, experiment_builders, args)

for eb in reopen_experiment_builders(experiment_builders):
eb.populate_presence_matrix()
eb.populate_presence_matrix(filtered_datasets)

logging.info("Build step 4 - Populate X layers - finished")

Expand Down
4 changes: 2 additions & 2 deletions tools/cell_census_builder/experiment_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def create_X_with_layers(self) -> None:
platform_config=CENSUS_X_LAYERS_PLATFORM_CONFIG[layer_name],
)

def populate_presence_matrix(self) -> None:
def populate_presence_matrix(self, datasets: List[Dataset]) -> None:
"""
Save presence matrix per Experiment
"""
Expand All @@ -265,7 +265,7 @@ def populate_presence_matrix(self) -> None:
# sanity check
assert len(self.presence) == self.n_datasets

max_dataset_joinid = max(self.presence.keys())
max_dataset_joinid = max(d.soma_joinid for d in datasets)
Copy link
Contributor

@bkmartinjr bkmartinjr Mar 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is wrong - it is going to add datasets which do not exist in the organism's experiment.

Edit: my misunderstanding of the defect being fixed. This change looks correct to me.


# LIL is fast way to create spmatrix
pm = sparse.lil_array((max_dataset_joinid + 1, self.n_var), dtype=bool)
Expand Down
16 changes: 16 additions & 0 deletions tools/cell_census_builder/tests/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@ def test_base_builder_creation(
# Presence matrix should exist with the correct dimensions
for exp_name in ["homo_sapiens", "mus_musculus"]:
fdpm = census[CENSUS_DATA_NAME][exp_name].ms[MEASUREMENT_RNA_NAME][FEATURE_DATASET_PRESENCE_MATRIX_NAME]
fdpm_matrix = fdpm.read().coos().concat()

# The first dimension of the presence matrix should map to the soma_joinids of the returned datasets
dim_0 = fdpm_matrix.to_scipy().row
assert all(dim_0 >= 0)
assert all(dim_0 <= max(returned_datasets.soma_joinid))
assert fdpm_matrix.shape[0] == max(returned_datasets.soma_joinid) + 1

# All rows indexed by a Dataframe's soma_joinid that does not belong to the experiment contain all zeros
dense_pm = fdpm_matrix.to_scipy().todense()
for i, dataset in returned_datasets.iterrows():
if dataset["dataset_id"].startswith(exp_name):
assert np.count_nonzero(dense_pm[i]) > 0
else:
assert np.count_nonzero(dense_pm[i]) == 0

fdpm_df = fdpm.read().tables().concat().to_pandas()
n_datasets = fdpm_df["soma_dim_0"].nunique()
n_features = fdpm_df["soma_dim_1"].nunique()
Expand Down