Merge pull request #138 from saezlab/dev

1.4.0
saezlab · Sep 2, 2024 · 058a768 · 058a768
2 parents 1746c5b + 3b14ecc
commit 058a768
Show file tree

Hide file tree

Showing 16 changed files with 1,297 additions and 1,291 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.3.0
+current_version = 1.4.0
 commit = True
 tag = True
 files = pyproject.toml liana/__init__.py

diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ LIANA+ is a scalable framework that integrates and extends existing methods and
 
 ## Development & Contributions
 
-We welcome suggestions, ideas, and contributions! Please use do not hesitate to contact us, or use the issues or the [LIANA+ Development project](https://github.com/orgs/saezlab/projects/16) to make suggestions.
+We welcome suggestions, ideas, and contributions! Please use do not hesitate to contact us, open issues, and check the [contributions guide](https://liana-py.readthedocs.io/en/latest/contributing.html).
 
 ## Vignettes
 A set of extensive vignettes can be found in the [LIANA+ documentation](https://liana-py.readthedocs.io/en/latest/).
@@ -46,7 +46,7 @@ For further information please check LIANA's [API documentation](https://liana-p
 
 ## Cite LIANA+:
 
-Dimitrov D., Schäfer P.S.L, Farr E., Rodriguez Mier P., Lobentanzer S., Dugourd A., Tanevski J., Ramirez Flores R.O. and Saez-Rodriguez J. 2023 LIANA+: an all-in-one cell-cell communication framework. BioRxiv. https://www.biorxiv.org/content/10.1101/2023.08.19.553863v1
+Dimitrov D., Schäfer P.S.L, Farr E., Rodriguez Mier P., Lobentanzer S., Badia-i-Mompel P., Dugourd A., Tanevski J., Ramirez Flores R.O. and Saez-Rodriguez J. LIANA+ provides an all-in-one framework for cell–cell communication inference. Nat Cell Biol (2024). https://doi.org/10.1038/s41556-024-01469-w
 
 Dimitrov, D., Türei, D., Garrido-Rodriguez M., Burmedi P.L., Nagai, J.S., Boys, C., Flores, R.O.R., Kim, H., Szalai, B., Costa, I.G., Valdeolivas, A., Dugourd, A. and Saez-Rodriguez, J. Comparison of methods and resources for cell-cell communication inference from single-cell RNA-Seq data. Nat Commun 13, 3224 (2022). https://doi.org/10.1038/s41467-022-30755-0
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -39,16 +39,6 @@
 
 html_theme = 'furo'
 html_static_path = ["_static"]
-html_theme_options = {
-    "light_css_variables": {
-        "color-brand-primary": "#2980B9",
-        "color-brand-content": "#2980B9",
-    },
-    "dark_css_variables": {
-        "color-brand-primary": "#2980B9",
-        "color-brand-content": "#2980B9",
-    },
-}
 html_context = dict(
     display_github=True,
     github_user='saezlab',

diff --git a/docs/source/notebooks/basic_usage.ipynb b/docs/source/notebooks/basic_usage.ipynb
diff --git a/docs/source/notebooks/misty.ipynb b/docs/source/notebooks/misty.ipynb
diff --git a/docs/source/notebooks/mofatalk.ipynb b/docs/source/notebooks/mofatalk.ipynb
diff --git a/docs/source/reference.rst b/docs/source/reference.rst
@@ -1,8 +1,8 @@
 Reference
 ----------
 
-Dimitrov D., Schäfer P.S.L, Farr E., Rodriguez Mier P., Lobentanzer S., Dugourd A., Tanevski J., Ramirez Flores R.O. and Saez-Rodriguez J. 2023 LIANA+: an all-in-one cell-cell communication framework. BioRxiv. https://www.biorxiv.org/content/10.1101/2023.08.19.553863v1
+Dimitrov D., Schäfer P.S.L, Farr E., Rodriguez Mier P., Lobentanzer S., Badia-i-Mompel P., Dugourd A., Tanevski J., Ramirez Flores R.O. and Saez-Rodriguez J. LIANA+ provides an all-in-one framework for cell–cell communication inference. Nat Cell Biol (2024). https://doi.org/10.1038/s41556-024-01469-w
 
 Dimitrov, D., Türei, D., Garrido-Rodriguez M., Burmedi P.L., Nagai, J.S., Boys, C., Flores, R.O.R., Kim, H., Szalai, B., Costa, I.G., Valdeolivas, A., Dugourd, A. and Saez-Rodriguez, J. Comparison of methods and resources for cell-cell communication inference from single-cell RNA-Seq data. Nat Commun 13, 3224 (2022). https://doi.org/10.1038/s41467-022-30755-0
 
-Similarly, please consider citing any of the methods and/or resources implemented in liana, that were particularly relevant for your research!
+Similarly, please consider citing any of the methods and/or resources implemented in liana, that were relevant for your research!
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -1,6 +1,17 @@
 Changelog
 =============
 
+1.4.0 (02.09.2024)
+
+- Now published at Nat Cell Bio.
+
+- Correctly referred to PK tutorial for orthology conversion
+
+- Added ``batch_key`` and ``min_var_nbatches`` to control te way batches are selected in ``li.multi.lrs_to_views``.
+This might result in minor differences of how many interactions are considered per view, as I also changed the order of filtering.
+
+- Changed ``max_neighbours`` in ``li.ut.spatial_neighbors`` to be a fixed number (default=100), rather than a fraction of the spots as this was making RAM explode for large spatial formats.
+
 1.3.0 (12.07.2024)
 
 - Minor improvements to documentation, specifically changed to the furo theme. Resolved issues with latex not being rendered and plot sizes being off.

diff --git a/liana/__init__.py b/liana/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.3.0'
+__version__ = '1.4.0'
 
 from liana import method as mt, plotting as pl, resource as rs, multi as mu, utils as ut, testing
 

diff --git a/liana/method/sp/_misty/_misty_constructs.py b/liana/method/sp/_misty/_misty_constructs.py
@@ -50,12 +50,13 @@ def genericMistyData(intra,
                      add_para=True,
                      spatial_key='spatial',
                      set_diag=False,
-                     kernel = 'misty_rbf', ## TODO change to gaussian kernel
+                     kernel = 'misty_rbf',
                      bandwidth = 100,
                      zoi = 0,
                      cutoff = 0.1,
                      add_juxta=True,
                      n_neighs = 6,
+                     max_neighs = 18,
                      verbose=False
                      ):
 
@@ -98,6 +99,8 @@ def genericMistyData(intra,
         A bandwidth of 5 times the bandwidth of the paraview is used to ensure that the nearest neighbors within the radius.
     n_neighs : `int`, optional (default: 6)
         The number of neighbors to consider when constructing the juxtaview.
+    max_neighs: `int`, optional (default: 18)
+        The maximum number of neighbors to consider when constructing the Paraview.
     verbose : `bool`, optional (default: False)
         Whether to print progress.
 
@@ -135,6 +138,7 @@ def genericMistyData(intra,
                                     bandwidth=bandwidth,
                                     kernel=kernel,
                                     set_diag=set_diag,
+                                    max_neighbours=max_neighs,
                                     inplace=False,
                                     cutoff=cutoff,
                                     zoi=zoi
@@ -165,7 +169,7 @@ def lrMistyData(adata,
                 nz_threshold=0.1,
                 use_raw = False,
                 layer = None,
-                spatial_key='spatial', ## TODO Change to Gaussian kernel
+                spatial_key='spatial',
                 kernel = 'misty_rbf',
                 bandwidth = 100,
                 set_diag = False,

diff --git a/liana/multi/to_mudata.py b/liana/multi/to_mudata.py
@@ -129,6 +129,8 @@ def lrs_to_views(adata: AnnData,
                  lrs_per_sample:int = 10,
                  samples_per_view: int = 3,
                  min_variance:int = 0,
+                 min_var_nbatches = 1,
+                 batch_key=None,
                  lr_sep: str = V.lr_sep,
                  cell_sep: str='&',
                  var_sep: str=':',
@@ -164,6 +166,11 @@ def lrs_to_views(adata: AnnData,
     min_variance
         Reflects the minimum required variance across samples for each interaction in each view.
         NaNs are ignored when computing the variance.
+    batch_key
+        Key in `adata.obs` that represents the batch information. Used solely when computing the variance.
+        If batch_key is not `None`, the variance is computed per batch, and the ``
+    min_var_nbatches
+        Reflect the minimum number of batches (>=) that must have a variance above `min_variance` for an interaction to be included in the view.
     %(lr_sep)s
     cell_sep
         Separator to use for the cell names in the views.
@@ -209,7 +216,10 @@ def lrs_to_views(adata: AnnData,
     # concat columns (needed for MOFA)
     liana_res['interaction'] = liana_res[ligand_key] + lr_sep + liana_res[receptor_key]
     liana_res['ct_pair'] = liana_res[source_key] + cell_sep + liana_res[target_key]
-    liana_res = liana_res[[sample_key, 'ct_pair', 'interaction', score_key]]
+    keys = [sample_key, 'ct_pair', 'interaction', score_key]
+    if batch_key is not None:
+        keys.append(batch_key)
+    liana_res = liana_res[keys]
 
     # get scores & invert if necessary
     liana_res = process_scores(liana_res=liana_res,
@@ -218,9 +228,8 @@ def lrs_to_views(adata: AnnData,
 
     # count samples per interaction
     count_pairs = (liana_res.
-                   drop(columns=score_key).
                    groupby(['interaction', 'ct_pair']).
-                   count().
+                   count()[[sample_key]].
                    rename(columns={sample_key: 'count'}).
                    reset_index()
                    )
@@ -232,8 +241,7 @@ def lrs_to_views(adata: AnnData,
     liana_res = liana_res.merge(count_pairs.drop(columns='count') , how='inner')
 
     # Keep only samples above a certain number of LRs
-    count_lrs = (liana_res.
-                 drop(columns=score_key).
+    count_lrs = (liana_res[[sample_key, 'ct_pair', 'interaction']].
                  groupby([sample_key, 'ct_pair']).
                  count().
                  rename(columns={'interaction': 'count'}).
@@ -243,28 +251,26 @@ def lrs_to_views(adata: AnnData,
     liana_res = liana_res.merge(count_lrs.drop(columns='count') , how='inner')
 
     # convert to anndata views
-    views = liana_res['ct_pair'].unique()
-    views = tqdm(views, disable=not verbose)
-
     lr_adatas = {}
+    views = tqdm(liana_res['ct_pair'].unique(), disable=not verbose)
     for view in views:
         lrs_per_ct = liana_res[liana_res['ct_pair']==view]
-        lrs_wide = lrs_per_ct.pivot(index='interaction',
-                                    columns=sample_key,
-                                    values=score_key)
-
+        index = 'interaction' if batch_key is None else ['interaction', batch_key]
+        # check variance
+        ints_to_keep = (lrs_per_ct.groupby(index).apply(lambda x: np.nanvar(x[score_key])) > min_variance).groupby('interaction').sum() >= min_var_nbatches
+        ints_to_keep = ints_to_keep[ints_to_keep].index
+
+        lrs_wide = lrs_per_ct[lrs_per_ct['interaction'].isin(ints_to_keep)].\
+            pivot(index='interaction',
+                  columns=sample_key,
+                  values=score_key)
         lrs_wide.index = view + var_sep + lrs_wide.index
         lrs_wide = lrs_wide.replace(np.nan, lr_fill)
 
         if lrs_wide.shape[0] >= lrs_per_view: # check if enough LRs
             temp = _dataframe_to_anndata(lrs_wide)
-
-            # keep only variables with variance > min_variance
-            temp = temp[:, np.nanvar(temp.X, axis=0) > min_variance]
-
             if (temp.shape[0] >= samples_per_view): # check if enough samples
                 lr_adatas[view] = temp
-
     # to mdata
     mdata = MuData(lr_adatas)
 
@@ -350,7 +356,6 @@ def filter_view_markers(mdata: MuData,
 def _process_meta(adata, mdata, sample_key, obs_keys):
     if obs_keys is not None:
         metadata = adata.obs[[sample_key, *obs_keys]].drop_duplicates()
-
         sample_n = adata.obs[sample_key].nunique()
         if metadata.shape[0] != sample_n:
             raise ValueError('`obs_keys` must be unique per sample in `adata.obs`')

diff --git a/liana/tests/test_misty.py b/liana/tests/test_misty.py
@@ -36,6 +36,7 @@ def test_misty_bypass():
                              bandwidth=10,
                              add_juxta=True,
                              set_diag=True,
+                             max_neighs=10,
                              cutoff=0)
     misty(model=RandomForestModel, alphas=1, bypass_intra=True, seed=42, n_estimators=11)
     assert np.isin(['juxta', 'para'], misty.uns['target_metrics'].columns).all()

diff --git a/liana/tests/test_multi.py b/liana/tests/test_multi.py
@@ -57,6 +57,52 @@ def test_lrs_to_views():
     assert len(mdata.varm_keys())==3
 
 
+def test_lrs_to_views_batch():
+    adata = generate_toy_adata()
+    adata.obs['batch'] = 1
+    adata2 = adata.copy()
+    adata2.obs['batch'] = 2
+    adata2.obs['sample'] = adata2.obs['sample'].apply(lambda x: x+'2')
+    adata3 = adata.copy()
+    adata3.obs['sample'] = adata3.obs['sample'].apply(lambda x: x+'3')
+    adata = adata.concatenate([adata2, adata3], join='inner', batch_key='sample_number')
+
+    liana_res = sample_lrs(by_sample=True)
+    liana_res2 = liana_res.copy()
+    liana_res2['sample'] = liana_res['sample'].apply(lambda x: x+'2')
+    liana_res['batch']=1
+    liana_res2['batch']=2
+    liana_res3 = liana_res.copy()
+    liana_res3['sample'] = liana_res3['sample'].apply(lambda x: x+'3')
+    # add some variance
+    liana_res2['specificity_rank'] = liana_res2['specificity_rank'] + 0.1
+    liana_res3['specificity_rank'] = liana_res3['specificity_rank'] + 0.2
+    liana_res = pd.concat([liana_res, liana_res2, liana_res3])
+    adata.uns['liana_results'] = liana_res
+
+    mdata = lrs_to_views(adata=adata,
+                         sample_key='sample',
+                         score_key='specificity_rank',
+                         uns_key = 'liana_results',
+                         obs_keys = ['case', 'batch'],
+                         source_key='source',
+                         target_key='target',
+                         ligand_key='ligand_complex',
+                         receptor_key='receptor_complex',
+                         lr_prop=0.1,
+                         lrs_per_sample=1,
+                         lrs_per_view=5,
+                         samples_per_view=0,
+                         min_variance=0,
+                         batch_key='batch',
+                         min_var_nbatches=1,
+                         verbose=True
+                         )
+
+    assert mdata.shape == (12, 16)
+    assert 'case' in mdata.obs.columns
+    assert 'batch' in mdata.obs.columns
+    assert len(mdata.varm_keys())==3
 
 def test_adata_to_views():
     """Test adata_to_views."""

diff --git a/liana/utils/spatial_neighbors.py b/liana/utils/spatial_neighbors.py
@@ -24,7 +24,7 @@ def _linear(distance_mtx, bandwidth):
 def spatial_neighbors(adata: AnnData,
                       bandwidth=None,
                       cutoff=0.1,
-                      max_neighbours=None,
+                      max_neighbours=100,
                       kernel='gaussian',
                       set_diag=False,
                       zoi=0,
@@ -48,7 +48,7 @@ def spatial_neighbors(adata: AnnData,
         Values below this cutoff will be set to 0.
     max_neighbours
         Maximum nearest neighbours to be considered when generating spatial connectivity weights.
-        Essentially, the maximum number of edges in the graph. Default is `None`, which will use n = adata.shape[0]/10.
+        Essentially, the maximum number of edges in the spatial connectivity graph.
     kernel
         Kernel function used to generate connectivity weights.
         It controls the shape of the connectivity weights.
@@ -100,9 +100,6 @@ def spatial_neighbors(adata: AnnData,
     else:
         _reference = reference
 
-    if max_neighbours is None:
-        max_neighbours = int(adata.shape[0] / 10)
-
     tree = NearestNeighbors(n_neighbors=max_neighbours + 1, # +1 to exclude self
                             algorithm='ball_tree',
                             metric='euclidean').fit(_reference)