Added estimate logicle, remove exporting functions

saeyslab · Sep 6, 2023 · ac59264 · ac59264
1 parent d84e5ac
commit ac59264
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 50 deletions.
diff --git a/docs/references.bib b/docs/references.bib
@@ -1,17 +1,37 @@
-@article{Wolf2018,
-  author   = {Wolf, F. Alexander
-              and Angerer, Philipp
-              and Theis, Fabian J.},
-  title    = {SCANPY: large-scale single-cell gene expression data analysis},
-  journal  = {Genome Biology},
-  year     = {2018},
-  month    = {Feb},
-  day      = {06},
-  volume   = {19},
-  number   = {1},
-  pages    = {15},
-  abstract = {Scanpy is a scalable toolkit for analyzing single-cell gene expression data. It includes methods for preprocessing, visualization, clustering, pseudotime and trajectory inference, differential expression testing, and simulation of gene regulatory networks. Its Python-based implementation efficiently deals with data sets of more than one million cells (https://github.com/theislab/Scanpy). Along with Scanpy, we present AnnData, a generic class for handling annotated data matrices (https://github.com/theislab/anndata).},
-  issn     = {1474-760X},
-  doi      = {10.1186/s13059-017-1382-0},
-  url      = {https://doi.org/10.1186/s13059-017-1382-0}
+@article{van_gassen_flowsom_2015,
+	title = {{FlowSOM}: {Using} self-organizing maps for visualization and interpretation of cytometry data},
+	volume = {87},
+	issn = {1552-4930},
+	shorttitle = {{FlowSOM}},
+	doi = {10.1002/cyto.a.22625},
+	abstract = {The number of markers measured in both flow and mass cytometry keeps increasing steadily. Although this provides a wealth of information, it becomes infeasible to analyze these datasets manually. When using 2D scatter plots, the number of possible plots increases exponentially with the number of markers and therefore, relevant information that is present in the data might be missed. In this article, we introduce a new visualization technique, called FlowSOM, which analyzes Flow or mass cytometry data using a Self-Organizing Map. Using a two-level clustering and star charts, our algorithm helps to obtain a clear overview of how all markers are behaving on all cells, and to detect subsets that might be missed otherwise. R code is available at https://github.com/SofieVG/FlowSOM and will be made available at Bioconductor.},
+	language = {eng},
+	number = {7},
+	journal = {Cytometry. Part A: The Journal of the International Society for Analytical Cytology},
+	author = {Van Gassen, Sofie and Callebaut, Britt and Van Helden, Mary J. and Lambrecht, Bart N. and Demeester, Piet and Dhaene, Tom and Saeys, Yvan},
+	month = jul,
+	year = {2015},
+	pmid = {25573116},
+	note = {Number: 7},
+	keywords = {bioinformatics, mass cytometry, Flow Cytometry, Humans, Computational Biology, Algorithms, Cluster Analysis, Biomarkers, exploratory data analysis, Graft vs Host Disease, Hematopoietic Stem Cell Transplantation, Key terms: polychromatic flow cytometry, Lymphoma, B-Cell, self-organizing map, visualization method, West Nile Fever},
+	pages = {636--645},
 }
+@article{quintelier_analyzing_2021,
+	title = {Analyzing high-dimensional cytometry data using {FlowSOM}},
+	volume = {16},
+	copyright = {2021 The Author(s), under exclusive licence to Springer Nature Limited},
+	issn = {1750-2799},
+	url = {https://www.nature.com/articles/s41596-021-00550-0},
+	doi = {10.1038/s41596-021-00550-0},
+	abstract = {The dimensionality of cytometry data has strongly increased in the last decade, and in many situations the traditional manual downstream analysis becomes insufficient. The field is therefore slowly moving toward more automated approaches, and in this paper we describe the protocol for analyzing high-dimensional cytometry data using FlowSOM, a clustering and visualization algorithm based on a self-organizing map. FlowSOM is used to distinguish cell populations from cytometry data in an unsupervised way and can help to gain deeper insights in fields such as immunology and oncology. Since the original FlowSOM publication (2015), we have validated the tool on a wide variety of datasets, and to write this protocol, we made use of this experience to improve the user-friendliness of the package (e.g., comprehensive functions replacing commonly required scripts). Where the original paper focused mainly on the algorithm description, this protocol offers user guidelines on how to implement the procedure, detailed parameter descriptions and troubleshooting recommendations. The protocol provides clearly annotated R code, and is therefore relevant for all scientists interested in computational high-dimensional analyses without requiring a strong bioinformatics background. We demonstrate the complete workflow, starting from data preparation (such as compensation, transformation and quality control), including detailed discussion of the different FlowSOM parameters and visualization options, and concluding with how the results can be further used to answer biological questions, such as statistical comparison between groups of interest. An average FlowSOM analysis takes 1–3 h to complete, though quality issues can increase this time considerably.},
+	language = {en},
+	number = {8},
+	urldate = {2022-08-26},
+	journal = {Nature Protocols},
+	author = {Quintelier, Katrien and Couckuyt, Artuur and Emmaneel, Annelies and Aerts, Joachim and Saeys, Yvan and Van Gassen, Sofie},
+	month = aug,
+	year = {2021},
+	note = {Number: 8 Publisher: Nature Publishing Group},
+	keywords = {Software, Flow cytometry, Data processing},
+	pages = {3775--3801},
+}
diff --git a/src/FlowSOM/__init__.py b/src/FlowSOM/__init__.py
@@ -3,26 +3,6 @@
 from . import pl
 from . import main
 
-# import pooch
-
 __all__ = ["pl", "main"]
 
 __version__ = version("FlowSOM")
-
-"""
-example_dataset = pooch.create(
-    path=pooch.os_cache("example_data_set"),
-    base_url="https://dl01.irc.ugent.be/",
-    version="0.1.0",
-    # overwriting data path with environment variable
-    env="FlowSOM",
-    registry={
-        "flow/FlowRepository_FR-FCM-ZZPH/Levine_13dim.fcs": None,
-        "flow/FlowRepository_FR-FCM-ZZPH_zarr/Levine_13dim.zarr": None,
-        "flow/pixie/fov0.feather": None,
-    }
-    # TODO: create registry file with hashes
-    # registry=None,
-)
-
-"""
diff --git a/src/FlowSOM/main.py b/src/FlowSOM/main.py
@@ -41,7 +41,7 @@ def __init__(self, inp, cols_to_use: np.array = None, n_clus=10, max_meta=None,
         self.build_SOM(cols_to_use, **kwargs)
         self.build_MST()
         self.metacluster(n_clus)
-        self.update_derived_values()
+        self._update_derived_values()
 
     def read_input(self, inp):
         """Converts input to a FlowSOM AnnData object
@@ -84,7 +84,7 @@ def build_SOM(self, cols_to_use: np.array = None, outlier_MAD=4, **kwargs):
         self.mudata["cell_data"].var["cols_used"] = [x in cols_to_use for x in self.mudata["cell_data"].var_names]
 
         self.mudata["cell_data"].uns["n_nodes"] = xdim * ydim
-        self = self.update_derived_values()
+        self = self._update_derived_values()
         self.mudata["cluster_data"].uns["xdim"] = xdim
         self.mudata["cluster_data"].uns["ydim"] = ydim
         self.mudata["cluster_data"].obsm["codes"] = np.array(codes)
@@ -166,7 +166,7 @@ def SOM(
                 seed=seed,
             )
             if mst != 1:
-                nhbrdist = self.dist_mst(codes)
+                nhbrdist = self._dist_mst(codes)
 
         if map:
             clusters, dists = map_data_to_codes(data=data, codes=codes)
@@ -175,7 +175,7 @@ def SOM(
 
         return codes, clusters, dists, xdim, ydim
 
-    def update_derived_values(self):
+    def _update_derived_values(self):
         """Update the derived values such as median values and CV values"""
         df = self.mudata["cell_data"].X  # [self.adata.X[:, 0].argsort()]
         df = np.c_[self.mudata["cell_data"].obs["clustering"], df]
@@ -237,7 +237,7 @@ def build_MST(self):
         self.mudata["cluster_data"].uns["graph"] = MST_graph
         return self
 
-    def dist_mst(self, codes):
+    def _dist_mst(self, codes):
         adjacency = cdist(
             codes,
             codes,
@@ -257,7 +257,6 @@ def metacluster(self, n_clus):
         :param n_clus: The number of metaclusters
         :type n_clus: int
         """
-        # metaclusters = (self.hierarchical_clustering(self.mudata["cluster_data"].obsm["codes"], n_clus))
         metaclusters = self.consensus_hierarchical_clustering(self.mudata["cluster_data"].obsm["codes"], n_clus)
         self.mudata["cell_data"].uns["n_metaclusters"] = n_clus
         self.mudata["cluster_data"].obs["metaclustering"] = metaclusters.astype(str)
@@ -266,12 +265,6 @@ def metacluster(self, n_clus):
         )
         return self
 
-    def hierarchical_clustering(self, data, n_clus, linkage="average"):
-        average = AgglomerativeClustering(n_clusters=n_clus, linkage=linkage)
-        metaclustering = average.fit(data)
-        metaclusters = metaclustering.labels_
-        return metaclusters
-
     def consensus_hierarchical_clustering(
         self, data, n_clus, n_subsamples=100, linkage="average", resample_proportion=0.9
     ):
@@ -403,7 +396,7 @@ def new_data(self, inp, mad_allowed=4):
         clusters, dists = map_data_to_codes(data=data, codes=self.get_cluster_data().obsm["codes"])
         fsom_new.get_cell_data().obsm["distance_to_bmu"] = np.array(dists)
         fsom_new.get_cell_data().obs["clustering"] = np.array(clusters)
-        fsom_new = fsom_new.update_derived_values()
+        fsom_new = fsom_new._update_derived_values()
         metaclusters = self.get_cluster_data().obs["metaclustering"]
         fsom_new.get_cell_data().obs["metaclustering"] = np.asarray(
             [np.array(metaclusters)[int(i)] for i in np.asarray(fsom_new.get_cell_data().obs["clustering"])]
@@ -420,7 +413,7 @@ def subset(self, ids):
         """
         fsom_subset = self
         fsom_subset.mudata.mod["cell_data"] = fsom_subset.mudata["cell_data"][ids, :]
-        fsom_subset = fsom_subset.update_derived_values()
+        fsom_subset = fsom_subset._update_derived_values()
         return fsom_subset
 
     def get_cell_data(self):
@@ -848,3 +841,22 @@ def read_FCS(filepath):
         f.var.rename(index=index_markers, inplace=True)
         f.uns["meta"]["channels"]["$PnS"] = [index_markers[key] for key in f.uns["meta"]["channels"].index]
     return f
+
+
+def normalize_estimate_logicle(adata, channels, m=4.5, q=0.05):
+    assert isinstance(adata, ad.AnnData), f"Please provide an AnnData object"
+    assert isinstance(channels, list), f"Please provide a list of channels"
+    channels = list(get_markers(adata, channels).keys())
+    assert all([i in adata.var_names for i in channels]), f"Channels should be in the AnnData object"
+    neg_marker_quantiles = [
+        np.quantile(adata[:, channel].X[adata[:, channel].X < 0], q) if (adata[:, channel].X < 0).any() else 0.5
+        for channel in channels
+    ]
+    neg_marker_quantiles = pd.Series(neg_marker_quantiles, index=channels, dtype=float)
+    max_range = adata.var["$PnR"][channels].astype(float)
+    w = (m - np.log10(max_range / np.abs(neg_marker_quantiles))) / 2
+    for channel in channels:
+        adata[:, channel].X = pm.tools.normalize_logicle(
+            adata[:, channel], t=max_range[channel], m=m, a=0, w=w[channel], inplace=False
+        ).X
+    return adata
diff --git a/tests/data/not_preprocessed.fcs b/tests/data/not_preprocessed.fcs