diff --git a/docs/references.bib b/docs/references.bib index cd41386..4059e85 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -1,17 +1,37 @@ -@article{Wolf2018, - author = {Wolf, F. Alexander - and Angerer, Philipp - and Theis, Fabian J.}, - title = {SCANPY: large-scale single-cell gene expression data analysis}, - journal = {Genome Biology}, - year = {2018}, - month = {Feb}, - day = {06}, - volume = {19}, - number = {1}, - pages = {15}, - abstract = {Scanpy is a scalable toolkit for analyzing single-cell gene expression data. It includes methods for preprocessing, visualization, clustering, pseudotime and trajectory inference, differential expression testing, and simulation of gene regulatory networks. Its Python-based implementation efficiently deals with data sets of more than one million cells (https://github.com/theislab/Scanpy). Along with Scanpy, we present AnnData, a generic class for handling annotated data matrices (https://github.com/theislab/anndata).}, - issn = {1474-760X}, - doi = {10.1186/s13059-017-1382-0}, - url = {https://doi.org/10.1186/s13059-017-1382-0} +@article{van_gassen_flowsom_2015, + title = {{FlowSOM}: {Using} self-organizing maps for visualization and interpretation of cytometry data}, + volume = {87}, + issn = {1552-4930}, + shorttitle = {{FlowSOM}}, + doi = {10.1002/cyto.a.22625}, + abstract = {The number of markers measured in both flow and mass cytometry keeps increasing steadily. Although this provides a wealth of information, it becomes infeasible to analyze these datasets manually. When using 2D scatter plots, the number of possible plots increases exponentially with the number of markers and therefore, relevant information that is present in the data might be missed. In this article, we introduce a new visualization technique, called FlowSOM, which analyzes Flow or mass cytometry data using a Self-Organizing Map. Using a two-level clustering and star charts, our algorithm helps to obtain a clear overview of how all markers are behaving on all cells, and to detect subsets that might be missed otherwise. R code is available at https://github.com/SofieVG/FlowSOM and will be made available at Bioconductor.}, + language = {eng}, + number = {7}, + journal = {Cytometry. Part A: The Journal of the International Society for Analytical Cytology}, + author = {Van Gassen, Sofie and Callebaut, Britt and Van Helden, Mary J. and Lambrecht, Bart N. and Demeester, Piet and Dhaene, Tom and Saeys, Yvan}, + month = jul, + year = {2015}, + pmid = {25573116}, + note = {Number: 7}, + keywords = {bioinformatics, mass cytometry, Flow Cytometry, Humans, Computational Biology, Algorithms, Cluster Analysis, Biomarkers, exploratory data analysis, Graft vs Host Disease, Hematopoietic Stem Cell Transplantation, Key terms: polychromatic flow cytometry, Lymphoma, B-Cell, self-organizing map, visualization method, West Nile Fever}, + pages = {636--645}, } +@article{quintelier_analyzing_2021, + title = {Analyzing high-dimensional cytometry data using {FlowSOM}}, + volume = {16}, + copyright = {2021 The Author(s), under exclusive licence to Springer Nature Limited}, + issn = {1750-2799}, + url = {https://www.nature.com/articles/s41596-021-00550-0}, + doi = {10.1038/s41596-021-00550-0}, + abstract = {The dimensionality of cytometry data has strongly increased in the last decade, and in many situations the traditional manual downstream analysis becomes insufficient. The field is therefore slowly moving toward more automated approaches, and in this paper we describe the protocol for analyzing high-dimensional cytometry data using FlowSOM, a clustering and visualization algorithm based on a self-organizing map. FlowSOM is used to distinguish cell populations from cytometry data in an unsupervised way and can help to gain deeper insights in fields such as immunology and oncology. Since the original FlowSOM publication (2015), we have validated the tool on a wide variety of datasets, and to write this protocol, we made use of this experience to improve the user-friendliness of the package (e.g., comprehensive functions replacing commonly required scripts). Where the original paper focused mainly on the algorithm description, this protocol offers user guidelines on how to implement the procedure, detailed parameter descriptions and troubleshooting recommendations. The protocol provides clearly annotated R code, and is therefore relevant for all scientists interested in computational high-dimensional analyses without requiring a strong bioinformatics background. We demonstrate the complete workflow, starting from data preparation (such as compensation, transformation and quality control), including detailed discussion of the different FlowSOM parameters and visualization options, and concluding with how the results can be further used to answer biological questions, such as statistical comparison between groups of interest. An average FlowSOM analysis takes 1–3 h to complete, though quality issues can increase this time considerably.}, + language = {en}, + number = {8}, + urldate = {2022-08-26}, + journal = {Nature Protocols}, + author = {Quintelier, Katrien and Couckuyt, Artuur and Emmaneel, Annelies and Aerts, Joachim and Saeys, Yvan and Van Gassen, Sofie}, + month = aug, + year = {2021}, + note = {Number: 8 Publisher: Nature Publishing Group}, + keywords = {Software, Flow cytometry, Data processing}, + pages = {3775--3801}, +} \ No newline at end of file diff --git a/src/FlowSOM/__init__.py b/src/FlowSOM/__init__.py index 4e1528e..98861d9 100644 --- a/src/FlowSOM/__init__.py +++ b/src/FlowSOM/__init__.py @@ -3,26 +3,6 @@ from . import pl from . import main -# import pooch - __all__ = ["pl", "main"] __version__ = version("FlowSOM") - -""" -example_dataset = pooch.create( - path=pooch.os_cache("example_data_set"), - base_url="https://dl01.irc.ugent.be/", - version="0.1.0", - # overwriting data path with environment variable - env="FlowSOM", - registry={ - "flow/FlowRepository_FR-FCM-ZZPH/Levine_13dim.fcs": None, - "flow/FlowRepository_FR-FCM-ZZPH_zarr/Levine_13dim.zarr": None, - "flow/pixie/fov0.feather": None, - } - # TODO: create registry file with hashes - # registry=None, -) - -""" diff --git a/src/FlowSOM/main.py b/src/FlowSOM/main.py index dd13b6e..0824661 100644 --- a/src/FlowSOM/main.py +++ b/src/FlowSOM/main.py @@ -41,7 +41,7 @@ def __init__(self, inp, cols_to_use: np.array = None, n_clus=10, max_meta=None, self.build_SOM(cols_to_use, **kwargs) self.build_MST() self.metacluster(n_clus) - self.update_derived_values() + self._update_derived_values() def read_input(self, inp): """Converts input to a FlowSOM AnnData object @@ -84,7 +84,7 @@ def build_SOM(self, cols_to_use: np.array = None, outlier_MAD=4, **kwargs): self.mudata["cell_data"].var["cols_used"] = [x in cols_to_use for x in self.mudata["cell_data"].var_names] self.mudata["cell_data"].uns["n_nodes"] = xdim * ydim - self = self.update_derived_values() + self = self._update_derived_values() self.mudata["cluster_data"].uns["xdim"] = xdim self.mudata["cluster_data"].uns["ydim"] = ydim self.mudata["cluster_data"].obsm["codes"] = np.array(codes) @@ -166,7 +166,7 @@ def SOM( seed=seed, ) if mst != 1: - nhbrdist = self.dist_mst(codes) + nhbrdist = self._dist_mst(codes) if map: clusters, dists = map_data_to_codes(data=data, codes=codes) @@ -175,7 +175,7 @@ def SOM( return codes, clusters, dists, xdim, ydim - def update_derived_values(self): + def _update_derived_values(self): """Update the derived values such as median values and CV values""" df = self.mudata["cell_data"].X # [self.adata.X[:, 0].argsort()] df = np.c_[self.mudata["cell_data"].obs["clustering"], df] @@ -237,7 +237,7 @@ def build_MST(self): self.mudata["cluster_data"].uns["graph"] = MST_graph return self - def dist_mst(self, codes): + def _dist_mst(self, codes): adjacency = cdist( codes, codes, @@ -257,7 +257,6 @@ def metacluster(self, n_clus): :param n_clus: The number of metaclusters :type n_clus: int """ - # metaclusters = (self.hierarchical_clustering(self.mudata["cluster_data"].obsm["codes"], n_clus)) metaclusters = self.consensus_hierarchical_clustering(self.mudata["cluster_data"].obsm["codes"], n_clus) self.mudata["cell_data"].uns["n_metaclusters"] = n_clus self.mudata["cluster_data"].obs["metaclustering"] = metaclusters.astype(str) @@ -266,12 +265,6 @@ def metacluster(self, n_clus): ) return self - def hierarchical_clustering(self, data, n_clus, linkage="average"): - average = AgglomerativeClustering(n_clusters=n_clus, linkage=linkage) - metaclustering = average.fit(data) - metaclusters = metaclustering.labels_ - return metaclusters - def consensus_hierarchical_clustering( self, data, n_clus, n_subsamples=100, linkage="average", resample_proportion=0.9 ): @@ -403,7 +396,7 @@ def new_data(self, inp, mad_allowed=4): clusters, dists = map_data_to_codes(data=data, codes=self.get_cluster_data().obsm["codes"]) fsom_new.get_cell_data().obsm["distance_to_bmu"] = np.array(dists) fsom_new.get_cell_data().obs["clustering"] = np.array(clusters) - fsom_new = fsom_new.update_derived_values() + fsom_new = fsom_new._update_derived_values() metaclusters = self.get_cluster_data().obs["metaclustering"] fsom_new.get_cell_data().obs["metaclustering"] = np.asarray( [np.array(metaclusters)[int(i)] for i in np.asarray(fsom_new.get_cell_data().obs["clustering"])] @@ -420,7 +413,7 @@ def subset(self, ids): """ fsom_subset = self fsom_subset.mudata.mod["cell_data"] = fsom_subset.mudata["cell_data"][ids, :] - fsom_subset = fsom_subset.update_derived_values() + fsom_subset = fsom_subset._update_derived_values() return fsom_subset def get_cell_data(self): @@ -848,3 +841,22 @@ def read_FCS(filepath): f.var.rename(index=index_markers, inplace=True) f.uns["meta"]["channels"]["$PnS"] = [index_markers[key] for key in f.uns["meta"]["channels"].index] return f + + +def normalize_estimate_logicle(adata, channels, m=4.5, q=0.05): + assert isinstance(adata, ad.AnnData), f"Please provide an AnnData object" + assert isinstance(channels, list), f"Please provide a list of channels" + channels = list(get_markers(adata, channels).keys()) + assert all([i in adata.var_names for i in channels]), f"Channels should be in the AnnData object" + neg_marker_quantiles = [ + np.quantile(adata[:, channel].X[adata[:, channel].X < 0], q) if (adata[:, channel].X < 0).any() else 0.5 + for channel in channels + ] + neg_marker_quantiles = pd.Series(neg_marker_quantiles, index=channels, dtype=float) + max_range = adata.var["$PnR"][channels].astype(float) + w = (m - np.log10(max_range / np.abs(neg_marker_quantiles))) / 2 + for channel in channels: + adata[:, channel].X = pm.tools.normalize_logicle( + adata[:, channel], t=max_range[channel], m=m, a=0, w=w[channel], inplace=False + ).X + return adata diff --git a/tests/data/not_preprocessed.fcs b/tests/data/not_preprocessed.fcs new file mode 100644 index 0000000..7a62316 Binary files /dev/null and b/tests/data/not_preprocessed.fcs differ