Skip to content

Commit

Permalink
Added estimate logicle, remove exporting functions
Browse files Browse the repository at this point in the history
  • Loading branch information
artuurC committed Sep 6, 2023
1 parent d84e5ac commit ac59264
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 50 deletions.
52 changes: 36 additions & 16 deletions docs/references.bib
Original file line number Diff line number Diff line change
@@ -1,17 +1,37 @@
@article{Wolf2018,
author = {Wolf, F. Alexander
and Angerer, Philipp
and Theis, Fabian J.},
title = {SCANPY: large-scale single-cell gene expression data analysis},
journal = {Genome Biology},
year = {2018},
month = {Feb},
day = {06},
volume = {19},
number = {1},
pages = {15},
abstract = {Scanpy is a scalable toolkit for analyzing single-cell gene expression data. It includes methods for preprocessing, visualization, clustering, pseudotime and trajectory inference, differential expression testing, and simulation of gene regulatory networks. Its Python-based implementation efficiently deals with data sets of more than one million cells (https://github.com/theislab/Scanpy). Along with Scanpy, we present AnnData, a generic class for handling annotated data matrices (https://github.com/theislab/anndata).},
issn = {1474-760X},
doi = {10.1186/s13059-017-1382-0},
url = {https://doi.org/10.1186/s13059-017-1382-0}
@article{van_gassen_flowsom_2015,
title = {{FlowSOM}: {Using} self-organizing maps for visualization and interpretation of cytometry data},
volume = {87},
issn = {1552-4930},
shorttitle = {{FlowSOM}},
doi = {10.1002/cyto.a.22625},
abstract = {The number of markers measured in both flow and mass cytometry keeps increasing steadily. Although this provides a wealth of information, it becomes infeasible to analyze these datasets manually. When using 2D scatter plots, the number of possible plots increases exponentially with the number of markers and therefore, relevant information that is present in the data might be missed. In this article, we introduce a new visualization technique, called FlowSOM, which analyzes Flow or mass cytometry data using a Self-Organizing Map. Using a two-level clustering and star charts, our algorithm helps to obtain a clear overview of how all markers are behaving on all cells, and to detect subsets that might be missed otherwise. R code is available at https://github.com/SofieVG/FlowSOM and will be made available at Bioconductor.},
language = {eng},
number = {7},
journal = {Cytometry. Part A: The Journal of the International Society for Analytical Cytology},
author = {Van Gassen, Sofie and Callebaut, Britt and Van Helden, Mary J. and Lambrecht, Bart N. and Demeester, Piet and Dhaene, Tom and Saeys, Yvan},
month = jul,
year = {2015},
pmid = {25573116},
note = {Number: 7},
keywords = {bioinformatics, mass cytometry, Flow Cytometry, Humans, Computational Biology, Algorithms, Cluster Analysis, Biomarkers, exploratory data analysis, Graft vs Host Disease, Hematopoietic Stem Cell Transplantation, Key terms: polychromatic flow cytometry, Lymphoma, B-Cell, self-organizing map, visualization method, West Nile Fever},
pages = {636--645},
}
@article{quintelier_analyzing_2021,
title = {Analyzing high-dimensional cytometry data using {FlowSOM}},
volume = {16},
copyright = {2021 The Author(s), under exclusive licence to Springer Nature Limited},
issn = {1750-2799},
url = {https://www.nature.com/articles/s41596-021-00550-0},
doi = {10.1038/s41596-021-00550-0},
abstract = {The dimensionality of cytometry data has strongly increased in the last decade, and in many situations the traditional manual downstream analysis becomes insufficient. The field is therefore slowly moving toward more automated approaches, and in this paper we describe the protocol for analyzing high-dimensional cytometry data using FlowSOM, a clustering and visualization algorithm based on a self-organizing map. FlowSOM is used to distinguish cell populations from cytometry data in an unsupervised way and can help to gain deeper insights in fields such as immunology and oncology. Since the original FlowSOM publication (2015), we have validated the tool on a wide variety of datasets, and to write this protocol, we made use of this experience to improve the user-friendliness of the package (e.g., comprehensive functions replacing commonly required scripts). Where the original paper focused mainly on the algorithm description, this protocol offers user guidelines on how to implement the procedure, detailed parameter descriptions and troubleshooting recommendations. The protocol provides clearly annotated R code, and is therefore relevant for all scientists interested in computational high-dimensional analyses without requiring a strong bioinformatics background. We demonstrate the complete workflow, starting from data preparation (such as compensation, transformation and quality control), including detailed discussion of the different FlowSOM parameters and visualization options, and concluding with how the results can be further used to answer biological questions, such as statistical comparison between groups of interest. An average FlowSOM analysis takes 1–3 h to complete, though quality issues can increase this time considerably.},
language = {en},
number = {8},
urldate = {2022-08-26},
journal = {Nature Protocols},
author = {Quintelier, Katrien and Couckuyt, Artuur and Emmaneel, Annelies and Aerts, Joachim and Saeys, Yvan and Van Gassen, Sofie},
month = aug,
year = {2021},
note = {Number: 8 Publisher: Nature Publishing Group},
keywords = {Software, Flow cytometry, Data processing},
pages = {3775--3801},
}
20 changes: 0 additions & 20 deletions src/FlowSOM/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,6 @@
from . import pl
from . import main

# import pooch

__all__ = ["pl", "main"]

__version__ = version("FlowSOM")

"""
example_dataset = pooch.create(
path=pooch.os_cache("example_data_set"),
base_url="https://dl01.irc.ugent.be/",
version="0.1.0",
# overwriting data path with environment variable
env="FlowSOM",
registry={
"flow/FlowRepository_FR-FCM-ZZPH/Levine_13dim.fcs": None,
"flow/FlowRepository_FR-FCM-ZZPH_zarr/Levine_13dim.zarr": None,
"flow/pixie/fov0.feather": None,
}
# TODO: create registry file with hashes
# registry=None,
)
"""
40 changes: 26 additions & 14 deletions src/FlowSOM/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, inp, cols_to_use: np.array = None, n_clus=10, max_meta=None,
self.build_SOM(cols_to_use, **kwargs)
self.build_MST()
self.metacluster(n_clus)
self.update_derived_values()
self._update_derived_values()

def read_input(self, inp):
"""Converts input to a FlowSOM AnnData object
Expand Down Expand Up @@ -84,7 +84,7 @@ def build_SOM(self, cols_to_use: np.array = None, outlier_MAD=4, **kwargs):
self.mudata["cell_data"].var["cols_used"] = [x in cols_to_use for x in self.mudata["cell_data"].var_names]

self.mudata["cell_data"].uns["n_nodes"] = xdim * ydim
self = self.update_derived_values()
self = self._update_derived_values()
self.mudata["cluster_data"].uns["xdim"] = xdim
self.mudata["cluster_data"].uns["ydim"] = ydim
self.mudata["cluster_data"].obsm["codes"] = np.array(codes)
Expand Down Expand Up @@ -166,7 +166,7 @@ def SOM(
seed=seed,
)
if mst != 1:
nhbrdist = self.dist_mst(codes)
nhbrdist = self._dist_mst(codes)

if map:
clusters, dists = map_data_to_codes(data=data, codes=codes)
Expand All @@ -175,7 +175,7 @@ def SOM(

return codes, clusters, dists, xdim, ydim

def update_derived_values(self):
def _update_derived_values(self):
"""Update the derived values such as median values and CV values"""
df = self.mudata["cell_data"].X # [self.adata.X[:, 0].argsort()]
df = np.c_[self.mudata["cell_data"].obs["clustering"], df]
Expand Down Expand Up @@ -237,7 +237,7 @@ def build_MST(self):
self.mudata["cluster_data"].uns["graph"] = MST_graph
return self

def dist_mst(self, codes):
def _dist_mst(self, codes):
adjacency = cdist(
codes,
codes,
Expand All @@ -257,7 +257,6 @@ def metacluster(self, n_clus):
:param n_clus: The number of metaclusters
:type n_clus: int
"""
# metaclusters = (self.hierarchical_clustering(self.mudata["cluster_data"].obsm["codes"], n_clus))
metaclusters = self.consensus_hierarchical_clustering(self.mudata["cluster_data"].obsm["codes"], n_clus)
self.mudata["cell_data"].uns["n_metaclusters"] = n_clus
self.mudata["cluster_data"].obs["metaclustering"] = metaclusters.astype(str)
Expand All @@ -266,12 +265,6 @@ def metacluster(self, n_clus):
)
return self

def hierarchical_clustering(self, data, n_clus, linkage="average"):
average = AgglomerativeClustering(n_clusters=n_clus, linkage=linkage)
metaclustering = average.fit(data)
metaclusters = metaclustering.labels_
return metaclusters

def consensus_hierarchical_clustering(
self, data, n_clus, n_subsamples=100, linkage="average", resample_proportion=0.9
):
Expand Down Expand Up @@ -403,7 +396,7 @@ def new_data(self, inp, mad_allowed=4):
clusters, dists = map_data_to_codes(data=data, codes=self.get_cluster_data().obsm["codes"])
fsom_new.get_cell_data().obsm["distance_to_bmu"] = np.array(dists)
fsom_new.get_cell_data().obs["clustering"] = np.array(clusters)
fsom_new = fsom_new.update_derived_values()
fsom_new = fsom_new._update_derived_values()
metaclusters = self.get_cluster_data().obs["metaclustering"]
fsom_new.get_cell_data().obs["metaclustering"] = np.asarray(
[np.array(metaclusters)[int(i)] for i in np.asarray(fsom_new.get_cell_data().obs["clustering"])]
Expand All @@ -420,7 +413,7 @@ def subset(self, ids):
"""
fsom_subset = self
fsom_subset.mudata.mod["cell_data"] = fsom_subset.mudata["cell_data"][ids, :]
fsom_subset = fsom_subset.update_derived_values()
fsom_subset = fsom_subset._update_derived_values()
return fsom_subset

def get_cell_data(self):
Expand Down Expand Up @@ -848,3 +841,22 @@ def read_FCS(filepath):
f.var.rename(index=index_markers, inplace=True)
f.uns["meta"]["channels"]["$PnS"] = [index_markers[key] for key in f.uns["meta"]["channels"].index]
return f


def normalize_estimate_logicle(adata, channels, m=4.5, q=0.05):
assert isinstance(adata, ad.AnnData), f"Please provide an AnnData object"
assert isinstance(channels, list), f"Please provide a list of channels"
channels = list(get_markers(adata, channels).keys())
assert all([i in adata.var_names for i in channels]), f"Channels should be in the AnnData object"
neg_marker_quantiles = [
np.quantile(adata[:, channel].X[adata[:, channel].X < 0], q) if (adata[:, channel].X < 0).any() else 0.5
for channel in channels
]
neg_marker_quantiles = pd.Series(neg_marker_quantiles, index=channels, dtype=float)
max_range = adata.var["$PnR"][channels].astype(float)
w = (m - np.log10(max_range / np.abs(neg_marker_quantiles))) / 2
for channel in channels:
adata[:, channel].X = pm.tools.normalize_logicle(
adata[:, channel], t=max_range[channel], m=m, a=0, w=w[channel], inplace=False
).X
return adata
Binary file added tests/data/not_preprocessed.fcs
Binary file not shown.

0 comments on commit ac59264

Please sign in to comment.