From 6b4bdad3a7761904168702ba9b8c0c395b3afa45 Mon Sep 17 00:00:00 2001
From: sdewin <seppe.dewinter@hotmail.com>
Date: Mon, 20 Feb 2023 10:25:38 +0100
Subject: [PATCH] Add function to calculate AUC-based correlation

---
 src/scenicplus/cistromes.py | 87 +++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/src/scenicplus/cistromes.py b/src/scenicplus/cistromes.py
index 0dca208..e3312f3 100644
--- a/src/scenicplus/cistromes.py
+++ b/src/scenicplus/cistromes.py
@@ -297,6 +297,93 @@ def TF_cistrome_correlation(scplus_obj: SCENICPLUS,
     scplus_obj.uns['TF_cistrome_correlation'][out_key] = corr_df
 
 
+def eregulon_correlation(scplus_obj: SCENICPLUS,
+                         auc_key: str = 'eRegulon_AUC',
+                         signature_key1: str = 'Gene_based',
+                         signature_key2: str = 'Region_based',
+                         nSignif: int = 3,
+                         out_key: str = 'Unfiltered',
+                         subset_cellids: List[str] = None
+                        ):
+        """
+        Get correlation between gene-based and region-based eRegulon AUC
+
+        Parameters
+        ---------
+        scplus_obj: :class:`SCENICPLUS`
+            A :class:`SCENICPLUS` object
+        auc_key: str, optional
+            Name of the AUC matrix used to calculate the correlation, normally 'eRegulon_AUC'. 
+            Must be a key in `scplus_obj.uns.keys()`.
+        signature_key1: str, optional
+            Variable used to calculate the correlation, normally 'Gene_based'. 
+            Must be a key in `scplus_obj.uns[auc_key].keys()`.
+        signature_key2: str, optional
+            Variable used to calculate the correlation, normally 'Region_based'. 
+            Must be a key in `scplus_obj.uns[auc_key].keys()`.
+        nSignif: str, optional
+            Number of digits to save.
+        out_key : str, optional
+            Ouput key. Correlations will be stored at `scplus_obj.uns['eRegulon_correlation'][out_key]`.
+        subset_cellids: List, optional
+            Subset of cells to be used to calculate correlations. Default: None (All)
+        """
+
+        gene_auc = scplus_obj.uns[auc_key][signature_key1].copy().T
+        region_auc = scplus_obj.uns[auc_key][signature_key2].copy().T
+
+        if subset_cellids is not None:
+            cell_data = pd.DataFrame([x.rsplit('_', 1)[0] for x in gene_auc.columns],
+                                     index=gene_auc.columns).iloc[:, 0]
+            subset_cells = cell_data[cell_data.isin(subset_cellids)].index.tolist()
+            gene_auc = gene_auc.loc[:, subset_cells]
+            region_auc = region_auc.loc[:, subset_cells]
+
+        # cistrome naming includes number of genes/regions, so need to create matching names
+        # x.rsplit('_', 1) splits at first _ from the right
+        gene_auc['id_short'] = gene_auc.index.map(lambda x: x.rsplit('_', 1)[0])
+        gene_auc['id_full'] = gene_auc.index
+        gene_auc = gene_auc.set_index('id_short')
+
+        region_auc['id_short'] = region_auc.index.map(lambda x: x.rsplit('_', 1)[0])
+        region_auc['id_full'] = region_auc.index
+        region_auc = region_auc.set_index('id_short')
+
+        corr_df = pd.DataFrame(columns=['id', signature_key1, signature_key2, 'Rho', 'P-value'])
+
+        for tf in gene_auc.index:
+            # All TFs should match, but just in case
+            if tf in region_auc.index:
+                # record orginal cistrome name for results
+                signature1_id = gene_auc.loc[tf, 'id_full']
+                signature2_id = region_auc.loc[tf, 'id_full']
+                # Exception in case TF is only expressed in 1 cell
+                # TFs expressed in few cells could be filtered too
+                try:
+                    corr_1, _1 = pearsonr(gene_auc.loc[tf, gene_auc.columns != 'id_full'],
+                                          region_auc.loc[tf, gene_auc.columns != 'id_full'])
+                    x = {'id': tf,
+                         signature_key1: signature1_id,
+                         signature_key2: signature2_id,
+                         'Rho': round(corr_1,nSignif),
+                         'P-value': _1}
+                    corr_df = pd.concat([corr_df,
+                                         pd.DataFrame(data=x, index=[0])],
+                                        ignore_index=True)
+                except:
+                    continue
+        corr_df = corr_df.dropna()
+        corr_df['Adjusted_pValue'] = p_adjust_bh(corr_df['P-value'])
+        corr_df['Abs_rho'] = abs(corr_df['Rho'])
+        corr_df.sort_values('Abs_rho', ascending=False, inplace=True)
+
+        if not 'eRegulon_correlation' in scplus_obj.uns.keys():
+            scplus_obj.uns['eRegulon_correlation'] = {}
+        if not out_key in scplus_obj.uns['eRegulon_correlation'].keys():
+            scplus_obj.uns['eRegulon_correlation'][out_key] = {}
+        scplus_obj.uns['eRegulon_correlation'][out_key] = corr_df
+
+
 def prune_plot(scplus_obj: SCENICPLUS,
                name: str,
                pseudobulk_variable: str = None,