From 9e041328a6fd9f0fcf23f6524b5f51943cdb2b0a Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:14:07 +0200 Subject: [PATCH 01/11] Rename argument `copy` to `inplace` --- scvelo/core/_anndata.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index d77bbb7d..d96edd1e 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -11,16 +11,18 @@ from scipy.sparse import csr_matrix, issparse, spmatrix from anndata import AnnData +from scanpy._utils import deprecated_arg_names from scvelo import logging as logg from ._arithmetic import sum +@deprecated_arg_names({"copy": "inplace"}) def clean_obs_names( data: AnnData, base: str = "[AGTCBDHKMNRSVWY]", ID_length: int = 12, - copy: bool = False, + inplace: bool = True, ) -> Optional[AnnData]: """Clean up the obs_names. @@ -37,8 +39,8 @@ def clean_obs_names( Genetic code letters to be identified. ID_length Length of the Genetic Codes in the samples. - copy - Return a copy instead of writing to adata. + inplace + Whether to update `adata` inplace or not. Returns ------- @@ -58,7 +60,7 @@ def get_base_list(name, base): raise ValueError("Encountered an invalid ID in obs_names: ", name) return base_list - adata = data.copy() if copy else data + adata = data.copy() if not inplace else data names = adata.obs_names base_list = get_base_list(names[0], base) @@ -96,7 +98,9 @@ def get_base_list(name, base): ) adata.obs_names_make_unique() - return adata if copy else None + + if not inplace: + return adata def cleanup( From d5fe4629729fd216ed9c47cc943d92c582119f0e Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:14:47 +0200 Subject: [PATCH 02/11] Refactor if clause check Checking if all observation names are of equal length can entirely be done using Pandas functionality. This looks slightly cleaner. --- scvelo/core/_anndata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index d96edd1e..83199c9a 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -65,7 +65,7 @@ def get_base_list(name, base): names = adata.obs_names base_list = get_base_list(names[0], base) - if len(np.unique([len(name) for name in adata.obs_names])) == 1: + if adata.obs_names.map(len).unique().size == 1: start, end = re.search(base_list, names[0]).span() newIDs = [name[start:end] for name in names] start, end = 0, len(newIDs[0]) From f19f58e3535e04dddb6d56f3ab268d4133b3d443 Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:30:38 +0200 Subject: [PATCH 03/11] Rename variables Rename * `ID_length` to `id_length` to consistently use lower case letters for variables. * `newIDs` and `newID` to `new_obs_names` and `new_obs_name` to have clearer names and consistently use lower case letters and snake case for variable names. * `id` to `new_obs_name` to use more informative name. --- scvelo/core/_anndata.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index 83199c9a..53e2c1db 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -17,11 +17,11 @@ from ._arithmetic import sum -@deprecated_arg_names({"copy": "inplace"}) +@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length"}) def clean_obs_names( data: AnnData, base: str = "[AGTCBDHKMNRSVWY]", - ID_length: int = 12, + id_length: int = 12, inplace: bool = True, ) -> Optional[AnnData]: """Clean up the obs_names. @@ -67,29 +67,31 @@ def get_base_list(name, base): if adata.obs_names.map(len).unique().size == 1: start, end = re.search(base_list, names[0]).span() - newIDs = [name[start:end] for name in names] - start, end = 0, len(newIDs[0]) - for i in range(end - ID_length): - if np.any([ID[i] not in base for ID in newIDs]): + new_obs_names = [name[start:end] for name in names] + start, end = 0, len(new_obs_names[0]) + for i in range(end - id_length): + if np.any([new_obs_name[i] not in base for new_obs_name in new_obs_names]): start += 1 - if np.any([ID[::-1][i] not in base for ID in newIDs]): + if np.any( + [new_obs_name[::-1][i] not in base for new_obs_name in new_obs_names] + ): end -= 1 - newIDs = [ID[start:end] for ID in newIDs] - prefixes = [names[i].replace(newIDs[i], "") for i in range(len(names))] + new_obs_names = [new_obs_name[start:end] for new_obs_name in new_obs_names] + prefixes = [names[i].replace(new_obs_names[i], "") for i in range(len(names))] else: - prefixes, newIDs = [], [] + prefixes, new_obs_names = [], [] for name in names: match = re.search(base_list, name) - newID = ( + new_obs_name = ( re.search(get_base_list(name, base), name).group() if match is None else match.group() ) - newIDs.append(newID) - prefixes.append(name.replace(newID, "")) + new_obs_names.append(new_obs_name) + prefixes.append(name.replace(new_obs_name, "")) - adata.obs_names = newIDs + adata.obs_names = new_obs_names if len(prefixes[0]) > 0 and len(np.unique(prefixes)) > 1: adata.obs["sample_batch"] = ( pd.Categorical(prefixes) From 1b79dd41959f7ec7f16ffc6182921f55d5ab6bcf Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:31:37 +0200 Subject: [PATCH 04/11] Remove variable `names` Refactor code to not rely on variable `names` but use adata.obs_names directly. --- scvelo/core/_anndata.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index 53e2c1db..bbca5cfc 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -62,12 +62,11 @@ def get_base_list(name, base): adata = data.copy() if not inplace else data - names = adata.obs_names - base_list = get_base_list(names[0], base) + base_list = get_base_list(adata.obs_names[0], base) if adata.obs_names.map(len).unique().size == 1: - start, end = re.search(base_list, names[0]).span() - new_obs_names = [name[start:end] for name in names] + start, end = re.search(base_list, adata.obs_names[0]).span() + new_obs_names = [obs_name[start:end] for obs_name in adata.obs_names] start, end = 0, len(new_obs_names[0]) for i in range(end - id_length): if np.any([new_obs_name[i] not in base for new_obs_name in new_obs_names]): @@ -78,10 +77,13 @@ def get_base_list(name, base): end -= 1 new_obs_names = [new_obs_name[start:end] for new_obs_name in new_obs_names] - prefixes = [names[i].replace(new_obs_names[i], "") for i in range(len(names))] + prefixes = [ + obs_name.replace(new_obs_names[obs_id], "") + for obs_id, obs_name in enumerate(adata.obs_names) + ] else: prefixes, new_obs_names = [], [] - for name in names: + for name in adata.obs_names: match = re.search(base_list, name) new_obs_name = ( re.search(get_base_list(name, base), name).group() From bdc706b9d8e9d213363643cb73a6c3df86f67525 Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:33:26 +0200 Subject: [PATCH 05/11] Rename argument `base` to `alphabet` --- scvelo/core/_anndata.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index bbca5cfc..3505031f 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -17,10 +17,10 @@ from ._arithmetic import sum -@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length"}) +@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length", "base": "alphabet"}) def clean_obs_names( data: AnnData, - base: str = "[AGTCBDHKMNRSVWY]", + alphabet: str = "[AGTCBDHKMNRSVWY]", id_length: int = 12, inplace: bool = True, ) -> Optional[AnnData]: @@ -35,7 +35,7 @@ def clean_obs_names( --------- data Annotated data matrix. - base + alphabet Genetic code letters to be identified. ID_length Length of the Genetic Codes in the samples. @@ -52,27 +52,32 @@ def clean_obs_names( names of the identified sample batches """ - def get_base_list(name, base): - base_list = base - while re.search(base_list + base, name) is not None: - base_list += base + def get_base_list(name, alphabet): + base_list = alphabet + while re.search(base_list + alphabet, name) is not None: + base_list += alphabet if len(base_list) == 0: raise ValueError("Encountered an invalid ID in obs_names: ", name) return base_list adata = data.copy() if not inplace else data - base_list = get_base_list(adata.obs_names[0], base) + base_list = get_base_list(adata.obs_names[0], alphabet) if adata.obs_names.map(len).unique().size == 1: start, end = re.search(base_list, adata.obs_names[0]).span() new_obs_names = [obs_name[start:end] for obs_name in adata.obs_names] start, end = 0, len(new_obs_names[0]) for i in range(end - id_length): - if np.any([new_obs_name[i] not in base for new_obs_name in new_obs_names]): + if np.any( + [new_obs_name[i] not in alphabet for new_obs_name in new_obs_names] + ): start += 1 if np.any( - [new_obs_name[::-1][i] not in base for new_obs_name in new_obs_names] + [ + new_obs_name[::-1][i] not in alphabet + for new_obs_name in new_obs_names + ] ): end -= 1 @@ -86,7 +91,7 @@ def get_base_list(name, base): for name in adata.obs_names: match = re.search(base_list, name) new_obs_name = ( - re.search(get_base_list(name, base), name).group() + re.search(get_base_list(name, alphabet), name).group() if match is None else match.group() ) From c45bb61e2ec262cc268c0ddf378e6654b39137f2 Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:40:31 +0200 Subject: [PATCH 06/11] Rename argument `data` to `adata` The argument `data` needs to be an `AnnData` object. The argument name should convey this. --- scvelo/core/_anndata.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index 3505031f..25d3924a 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -17,9 +17,11 @@ from ._arithmetic import sum -@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length", "base": "alphabet"}) +@deprecated_arg_names( + {"data": "adata", "copy": "inplace", "ID_length": "id_length", "base": "alphabet"} +) def clean_obs_names( - data: AnnData, + adata: AnnData, alphabet: str = "[AGTCBDHKMNRSVWY]", id_length: int = 12, inplace: bool = True, @@ -33,7 +35,7 @@ def clean_obs_names( Arguments --------- - data + adata Annotated data matrix. alphabet Genetic code letters to be identified. @@ -60,7 +62,8 @@ def get_base_list(name, alphabet): raise ValueError("Encountered an invalid ID in obs_names: ", name) return base_list - adata = data.copy() if not inplace else data + if not inplace: + adata = adata.copy() base_list = get_base_list(adata.obs_names[0], alphabet) From a4f6e35b33e86fa164e68d35d4556f2bcf4c6e0f Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:44:59 +0200 Subject: [PATCH 07/11] Refactor calculation of start and end position Makes use of regex expression instead of relying on custom function. --- scvelo/core/_anndata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index 25d3924a..0b9f30a1 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -68,7 +68,8 @@ def get_base_list(name, alphabet): base_list = get_base_list(adata.obs_names[0], alphabet) if adata.obs_names.map(len).unique().size == 1: - start, end = re.search(base_list, adata.obs_names[0]).span() + start = re.search(alphabet, adata.obs_names[0]).start() + end = start + re.search(f"{alphabet}*", adata.obs_names[0][start:]).end() new_obs_names = [obs_name[start:end] for obs_name in adata.obs_names] start, end = 0, len(new_obs_names[0]) for i in range(end - id_length): From c3d1b253784998c3e759c7559cd7fb13dc472667 Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:48:41 +0200 Subject: [PATCH 08/11] Refactor else clause * Use regex expression to find start and end positions. * Apply to each observation using map on Pandas index. --- scvelo/core/_anndata.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index 0b9f30a1..71e7fc61 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -54,19 +54,9 @@ def clean_obs_names( names of the identified sample batches """ - def get_base_list(name, alphabet): - base_list = alphabet - while re.search(base_list + alphabet, name) is not None: - base_list += alphabet - if len(base_list) == 0: - raise ValueError("Encountered an invalid ID in obs_names: ", name) - return base_list - if not inplace: adata = adata.copy() - base_list = get_base_list(adata.obs_names[0], alphabet) - if adata.obs_names.map(len).unique().size == 1: start = re.search(alphabet, adata.obs_names[0]).start() end = start + re.search(f"{alphabet}*", adata.obs_names[0][start:]).end() @@ -91,16 +81,13 @@ def get_base_list(name, alphabet): for obs_id, obs_name in enumerate(adata.obs_names) ] else: - prefixes, new_obs_names = [], [] - for name in adata.obs_names: - match = re.search(base_list, name) - new_obs_name = ( - re.search(get_base_list(name, alphabet), name).group() - if match is None - else match.group() - ) - new_obs_names.append(new_obs_name) - prefixes.append(name.replace(new_obs_name, "")) + + def rename_obs(obs_name): + start = re.search(alphabet, obs_name).start() + new_obs_name = re.search(f"{alphabet}*", obs_name[start:]).group() + return new_obs_name, obs_name.replace(new_obs_name, "") + + new_obs_names, prefixes = zip(*adata.obs_names.map(rename_obs)) adata.obs_names = new_obs_names if len(prefixes[0]) > 0 and len(np.unique(prefixes)) > 1: From 7880b117ce049cda8860f9a39292188dc482d336 Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Tue, 13 Jul 2021 21:51:27 +0200 Subject: [PATCH 09/11] Update docs Update docs to match new argument names. --- scvelo/core/_anndata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index 71e7fc61..09f7a3a9 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -39,7 +39,7 @@ def clean_obs_names( Annotated data matrix. alphabet Genetic code letters to be identified. - ID_length + id_length Length of the Genetic Codes in the samples. inplace Whether to update `adata` inplace or not. From b4bc19a60556e99b8e71e3b8dee8f0d9e53cc1c0 Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Wed, 14 Jul 2021 08:07:51 +0200 Subject: [PATCH 10/11] Reorder code Move `adata.obs_names_make_unique()` up to group similar parts of the code together. --- scvelo/core/_anndata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index 09f7a3a9..bc595ba8 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -90,6 +90,8 @@ def rename_obs(obs_name): new_obs_names, prefixes = zip(*adata.obs_names.map(rename_obs)) adata.obs_names = new_obs_names + adata.obs_names_make_unique() + if len(prefixes[0]) > 0 and len(np.unique(prefixes)) > 1: adata.obs["sample_batch"] = ( pd.Categorical(prefixes) @@ -97,8 +99,6 @@ def rename_obs(obs_name): else prefixes ) - adata.obs_names_make_unique() - if not inplace: return adata From d903fdb9bdb66a3e094f0a4a2f0e181dc7dac220 Mon Sep 17 00:00:00 2001 From: Philipp Weiler Date: Wed, 14 Jul 2021 08:11:57 +0200 Subject: [PATCH 11/11] Refactor prefix definition Refactor definition of prefixes when observation names have same length. --- scvelo/core/_anndata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py index bc595ba8..e0a6d15c 100644 --- a/scvelo/core/_anndata.py +++ b/scvelo/core/_anndata.py @@ -77,8 +77,8 @@ def clean_obs_names( new_obs_names = [new_obs_name[start:end] for new_obs_name in new_obs_names] prefixes = [ - obs_name.replace(new_obs_names[obs_id], "") - for obs_id, obs_name in enumerate(adata.obs_names) + obs_name.replace(new_obs_name, "") + for obs_name, new_obs_name in zip(adata.obs_names, new_obs_names) ] else: