From 9e041328a6fd9f0fcf23f6524b5f51943cdb2b0a Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:14:07 +0200
Subject: [PATCH 01/11] Rename argument `copy` to `inplace`

---
 scvelo/core/_anndata.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index d77bbb7d..d96edd1e 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -11,16 +11,18 @@
 from scipy.sparse import csr_matrix, issparse, spmatrix
 
 from anndata import AnnData
+from scanpy._utils import deprecated_arg_names
 
 from scvelo import logging as logg
 from ._arithmetic import sum
 
 
+@deprecated_arg_names({"copy": "inplace"})
 def clean_obs_names(
     data: AnnData,
     base: str = "[AGTCBDHKMNRSVWY]",
     ID_length: int = 12,
-    copy: bool = False,
+    inplace: bool = True,
 ) -> Optional[AnnData]:
     """Clean up the obs_names.
 
@@ -37,8 +39,8 @@ def clean_obs_names(
         Genetic code letters to be identified.
     ID_length
         Length of the Genetic Codes in the samples.
-    copy
-        Return a copy instead of writing to adata.
+    inplace
+        Whether to update `adata` inplace or not.
 
     Returns
     -------
@@ -58,7 +60,7 @@ def get_base_list(name, base):
             raise ValueError("Encountered an invalid ID in obs_names: ", name)
         return base_list
 
-    adata = data.copy() if copy else data
+    adata = data.copy() if not inplace else data
 
     names = adata.obs_names
     base_list = get_base_list(names[0], base)
@@ -96,7 +98,9 @@ def get_base_list(name, base):
         )
 
     adata.obs_names_make_unique()
-    return adata if copy else None
+
+    if not inplace:
+        return adata
 
 
 def cleanup(

From d5fe4629729fd216ed9c47cc943d92c582119f0e Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:14:47 +0200
Subject: [PATCH 02/11] Refactor if clause check

Checking if all observation names are of equal length can entirely be
done using Pandas functionality. This looks slightly cleaner.
---
 scvelo/core/_anndata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index d96edd1e..83199c9a 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -65,7 +65,7 @@ def get_base_list(name, base):
     names = adata.obs_names
     base_list = get_base_list(names[0], base)
 
-    if len(np.unique([len(name) for name in adata.obs_names])) == 1:
+    if adata.obs_names.map(len).unique().size == 1:
         start, end = re.search(base_list, names[0]).span()
         newIDs = [name[start:end] for name in names]
         start, end = 0, len(newIDs[0])

From f19f58e3535e04dddb6d56f3ab268d4133b3d443 Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:30:38 +0200
Subject: [PATCH 03/11] Rename variables

Rename
* `ID_length`  to `id_length` to consistently use lower case
letters for variables.
* `newIDs` and `newID` to `new_obs_names` and `new_obs_name` to have
clearer names and consistently use lower case letters and snake case for
variable names.
* `id` to `new_obs_name` to use more informative name.
---
 scvelo/core/_anndata.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index 83199c9a..53e2c1db 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -17,11 +17,11 @@
 from ._arithmetic import sum
 
 
-@deprecated_arg_names({"copy": "inplace"})
+@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length"})
 def clean_obs_names(
     data: AnnData,
     base: str = "[AGTCBDHKMNRSVWY]",
-    ID_length: int = 12,
+    id_length: int = 12,
     inplace: bool = True,
 ) -> Optional[AnnData]:
     """Clean up the obs_names.
@@ -67,29 +67,31 @@ def get_base_list(name, base):
 
     if adata.obs_names.map(len).unique().size == 1:
         start, end = re.search(base_list, names[0]).span()
-        newIDs = [name[start:end] for name in names]
-        start, end = 0, len(newIDs[0])
-        for i in range(end - ID_length):
-            if np.any([ID[i] not in base for ID in newIDs]):
+        new_obs_names = [name[start:end] for name in names]
+        start, end = 0, len(new_obs_names[0])
+        for i in range(end - id_length):
+            if np.any([new_obs_name[i] not in base for new_obs_name in new_obs_names]):
                 start += 1
-            if np.any([ID[::-1][i] not in base for ID in newIDs]):
+            if np.any(
+                [new_obs_name[::-1][i] not in base for new_obs_name in new_obs_names]
+            ):
                 end -= 1
 
-        newIDs = [ID[start:end] for ID in newIDs]
-        prefixes = [names[i].replace(newIDs[i], "") for i in range(len(names))]
+        new_obs_names = [new_obs_name[start:end] for new_obs_name in new_obs_names]
+        prefixes = [names[i].replace(new_obs_names[i], "") for i in range(len(names))]
     else:
-        prefixes, newIDs = [], []
+        prefixes, new_obs_names = [], []
         for name in names:
             match = re.search(base_list, name)
-            newID = (
+            new_obs_name = (
                 re.search(get_base_list(name, base), name).group()
                 if match is None
                 else match.group()
             )
-            newIDs.append(newID)
-            prefixes.append(name.replace(newID, ""))
+            new_obs_names.append(new_obs_name)
+            prefixes.append(name.replace(new_obs_name, ""))
 
-    adata.obs_names = newIDs
+    adata.obs_names = new_obs_names
     if len(prefixes[0]) > 0 and len(np.unique(prefixes)) > 1:
         adata.obs["sample_batch"] = (
             pd.Categorical(prefixes)

From 1b79dd41959f7ec7f16ffc6182921f55d5ab6bcf Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:31:37 +0200
Subject: [PATCH 04/11] Remove variable `names`

Refactor code to not rely on variable `names` but use adata.obs_names
directly.
---
 scvelo/core/_anndata.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index 53e2c1db..bbca5cfc 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -62,12 +62,11 @@ def get_base_list(name, base):
 
     adata = data.copy() if not inplace else data
 
-    names = adata.obs_names
-    base_list = get_base_list(names[0], base)
+    base_list = get_base_list(adata.obs_names[0], base)
 
     if adata.obs_names.map(len).unique().size == 1:
-        start, end = re.search(base_list, names[0]).span()
-        new_obs_names = [name[start:end] for name in names]
+        start, end = re.search(base_list, adata.obs_names[0]).span()
+        new_obs_names = [obs_name[start:end] for obs_name in adata.obs_names]
         start, end = 0, len(new_obs_names[0])
         for i in range(end - id_length):
             if np.any([new_obs_name[i] not in base for new_obs_name in new_obs_names]):
@@ -78,10 +77,13 @@ def get_base_list(name, base):
                 end -= 1
 
         new_obs_names = [new_obs_name[start:end] for new_obs_name in new_obs_names]
-        prefixes = [names[i].replace(new_obs_names[i], "") for i in range(len(names))]
+        prefixes = [
+            obs_name.replace(new_obs_names[obs_id], "")
+            for obs_id, obs_name in enumerate(adata.obs_names)
+        ]
     else:
         prefixes, new_obs_names = [], []
-        for name in names:
+        for name in adata.obs_names:
             match = re.search(base_list, name)
             new_obs_name = (
                 re.search(get_base_list(name, base), name).group()

From bdc706b9d8e9d213363643cb73a6c3df86f67525 Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:33:26 +0200
Subject: [PATCH 05/11] Rename argument `base` to `alphabet`

---
 scvelo/core/_anndata.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index bbca5cfc..3505031f 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -17,10 +17,10 @@
 from ._arithmetic import sum
 
 
-@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length"})
+@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length", "base": "alphabet"})
 def clean_obs_names(
     data: AnnData,
-    base: str = "[AGTCBDHKMNRSVWY]",
+    alphabet: str = "[AGTCBDHKMNRSVWY]",
     id_length: int = 12,
     inplace: bool = True,
 ) -> Optional[AnnData]:
@@ -35,7 +35,7 @@ def clean_obs_names(
     ---------
     data
         Annotated data matrix.
-    base
+    alphabet
         Genetic code letters to be identified.
     ID_length
         Length of the Genetic Codes in the samples.
@@ -52,27 +52,32 @@ def clean_obs_names(
             names of the identified sample batches
     """
 
-    def get_base_list(name, base):
-        base_list = base
-        while re.search(base_list + base, name) is not None:
-            base_list += base
+    def get_base_list(name, alphabet):
+        base_list = alphabet
+        while re.search(base_list + alphabet, name) is not None:
+            base_list += alphabet
         if len(base_list) == 0:
             raise ValueError("Encountered an invalid ID in obs_names: ", name)
         return base_list
 
     adata = data.copy() if not inplace else data
 
-    base_list = get_base_list(adata.obs_names[0], base)
+    base_list = get_base_list(adata.obs_names[0], alphabet)
 
     if adata.obs_names.map(len).unique().size == 1:
         start, end = re.search(base_list, adata.obs_names[0]).span()
         new_obs_names = [obs_name[start:end] for obs_name in adata.obs_names]
         start, end = 0, len(new_obs_names[0])
         for i in range(end - id_length):
-            if np.any([new_obs_name[i] not in base for new_obs_name in new_obs_names]):
+            if np.any(
+                [new_obs_name[i] not in alphabet for new_obs_name in new_obs_names]
+            ):
                 start += 1
             if np.any(
-                [new_obs_name[::-1][i] not in base for new_obs_name in new_obs_names]
+                [
+                    new_obs_name[::-1][i] not in alphabet
+                    for new_obs_name in new_obs_names
+                ]
             ):
                 end -= 1
 
@@ -86,7 +91,7 @@ def get_base_list(name, base):
         for name in adata.obs_names:
             match = re.search(base_list, name)
             new_obs_name = (
-                re.search(get_base_list(name, base), name).group()
+                re.search(get_base_list(name, alphabet), name).group()
                 if match is None
                 else match.group()
             )

From c45bb61e2ec262cc268c0ddf378e6654b39137f2 Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:40:31 +0200
Subject: [PATCH 06/11] Rename argument `data` to `adata`

The argument `data` needs to be an `AnnData` object. The argument name
should convey this.
---
 scvelo/core/_anndata.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index 3505031f..25d3924a 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -17,9 +17,11 @@
 from ._arithmetic import sum
 
 
-@deprecated_arg_names({"copy": "inplace", "ID_length": "id_length", "base": "alphabet"})
+@deprecated_arg_names(
+    {"data": "adata", "copy": "inplace", "ID_length": "id_length", "base": "alphabet"}
+)
 def clean_obs_names(
-    data: AnnData,
+    adata: AnnData,
     alphabet: str = "[AGTCBDHKMNRSVWY]",
     id_length: int = 12,
     inplace: bool = True,
@@ -33,7 +35,7 @@ def clean_obs_names(
 
     Arguments
     ---------
-    data
+    adata
         Annotated data matrix.
     alphabet
         Genetic code letters to be identified.
@@ -60,7 +62,8 @@ def get_base_list(name, alphabet):
             raise ValueError("Encountered an invalid ID in obs_names: ", name)
         return base_list
 
-    adata = data.copy() if not inplace else data
+    if not inplace:
+        adata = adata.copy()
 
     base_list = get_base_list(adata.obs_names[0], alphabet)
 

From a4f6e35b33e86fa164e68d35d4556f2bcf4c6e0f Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:44:59 +0200
Subject: [PATCH 07/11] Refactor calculation of start and end position

Makes use of regex expression instead of relying on custom function.
---
 scvelo/core/_anndata.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index 25d3924a..0b9f30a1 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -68,7 +68,8 @@ def get_base_list(name, alphabet):
     base_list = get_base_list(adata.obs_names[0], alphabet)
 
     if adata.obs_names.map(len).unique().size == 1:
-        start, end = re.search(base_list, adata.obs_names[0]).span()
+        start = re.search(alphabet, adata.obs_names[0]).start()
+        end = start + re.search(f"{alphabet}*", adata.obs_names[0][start:]).end()
         new_obs_names = [obs_name[start:end] for obs_name in adata.obs_names]
         start, end = 0, len(new_obs_names[0])
         for i in range(end - id_length):

From c3d1b253784998c3e759c7559cd7fb13dc472667 Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:48:41 +0200
Subject: [PATCH 08/11] Refactor else clause

* Use regex expression to find start and end positions.
* Apply to each observation using map on Pandas index.
---
 scvelo/core/_anndata.py | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index 0b9f30a1..71e7fc61 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -54,19 +54,9 @@ def clean_obs_names(
             names of the identified sample batches
     """
 
-    def get_base_list(name, alphabet):
-        base_list = alphabet
-        while re.search(base_list + alphabet, name) is not None:
-            base_list += alphabet
-        if len(base_list) == 0:
-            raise ValueError("Encountered an invalid ID in obs_names: ", name)
-        return base_list
-
     if not inplace:
         adata = adata.copy()
 
-    base_list = get_base_list(adata.obs_names[0], alphabet)
-
     if adata.obs_names.map(len).unique().size == 1:
         start = re.search(alphabet, adata.obs_names[0]).start()
         end = start + re.search(f"{alphabet}*", adata.obs_names[0][start:]).end()
@@ -91,16 +81,13 @@ def get_base_list(name, alphabet):
             for obs_id, obs_name in enumerate(adata.obs_names)
         ]
     else:
-        prefixes, new_obs_names = [], []
-        for name in adata.obs_names:
-            match = re.search(base_list, name)
-            new_obs_name = (
-                re.search(get_base_list(name, alphabet), name).group()
-                if match is None
-                else match.group()
-            )
-            new_obs_names.append(new_obs_name)
-            prefixes.append(name.replace(new_obs_name, ""))
+
+        def rename_obs(obs_name):
+            start = re.search(alphabet, obs_name).start()
+            new_obs_name = re.search(f"{alphabet}*", obs_name[start:]).group()
+            return new_obs_name, obs_name.replace(new_obs_name, "")
+
+        new_obs_names, prefixes = zip(*adata.obs_names.map(rename_obs))
 
     adata.obs_names = new_obs_names
     if len(prefixes[0]) > 0 and len(np.unique(prefixes)) > 1:

From 7880b117ce049cda8860f9a39292188dc482d336 Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Tue, 13 Jul 2021 21:51:27 +0200
Subject: [PATCH 09/11] Update docs

Update docs to match new argument names.
---
 scvelo/core/_anndata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index 71e7fc61..09f7a3a9 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -39,7 +39,7 @@ def clean_obs_names(
         Annotated data matrix.
     alphabet
         Genetic code letters to be identified.
-    ID_length
+    id_length
         Length of the Genetic Codes in the samples.
     inplace
         Whether to update `adata` inplace or not.

From b4bc19a60556e99b8e71e3b8dee8f0d9e53cc1c0 Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Wed, 14 Jul 2021 08:07:51 +0200
Subject: [PATCH 10/11] Reorder code

Move `adata.obs_names_make_unique()` up to group similar parts of the
code together.
---
 scvelo/core/_anndata.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index 09f7a3a9..bc595ba8 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -90,6 +90,8 @@ def rename_obs(obs_name):
         new_obs_names, prefixes = zip(*adata.obs_names.map(rename_obs))
 
     adata.obs_names = new_obs_names
+    adata.obs_names_make_unique()
+
     if len(prefixes[0]) > 0 and len(np.unique(prefixes)) > 1:
         adata.obs["sample_batch"] = (
             pd.Categorical(prefixes)
@@ -97,8 +99,6 @@ def rename_obs(obs_name):
             else prefixes
         )
 
-    adata.obs_names_make_unique()
-
     if not inplace:
         return adata
 

From d903fdb9bdb66a3e094f0a4a2f0e181dc7dac220 Mon Sep 17 00:00:00 2001
From: Philipp Weiler <weiler.philipp@gmail.com>
Date: Wed, 14 Jul 2021 08:11:57 +0200
Subject: [PATCH 11/11] Refactor prefix definition

Refactor definition of prefixes when observation names have same length.
---
 scvelo/core/_anndata.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scvelo/core/_anndata.py b/scvelo/core/_anndata.py
index bc595ba8..e0a6d15c 100644
--- a/scvelo/core/_anndata.py
+++ b/scvelo/core/_anndata.py
@@ -77,8 +77,8 @@ def clean_obs_names(
 
         new_obs_names = [new_obs_name[start:end] for new_obs_name in new_obs_names]
         prefixes = [
-            obs_name.replace(new_obs_names[obs_id], "")
-            for obs_id, obs_name in enumerate(adata.obs_names)
+            obs_name.replace(new_obs_name, "")
+            for obs_name, new_obs_name in zip(adata.obs_names, new_obs_names)
         ]
     else: