Merge branch 'feature/text-normalization'

sharkutilities · Aug 18, 2024 · 2158f46 · 2158f46
2 parents 626ee16 + d4e7c00
commit 2158f46
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 15 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -2,13 +2,21 @@
 
 <div align = "center">
 
+[![Documentation Status](https://readthedocs.org/projects/nlpurify/badge/?version=latest&style=plastic)](https://nlpurify.readthedocs.io/en/latest/?badge=latest)
 [![GitHub Issues](https://img.shields.io/github/issues/sharkutilities/NLPurify?style=plastic)](https://github.com/sharkutilities/NLPurify/issues)
 [![GitHub Forks](https://img.shields.io/github/forks/sharkutilities/NLPurify?style=plastic)](https://github.com/sharkutilities/NLPurify/network)
 [![GitHub Stars](https://img.shields.io/github/stars/sharkutilities/NLPurify?style=plastic)](https://github.com/sharkutilities/NLPurify/stargazers)
 [![LICENSE File](https://img.shields.io/github/license/sharkutilities/NLPurify?style=plastic)](https://github.com/sharkutilities/NLPurify/blob/master/LICENSE)
+[![PyPI - Downloads](https://img.shields.io/pypi/dm/NLPurify?style=plastic)](https://pypistats.org/packages/pandas-wizard)
+[![PyPI Latest Release](https://img.shields.io/pypi/v/NLPurify.svg?style=plastic)](https://pypi.org/project/NLPurify/)
 
 </div>
 
+```{toctree}
+:hidden:
+normalize.md
+```
+
 <div align = "justify">
 
 A text cleaning and extraction engine was developed using a combination of traditional techniques like Unicode translations,
@@ -25,20 +33,6 @@ pip install -U NLPurify
 ```
 
 The module is currently under development, and new ideas are welcomed. Raise a new PR/issue for the same.
-The changes between each release are available [here](./CHANGELOG.md).
-
----
-
-```{eval-rst}
-.. caution::
-  **This code depreciates the existing GitHub Gist which was previously designed.**
-  Check `#1 <https://github.com/sharkutilities/NLPurify/issues/1>`_ for more details.
-```
-
-```{eval-rst}
-.. note::
-  **Legacy codes are available as a submodule.**
-  Check `#5 <https://github.com/sharkutilities/NLPurify/issues/5>`_ for more details.
-```
+The changes between each release are available [here](https://github.com/sharkutilities/NLPurify/blob/master/CHANGELOG.md).
 
 </div>
diff --git a/docs/normalize.md b/docs/normalize.md
@@ -0,0 +1,10 @@
+# Text Normalization
+
+<div align = "justify">
+
+```{eval-rst}
+.. automodule:: nlpurify.normalize
+  :members:
+```
+
+</div>
diff --git a/nlpurify/__init__.py b/nlpurify/__init__.py
@@ -13,3 +13,4 @@
 __version__ = "0.0.1.dev0"
 
 # init-time options registrations
+from nlpurify.normalize import normalizeText
diff --git a/nlpurify/normalize.py b/nlpurify/normalize.py
@@ -0,0 +1,148 @@
+# -*- encoding: utf-8 -*-
+
+"""
+Module Involved to Normalization of Text
+
+The normalization of text involves cleaning of text/strings from
+unwanted characters like double spacing, double line breaks to single
+line breaks, etc. A single functional approach is designed to handle
+all such user's requests.
+"""
+
+import os
+
+def _strip_whitespace(
+        text : str,
+        strip_whitespace : bool,
+        strip_whitespace_start : bool,
+        strip_whitespace_final : bool
+    ) -> str:
+    """
+    Normalive Text of White Spaces from Beginning and End
+
+    The normal text behavior is that they do not contain a white space
+    characters at the beginning and end of the string.
+    """
+
+    _choice = {
+        "strip_whitespace_start" : text.lstrip(),
+        "strip_whitespace_final" : text.rstrip(),
+
+        # ? setting default i.e., no strip when all false
+        "default" : text
+    }
+
+    if strip_whitespace:
+        # has priority over `strip_whitespace_*` atrributes
+        text = text.strip()
+    else:
+        choice = "strip_whitespace_start" if strip_whitespace_start \
+            else "strip_whitespace_final" if strip_whitespace_final \
+            else "default"
+
+        text = _choice[choice]
+
+    return text
+
+
+def normalizeText(
+        text : str,
+        replace_double_space : bool = True,
+        replace_double_line_breaks : bool = True,
+        **kwargs
+    ) -> str:
+    """
+    Normalize a Given String with User-Defined Configurations
+
+    The normalization function uses the in-built string function like
+    :attr:`.strip()`, :attr:`.replace()` etc. to return a cleaner
+    version. The following arguments are available for more control.
+
+    :type  text: str
+    :param text: The base uncleaned text, all the operations are
+        done on this text to return a cleaner version. The string can
+        be single line, multi-line (example from "text area") and can
+        have any type of escape characters.
+
+    :type  replace_double_space: bool
+    :param replace_double_space: A common type of uncleaned text
+        format includes double space (white characters), which can be
+        directly cleaned without compromising informations. Defaults
+        to True.
+
+    :type  replace_double_line_breaks: bool
+    :param replace_double_line_breaks: Double line breaks are common
+        in texts containing paragraphs. This can be easily replaced
+        with a single line break character set. Defaults to True.
+        NOTE: The line break is dependent on the operating system:
+        in windows it is "\\r\\n" or "CR LF" while in *nix system it
+        is always "\\n" or "LF". To answer this, the program considers
+        the default line break based on the operating system the code
+        is running. To override this - use the keyword argument.
+
+    **Keyword Arguments**
+        * **strip_whitespace** (*bool*): Strip white space from the
+          beginning or end of the text. Defaults to True. Alternate
+          keyword terms are :attr:`strip_whitespace_start` and
+          :attr:`strip_whitespace_final` which cleans white space from
+          the beginning or end of string only respectively. The
+          attribute :attr:`strip_whitespace` has priority over its
+          alternates and ignores alternates if set to True.
+        * **strip_whitespace_inline** (*bool*): This is an extension
+          of the :attr:`strip_whitespace` that iterates for each line
+          and strips the white spaces at the beginning and end of each
+          line. This is useful when the text spans multiple lines.
+          Defaults to True. Similar to :attr:`strip_whitespace` the
+          alternate arguments are :attr:`strip_whitespace_inline_start`
+          and :attr:`strip_whitespace_inline_start` which if True
+          strips only the beginning or the ending white space from
+          each line.
+        * **line_break_seperator** (*str*): The end line character
+          which is either "\\r\\n" for windows or "\\n" for *nix
+          based systems. By default defaults to running operating
+          systems default.
+    """
+
+    strip_whitespace = kwargs.get("strip_whitespace", True)
+    strip_whitespace_inline = kwargs.get("strip_whitespace", True)
+
+    if replace_double_space:
+        # ? can compile with regex:: `re.compile(r"\s+"")`
+        text = text.replace("  ", " ")
+
+    if replace_double_line_breaks:
+        # get the keyword argument for line break seperator,
+        # or else get the os default, value is doubled internally
+        line_break_seperator = kwargs.get("line_break_seperator", os.linesep)
+        text = text.replace(line_break_seperator * 2, line_break_seperator)
+
+    # ? related alternate terms to `strip_whitespace`
+    strip_whitespace_start = kwargs.get("strip_whitespace_start", False)
+    strip_whitespace_final = kwargs.get("strip_whitespace_final", False)
+
+    # ? related alternate terms to `strip_whitespace_inline`
+    strip_whitespace_inline_start = kwargs.get("strip_whitespace_inline_start", False)
+    strip_whitespace_inline_final = kwargs.get("strip_whitespace_inline_final", False)
+
+    if any([strip_whitespace, strip_whitespace_start, strip_whitespace_final]):
+        # white space character from the string is to be removed
+        text = _strip_whitespace(
+            text,
+            strip_whitespace = strip_whitespace,
+            strip_whitespace_start = strip_whitespace_start,
+            strip_whitespace_final = strip_whitespace_final
+        )
+
+    if any([strip_whitespace_inline, strip_whitespace_inline_start, strip_whitespace_inline_final]):
+        # white space character from the string is to be removed
+        text = "\n".join([
+            _strip_whitespace(
+                line,
+                strip_whitespace = strip_whitespace_inline,
+                strip_whitespace_start = strip_whitespace_inline_start,
+                strip_whitespace_final = strip_whitespace_inline_final
+            )
+            for line in text.splitlines()
+        ])
+
+    return text