Add unicode normalization to all input.

All unicode input now gets 'NFD' normalization, which ensures that all characters that look the same are represented by the same code points. 'NFD' was chosen because it is the expanded for which will cause (for example) 'é' to be placed immediately after 'e' rather than after 'z'. Users can choose 'NFKD' with ns.COMPATIBILITYNORMALIZE (or ns.CN) which will change certain characters to their compatible (and often ASCII) representation. This may be useful to cause force numbers in odd representations to be transformed to ASCII which will potentially give better sorting orders. This will close issue #44.
SethMMorton · Aug 19, 2017 · 3a75ddb · 3a75ddb
1 parent c2f4b5d
commit 3a75ddb
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 30 deletions.
diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py
@@ -39,7 +39,7 @@ class ns(object):
         This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful
         when attempting to sort real numbers.
     NOEXP, N
-        Tell `natsort` to not search for exponents as part of the number.
+        Tell `natsort` to not search for exponents as part of a float number.
         For example, with `NOEXP` the number "5.6E5" would be interpreted
         as `5.6`, `"E"`, and `5` instead of `560000`.
     PATH, P
@@ -51,6 +51,13 @@ class ns(object):
         sorted properly; 'Folder/' will be placed at the end, not at the
         front. It is the same as setting the old `as_path` option to
         `True`.
+    COMPATIBILITYNORMALIZE, CN
+        Use the "NFKD" unicode normalization form on input rather than the
+        default "NFD". This will transform characters such as '⑦' into
+        '7'. Please see https://stackoverflow.com/a/7934397/1399279,
+        https://stackoverflow.com/a/7931547/1399279,
+        and http://unicode.org/reports/tr15/ full details into unicode
+        normalization.
     LOCALE, L
         Tell `natsort` to be locale-aware when sorting. This includes both
         proper sorting of alphabetical characters as well as proper
@@ -129,20 +136,21 @@ class ns(object):
 
     # The below are options. The values are stored as powers of two
     # so bitmasks can be used to extract the user's requested options.
-    FLOAT            = F  = 1 << 0
-    SIGNED           = S  = 1 << 1
-    REAL             = R  = FLOAT | SIGNED
-    NOEXP            = N  = 1 << 2
-    PATH             = P  = 1 << 3
-    LOCALEALPHA      = LA = 1 << 4
-    LOCALENUM        = LN = 1 << 5
-    LOCALE           = L  = LOCALEALPHA | LOCALENUM
-    IGNORECASE       = IC = 1 << 6
-    LOWERCASEFIRST   = LF = 1 << 7
-    GROUPLETTERS     = G  = 1 << 8
-    UNGROUPLETTERS   = UG = 1 << 9
-    CAPITALFIRST     = C  = UNGROUPLETTERS
-    NANLAST          = NL = 1 << 10
+    FLOAT                  = F  = 1 << 0
+    SIGNED                 = S  = 1 << 1
+    REAL                   = R  = FLOAT | SIGNED
+    NOEXP                  = N  = 1 << 2
+    PATH                   = P  = 1 << 3
+    LOCALEALPHA            = LA = 1 << 4
+    LOCALENUM              = LN = 1 << 5
+    LOCALE                 = L  = LOCALEALPHA | LOCALENUM
+    IGNORECASE             = IC = 1 << 6
+    LOWERCASEFIRST         = LF = 1 << 7
+    GROUPLETTERS           = G  = 1 << 8
+    UNGROUPLETTERS         = UG = 1 << 9
+    CAPITALFIRST           = C  = UNGROUPLETTERS
+    NANLAST                = NL = 1 << 10
+    COMPATIBILITYNORMALIZE = CN = 1 << 11
 
     # The below are private options for internal use only.
     _NUMERIC_ONLY    = REAL | NOEXP

diff --git a/natsort/utils.py b/natsort/utils.py
@@ -54,6 +54,7 @@
 from collections import deque
 from functools import partial, reduce
 from operator import methodcaller
+from unicodedata import normalize
 
 # Local imports.
 from natsort.ns_enum import ns
@@ -267,11 +268,23 @@ def _input_string_transform_factory(alg):
     # Shortcuts.
     lowfirst = alg & ns.LOWERCASEFIRST
     dumb = alg & ns._DUMB
+    normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD'
+
+    if NEWPY:
+        careful_normalize = partial(normalize, normalization_form)
+    else:
+        def careful_normalize(x):
+            """Normalize unicode input."""
+            if isinstance(x, py23_str):  # unicode
+                return normalize(normalization_form, x)
+            else:
+                return x
 
     # Build the chain of functions to execute in order.
-    function_chain = []
+    function_chain = [careful_normalize]
     if (dumb and not lowfirst) or (lowfirst and not dumb):
         function_chain.append(methodcaller('swapcase'))
+
     if alg & ns.IGNORECASE:
         if NEWPY:
             function_chain.append(methodcaller('casefold'))

diff --git a/test_natsort/test_input_string_transform_factory.py b/test_natsort/test_input_string_transform_factory.py
@@ -5,6 +5,7 @@
 import pytest
 import locale
 from operator import methodcaller
+from unicodedata import normalize
 from natsort.ns_enum import ns
 from natsort.utils import _input_string_transform_factory
 from natsort.compat.py23 import NEWPY
@@ -28,12 +29,22 @@
 
 def test_input_string_transform_factory_is_no_op_for_no_alg_options_examples():
     x = 'feijGGAd'
-    assert _input_string_transform_factory(0)(x) is x
+    assert _input_string_transform_factory(0)(x) == x
 
 
 @given(text())
-def test_input_string_transform_factory_is_no_op_for_no_alg_options(x):
-    assert _input_string_transform_factory(0)(x) is x
+def test_input_string_transform_factory_is_no_op_for_no_alg_options_except_normalization(x):
+    assert _input_string_transform_factory(0)(x) == normalize('NFD', x)
+
+
+def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE_examples():
+    x = '⑦'
+    assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == '7'
+
+
+@given(text())
+def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE(x):
+    assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == normalize('NFKD', x)
 
 
 def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_examples():
@@ -47,9 +58,9 @@ def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_exampl
 @given(text())
 def test_input_string_transform_factory_performs_casefold_with_IGNORECASE(x):
     if NEWPY:
-        assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.casefold()
+        assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).casefold()
     else:
-        assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.lower()
+        assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).lower()
 
 
 def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples():
@@ -59,7 +70,7 @@ def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples():
 
 @given(text())
 def test_input_string_transform_factory_performs_swapcase_with_DUMB(x):
-    assert _input_string_transform_factory(ns._DUMB)(x) == x.swapcase()
+    assert _input_string_transform_factory(ns._DUMB)(x) == normalize('NFD', x).swapcase()
 
 
 def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_example():
@@ -69,18 +80,17 @@ def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_ex
 
 @given(text())
 def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST(x):
-    x = 'feijGGAd'
-    assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == x.swapcase()
+    assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase()
 
 
 def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB_example():
     x = 'feijGGAd'
-    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x
+    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == x
 
 
 @given(text())
 def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB(x):
-    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x
+    assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == normalize('NFD', x)
 
 
 def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE_example():
@@ -94,9 +104,9 @@ def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWE
 @given(text())
 def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE(x):
     if NEWPY:
-        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold()
+        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().casefold()
     else:
-        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower()
+        assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().lower()
 
 
 def test_input_string_transform_factory_removes_thousands_separator_with_LOCALE_example():

diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
@@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_
 
 
 def test_natsorted_with_mixed_input_returns_sorted_results_without_error():
+    a = ['0', 'Á', '2', 'Z']
+    assert natsorted(a) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
-    assert natsorted(a) == [1.5, '2', 3, 'b', 'ä']
+    assert natsorted(a) == [1.5, '2', 3, 'ä', 'b']
 
 
 def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST():
@@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu
 def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error():
     load_locale('en_US')
     a = ['0', 'Á', '2', 'Z']
-    assert natsorted(a) == ['0', '2', 'Z', 'Á']
+    assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
     assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b']
     locale.setlocale(locale.LC_ALL, str(''))

diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py
@@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected():
     assert ns.CAPITALFIRST == ns.C
     assert ns.UNGROUPLETTERS == ns.CAPITALFIRST
     assert ns.NANLAST == ns.NL
+    assert ns.COMPATIBILITYNORMALIZE == ns.CN
 
     # Convenience
     assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM