diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py index e5ffbf56..37a00deb 100644 --- a/natsort/ns_enum.py +++ b/natsort/ns_enum.py @@ -39,7 +39,7 @@ class ns(object): This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful when attempting to sort real numbers. NOEXP, N - Tell `natsort` to not search for exponents as part of the number. + Tell `natsort` to not search for exponents as part of a float number. For example, with `NOEXP` the number "5.6E5" would be interpreted as `5.6`, `"E"`, and `5` instead of `560000`. PATH, P @@ -51,6 +51,13 @@ class ns(object): sorted properly; 'Folder/' will be placed at the end, not at the front. It is the same as setting the old `as_path` option to `True`. + COMPATIBILITYNORMALIZE, CN + Use the "NFKD" unicode normalization form on input rather than the + default "NFD". This will transform characters such as '⑦' into + '7'. Please see https://stackoverflow.com/a/7934397/1399279, + https://stackoverflow.com/a/7931547/1399279, + and http://unicode.org/reports/tr15/ full details into unicode + normalization. LOCALE, L Tell `natsort` to be locale-aware when sorting. This includes both proper sorting of alphabetical characters as well as proper @@ -129,20 +136,21 @@ class ns(object): # The below are options. The values are stored as powers of two # so bitmasks can be used to extract the user's requested options. - FLOAT = F = 1 << 0 - SIGNED = S = 1 << 1 - REAL = R = FLOAT | SIGNED - NOEXP = N = 1 << 2 - PATH = P = 1 << 3 - LOCALEALPHA = LA = 1 << 4 - LOCALENUM = LN = 1 << 5 - LOCALE = L = LOCALEALPHA | LOCALENUM - IGNORECASE = IC = 1 << 6 - LOWERCASEFIRST = LF = 1 << 7 - GROUPLETTERS = G = 1 << 8 - UNGROUPLETTERS = UG = 1 << 9 - CAPITALFIRST = C = UNGROUPLETTERS - NANLAST = NL = 1 << 10 + FLOAT = F = 1 << 0 + SIGNED = S = 1 << 1 + REAL = R = FLOAT | SIGNED + NOEXP = N = 1 << 2 + PATH = P = 1 << 3 + LOCALEALPHA = LA = 1 << 4 + LOCALENUM = LN = 1 << 5 + LOCALE = L = LOCALEALPHA | LOCALENUM + IGNORECASE = IC = 1 << 6 + LOWERCASEFIRST = LF = 1 << 7 + GROUPLETTERS = G = 1 << 8 + UNGROUPLETTERS = UG = 1 << 9 + CAPITALFIRST = C = UNGROUPLETTERS + NANLAST = NL = 1 << 10 + COMPATIBILITYNORMALIZE = CN = 1 << 11 # The below are private options for internal use only. _NUMERIC_ONLY = REAL | NOEXP diff --git a/natsort/utils.py b/natsort/utils.py index c21d3b40..28f1487d 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -54,6 +54,7 @@ from collections import deque from functools import partial, reduce from operator import methodcaller +from unicodedata import normalize # Local imports. from natsort.ns_enum import ns @@ -267,11 +268,23 @@ def _input_string_transform_factory(alg): # Shortcuts. lowfirst = alg & ns.LOWERCASEFIRST dumb = alg & ns._DUMB + normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD' + + if NEWPY: + careful_normalize = partial(normalize, normalization_form) + else: + def careful_normalize(x): + """Normalize unicode input.""" + if isinstance(x, py23_str): # unicode + return normalize(normalization_form, x) + else: + return x # Build the chain of functions to execute in order. - function_chain = [] + function_chain = [careful_normalize] if (dumb and not lowfirst) or (lowfirst and not dumb): function_chain.append(methodcaller('swapcase')) + if alg & ns.IGNORECASE: if NEWPY: function_chain.append(methodcaller('casefold')) diff --git a/test_natsort/test_input_string_transform_factory.py b/test_natsort/test_input_string_transform_factory.py index 3dbd8433..97acf216 100644 --- a/test_natsort/test_input_string_transform_factory.py +++ b/test_natsort/test_input_string_transform_factory.py @@ -5,6 +5,7 @@ import pytest import locale from operator import methodcaller +from unicodedata import normalize from natsort.ns_enum import ns from natsort.utils import _input_string_transform_factory from natsort.compat.py23 import NEWPY @@ -28,12 +29,22 @@ def test_input_string_transform_factory_is_no_op_for_no_alg_options_examples(): x = 'feijGGAd' - assert _input_string_transform_factory(0)(x) is x + assert _input_string_transform_factory(0)(x) == x @given(text()) -def test_input_string_transform_factory_is_no_op_for_no_alg_options(x): - assert _input_string_transform_factory(0)(x) is x +def test_input_string_transform_factory_is_no_op_for_no_alg_options_except_normalization(x): + assert _input_string_transform_factory(0)(x) == normalize('NFD', x) + + +def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE_examples(): + x = '⑦' + assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == '7' + + +@given(text()) +def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE(x): + assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == normalize('NFKD', x) def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_examples(): @@ -47,9 +58,9 @@ def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_exampl @given(text()) def test_input_string_transform_factory_performs_casefold_with_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.casefold() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.lower() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).lower() def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @@ -59,7 +70,7 @@ def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @given(text()) def test_input_string_transform_factory_performs_swapcase_with_DUMB(x): - assert _input_string_transform_factory(ns._DUMB)(x) == x.swapcase() + assert _input_string_transform_factory(ns._DUMB)(x) == normalize('NFD', x).swapcase() def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_example(): @@ -69,18 +80,17 @@ def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_ex @given(text()) def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST(x): - x = 'feijGGAd' - assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == x.swapcase() + assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase() def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB_example(): x = 'feijGGAd' - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == x @given(text()) def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB(x): - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == normalize('NFD', x) def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE_example(): @@ -94,9 +104,9 @@ def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWE @given(text()) def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().lower() def test_input_string_transform_factory_removes_thousands_separator_with_LOCALE_example(): diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py index 146997a7..388e209f 100644 --- a/test_natsort/test_natsorted.py +++ b/test_natsort/test_natsorted.py @@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_ def test_natsorted_with_mixed_input_returns_sorted_results_without_error(): + a = ['0', 'Á', '2', 'Z'] + assert natsorted(a) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a) == [1.5, '2', 3, 'ä', 'b'] def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST(): @@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str('')) diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py index 934757a9..f1cffa20 100644 --- a/test_natsort/test_utils.py +++ b/test_natsort/test_utils.py @@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected(): assert ns.CAPITALFIRST == ns.C assert ns.UNGROUPLETTERS == ns.CAPITALFIRST assert ns.NANLAST == ns.NL + assert ns.COMPATIBILITYNORMALIZE == ns.CN # Convenience assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM