Skip to content

Commit

Permalink
Add unicode normalization to all input.
Browse files Browse the repository at this point in the history
All unicode input now gets 'NFD' normalization, which ensures that
all characters that look the same are represented by the same code
points. 'NFD' was chosen because it is the expanded for which will
cause (for example) 'é' to be placed immediately after 'e' rather than
after 'z'.

Users can choose 'NFKD' with ns.COMPATIBILITYNORMALIZE (or ns.CN) which
will change certain characters to their compatible (and often ASCII)
representation. This may be useful to cause force numbers in odd
representations to be transformed to ASCII which will potentially give
better sorting orders.

This will close issue #44.
  • Loading branch information
SethMMorton committed Aug 19, 2017
1 parent c2f4b5d commit 3a75ddb
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 30 deletions.
38 changes: 23 additions & 15 deletions natsort/ns_enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class ns(object):
This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful
when attempting to sort real numbers.
NOEXP, N
Tell `natsort` to not search for exponents as part of the number.
Tell `natsort` to not search for exponents as part of a float number.
For example, with `NOEXP` the number "5.6E5" would be interpreted
as `5.6`, `"E"`, and `5` instead of `560000`.
PATH, P
Expand All @@ -51,6 +51,13 @@ class ns(object):
sorted properly; 'Folder/' will be placed at the end, not at the
front. It is the same as setting the old `as_path` option to
`True`.
COMPATIBILITYNORMALIZE, CN
Use the "NFKD" unicode normalization form on input rather than the
default "NFD". This will transform characters such as '⑦' into
'7'. Please see https://stackoverflow.com/a/7934397/1399279,
https://stackoverflow.com/a/7931547/1399279,
and http://unicode.org/reports/tr15/ full details into unicode
normalization.
LOCALE, L
Tell `natsort` to be locale-aware when sorting. This includes both
proper sorting of alphabetical characters as well as proper
Expand Down Expand Up @@ -129,20 +136,21 @@ class ns(object):

# The below are options. The values are stored as powers of two
# so bitmasks can be used to extract the user's requested options.
FLOAT = F = 1 << 0
SIGNED = S = 1 << 1
REAL = R = FLOAT | SIGNED
NOEXP = N = 1 << 2
PATH = P = 1 << 3
LOCALEALPHA = LA = 1 << 4
LOCALENUM = LN = 1 << 5
LOCALE = L = LOCALEALPHA | LOCALENUM
IGNORECASE = IC = 1 << 6
LOWERCASEFIRST = LF = 1 << 7
GROUPLETTERS = G = 1 << 8
UNGROUPLETTERS = UG = 1 << 9
CAPITALFIRST = C = UNGROUPLETTERS
NANLAST = NL = 1 << 10
FLOAT = F = 1 << 0
SIGNED = S = 1 << 1
REAL = R = FLOAT | SIGNED
NOEXP = N = 1 << 2
PATH = P = 1 << 3
LOCALEALPHA = LA = 1 << 4
LOCALENUM = LN = 1 << 5
LOCALE = L = LOCALEALPHA | LOCALENUM
IGNORECASE = IC = 1 << 6
LOWERCASEFIRST = LF = 1 << 7
GROUPLETTERS = G = 1 << 8
UNGROUPLETTERS = UG = 1 << 9
CAPITALFIRST = C = UNGROUPLETTERS
NANLAST = NL = 1 << 10
COMPATIBILITYNORMALIZE = CN = 1 << 11

# The below are private options for internal use only.
_NUMERIC_ONLY = REAL | NOEXP
Expand Down
15 changes: 14 additions & 1 deletion natsort/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from collections import deque
from functools import partial, reduce
from operator import methodcaller
from unicodedata import normalize

# Local imports.
from natsort.ns_enum import ns
Expand Down Expand Up @@ -267,11 +268,23 @@ def _input_string_transform_factory(alg):
# Shortcuts.
lowfirst = alg & ns.LOWERCASEFIRST
dumb = alg & ns._DUMB
normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD'

if NEWPY:
careful_normalize = partial(normalize, normalization_form)
else:
def careful_normalize(x):
"""Normalize unicode input."""
if isinstance(x, py23_str): # unicode
return normalize(normalization_form, x)
else:
return x

# Build the chain of functions to execute in order.
function_chain = []
function_chain = [careful_normalize]
if (dumb and not lowfirst) or (lowfirst and not dumb):
function_chain.append(methodcaller('swapcase'))

if alg & ns.IGNORECASE:
if NEWPY:
function_chain.append(methodcaller('casefold'))
Expand Down
34 changes: 22 additions & 12 deletions test_natsort/test_input_string_transform_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
import locale
from operator import methodcaller
from unicodedata import normalize
from natsort.ns_enum import ns
from natsort.utils import _input_string_transform_factory
from natsort.compat.py23 import NEWPY
Expand All @@ -28,12 +29,22 @@

def test_input_string_transform_factory_is_no_op_for_no_alg_options_examples():
x = 'feijGGAd'
assert _input_string_transform_factory(0)(x) is x
assert _input_string_transform_factory(0)(x) == x


@given(text())
def test_input_string_transform_factory_is_no_op_for_no_alg_options(x):
assert _input_string_transform_factory(0)(x) is x
def test_input_string_transform_factory_is_no_op_for_no_alg_options_except_normalization(x):
assert _input_string_transform_factory(0)(x) == normalize('NFD', x)


def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE_examples():
x = '⑦'
assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == '7'


@given(text())
def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE(x):
assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == normalize('NFKD', x)


def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_examples():
Expand All @@ -47,9 +58,9 @@ def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_exampl
@given(text())
def test_input_string_transform_factory_performs_casefold_with_IGNORECASE(x):
if NEWPY:
assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.casefold()
assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).casefold()
else:
assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.lower()
assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).lower()


def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples():
Expand All @@ -59,7 +70,7 @@ def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples():

@given(text())
def test_input_string_transform_factory_performs_swapcase_with_DUMB(x):
assert _input_string_transform_factory(ns._DUMB)(x) == x.swapcase()
assert _input_string_transform_factory(ns._DUMB)(x) == normalize('NFD', x).swapcase()


def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_example():
Expand All @@ -69,18 +80,17 @@ def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_ex

@given(text())
def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST(x):
x = 'feijGGAd'
assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == x.swapcase()
assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase()


def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB_example():
x = 'feijGGAd'
assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x
assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == x


@given(text())
def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB(x):
assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x
assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == normalize('NFD', x)


def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE_example():
Expand All @@ -94,9 +104,9 @@ def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWE
@given(text())
def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE(x):
if NEWPY:
assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold()
assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().casefold()
else:
assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower()
assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().lower()


def test_input_string_transform_factory_removes_thousands_separator_with_LOCALE_example():
Expand Down
6 changes: 4 additions & 2 deletions test_natsort/test_natsorted.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_


def test_natsorted_with_mixed_input_returns_sorted_results_without_error():
a = ['0', 'Á', '2', 'Z']
assert natsorted(a) == ['0', '2', 'Á', 'Z']
a = ['2', 'ä', 'b', 1.5, 3]
assert natsorted(a) == [1.5, '2', 3, 'b', 'ä']
assert natsorted(a) == [1.5, '2', 3, 'ä', 'b']


def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST():
Expand Down Expand Up @@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu
def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error():
load_locale('en_US')
a = ['0', 'Á', '2', 'Z']
assert natsorted(a) == ['0', '2', 'Z', 'Á']
assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z']
a = ['2', 'ä', 'b', 1.5, 3]
assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b']
locale.setlocale(locale.LC_ALL, str(''))
Expand Down
1 change: 1 addition & 0 deletions test_natsort/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected():
assert ns.CAPITALFIRST == ns.C
assert ns.UNGROUPLETTERS == ns.CAPITALFIRST
assert ns.NANLAST == ns.NL
assert ns.COMPATIBILITYNORMALIZE == ns.CN

# Convenience
assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM
Expand Down

0 comments on commit 3a75ddb

Please sign in to comment.