diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e8b272287..5d2ae8845c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ Changes * [#3141](https://github.com/RaRe-Technologies/gensim/pull/3141): Update link for online LDA paper, by [@dymil](https://github.com/dymil) * [#3148](https://github.com/RaRe-Technologies/gensim/pull/3148): Fix broken link in documentation, by [@rohit901](https://github.com/rohit901) * [#3155](https://github.com/RaRe-Technologies/gensim/pull/3155): Correct parameter name in documentation of fasttext.py, by [@bizzyvinci](https://github.com/bizzyvinci) - +* [#2964](https://github.com/RaRe-Technologies/gensim/pull/2964): Document that preprocessing.strip_punctuation is limited to ASCII, by [@sciatro](https://github.com/sciatro) ## 4.0.1, 2021-04-01 Bugfix release to address issues with Wheels on Windows: diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index 777ca46e8e..5fd45d2421 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -94,7 +94,7 @@ def remove_stopwords(s): def strip_punctuation(s): - """Replace punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`. + """Replace ASCII punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`. Parameters ---------- @@ -115,6 +115,7 @@ def strip_punctuation(s): """ s = utils.to_unicode(s) + # For unicode enhancement options see https://github.com/RaRe-Technologies/gensim/issues/2962 return RE_PUNCT.sub(" ", s)