diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 07cb70630c..de04b36c62 100755
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -41,23 +41,23 @@
TOKEN_MAX_LEN = 15
-RE_P0 = re.compile('', re.DOTALL | re.UNICODE) # comments
-RE_P1 = re.compile('[ ].*?)(]|/>)', re.DOTALL | re.UNICODE) # footnotes
-RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages
-RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template
-RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template
-RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
-RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE) # simplify links, keep description
-RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
-RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
-RE_P9 = re.compile(' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links
-RE_P10 = re.compile('|/>)', re.DOTALL | re.UNICODE) # math content
-RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
-RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
-RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
-RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories
+RE_P0 = re.compile(r'', re.DOTALL | re.UNICODE) # comments
+RE_P1 = re.compile(r'[ ].*?)(]|/>)', re.DOTALL | re.UNICODE) # footnotes
+RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE) # links to languages
+RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE) # template
+RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE) # template
+RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
+RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE) # simplify links, keep description
+RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
+RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
+RE_P9 = re.compile(r' ].*?)(|/>)', re.DOTALL | re.UNICODE) # outside links
+RE_P10 = re.compile(r'|/>)', re.DOTALL | re.UNICODE) # math content
+RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
+RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
+RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
+RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories
# Remove File and Image template
-RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
# ought to be ignored
@@ -81,7 +81,7 @@ def filter_wiki(raw):
def remove_markup(text):
- text = re.sub(RE_P2, "", text) # remove the last list (=languages)
+ text = re.sub(RE_P2, '', text) # remove the last list (=languages)
# the wiki markup is recursive (markup inside markup etc)
# instead of writing a recursive grammar, here we deal with that by removing
# markup in a loop, starting with inner-most expressions and working outwards,
@@ -91,11 +91,11 @@ def remove_markup(text):
iters = 0
while True:
old, iters = text, iters + 1
- text = re.sub(RE_P0, "", text) # remove comments
+ text = re.sub(RE_P0, '', text) # remove comments
text = re.sub(RE_P1, '', text) # remove footnotes
- text = re.sub(RE_P9, "", text) # remove outside links
- text = re.sub(RE_P10, "", text) # remove math content
- text = re.sub(RE_P11, "", text) # remove all remaining tags
+ text = re.sub(RE_P9, '', text) # remove outside links
+ text = re.sub(RE_P10, '', text) # remove math content
+ text = re.sub(RE_P11, '', text) # remove all remaining tags
text = re.sub(RE_P14, '', text) # remove categories
text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py
index 4193da0820..8124101acd 100644
--- a/gensim/examples/dmlcz/sources.py
+++ b/gensim/examples/dmlcz/sources.py
@@ -28,7 +28,7 @@
if sys.version_info[0] >= 3:
unicode = str
-PAT_TAG = re.compile('<(.*?)>(.*)')
+PAT_TAG = re.compile(r'<(.*?)>(.*)')
logger = logging.getLogger('gensim.corpora.sources')
diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py
index ab25361f60..6d9fa59079 100644
--- a/gensim/parsing/preprocessing.py
+++ b/gensim/parsing/preprocessing.py
@@ -44,7 +44,7 @@ def remove_stopwords(s):
return " ".join(w for w in s.split() if w not in STOPWORDS)
-RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
+RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE)
def strip_punctuation(s):
diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
index fa6a56b887..5f33bbcea9 100644
--- a/gensim/summarization/textcleaner.py
+++ b/gensim/summarization/textcleaner.py
@@ -21,13 +21,13 @@
HAS_PATTERN = False
-SEPARATOR = r"@"
-RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
-AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE)
-AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE)
-AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE)
-UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE)
-UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE)
+SEPARATOR = r'@'
+RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
+AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
+AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
+AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)
+UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE)
+UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE)
def split_sentences(text):
diff --git a/gensim/utils.py b/gensim/utils.py
index fc3ca51906..74e623f0b3 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -79,7 +79,7 @@ def smart_open(fname, mode='rb'):
return open(fname, mode)
-PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
+PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
@@ -1039,7 +1039,7 @@ def has_pattern():
return False
-def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
+def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.