Fix deprecation warnings for regex string literals. Fix #1646 (#1649)

* Fix deprecation warnings for regex string literals. Fix #1646 Add raw flag before all Regex strings so Python 3 can stop complaining. * Fix two more occurrences of unescaped Regex strings
piskvorky · Oct 26, 2017 · a068cbe · a068cbe
1 parent 00192a8
commit a068cbe
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 32 deletions.
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -41,23 +41,23 @@
 TOKEN_MAX_LEN = 15
 
 
-RE_P0 = re.compile('<!--.*?-->', re.DOTALL | re.UNICODE)  # comments
-RE_P1 = re.compile('<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)  # footnotes
-RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE)  # links to languages
-RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE)  # template
-RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE)  # template
-RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)  # remove URL, keep description
-RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE)  # simplify links, keep description
-RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of images
-RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of files
-RE_P9 = re.compile('<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE)  # outside links
-RE_P10 = re.compile('<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE)  # math content
-RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE)  # all other tags
-RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)  # table formatting
-RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)  # table cell formatting
-RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE)  # categories
+RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)  # comments
+RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)  # footnotes
+RE_P2 = re.compile(r'(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$', re.UNICODE)  # links to languages
+RE_P3 = re.compile(r'{{([^}{]*)}}', re.DOTALL | re.UNICODE)  # template
+RE_P4 = re.compile(r'{{([^}]*)}}', re.DOTALL | re.UNICODE)  # template
+RE_P5 = re.compile(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE)  # remove URL, keep description
+RE_P6 = re.compile(r'\[([^][]*)\|([^][]*)\]', re.DOTALL | re.UNICODE)  # simplify links, keep description
+RE_P7 = re.compile(r'\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of images
+RE_P8 = re.compile(r'\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE)  # keep description of files
+RE_P9 = re.compile(r'<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE)  # outside links
+RE_P10 = re.compile(r'<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE)  # math content
+RE_P11 = re.compile(r'<(.*?)>', re.DOTALL | re.UNICODE)  # all other tags
+RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)  # table formatting
+RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)  # table cell formatting
+RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)  # categories
 # Remove File and Image template
-RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
 
 # MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
 # ought to be ignored
@@ -81,7 +81,7 @@ def filter_wiki(raw):
 
 
 def remove_markup(text):
-    text = re.sub(RE_P2, "", text)  # remove the last list (=languages)
+    text = re.sub(RE_P2, '', text)  # remove the last list (=languages)
     # the wiki markup is recursive (markup inside markup etc)
     # instead of writing a recursive grammar, here we deal with that by removing
     # markup in a loop, starting with inner-most expressions and working outwards,
@@ -91,11 +91,11 @@ def remove_markup(text):
     iters = 0
     while True:
         old, iters = text, iters + 1
-        text = re.sub(RE_P0, "", text)  # remove comments
+        text = re.sub(RE_P0, '', text)  # remove comments
         text = re.sub(RE_P1, '', text)  # remove footnotes
-        text = re.sub(RE_P9, "", text)  # remove outside links
-        text = re.sub(RE_P10, "", text)  # remove math content
-        text = re.sub(RE_P11, "", text)  # remove all remaining tags
+        text = re.sub(RE_P9, '', text)  # remove outside links
+        text = re.sub(RE_P10, '', text)  # remove math content
+        text = re.sub(RE_P11, '', text)  # remove all remaining tags
         text = re.sub(RE_P14, '', text)  # remove categories
         text = re.sub(RE_P5, '\\3', text)  # remove urls, keep description
         text = re.sub(RE_P6, '\\2', text)  # simplify links, keep description only

diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py
@@ -28,7 +28,7 @@
 if sys.version_info[0] >= 3:
     unicode = str
 
-PAT_TAG = re.compile('<(.*?)>(.*)</.*?>')
+PAT_TAG = re.compile(r'<(.*?)>(.*)</.*?>')
 logger = logging.getLogger('gensim.corpora.sources')
 
 

diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py
@@ -44,7 +44,7 @@ def remove_stopwords(s):
     return " ".join(w for w in s.split() if w not in STOPWORDS)
 
 
-RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
+RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE)
 
 
 def strip_punctuation(s):

diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
@@ -21,13 +21,13 @@
     HAS_PATTERN = False
 
 
-SEPARATOR = r"@"
-RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
-AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)", re.UNICODE)
-AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)", re.UNICODE)
-AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.", re.UNICODE)
-UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)", re.UNICODE)
-UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)", re.UNICODE)
+SEPARATOR = r'@'
+RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
+AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
+AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
+AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)
+UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE)
+UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE)
 
 
 def split_sentences(text):

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -79,7 +79,7 @@ def smart_open(fname, mode='rb'):
         return open(fname, mode)
 
 
-PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
+PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
 RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
 
 
@@ -1039,7 +1039,7 @@ def has_pattern():
         return False
 
 
-def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
+def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
         stopwords=frozenset(), min_length=2, max_length=15):
     """
     This function is only available when the optional 'pattern' package is installed.