Clean up nltk-based language utils and tests.

gunthercox · Oct 26, 2016 · 2c60a3a · 2c60a3a
1 parent f991d47
commit 2c60a3a
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 84 deletions.
diff --git a/chatterbot/conversation/comparisons.py b/chatterbot/conversation/comparisons.py
@@ -1,4 +1,13 @@
+"""
+This module contains various text-comparison algorithms
+designed to compare one statement to another.
+"""
+
 def levenshtein_distance(statement, other_statement):
+    """
+    Compare two statements based on the Levenshtein distance
+    (fuzzy string comparison) of each statement's text.
+    """
     from fuzzywuzzy import fuzz
 
     return fuzz.ratio(statement.text.lower(), other_statement.text.lower())
@@ -7,38 +16,18 @@ def levenshtein_distance(statement, other_statement):
 def synset_distance(statement, other_statement):
     """
     Calculate the similarity of two statements.
-    This is based on the total similarity between
-    each word in each sentence.
+    This is based on the total maximum synset similarity
+    between each word in each sentence.
     """
-    from chatterbot.utils.pos_tagger import POSTagger
-    from chatterbot.utils.stop_words import StopWordsManager
-    from chatterbot.utils.word_net import Wordnet
+    from chatterbot.utils.wordnet import Wordnet
+    from chatterbot.utils.tokenizer import Tokenizer
     import itertools
 
     wordnet = Wordnet()
-    tagger = POSTagger()
-    stopwords = StopWordsManager()
-
-    def get_tokens(text, exclude_stop_words=True):
-        """
-        Takes a string and converts it to a tuple
-        of each word. Skips common stop words such
-        as ("is, the, a, ...") is 'exclude_stop_words'
-        is True.
-        """
-        lower = text.lower()
-        tokens = tagger.tokenize(lower)
-
-        # Remove any stop words from the string
-        if exclude_stop_words:
-            excluded_words = stopwords.words('english')
-
-            tokens = set(tokens) - set(excluded_words)
-
-        return tokens
+    tokenizer = Tokenizer()
 
-    tokens1 = get_tokens(statement.text)
-    tokens2 = get_tokens(other_statement.text)
+    tokens1 = tokenizer.get_tokens(statement.text)
+    tokens2 = tokenizer.get_tokens(other_statement.text)
 
     total_similarity = 0
 
@@ -89,7 +78,6 @@ def jaccard_similarity(a, b, threshold=0.5):
     Given our threshold above, we would consider this to be  a match
     """
     from nltk.corpus import wordnet
-    import nltk.corpus
     import nltk
     import string
 

diff --git a/chatterbot/utils/pos_tagger.py b/chatterbot/utils/pos_tagger.py
diff --git a/chatterbot/utils/stop_words.py b/chatterbot/utils/stop_words.py
@@ -1,20 +1,18 @@
 from nltk.corpus import stopwords
 
 
-class StopWordsManager():
+class StopWordsManager(object):
     """
-    A custom-implementation of Stop words. Not many
-    features are supported at the moment, only:
+    A stop words utility class.
     1) remove_stopwords: Removes the stopwords of the
         passed language from the tokens given
-    2) words: Returns a list of stopwords for a given
-        language
     """
 
     def __init__(self):
         from nltk.data import find
         from nltk import download
 
+        # Download the stopwords data only if it is not already downloaded
         try:
             find('stopwords.zip')
         except LookupError:
@@ -25,13 +23,10 @@ def remove_stopwords(self, language, tokens):
         Takes a language (i.e. 'english'), and a set of word tokens.
         Returns the tokenized text with any stopwords removed.
         """
-        stop_words = self.words(language)
-        tokens = set(tokens) - set(stop_words)
+        # Get the stopwords for the specified language
+        stop_words = stopwords.words(language)
 
-        return tokens
+        # Remove the stop words from the set of word tokens
+        tokens = set(tokens) - set(stop_words)
 
-    def words(self, language):
-        """
-        Returns the stopwords for the given language.
-        """
-        return stopwords.words(language)
+        return tokens
diff --git a/chatterbot/utils/tokenizer.py b/chatterbot/utils/tokenizer.py
@@ -0,0 +1,32 @@
+class Tokenizer(object):
+    """
+    A string tokenizaton utility class.
+    """
+
+    def __init__(self):
+        from nltk.data import find
+        from nltk import download
+
+        # Download the punkt data only if it is not already downloaded
+        try:
+            find('punkt.zip')
+        except LookupError:
+            download('punkt')
+
+    def get_tokens(self, text, language='english', exclude_stop_words=True):
+        """
+        Takes a string and converts it to a tuple of each word.
+        Skips common stop words such as ("is, the, a, ...")
+        if 'exclude_stop_words' is True.
+        """
+        from chatterbot.utils.stop_words import StopWordsManager
+        from nltk import word_tokenize
+
+        stopwords = StopWordsManager()
+        tokens = word_tokenize(text.lower())
+
+        # Remove all stop words from the list of word tokens
+        if exclude_stop_words:
+            tokens = stopwords.remove_stopwords(language, tokens)
+
+        return tokens
diff --git a/chatterbot/utils/word_net.py → chatterbot/utils/wordnet.py b/chatterbot/utils/word_net.py → chatterbot/utils/wordnet.py
@@ -1,7 +1,7 @@
 from nltk.corpus import wordnet
 
 
-class Wordnet():
+class Wordnet(object):
     """
     A custom-implementation of Wordnet. Not many
     features are supported at the moment, only:

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,43 +1,62 @@
 # -*- coding: utf-8 -*-
 from unittest import TestCase
-
 from chatterbot.utils.clean import clean_whitespace
 from chatterbot.utils.clean import clean
 from chatterbot.utils.module_loading import import_module
-from chatterbot.utils.pos_tagger import POSTagger
-from chatterbot.utils.stop_words import StopWordsManager
-from chatterbot.utils.word_net import Wordnet
 
 
 class UtilityTests(TestCase):
 
     def test_import_module(self):
-        datetime = import_module("datetime.datetime")
+        datetime = import_module('datetime.datetime')
         self.assertTrue(hasattr(datetime, 'now'))
 
 
-class LanguageUtilityTests(TestCase):
+class TokenizerTestCase(TestCase):
+
+    def setUp(self):
+        super(TokenizerTestCase, self).setUp()
+        from chatterbot.utils.tokenizer import Tokenizer
 
-    def test_pos_tagger_tokenize(self):
-        pos_tagger = POSTagger()
-        tokens = pos_tagger.tokenize("what time is it")
+        self.tokenizer = Tokenizer()
 
+    def test_get_tokens(self):
+        tokens = self.tokenizer.get_tokens('what time is it', exclude_stop_words=False)
         self.assertEqual(tokens, ['what', 'time', 'is', 'it'])
 
-    def test_remove_stop_words(self):
-        stopwords_manager = StopWordsManager()
+    def test_get_tokens_exclude_stop_words(self):
+        tokens = self.tokenizer.get_tokens('what time is it', exclude_stop_words=True)
+        self.assertEqual(tokens, {'time'})
+
+
+class StopWordsTestCase(TestCase):
+
+    def setUp(self):
+        super(StopWordsTestCase, self).setUp()
+        from chatterbot.utils.stop_words import StopWordsManager
 
+        self.stopwords_manager = StopWordsManager()
+
+    def test_remove_stop_words(self):
         tokens = ['this', 'is', 'a', 'test', 'string']
-        words = stopwords_manager.remove_stopwords('english', tokens)
+        words = self.stopwords_manager.remove_stopwords('english', tokens)
 
         # This example list of words should end up with only two elements
         self.assertEqual(len(words), 2)
         self.assertIn('test', list(words))
         self.assertIn('string', list(words))
 
-    def test_word_net(self):
-        wordnet = Wordnet()
-        synsets = wordnet.synsets('test')
+
+class WordnetTestCase(TestCase):
+
+    def setUp(self):
+        super(WordnetTestCase, self).setUp()
+        from chatterbot.utils.wordnet import Wordnet
+
+        self.wordnet = Wordnet()
+
+    def test_wordnet(self):
+        synsets = self.wordnet.synsets('test')
 
         self.assertEqual(
             0.06666666666666667,