gunthercox · gunthercox · Apr 6, 2019 · Mar 31, 2019 · Mar 31, 2019 · Apr 1, 2019
diff --git a/chatterbot/__main__.py b/chatterbot/__main__.py
@@ -1,30 +1,12 @@
 import importlib
 import sys
-import os
 
 
 def get_chatterbot_version():
     chatterbot = importlib.import_module('chatterbot')
     return chatterbot.__version__
 
 
-def get_nltk_data_directories():
-    import nltk.data
-
-    data_directories = []
-
-    # Find each data directory in the NLTK path that has content
-    for path in nltk.data.path:
-        if os.path.exists(path):
-            if os.listdir(path):
-                data_directories.append(path)
-
-    return os.linesep.join(data_directories)
-
-
 if __name__ == '__main__':
     if '--version' in sys.argv:
         print(get_chatterbot_version())
-
-    if 'list_nltk_data' in sys.argv:
-        print(get_nltk_data_directories())
diff --git a/chatterbot/chatterbot.py b/chatterbot/chatterbot.py
@@ -54,31 +54,6 @@ def __init__(self, name, **kwargs):
         # Allow the bot to save input it receives so that it can learn
         self.read_only = kwargs.get('read_only', False)
 
-        if kwargs.get('initialize', True):
-            self.initialize()
-
-    def get_initialization_functions(self):
-        initialization_functions = set()
-
-        initialization_functions.update(utils.get_initialization_functions(
-            self, 'storage.tagger'
-        ))
-
-        for search_algorithm in self.search_algorithms.values():
-            search_algorithm_functions = utils.get_initialization_functions(
-                search_algorithm, 'compare_statements'
-            )
-            initialization_functions.update(search_algorithm_functions)
-
-        return initialization_functions
-
-    def initialize(self):
-        """
-        Do any work that needs to be done before the chatbot can process responses.
-        """
-        for function in self.get_initialization_functions():
-            function()
-
     def get_response(self, statement=None, **kwargs):
         """
         Return the bot's response based on the input.

diff --git a/chatterbot/storage/storage_adapter.py b/chatterbot/storage/storage_adapter.py
@@ -1,6 +1,6 @@
 import logging
 from chatterbot import languages
-from chatterbot.tagging import PosHypernymTagger
+from chatterbot.tagging import PosLemmaTagger
 
 
 class StorageAdapter(object):
@@ -12,10 +12,12 @@ class StorageAdapter(object):
     def __init__(self, *args, **kwargs):
         """
         Initialize common attributes shared by all storage adapters.
+
+        :param str tagger_language: The language that the tagger uses to remove stopwords.
         """
         self.logger = kwargs.get('logger', logging.getLogger(__name__))
 
-        self.tagger = PosHypernymTagger(language=kwargs.get(
+        self.tagger = PosLemmaTagger(language=kwargs.get(
             'tagger_language', languages.ENG
         ))
 

diff --git a/chatterbot/tagging.py b/chatterbot/tagging.py
@@ -1,10 +1,5 @@
 import string
 from chatterbot import languages
-from chatterbot import utils
-from chatterbot.tokenizers import get_sentence_tokenizer
-from nltk import pos_tag
-from nltk.corpus import wordnet, stopwords
-from nltk.corpus.reader.wordnet import WordNetError
 import spacy
 
 
@@ -56,143 +51,3 @@ def get_bigram_pair_string(self, text):
             ]
 
         return ' '.join(bigram_pairs)
-
-
-class PosHypernymTagger(object):
-    """
-    For each non-stopword in a string, return a string where each word is a
-    hypernym preceded by the part of speech of the word before it.
-    """
-
-    def __init__(self, language=None):
-        self.language = language or languages.ENG
-
-        self.sentence_tokenizer = None
-
-        self.stopwords = None
-
-        self.initialization_functions = [
-            utils.download_nltk_stopwords,
-            utils.download_nltk_wordnet,
-            utils.download_nltk_averaged_perceptron_tagger
-        ]
-
-    def get_stopwords(self):
-        """
-        Get the list of stopwords from the NLTK corpus.
-        """
-        if self.stopwords is None:
-            self.stopwords = stopwords.words(self.language.ENGLISH_NAME.lower())
-
-        return self.stopwords
-
-    def tokenize_sentence(self, sentence):
-        """
-        Tokenize the provided sentence.
-        """
-        if self.sentence_tokenizer is None:
-            self.sentence_tokenizer = get_sentence_tokenizer(self.language)
-
-        return self.sentence_tokenizer.tokenize(sentence)
-
-    def stem_words(self, words):
-        """
-        Return the first character of the word in place of a part-of-speech tag.
-        """
-        return [
-            (word, word.lower()[0], ) for word in words
-        ]
-
-    def get_pos_tags(self, words):
-        try:
-            # pos_tag supports eng and rus
-            tags = pos_tag(words, lang=self.language.ISO_639)
-        except NotImplementedError:
-            tags = self.stem_words(words)
-        except LookupError:
-            tags = self.stem_words(words)
-
-        return tags
-
-    def get_hypernyms(self, pos_tags):
-        """
-        Return the hypernyms for each word in a list of POS tagged words.
-        """
-        results = []
-
-        for word, pos in pos_tags:
-            try:
-                synsets = wordnet.synsets(word, utils.treebank_to_wordnet(pos), lang=self.language.ISO_639)
-            except WordNetError:
-                synsets = None
-            except LookupError:
-                # Don't return any synsets if the language is not supported
-                synsets = None
-
-            if synsets:
-                synset = synsets[0]
-                hypernyms = synset.hypernyms()
-
-                if hypernyms:
-                    results.append(hypernyms[0].name().split('.')[0])
-                else:
-                    results.append(word)
-            else:
-                results.append(word)
-
-        return results
-
-    def get_bigram_pair_string(self, text):
-        """
-        For example:
-        What a beautiful swamp
-
-        becomes:
-
-        DT:beautiful JJ:wetland
-        """
-        WORD_INDEX = 0
-        POS_INDEX = 1
-
-        pos_tags = []
-
-        for sentence in self.tokenize_sentence(text.strip()):
-
-            # Remove punctuation
-            if sentence and sentence[-1] in string.punctuation:
-                sentence_with_punctuation_removed = sentence[:-1]
-
-                if sentence_with_punctuation_removed:
-                    sentence = sentence_with_punctuation_removed
-
-            words = sentence.split()
-
-            pos_tags.extend(self.get_pos_tags(words))
-
-        hypernyms = self.get_hypernyms(pos_tags)
-
-        high_quality_bigrams = []
-        all_bigrams = []
-
-        word_count = len(pos_tags)
-
-        if word_count == 1:
-            all_bigrams.append(
-                pos_tags[0][WORD_INDEX].lower()
-            )
-
-        for index in range(1, word_count):
-            word = pos_tags[index][WORD_INDEX].lower()
-            previous_word_pos = pos_tags[index - 1][POS_INDEX]
-            if word not in self.get_stopwords() and len(word) > 1:
-                bigram = previous_word_pos + ':' + hypernyms[index].lower()
-                high_quality_bigrams.append(bigram)
-                all_bigrams.append(bigram)
-            else:
-                bigram = previous_word_pos + ':' + word
-                all_bigrams.append(bigram)
-
-        if high_quality_bigrams:
-            all_bigrams = high_quality_bigrams
-
-        return ' '.join(all_bigrams)
diff --git a/chatterbot/tokenizers.py b/chatterbot/tokenizers.py