Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch from NLTK to Spacy for text indexing #1684

Merged
merged 6 commits into from
Apr 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 0 additions & 18 deletions chatterbot/__main__.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,12 @@
import importlib
import sys
import os


def get_chatterbot_version():
chatterbot = importlib.import_module('chatterbot')
return chatterbot.__version__


def get_nltk_data_directories():
import nltk.data

data_directories = []

# Find each data directory in the NLTK path that has content
for path in nltk.data.path:
if os.path.exists(path):
if os.listdir(path):
data_directories.append(path)

return os.linesep.join(data_directories)


if __name__ == '__main__':
if '--version' in sys.argv:
print(get_chatterbot_version())

if 'list_nltk_data' in sys.argv:
print(get_nltk_data_directories())
25 changes: 0 additions & 25 deletions chatterbot/chatterbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,31 +54,6 @@ def __init__(self, name, **kwargs):
# Allow the bot to save input it receives so that it can learn
self.read_only = kwargs.get('read_only', False)

if kwargs.get('initialize', True):
self.initialize()

def get_initialization_functions(self):
initialization_functions = set()

initialization_functions.update(utils.get_initialization_functions(
self, 'storage.tagger'
))

for search_algorithm in self.search_algorithms.values():
search_algorithm_functions = utils.get_initialization_functions(
search_algorithm, 'compare_statements'
)
initialization_functions.update(search_algorithm_functions)

return initialization_functions

def initialize(self):
"""
Do any work that needs to be done before the chatbot can process responses.
"""
for function in self.get_initialization_functions():
function()

def get_response(self, statement=None, **kwargs):
"""
Return the bot's response based on the input.
Expand Down
6 changes: 4 additions & 2 deletions chatterbot/storage/storage_adapter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from chatterbot import languages
from chatterbot.tagging import PosHypernymTagger
from chatterbot.tagging import PosLemmaTagger


class StorageAdapter(object):
Expand All @@ -12,10 +12,12 @@ class StorageAdapter(object):
def __init__(self, *args, **kwargs):
"""
Initialize common attributes shared by all storage adapters.

:param str tagger_language: The language that the tagger uses to remove stopwords.
"""
self.logger = kwargs.get('logger', logging.getLogger(__name__))

self.tagger = PosHypernymTagger(language=kwargs.get(
self.tagger = PosLemmaTagger(language=kwargs.get(
'tagger_language', languages.ENG
))

Expand Down
145 changes: 0 additions & 145 deletions chatterbot/tagging.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
import string
from chatterbot import languages
from chatterbot import utils
from chatterbot.tokenizers import get_sentence_tokenizer
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.corpus.reader.wordnet import WordNetError
import spacy


Expand Down Expand Up @@ -56,143 +51,3 @@ def get_bigram_pair_string(self, text):
]

return ' '.join(bigram_pairs)


class PosHypernymTagger(object):
"""
For each non-stopword in a string, return a string where each word is a
hypernym preceded by the part of speech of the word before it.
"""

def __init__(self, language=None):
self.language = language or languages.ENG

self.sentence_tokenizer = None

self.stopwords = None

self.initialization_functions = [
utils.download_nltk_stopwords,
utils.download_nltk_wordnet,
utils.download_nltk_averaged_perceptron_tagger
]

def get_stopwords(self):
"""
Get the list of stopwords from the NLTK corpus.
"""
if self.stopwords is None:
self.stopwords = stopwords.words(self.language.ENGLISH_NAME.lower())

return self.stopwords

def tokenize_sentence(self, sentence):
"""
Tokenize the provided sentence.
"""
if self.sentence_tokenizer is None:
self.sentence_tokenizer = get_sentence_tokenizer(self.language)

return self.sentence_tokenizer.tokenize(sentence)

def stem_words(self, words):
"""
Return the first character of the word in place of a part-of-speech tag.
"""
return [
(word, word.lower()[0], ) for word in words
]

def get_pos_tags(self, words):
try:
# pos_tag supports eng and rus
tags = pos_tag(words, lang=self.language.ISO_639)
except NotImplementedError:
tags = self.stem_words(words)
except LookupError:
tags = self.stem_words(words)

return tags

def get_hypernyms(self, pos_tags):
"""
Return the hypernyms for each word in a list of POS tagged words.
"""
results = []

for word, pos in pos_tags:
try:
synsets = wordnet.synsets(word, utils.treebank_to_wordnet(pos), lang=self.language.ISO_639)
except WordNetError:
synsets = None
except LookupError:
# Don't return any synsets if the language is not supported
synsets = None

if synsets:
synset = synsets[0]
hypernyms = synset.hypernyms()

if hypernyms:
results.append(hypernyms[0].name().split('.')[0])
else:
results.append(word)
else:
results.append(word)

return results

def get_bigram_pair_string(self, text):
"""
For example:
What a beautiful swamp

becomes:

DT:beautiful JJ:wetland
"""
WORD_INDEX = 0
POS_INDEX = 1

pos_tags = []

for sentence in self.tokenize_sentence(text.strip()):

# Remove punctuation
if sentence and sentence[-1] in string.punctuation:
sentence_with_punctuation_removed = sentence[:-1]

if sentence_with_punctuation_removed:
sentence = sentence_with_punctuation_removed

words = sentence.split()

pos_tags.extend(self.get_pos_tags(words))

hypernyms = self.get_hypernyms(pos_tags)

high_quality_bigrams = []
all_bigrams = []

word_count = len(pos_tags)

if word_count == 1:
all_bigrams.append(
pos_tags[0][WORD_INDEX].lower()
)

for index in range(1, word_count):
word = pos_tags[index][WORD_INDEX].lower()
previous_word_pos = pos_tags[index - 1][POS_INDEX]
if word not in self.get_stopwords() and len(word) > 1:
bigram = previous_word_pos + ':' + hypernyms[index].lower()
high_quality_bigrams.append(bigram)
all_bigrams.append(bigram)
else:
bigram = previous_word_pos + ':' + word
all_bigrams.append(bigram)

if high_quality_bigrams:
all_bigrams = high_quality_bigrams

return ' '.join(all_bigrams)
62 changes: 0 additions & 62 deletions chatterbot/tokenizers.py

This file was deleted.

Loading