Skip to content

Commit

Permalink
Merge pull request #361 from gunthercox/nltk
Browse files Browse the repository at this point in the history
Clean up nltk-based language utils and tests
  • Loading branch information
gunthercox authored Oct 26, 2016
2 parents f991d47 + 2c60a3a commit cb2d34c
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 84 deletions.
44 changes: 16 additions & 28 deletions chatterbot/conversation/comparisons.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
"""
This module contains various text-comparison algorithms
designed to compare one statement to another.
"""

def levenshtein_distance(statement, other_statement):
"""
Compare two statements based on the Levenshtein distance
(fuzzy string comparison) of each statement's text.
"""
from fuzzywuzzy import fuzz

return fuzz.ratio(statement.text.lower(), other_statement.text.lower())
Expand All @@ -7,38 +16,18 @@ def levenshtein_distance(statement, other_statement):
def synset_distance(statement, other_statement):
"""
Calculate the similarity of two statements.
This is based on the total similarity between
each word in each sentence.
This is based on the total maximum synset similarity
between each word in each sentence.
"""
from chatterbot.utils.pos_tagger import POSTagger
from chatterbot.utils.stop_words import StopWordsManager
from chatterbot.utils.word_net import Wordnet
from chatterbot.utils.wordnet import Wordnet
from chatterbot.utils.tokenizer import Tokenizer
import itertools

wordnet = Wordnet()
tagger = POSTagger()
stopwords = StopWordsManager()

def get_tokens(text, exclude_stop_words=True):
"""
Takes a string and converts it to a tuple
of each word. Skips common stop words such
as ("is, the, a, ...") is 'exclude_stop_words'
is True.
"""
lower = text.lower()
tokens = tagger.tokenize(lower)

# Remove any stop words from the string
if exclude_stop_words:
excluded_words = stopwords.words('english')

tokens = set(tokens) - set(excluded_words)

return tokens
tokenizer = Tokenizer()

tokens1 = get_tokens(statement.text)
tokens2 = get_tokens(other_statement.text)
tokens1 = tokenizer.get_tokens(statement.text)
tokens2 = tokenizer.get_tokens(other_statement.text)

total_similarity = 0

Expand Down Expand Up @@ -89,7 +78,6 @@ def jaccard_similarity(a, b, threshold=0.5):
Given our threshold above, we would consider this to be a match
"""
from nltk.corpus import wordnet
import nltk.corpus
import nltk
import string

Expand Down
27 changes: 0 additions & 27 deletions chatterbot/utils/pos_tagger.py

This file was deleted.

21 changes: 8 additions & 13 deletions chatterbot/utils/stop_words.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
from nltk.corpus import stopwords


class StopWordsManager():
class StopWordsManager(object):
"""
A custom-implementation of Stop words. Not many
features are supported at the moment, only:
A stop words utility class.
1) remove_stopwords: Removes the stopwords of the
passed language from the tokens given
2) words: Returns a list of stopwords for a given
language
"""

def __init__(self):
from nltk.data import find
from nltk import download

# Download the stopwords data only if it is not already downloaded
try:
find('stopwords.zip')
except LookupError:
Expand All @@ -25,13 +23,10 @@ def remove_stopwords(self, language, tokens):
Takes a language (i.e. 'english'), and a set of word tokens.
Returns the tokenized text with any stopwords removed.
"""
stop_words = self.words(language)
tokens = set(tokens) - set(stop_words)
# Get the stopwords for the specified language
stop_words = stopwords.words(language)

return tokens
# Remove the stop words from the set of word tokens
tokens = set(tokens) - set(stop_words)

def words(self, language):
"""
Returns the stopwords for the given language.
"""
return stopwords.words(language)
return tokens
32 changes: 32 additions & 0 deletions chatterbot/utils/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
class Tokenizer(object):
"""
A string tokenizaton utility class.
"""

def __init__(self):
from nltk.data import find
from nltk import download

# Download the punkt data only if it is not already downloaded
try:
find('punkt.zip')
except LookupError:
download('punkt')

def get_tokens(self, text, language='english', exclude_stop_words=True):
"""
Takes a string and converts it to a tuple of each word.
Skips common stop words such as ("is, the, a, ...")
if 'exclude_stop_words' is True.
"""
from chatterbot.utils.stop_words import StopWordsManager
from nltk import word_tokenize

stopwords = StopWordsManager()
tokens = word_tokenize(text.lower())

# Remove all stop words from the list of word tokens
if exclude_stop_words:
tokens = stopwords.remove_stopwords(language, tokens)

return tokens
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from nltk.corpus import wordnet


class Wordnet():
class Wordnet(object):
"""
A custom-implementation of Wordnet. Not many
features are supported at the moment, only:
Expand Down
49 changes: 34 additions & 15 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,62 @@
# -*- coding: utf-8 -*-
from unittest import TestCase

from chatterbot.utils.clean import clean_whitespace
from chatterbot.utils.clean import clean
from chatterbot.utils.module_loading import import_module
from chatterbot.utils.pos_tagger import POSTagger
from chatterbot.utils.stop_words import StopWordsManager
from chatterbot.utils.word_net import Wordnet


class UtilityTests(TestCase):

def test_import_module(self):
datetime = import_module("datetime.datetime")
datetime = import_module('datetime.datetime')
self.assertTrue(hasattr(datetime, 'now'))


class LanguageUtilityTests(TestCase):
class TokenizerTestCase(TestCase):

def setUp(self):
super(TokenizerTestCase, self).setUp()
from chatterbot.utils.tokenizer import Tokenizer

def test_pos_tagger_tokenize(self):
pos_tagger = POSTagger()
tokens = pos_tagger.tokenize("what time is it")
self.tokenizer = Tokenizer()

def test_get_tokens(self):
tokens = self.tokenizer.get_tokens('what time is it', exclude_stop_words=False)
self.assertEqual(tokens, ['what', 'time', 'is', 'it'])

def test_remove_stop_words(self):
stopwords_manager = StopWordsManager()
def test_get_tokens_exclude_stop_words(self):
tokens = self.tokenizer.get_tokens('what time is it', exclude_stop_words=True)
self.assertEqual(tokens, {'time'})


class StopWordsTestCase(TestCase):

def setUp(self):
super(StopWordsTestCase, self).setUp()
from chatterbot.utils.stop_words import StopWordsManager

self.stopwords_manager = StopWordsManager()

def test_remove_stop_words(self):
tokens = ['this', 'is', 'a', 'test', 'string']
words = stopwords_manager.remove_stopwords('english', tokens)
words = self.stopwords_manager.remove_stopwords('english', tokens)

# This example list of words should end up with only two elements
self.assertEqual(len(words), 2)
self.assertIn('test', list(words))
self.assertIn('string', list(words))

def test_word_net(self):
wordnet = Wordnet()
synsets = wordnet.synsets('test')

class WordnetTestCase(TestCase):

def setUp(self):
super(WordnetTestCase, self).setUp()
from chatterbot.utils.wordnet import Wordnet

self.wordnet = Wordnet()

def test_wordnet(self):
synsets = self.wordnet.synsets('test')

self.assertEqual(
0.06666666666666667,
Expand Down

0 comments on commit cb2d34c

Please sign in to comment.