-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Clean up nltk-based language utils and tests.
- Loading branch information
1 parent
f991d47
commit 2c60a3a
Showing
6 changed files
with
91 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
class Tokenizer(object): | ||
""" | ||
A string tokenizaton utility class. | ||
""" | ||
|
||
def __init__(self): | ||
from nltk.data import find | ||
from nltk import download | ||
|
||
# Download the punkt data only if it is not already downloaded | ||
try: | ||
find('punkt.zip') | ||
except LookupError: | ||
download('punkt') | ||
|
||
def get_tokens(self, text, language='english', exclude_stop_words=True): | ||
""" | ||
Takes a string and converts it to a tuple of each word. | ||
Skips common stop words such as ("is, the, a, ...") | ||
if 'exclude_stop_words' is True. | ||
""" | ||
from chatterbot.utils.stop_words import StopWordsManager | ||
from nltk import word_tokenize | ||
|
||
stopwords = StopWordsManager() | ||
tokens = word_tokenize(text.lower()) | ||
|
||
# Remove all stop words from the list of word tokens | ||
if exclude_stop_words: | ||
tokens = stopwords.remove_stopwords(language, tokens) | ||
|
||
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters