Skip to content

Commit

Permalink
add support for emoji
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Durivage committed May 11, 2017
1 parent 90d08a2 commit 4896c8a
Showing 1 changed file with 34 additions and 18 deletions.
52 changes: 34 additions & 18 deletions afinn/afinn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

from os.path import dirname, join


LANGUAGE_TO_FILENAME = {
'da': 'AFINN-da-32.txt',
'en': 'AFINN-en-165.txt',
'emoticons': 'AFINN-emoticon-8.txt',
}
'emoji': 'AFINN-emoji.txt',
}


class AfinnException(Exception):
Expand Down Expand Up @@ -54,7 +54,8 @@ class Afinn(object):
"""

def __init__(self, language="en", emoticons=False, word_boundary=True):
def __init__(self, language="en", emoticons=False, emoji=False,
word_boundary=True):
"""Setup dictionary from data file.
The language parameter can be set to English (en) or Danish (da).
Expand All @@ -65,31 +66,46 @@ def __init__(self, language="en", emoticons=False, word_boundary=True):
Specify language dictionary.
emoticons : bool, optional
Includes emoticons in the token list
emoji : bool, optional
Includes emoji in the token list
word_boundary : bool, optional
Use word boundary match in the regular expression.
"""
filename = LANGUAGE_TO_FILENAME[language]
full_filename = self.full_filename(filename)
if emoticons:
# Words

if emoticons or emoji:
self._dict = self.read_word_file(full_filename)
regex_words = self.regex_from_tokens(
regexp_words = self.regex_from_tokens(
list(self._dict),
word_boundary=True, capture=False)

# Emoticons
filename_emoticons = LANGUAGE_TO_FILENAME['emoticons']
full_filename_emoticons = self.full_filename(filename_emoticons)
emoticons_and_score = self.read_word_file(full_filename_emoticons)
self._dict.update(emoticons_and_score)
regex_emoticons = self.regex_from_tokens(
list(emoticons_and_score), word_boundary=False,
capture=False)
word_boundary=True,
capture=False
)

words = []
if emoticons:
fname = LANGUAGE_TO_FILENAME['emoticons']
full_fname = self.full_filename(fname)
emoticon_score_dct = self.read_word_file(full_fname)
self._dict.update(emoticon_score_dct)
words.extend(list(emoticon_score_dct))

if emoji:
full_fname = self.full_filename(LANGUAGE_TO_FILENAME['emoji'])
emoji_score_dct = self.read_word_file(full_fname)
self._dict.update(emoji_score_dct)
words.extend(list(emoji_score_dct))

regexp_extra = self.regex_from_tokens(
words,
word_boundary=False,
capture=False
)

# Combined words and emoticon regular expression
regex = '(' + regex_words + '|' + regex_emoticons + ')'
self._setup_pattern_from_regex(regex)
regexp = '(%s|%s)' % (regexp_words, regexp_extra)
self._setup_pattern_from_regex(regexp)

else:
self.setup_from_file(full_filename, word_boundary=word_boundary)
Expand Down

0 comments on commit 4896c8a

Please sign in to comment.