diff --git a/afinn/afinn.py b/afinn/afinn.py index 35ed4a1..5359141 100644 --- a/afinn/afinn.py +++ b/afinn/afinn.py @@ -9,11 +9,11 @@ from os.path import dirname, join - LANGUAGE_TO_FILENAME = { 'da': 'AFINN-da-32.txt', 'en': 'AFINN-en-165.txt', 'emoticons': 'AFINN-emoticon-8.txt', + 'emoji': 'AFINN-emoji.txt', 'fr': 'AFINN-fr-165.txt', 'sv': 'AFINN-sv-165.txt', } @@ -56,7 +56,8 @@ class Afinn(object): """ - def __init__(self, language="en", emoticons=False, word_boundary=True): + def __init__(self, language="en", emoticons=False, emoji=False, + word_boundary=True): """Setup dictionary from data file. The language parameter can be set to English (en) or Danish (da). @@ -67,31 +68,46 @@ def __init__(self, language="en", emoticons=False, word_boundary=True): Specify language dictionary. emoticons : bool, optional Includes emoticons in the token list + emoji : bool, optional + Includes emoji in the token list word_boundary : bool, optional Use word boundary match in the regular expression. """ filename = LANGUAGE_TO_FILENAME[language] full_filename = self.full_filename(filename) - if emoticons: - # Words + + if emoticons or emoji: self._dict = self.read_word_file(full_filename) - regex_words = self.regex_from_tokens( + regexp_words = self.regex_from_tokens( list(self._dict), - word_boundary=True, capture=False) - - # Emoticons - filename_emoticons = LANGUAGE_TO_FILENAME['emoticons'] - full_filename_emoticons = self.full_filename(filename_emoticons) - emoticons_and_score = self.read_word_file(full_filename_emoticons) - self._dict.update(emoticons_and_score) - regex_emoticons = self.regex_from_tokens( - list(emoticons_and_score), word_boundary=False, - capture=False) + word_boundary=True, + capture=False + ) + + words = [] + if emoticons: + fname = LANGUAGE_TO_FILENAME['emoticons'] + full_fname = self.full_filename(fname) + emoticon_score_dct = self.read_word_file(full_fname) + self._dict.update(emoticon_score_dct) + words.extend(list(emoticon_score_dct)) + + if emoji: + full_fname = self.full_filename(LANGUAGE_TO_FILENAME['emoji']) + emoji_score_dct = self.read_word_file(full_fname) + self._dict.update(emoji_score_dct) + words.extend(list(emoji_score_dct)) + + regexp_extra = self.regex_from_tokens( + words, + word_boundary=False, + capture=False + ) # Combined words and emoticon regular expression - regex = '(' + regex_words + '|' + regex_emoticons + ')' - self._setup_pattern_from_regex(regex) + regexp = '(%s|%s)' % (regexp_words, regexp_extra) + self._setup_pattern_from_regex(regexp) else: self.setup_from_file(full_filename, word_boundary=word_boundary) diff --git a/afinn/data/AFINN-emoji.txt b/afinn/data/AFINN-emoji.txt new file mode 100644 index 0000000..6a00163 --- /dev/null +++ b/afinn/data/AFINN-emoji.txt @@ -0,0 +1,76 @@ +😀 3 +đŸ˜Ŧ -2 +😁 -2 +😂 2 +😃 3 +😄 3 +😅 3 +😆 3 +😇 1 +😉 1 +😊 2 +🙂 2 +🙃 1 +â˜ēī¸ 2 +😋 1 +😌 1 +😍 3 +😘 2 +😗 1 +😙 1 +😚 1 +😜 1 +😝 1 +😛 1 +🤑 1 +🤓 1 +😎 1 +🤗 2 +😏 2 +đŸ˜ļ -1 +😐 -2 +😑 -2 +😒 -2 +🙄 -1 +🤔 -1 +đŸ˜ŗ -1 +😞 -2 +😟 -2 +😠 -3 +😡 -3 +😔 -2 +😕 -1 +🙁 -2 +☚ī¸ -3 +đŸ˜Ŗ -2 +😖 -2 +đŸ˜Ģ -3 +😩 -3 +😤 -2 +😮 -1 +😱 -1 +😨 -1 +😰 -1 +đŸ˜¯ -1 +đŸ˜Ļ -1 +😧 -1 +đŸ˜ĸ -2 +đŸ˜Ĩ -2 +đŸ˜Ē -2 +😓 -1 +😭 -3 +đŸ˜ĩ -3 +😲 -3 +🤐 -1 +😴 -1 +👍 3 +👎 -3 +💋 1 +❤ī¸ 3 +💛 3 +💚 3 +💙 3 +💜 3 +💔 -2 +đŸ’¯ 2 +🖕 -3 diff --git a/setup.py b/setup.py index ead6739..b7c6433 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='afinn', packages=['afinn'], - version='0.2dev', + version='0.3', author='Finn Aarup Nielsen', author_email='faan@dtu.dk', description='AFINN sentiment analysis',