From 728844a8d0dfc26f6f226e16a421d1f2bfb5a310 Mon Sep 17 00:00:00 2001 From: Gunther Cox Date: Sun, 3 Dec 2017 16:45:11 -0500 Subject: [PATCH 1/4] Test training with emoji characters. --- tests/training_tests/test_list_training.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/training_tests/test_list_training.py b/tests/training_tests/test_list_training.py index 74bd46056..900330ef7 100644 --- a/tests/training_tests/test_list_training.py +++ b/tests/training_tests/test_list_training.py @@ -115,6 +115,22 @@ def test_training_with_unicode_characters(self): self.assertEqual(response, conversation[2]) + def test_training_with_emoji_characters(self): + """ + Ensure that the training method adds statements containing emojis. + """ + conversation = [ + u'Hi, how are you? 😃', + u'I am just dandy 👍', + u'Superb! 🎆' + ] + + self.chatbot.train(conversation) + + response = self.chatbot.get_response(conversation[1]) + + self.assertEqual(response, conversation[2]) + def test_similar_sentence_gets_same_response_multiple_times(self): """ Tests if the bot returns the same response for the same From 681414372dbed73f0aa7037f1c43cb037e70f565 Mon Sep 17 00:00:00 2001 From: Gunther Cox Date: Sun, 3 Dec 2017 16:57:47 -0500 Subject: [PATCH 2/4] Test training with an 8-bit bytestring. This will cause an error in SqlAlchemy in Python 2.7 ProgrammingError: (sqlite3.ProgrammingError) You must not use 8-bit bytestrings unless you use a text_factory that can interpret 8-bit bytestrings (like text_factory = str). It is highly recommended that you instead just switch your application to Unicode strings. [SQL: u'SELECT statement.id AS statement_id, statement.text AS statement_text, statement.extra_data AS statement_extra_data \nFROM statement \nWHERE statement.text = ?\n LIMIT ? OFFSET ?'] [parameters: ('\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x90\x97', 1, 0)] --- tests/training_tests/test_list_training.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/training_tests/test_list_training.py b/tests/training_tests/test_list_training.py index 900330ef7..dd63f2c07 100644 --- a/tests/training_tests/test_list_training.py +++ b/tests/training_tests/test_list_training.py @@ -131,6 +131,22 @@ def test_training_with_emoji_characters(self): self.assertEqual(response, conversation[2]) + def test_training_with_unicode_bytestring(self): + """ + Test training with an 8-bit bytestring. + """ + conversation = [ + 'Hi, how are you?', + '\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x90\x97', + 'Superb!' + ] + + self.chatbot.train(conversation) + + response = self.chatbot.get_response(conversation[1]) + + self.assertEqual(response, conversation[2]) + def test_similar_sentence_gets_same_response_multiple_times(self): """ Tests if the bot returns the same response for the same From 5a527b11e4b64a639f8b722b96721fa7728ea0bb Mon Sep 17 00:00:00 2001 From: Gunther Cox Date: Sun, 3 Dec 2017 22:01:01 -0500 Subject: [PATCH 3/4] Make sure unicode strings are saved as text --- chatterbot/conversation/statement.py | 8 ++++++++ chatterbot/input/input_adapter.py | 1 + 2 files changed, 9 insertions(+) diff --git a/chatterbot/conversation/statement.py b/chatterbot/conversation/statement.py index a1938dd9f..dd3e5ca7a 100644 --- a/chatterbot/conversation/statement.py +++ b/chatterbot/conversation/statement.py @@ -29,6 +29,7 @@ class Statement(StatementMixin): """ def __init__(self, text, **kwargs): + import sys # Try not to allow non-string types to be passed to statements try: @@ -36,6 +37,13 @@ def __init__(self, text, **kwargs): except UnicodeEncodeError: pass + # Prefer decoded utf8-strings in Python 2.7 + if sys.version_info[0] < 3: + try: + text = text.decode('utf-8') + except UnicodeEncodeError: + pass + self.text = text self.tags = kwargs.pop('tags', []) self.in_response_to = kwargs.pop('in_response_to', []) diff --git a/chatterbot/input/input_adapter.py b/chatterbot/input/input_adapter.py index 2e764d1a5..17b1dbe14 100644 --- a/chatterbot/input/input_adapter.py +++ b/chatterbot/input/input_adapter.py @@ -19,6 +19,7 @@ def process_input_statement(self, *args, **kwargs): Return an existing statement object (if one exists). """ input_statement = self.process_input(*args, **kwargs) + self.logger.info('Received input statement: {}'.format(input_statement.text)) existing_statement = self.chatbot.storage.find(input_statement.text) From 1f6a0fa37a6d26b42f9acafb226370967a91b8ce Mon Sep 17 00:00:00 2001 From: Gunther Cox Date: Sun, 3 Dec 2017 22:09:34 -0500 Subject: [PATCH 4/4] Force unicode conversion. --- chatterbot/ext/sqlalchemy_app/models.py | 11 ++++++----- chatterbot/ext/sqlalchemy_app/types.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 chatterbot/ext/sqlalchemy_app/types.py diff --git a/chatterbot/ext/sqlalchemy_app/models.py b/chatterbot/ext/sqlalchemy_app/models.py index 121a6587a..8be09f86e 100644 --- a/chatterbot/ext/sqlalchemy_app/models.py +++ b/chatterbot/ext/sqlalchemy_app/models.py @@ -1,7 +1,8 @@ -from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey, PickleType +from sqlalchemy import Table, Column, Integer, DateTime, ForeignKey, PickleType from sqlalchemy.orm import relationship from sqlalchemy.sql import func from sqlalchemy.ext.declarative import declared_attr, declarative_base +from chatterbot.ext.sqlalchemy_app.types import UnicodeString from chatterbot.conversation.statement import StatementMixin @@ -40,7 +41,7 @@ class Tag(Base): A tag that describes a statement. """ - name = Column(String) + name = Column(UnicodeString) class Statement(Base, StatementMixin): @@ -48,7 +49,7 @@ class Statement(Base, StatementMixin): A Statement represents a sentence or phrase. """ - text = Column(String, unique=True) + text = Column(UnicodeString, unique=True) tags = relationship( 'Tag', @@ -90,7 +91,7 @@ class Response(Base): Response, contains responses related to a given statement. """ - text = Column(String) + text = Column(UnicodeString) created_at = Column( DateTime(timezone=True), @@ -99,7 +100,7 @@ class Response(Base): occurrence = Column(Integer, default=1) - statement_text = Column(String, ForeignKey('statement.text')) + statement_text = Column(UnicodeString, ForeignKey('statement.text')) statement_table = relationship( 'Statement', diff --git a/chatterbot/ext/sqlalchemy_app/types.py b/chatterbot/ext/sqlalchemy_app/types.py new file mode 100644 index 000000000..b48f4f6e4 --- /dev/null +++ b/chatterbot/ext/sqlalchemy_app/types.py @@ -0,0 +1,21 @@ +from sqlalchemy.types import TypeDecorator, Unicode + + +class UnicodeString(TypeDecorator): + """ + Type for unicode strings. + """ + + impl = Unicode + + def process_bind_param(self, value, dialect): + """ + Coerce Python bytestrings to unicode before + saving them to the database. + """ + import sys + + if sys.version_info[0] < 3: + if isinstance(value, str): + value = value.decode('utf-8') + return value