diff --git a/docs/language-responses.rst b/docs/language-responses.rst index b6c14d5b8856..3bc86ec3cdac 100644 --- a/docs/language-responses.rst +++ b/docs/language-responses.rst @@ -14,3 +14,10 @@ Sentiment .. automodule:: gcloud.language.sentiment :members: :show-inheritance: + +Syntax +~~~~~~ + +.. automodule:: gcloud.language.syntax + :members: + :show-inheritance: diff --git a/docs/language-usage.rst b/docs/language-usage.rst index accaa9a2bf9c..dfaae7b87f16 100644 --- a/docs/language-usage.rst +++ b/docs/language-usage.rst @@ -216,7 +216,7 @@ machine learning and need in-depth text features to build upon. The method returns a named tuple with four entries: * ``sentences``: A :class:`list` of sentences in the text -* ``tokens``: A :class:`list` of :class:`~gcloud.language.token.Token` +* ``tokens``: A :class:`list` of :class:`~gcloud.language.syntax.Token` object (e.g. words, punctuation) * ``sentiment``: The :class:`~gcloud.language.sentiment.Sentiment` of the text (as returned by diff --git a/gcloud/language/document.py b/gcloud/language/document.py index db21cc7d0b57..e811dabb4f0d 100644 --- a/gcloud/language/document.py +++ b/gcloud/language/document.py @@ -17,14 +17,38 @@ A document is used to hold text to be analyzed and annotated. """ +import collections + from gcloud.language.entity import Entity from gcloud.language.sentiment import Sentiment +from gcloud.language.syntax import Sentence +from gcloud.language.syntax import Token DEFAULT_LANGUAGE = 'en-US' """Default document language, English.""" +Annotations = collections.namedtuple( + 'Annotations', + 'sentences tokens sentiment entities') +"""Annotations for a document. + +:type sentences: list +:param sentences: List of :class:`.Sentence` in a document. + +:type tokens: list +:param tokens: List of :class:`.Token` from a document. + +:type sentiment: :class:`Sentiment` +:param sentiment: The sentiment of a document. + +:type entities: list +:param entities: List of :class:`~.language.entity.Entity` + found in a document. +""" + + class Encoding(object): """Document text encoding types.""" @@ -163,3 +187,75 @@ def analyze_sentiment(self): api_response = self.client.connection.api_request( method='POST', path='analyzeSentiment', data=data) return Sentiment.from_api_repr(api_response['documentSentiment']) + + def annotate_text(self, include_syntax=True, include_entities=True, + include_sentiment=True): + """Advanced natural language API: document syntax and other features. + + Includes the full functionality of :meth:`analyze_entities` and + :meth:`analyze_sentiment`, enabled by the flags + ``include_entities`` and ``include_sentiment`` respectively. + + In addition ``include_syntax`` adds a new feature that analyzes + the document for semantic and syntacticinformation. + + .. note:: + + This API is intended for users who are familiar with machine + learning and need in-depth text features to build upon. + + .. _annotateText: https://cloud.google.com/natural-language/\ + reference/rest/v1beta1/documents/annotateText + + See `annotateText`_. + + :type include_syntax: bool + :param include_syntax: (Optional) Flag to enable syntax analysis + of the current document. + + :type include_entities: bool + :param include_entities: (Optional) Flag to enable entity extraction + from the current document. + + :type include_sentiment: bool + :param include_sentiment: (Optional) Flag to enable sentiment + analysis of the current document. + + :rtype: :class:`Annotations` + :returns: A tuple of each of the four values returned from the API: + sentences, tokens, sentiment and entities. + """ + features = {} + if include_syntax: + features['extractSyntax'] = True + if include_entities: + features['extractEntities'] = True + if include_sentiment: + features['extractDocumentSentiment'] = True + + data = { + 'document': self._to_dict(), + 'features': features, + 'encodingType': self.encoding, + } + api_response = self.client.connection.api_request( + method='POST', path='annotateText', data=data) + + sentences = [Sentence.from_api_repr(sentence) + for sentence in api_response['sentences']] + tokens = [Token.from_api_repr(token) + for token in api_response['tokens']] + sentiment_info = api_response.get('documentSentiment') + if sentiment_info is None: + sentiment = None + else: + sentiment = Sentiment.from_api_repr(sentiment_info) + entities = [Entity.from_api_repr(entity) + for entity in api_response['entities']] + annotations = Annotations( + sentences=sentences, + tokens=tokens, + sentiment=sentiment, + entities=entities, + ) + return annotations diff --git a/gcloud/language/sentiment.py b/gcloud/language/sentiment.py index ab6a1209d146..8e0dc1f01162 100644 --- a/gcloud/language/sentiment.py +++ b/gcloud/language/sentiment.py @@ -28,7 +28,6 @@ class Sentiment(object): See `Sentiment message`_ and `Sentiment basics`_. - :type polarity: float :param polarity: Polarity of the sentiment in the ``[-1.0, 1.0]`` range. Larger numbers represent more positive sentiments. @@ -45,7 +44,7 @@ def __init__(self, polarity, magnitude): @classmethod def from_api_repr(cls, payload): - """Convert an Sentiment from the JSON API into a :class:`Sentiment`. + """Convert a Sentiment from the JSON API into a :class:`Sentiment`. :param payload: dict :type payload: The value from the backend. diff --git a/gcloud/language/syntax.py b/gcloud/language/syntax.py new file mode 100644 index 000000000000..d1c1178e22bc --- /dev/null +++ b/gcloud/language/syntax.py @@ -0,0 +1,203 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Natural Language API helpers for tokenized text. + +The ``annotateText`` method, when used with the "syntax" feature, +breaks a document down into tokens and sentences. +""" + + +class PartOfSpeech(object): + """Part of speech of a :class:`Token`.""" + + UNKNOWN = 'UNKNOWN' + """Unknown part of speech.""" + + ADJECTIVE = 'ADJ' + """Part of speech: Adjective.""" + + ADPOSITION = 'ADP' + """Adposition (preposition and postposition).""" + + ADVERB = 'ADV' + """Adverb.""" + + CONJUNCTION = 'CONJ' + """Conjunction.""" + + DETERMINER = 'DET' + """Determiner.""" + + NOUN = 'NOUN' + """Noun (common and proper).""" + + CARDINAL_NUMBER = 'NUM' + """Cardinal number.""" + + PRONOUN = 'PRON' + """Pronoun.""" + + PARTICIPLE = 'PRT' + """Particle or other function word.""" + + PUNCTUATION = 'PUNCT' + """Punctuation.""" + + VERB = 'VERB' + """Verb (all tenses and modes).""" + + OTHER = 'X' + """Other: foreign words, typos, abbreviations.""" + + AFFIX = 'AFFIX' + """Affix.""" + + _REVERSE_MAP = { + 'UNKNOWN': 'UNKNOWN', + 'ADJ': 'ADJECTIVE', + 'ADP': 'ADPOSITION', + 'ADV': 'ADVERB', + 'CONJ': 'CONJUNCTION', + 'DET': 'DETERMINER', + 'NOUN': 'NOUN', + 'NUM': 'CARDINAL_NUMBER', + 'PRON': 'PRONOUN', + 'PRT': 'PARTICIPLE', + 'PUNCT': 'PUNCTUATION', + 'VERB': 'VERB', + 'X': 'OTHER', + 'AFFIX': 'AFFIX', + } + + @classmethod + def reverse(cls, tag): + """Reverses the API's enum name for the one on this class. + + For example:: + + >>> PartOfSpeech.OTHER + 'X' + >>> PartOfSpeech.reverse('X') + 'OTHER' + + :rtype: str + :returns: The attribute name corresponding to the API part of + speech enum. + """ + return cls._REVERSE_MAP[tag] + + +class Token(object): + """A Google Cloud Natural Language API token object. + + .. _Token message: https://cloud.google.com/natural-language/reference\ + /rest/v1beta1/documents/annotateText#Token + .. _Lemma: https://en.wikipedia.org/wiki/Lemma_(morphology) + .. _Label enum: https://cloud.google.com/natural-language/reference/\ + rest/v1beta1/documents/annotateText#Label + + See `Token message`_. + + :type text_content: str + :param text_content: The text that the token is composed of. + + :type text_begin: int + :param text_begin: The beginning offset of the content in the original + document according to the encoding type specified + in the API request. + + :type part_of_speech: str + :param part_of_speech: The part of speech of the token. See + :class:`PartOfSpeech` for possible values. + + :type edge_index: int + :param edge_index: The head of this token in the dependency tree. This is + the index of the token which has an arc going to this + token. The index is the position of the token in the + array of tokens returned by the API method. If this + token is a root token, then the ``edge_index`` is + its own index. + + :type edge_label: str + :param edge_label: See `Label enum`_. + + :type lemma: str + :param lemma: The `Lemma`_ of the token. + """ + + def __init__(self, text_content, text_begin, part_of_speech, + edge_index, edge_label, lemma): + self.text_content = text_content + self.text_begin = text_begin + self.part_of_speech = part_of_speech + self.edge_index = edge_index + self.edge_label = edge_label + self.lemma = lemma + + @classmethod + def from_api_repr(cls, payload): + """Convert a token from the JSON API into a :class:`Sentiment`. + + :param payload: dict + :type payload: The value from the backend. + + :rtype: :class:`Token` + :returns: The token parsed from the API representation. + """ + text_span = payload['text'] + text_content = text_span['content'] + text_begin = text_span['beginOffset'] + part_of_speech = payload['partOfSpeech']['tag'] + edge = payload['dependencyEdge'] + edge_index = edge['headTokenIndex'] + edge_label = edge['label'] + lemma = payload['lemma'] + return cls(text_content, text_begin, part_of_speech, + edge_index, edge_label, lemma) + + +class Sentence(object): + """A Google Cloud Natural Language API sentence object. + + .. _Sentence message: https://cloud.google.com/natural-language/reference\ + /rest/v1beta1/documents/annotateText#Sentence + + See `Sentence message`_. + + :type content: str + :param content: The text that the sentence is composed of. + + :type begin: int + :param begin: The beginning offset of the sentence in the original + document according to the encoding type specified + in the API request. + """ + + def __init__(self, content, begin): + self.content = content + self.begin = begin + + @classmethod + def from_api_repr(cls, payload): + """Convert a sentence from the JSON API into a :class:`Sentiment`. + + :param payload: dict + :type payload: The value from the backend. + + :rtype: :class:`Sentence` + :returns: The sentence parsed from the API representation. + """ + text_span = payload['text'] + return cls(text_span['content'], text_span['beginOffset']) diff --git a/gcloud/language/test_document.py b/gcloud/language/test_document.py index e34528711a44..a99564fe0c0e 100644 --- a/gcloud/language/test_document.py +++ b/gcloud/language/test_document.py @@ -15,6 +15,86 @@ import unittest +ANNOTATE_NAME = 'Moon' +ANNOTATE_CONTENT = 'A cow jumped over the %s.' % (ANNOTATE_NAME,) +ANNOTATE_POLARITY = 1 +ANNOTATE_MAGNITUDE = 0.2 +ANNOTATE_SALIENCE = 0.11793101 +ANNOTATE_WIKI_URL = 'http://en.wikipedia.org/wiki/Natural_satellite' + + +def _make_token_json(name, part_of_speech, head, edge_label): + token_dict = { + 'text': { + 'content': name, + 'beginOffset': -1, + }, + 'partOfSpeech': {'tag': part_of_speech}, + 'dependencyEdge': { + 'headTokenIndex': head, + 'label': edge_label, + }, + 'lemma': name, + } + return token_dict + + +def _get_token_and_sentences(include_syntax): + from gcloud.language.syntax import PartOfSpeech + + if include_syntax: + token_info = [ + ('A', PartOfSpeech.DETERMINER, 1, 'DET'), + ('cow', PartOfSpeech.NOUN, 2, 'NSUBJ'), + ('jumped', PartOfSpeech.VERB, 2, 'ROOT'), + ('over', PartOfSpeech.ADPOSITION, 2, 'PREP'), + ('the', PartOfSpeech.DETERMINER, 5, 'DET'), + (ANNOTATE_NAME, PartOfSpeech.NOUN, 3, 'POBJ'), + ('.', PartOfSpeech.PUNCTUATION, 2, 'P'), + ] + sentences = [ + { + 'text': { + 'content': ANNOTATE_CONTENT, + 'beginOffset': -1, + }, + }, + ] + else: + token_info = [] + sentences = [] + + return token_info, sentences + + +def _get_entities(include_entities): + from gcloud.language.entity import EntityType + + if include_entities: + entities = [ + { + 'name': ANNOTATE_NAME, + 'type': EntityType.LOCATION, + 'metadata': { + 'wikipedia_url': ANNOTATE_WIKI_URL, + }, + 'salience': ANNOTATE_SALIENCE, + 'mentions': [ + { + 'text': { + 'content': ANNOTATE_NAME, + 'beginOffset': -1 + } + } + ] + }, + ] + else: + entities = [] + + return entities + + class TestDocument(unittest.TestCase): def _getTargetClass(self): @@ -95,8 +175,18 @@ def test__to_dict_with_no_content(self): 'type': klass.PLAIN_TEXT, }) - def test_analyze_entities(self): + def _verify_entity(self, entity, name, entity_type, wiki_url, salience): from gcloud.language.entity import Entity + + self.assertIsInstance(entity, Entity) + self.assertEqual(entity.name, name) + self.assertEqual(entity.entity_type, entity_type) + self.assertEqual(entity.wikipedia_url, wiki_url) + self.assertEqual(entity.metadata, {}) + self.assertEqual(entity.salience, salience) + self.assertEqual(entity.mentions, [name]) + + def test_analyze_entities(self): from gcloud.language.entity import EntityType name1 = 'R-O-C-K' @@ -136,7 +226,7 @@ def test_analyze_entities(self): ], }, ], - 'language': 'en', + 'language': 'en-US', } connection = _Connection(response) client = _Client(connection=connection) @@ -145,21 +235,11 @@ def test_analyze_entities(self): entities = document.analyze_entities() self.assertEqual(len(entities), 2) entity1 = entities[0] - self.assertIsInstance(entity1, Entity) - self.assertEqual(entity1.name, name1) - self.assertEqual(entity1.entity_type, EntityType.OTHER) - self.assertEqual(entity1.wikipedia_url, None) - self.assertEqual(entity1.metadata, {}) - self.assertEqual(entity1.salience, salience1) - self.assertEqual(entity1.mentions, [name1]) + self._verify_entity(entity1, name1, EntityType.OTHER, + None, salience1) entity2 = entities[1] - self.assertIsInstance(entity2, Entity) - self.assertEqual(entity2.name, name2) - self.assertEqual(entity2.entity_type, EntityType.LOCATION) - self.assertEqual(entity2.wikipedia_url, wiki2) - self.assertEqual(entity2.metadata, {}) - self.assertEqual(entity2.salience, salience2) - self.assertEqual(entity2.mentions, [name2]) + self._verify_entity(entity2, name2, EntityType.LOCATION, + wiki2, salience2) # Verify the request. self.assertEqual(len(connection._requested), 1) @@ -167,9 +247,14 @@ def test_analyze_entities(self): self.assertEqual(req['path'], 'analyzeEntities') self.assertEqual(req['method'], 'POST') - def test_analyze_sentiment(self): + def _verify_sentiment(self, sentiment, polarity, magnitude): from gcloud.language.sentiment import Sentiment + self.assertIsInstance(sentiment, Sentiment) + self.assertEqual(sentiment.polarity, polarity) + self.assertEqual(sentiment.magnitude, magnitude) + + def test_analyze_sentiment(self): content = 'All the pretty horses.' polarity = 1 magnitude = 0.6 @@ -178,16 +263,14 @@ def test_analyze_sentiment(self): 'polarity': polarity, 'magnitude': magnitude, }, - 'language': 'en', + 'language': 'en-US', } connection = _Connection(response) client = _Client(connection=connection) document = self._makeOne(client, content) sentiment = document.analyze_sentiment() - self.assertIsInstance(sentiment, Sentiment) - self.assertEqual(sentiment.polarity, polarity) - self.assertEqual(sentiment.magnitude, magnitude) + self._verify_sentiment(sentiment, polarity, magnitude) # Verify the request. self.assertEqual(len(connection._requested), 1) @@ -195,6 +278,102 @@ def test_analyze_sentiment(self): self.assertEqual(req['path'], 'analyzeSentiment') self.assertEqual(req['method'], 'POST') + def _verify_sentences(self, include_syntax, annotations): + from gcloud.language.syntax import Sentence + + if include_syntax: + self.assertEqual(len(annotations.sentences), 1) + sentence = annotations.sentences[0] + self.assertIsInstance(sentence, Sentence) + self.assertEqual(sentence.content, ANNOTATE_CONTENT) + self.assertEqual(sentence.begin, -1) + else: + self.assertEqual(annotations.sentences, []) + + def _verify_tokens(self, annotations, token_info): + from gcloud.language.syntax import Token + + self.assertEqual(len(annotations.tokens), len(token_info)) + for token, info in zip(annotations.tokens, token_info): + self.assertIsInstance(token, Token) + self.assertEqual(token.text_content, info[0]) + self.assertEqual(token.text_begin, -1) + self.assertEqual(token.part_of_speech, info[1]) + self.assertEqual(token.edge_index, info[2]) + self.assertEqual(token.edge_label, info[3]) + self.assertEqual(token.lemma, info[0]) + + def _annotate_text_helper(self, include_sentiment, + include_entities, include_syntax): + from gcloud.language.document import Annotations + from gcloud.language.entity import EntityType + + token_info, sentences = _get_token_and_sentences(include_syntax) + entities = _get_entities(include_entities) + tokens = [_make_token_json(*info) for info in token_info] + response = { + 'sentences': sentences, + 'tokens': tokens, + 'entities': entities, + 'language': 'en-US', + } + if include_sentiment: + response['documentSentiment'] = { + 'polarity': ANNOTATE_POLARITY, + 'magnitude': ANNOTATE_MAGNITUDE, + } + + connection = _Connection(response) + client = _Client(connection=connection) + document = self._makeOne(client, ANNOTATE_CONTENT) + + annotations = document.annotate_text( + include_syntax=include_syntax, include_entities=include_entities, + include_sentiment=include_sentiment) + self.assertIsInstance(annotations, Annotations) + # Sentences + self._verify_sentences(include_syntax, annotations) + # Token + self._verify_tokens(annotations, token_info) + # Sentiment + if include_sentiment: + self._verify_sentiment(annotations.sentiment, + ANNOTATE_POLARITY, ANNOTATE_MAGNITUDE) + else: + self.assertIsNone(annotations.sentiment) + # Entity + if include_entities: + self.assertEqual(len(annotations.entities), 1) + entity = annotations.entities[0] + self._verify_entity(entity, ANNOTATE_NAME, EntityType.LOCATION, + ANNOTATE_WIKI_URL, ANNOTATE_SALIENCE) + else: + self.assertEqual(annotations.entities, []) + + # Verify the request. + self.assertEqual(len(connection._requested), 1) + req = connection._requested[0] + self.assertEqual(req['path'], 'annotateText') + self.assertEqual(req['method'], 'POST') + features = req['data']['features'] + self.assertEqual(features.get('extractDocumentSentiment', False), + include_sentiment) + self.assertEqual(features.get('extractEntities', False), + include_entities) + self.assertEqual(features.get('extractSyntax', False), include_syntax) + + def test_annotate_text(self): + self._annotate_text_helper(True, True, True) + + def test_annotate_text_sentiment_only(self): + self._annotate_text_helper(True, False, False) + + def test_annotate_text_entities_only(self): + self._annotate_text_helper(False, True, False) + + def test_annotate_text_syntax_only(self): + self._annotate_text_helper(False, False, True) + class _Connection(object): diff --git a/gcloud/language/test_syntax.py b/gcloud/language/test_syntax.py new file mode 100644 index 000000000000..5524b2166a2a --- /dev/null +++ b/gcloud/language/test_syntax.py @@ -0,0 +1,124 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestPartOfSpeech(unittest.TestCase): + + def _getTargetClass(self): + from gcloud.language.syntax import PartOfSpeech + return PartOfSpeech + + def test_reverse(self): + klass = self._getTargetClass() + for attr in dir(klass): + if attr.startswith('_'): + continue + if attr.islower(): + continue + value = getattr(klass, attr) + result = klass.reverse(value) + self.assertEqual(result, attr) + + +class TestToken(unittest.TestCase): + + def _getTargetClass(self): + from gcloud.language.syntax import Token + return Token + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_constructor(self): + from gcloud.language.syntax import PartOfSpeech + + text_content = 'All' + text_begin = -1 + part_of_speech = PartOfSpeech.DETERMINER + edge_index = 3 + edge_label = 'PREDET' + lemma = text_content + token = self._makeOne(text_content, text_begin, part_of_speech, + edge_index, edge_label, lemma) + self.assertEqual(token.text_content, text_content) + self.assertEqual(token.text_begin, text_begin) + self.assertEqual(token.part_of_speech, part_of_speech) + self.assertEqual(token.edge_index, edge_index) + self.assertEqual(token.edge_label, edge_label) + self.assertEqual(token.lemma, lemma) + + def test_from_api_repr(self): + from gcloud.language.syntax import PartOfSpeech + + klass = self._getTargetClass() + text_content = 'pretty' + text_begin = -1 + part_of_speech = PartOfSpeech.ADJECTIVE + edge_index = 3 + edge_label = 'AMOD' + lemma = text_content + payload = { + 'text': { + 'content': text_content, + 'beginOffset': text_begin, + }, + 'partOfSpeech': { + 'tag': part_of_speech, + }, + 'dependencyEdge': { + 'headTokenIndex': edge_index, + 'label': edge_label, + }, + 'lemma': lemma, + } + token = klass.from_api_repr(payload) + self.assertEqual(token.text_content, text_content) + self.assertEqual(token.text_begin, text_begin) + self.assertEqual(token.part_of_speech, part_of_speech) + self.assertEqual(token.edge_index, edge_index) + self.assertEqual(token.edge_label, edge_label) + self.assertEqual(token.lemma, lemma) + + +class TestSentence(unittest.TestCase): + + def _getTargetClass(self): + from gcloud.language.syntax import Sentence + return Sentence + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_constructor(self): + content = "All the king's horses." + begin = 11 + sentence = self._makeOne(content, begin) + self.assertEqual(sentence.content, content) + self.assertEqual(sentence.begin, begin) + + def test_from_api_repr(self): + klass = self._getTargetClass() + content = 'All the pretty horses.' + begin = -1 + payload = { + 'text': { + 'content': content, + 'beginOffset': begin, + }, + } + sentence = klass.from_api_repr(payload) + self.assertEqual(sentence.content, content) + self.assertEqual(sentence.begin, begin)