Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding annotate_text() method in language and helper classes needed for it #2211

Merged
merged 6 commits into from
Aug 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/language-responses.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,10 @@ Sentiment
.. automodule:: gcloud.language.sentiment
:members:
:show-inheritance:

Syntax
~~~~~~

.. automodule:: gcloud.language.syntax
:members:
:show-inheritance:
2 changes: 1 addition & 1 deletion docs/language-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ machine learning and need in-depth text features to build upon.
The method returns a named tuple with four entries:

* ``sentences``: A :class:`list` of sentences in the text
* ``tokens``: A :class:`list` of :class:`~gcloud.language.token.Token`
* ``tokens``: A :class:`list` of :class:`~gcloud.language.syntax.Token`
object (e.g. words, punctuation)
* ``sentiment``: The :class:`~gcloud.language.sentiment.Sentiment` of
the text (as returned by
Expand Down
96 changes: 96 additions & 0 deletions gcloud/language/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,38 @@
A document is used to hold text to be analyzed and annotated.
"""

import collections

from gcloud.language.entity import Entity
from gcloud.language.sentiment import Sentiment
from gcloud.language.syntax import Sentence
from gcloud.language.syntax import Token


DEFAULT_LANGUAGE = 'en-US'
"""Default document language, English."""


Annotations = collections.namedtuple(
'Annotations',
'sentences tokens sentiment entities')
"""Annotations for a document.

:type sentences: list
:param sentences: List of :class:`.Sentence` in a document.

:type tokens: list
:param tokens: List of :class:`.Token` from a document.

:type sentiment: :class:`Sentiment`
:param sentiment: The sentiment of a document.

:type entities: list
:param entities: List of :class:`~.language.entity.Entity`
found in a document.
"""


class Encoding(object):
"""Document text encoding types."""

Expand Down Expand Up @@ -163,3 +187,75 @@ def analyze_sentiment(self):
api_response = self.client.connection.api_request(
method='POST', path='analyzeSentiment', data=data)
return Sentiment.from_api_repr(api_response['documentSentiment'])

def annotate_text(self, include_syntax=True, include_entities=True,
include_sentiment=True):
"""Advanced natural language API: document syntax and other features.

Includes the full functionality of :meth:`analyze_entities` and
:meth:`analyze_sentiment`, enabled by the flags
``include_entities`` and ``include_sentiment`` respectively.

In addition ``include_syntax`` adds a new feature that analyzes
the document for semantic and syntacticinformation.

.. note::

This API is intended for users who are familiar with machine
learning and need in-depth text features to build upon.

.. _annotateText: https://cloud.google.com/natural-language/\
reference/rest/v1beta1/documents/annotateText

See `annotateText`_.

:type include_syntax: bool
:param include_syntax: (Optional) Flag to enable syntax analysis
of the current document.

:type include_entities: bool
:param include_entities: (Optional) Flag to enable entity extraction
from the current document.

:type include_sentiment: bool
:param include_sentiment: (Optional) Flag to enable sentiment
analysis of the current document.

:rtype: :class:`Annotations`
:returns: A tuple of each of the four values returned from the API:
sentences, tokens, sentiment and entities.
"""
features = {}
if include_syntax:
features['extractSyntax'] = True
if include_entities:
features['extractEntities'] = True
if include_sentiment:
features['extractDocumentSentiment'] = True

data = {
'document': self._to_dict(),
'features': features,
'encodingType': self.encoding,
}
api_response = self.client.connection.api_request(
method='POST', path='annotateText', data=data)

sentences = [Sentence.from_api_repr(sentence)
for sentence in api_response['sentences']]
tokens = [Token.from_api_repr(token)
for token in api_response['tokens']]
sentiment_info = api_response.get('documentSentiment')
if sentiment_info is None:
sentiment = None
else:
sentiment = Sentiment.from_api_repr(sentiment_info)
entities = [Entity.from_api_repr(entity)
for entity in api_response['entities']]
annotations = Annotations(
sentences=sentences,
tokens=tokens,
sentiment=sentiment,
entities=entities,
)
return annotations
3 changes: 1 addition & 2 deletions gcloud/language/sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ class Sentiment(object):

See `Sentiment message`_ and `Sentiment basics`_.


:type polarity: float
:param polarity: Polarity of the sentiment in the ``[-1.0, 1.0]`` range.
Larger numbers represent more positive sentiments.
Expand All @@ -45,7 +44,7 @@ def __init__(self, polarity, magnitude):

@classmethod
def from_api_repr(cls, payload):
"""Convert an Sentiment from the JSON API into a :class:`Sentiment`.
"""Convert a Sentiment from the JSON API into a :class:`Sentiment`.

:param payload: dict
:type payload: The value from the backend.
Expand Down
203 changes: 203 additions & 0 deletions gcloud/language/syntax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Google Cloud Natural Language API helpers for tokenized text.

The ``annotateText`` method, when used with the "syntax" feature,
breaks a document down into tokens and sentences.
"""


class PartOfSpeech(object):
"""Part of speech of a :class:`Token`."""

UNKNOWN = 'UNKNOWN'
"""Unknown part of speech."""

ADJECTIVE = 'ADJ'
"""Part of speech: Adjective."""

ADPOSITION = 'ADP'
"""Adposition (preposition and postposition)."""

ADVERB = 'ADV'
"""Adverb."""

CONJUNCTION = 'CONJ'
"""Conjunction."""

DETERMINER = 'DET'
"""Determiner."""

NOUN = 'NOUN'
"""Noun (common and proper)."""

CARDINAL_NUMBER = 'NUM'
"""Cardinal number."""

PRONOUN = 'PRON'
"""Pronoun."""

PARTICIPLE = 'PRT'
"""Particle or other function word."""

PUNCTUATION = 'PUNCT'
"""Punctuation."""

VERB = 'VERB'
"""Verb (all tenses and modes)."""

OTHER = 'X'
"""Other: foreign words, typos, abbreviations."""

AFFIX = 'AFFIX'
"""Affix."""

_REVERSE_MAP = {
'UNKNOWN': 'UNKNOWN',
'ADJ': 'ADJECTIVE',
'ADP': 'ADPOSITION',
'ADV': 'ADVERB',
'CONJ': 'CONJUNCTION',
'DET': 'DETERMINER',
'NOUN': 'NOUN',
'NUM': 'CARDINAL_NUMBER',
'PRON': 'PRONOUN',
'PRT': 'PARTICIPLE',
'PUNCT': 'PUNCTUATION',
'VERB': 'VERB',
'X': 'OTHER',
'AFFIX': 'AFFIX',
}

@classmethod
def reverse(cls, tag):
"""Reverses the API's enum name for the one on this class.

For example::

>>> PartOfSpeech.OTHER
'X'
>>> PartOfSpeech.reverse('X')
'OTHER'

:rtype: str
:returns: The attribute name corresponding to the API part of
speech enum.
"""
return cls._REVERSE_MAP[tag]


class Token(object):
"""A Google Cloud Natural Language API token object.

.. _Token message: https://cloud.google.com/natural-language/reference\
/rest/v1beta1/documents/annotateText#Token
.. _Lemma: https://en.wikipedia.org/wiki/Lemma_(morphology)
.. _Label enum: https://cloud.google.com/natural-language/reference/\
rest/v1beta1/documents/annotateText#Label

See `Token message`_.

:type text_content: str
:param text_content: The text that the token is composed of.

:type text_begin: int
:param text_begin: The beginning offset of the content in the original
document according to the encoding type specified
in the API request.

:type part_of_speech: str
:param part_of_speech: The part of speech of the token. See
:class:`PartOfSpeech` for possible values.

:type edge_index: int
:param edge_index: The head of this token in the dependency tree. This is
the index of the token which has an arc going to this
token. The index is the position of the token in the
array of tokens returned by the API method. If this
token is a root token, then the ``edge_index`` is
its own index.

:type edge_label: str
:param edge_label: See `Label enum`_.

:type lemma: str
:param lemma: The `Lemma`_ of the token.
"""

def __init__(self, text_content, text_begin, part_of_speech,
edge_index, edge_label, lemma):
self.text_content = text_content
self.text_begin = text_begin
self.part_of_speech = part_of_speech
self.edge_index = edge_index
self.edge_label = edge_label
self.lemma = lemma

@classmethod
def from_api_repr(cls, payload):
"""Convert a token from the JSON API into a :class:`Sentiment`.

:param payload: dict
:type payload: The value from the backend.

:rtype: :class:`Token`
:returns: The token parsed from the API representation.
"""
text_span = payload['text']
text_content = text_span['content']
text_begin = text_span['beginOffset']
part_of_speech = payload['partOfSpeech']['tag']
edge = payload['dependencyEdge']
edge_index = edge['headTokenIndex']
edge_label = edge['label']
lemma = payload['lemma']
return cls(text_content, text_begin, part_of_speech,
edge_index, edge_label, lemma)


class Sentence(object):
"""A Google Cloud Natural Language API sentence object.

.. _Sentence message: https://cloud.google.com/natural-language/reference\
/rest/v1beta1/documents/annotateText#Sentence

See `Sentence message`_.

:type content: str
:param content: The text that the sentence is composed of.

:type begin: int
:param begin: The beginning offset of the sentence in the original
document according to the encoding type specified
in the API request.
"""

def __init__(self, content, begin):
self.content = content
self.begin = begin

@classmethod
def from_api_repr(cls, payload):
"""Convert a sentence from the JSON API into a :class:`Sentiment`.

:param payload: dict
:type payload: The value from the backend.

:rtype: :class:`Sentence`
:returns: The sentence parsed from the API representation.
"""
text_span = payload['text']
return cls(text_span['content'], text_span['beginOffset'])
Loading