From 3be037d80e5cddacbcfa9681aeea78057e4ac67e Mon Sep 17 00:00:00 2001 From: Abigail Hartman Date: Fri, 5 Feb 2021 10:34:58 -0800 Subject: [PATCH] [text analytics] Exposed the length property in classes impacted by `string_index_type` (#16538) --- .../azure-ai-textanalytics/CHANGELOG.md | 4 +- .../azure/ai/textanalytics/_models.py | 112 ++++++++++++++---- .../azure-ai-textanalytics/tests/test_repr.py | 18 ++- 3 files changed, 101 insertions(+), 33 deletions(-) diff --git a/sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md b/sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md index cdd936629bc2..0a988192e7a0 100644 --- a/sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md +++ b/sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md @@ -13,7 +13,9 @@ each action result. **New Features** - No longer need to specify `api_version=TextAnalyticsApiVersion.V3_1_PREVIEW_3` when calling `begin_analyze` and `begin_analyze_healthcare`. `begin_analyze_healthcare` is still in gated preview though. - +- Added a new parameter `string_index_type` to the service client methods `begin_analyze_healthcare`, `analyze_sentiment`, `recognize_entities`, `recognize_pii_entities`, and `recognize_linked_entities`. +- Added property `length` to `CategorizedEntity`, `SentenceSentiment`, `LinkedEntityMatch`, `AspectSentiment`, `OpinionSentiment`, `PiiEntity` and +`HealthcareEntity`. ## 5.1.0b4 (2021-01-12) diff --git a/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_models.py b/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_models.py index 16cb29e67cdc..8a7bcb1ed5af 100644 --- a/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_models.py +++ b/sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_models.py @@ -289,8 +289,13 @@ class CategorizedEntity(DictMixin): :vartype category: str :ivar subcategory: Entity subcategory, such as Age/Year/TimeRange etc :vartype subcategory: str + :ivar int length: The entity text length. This value depends on the value of the + `string_index_type` parameter set in the original request, which is UnicodeCodePoints + by default. Only returned for API versions v3.1-preview and up. :ivar int offset: The entity text offset from the start of the document. - Returned in unicode code points. Only returned for API versions v3.1-preview and up. + The value depends on the value of the `string_index_type` parameter + set in the original request, which is UnicodeCodePoints by default. Only returned for + API versions v3.1-preview and up. :ivar confidence_score: Confidence score between 0 and 1 of the extracted entity. :vartype confidence_score: float @@ -302,30 +307,35 @@ def __init__(self, **kwargs): self.text = kwargs.get('text', None) self.category = kwargs.get('category', None) self.subcategory = kwargs.get('subcategory', None) + self.length = kwargs.get('length', None) self.offset = kwargs.get('offset', None) self.confidence_score = kwargs.get('confidence_score', None) @classmethod def _from_generated(cls, entity): offset = entity.offset + length = entity.length if isinstance(entity, _v3_0_models.Entity): # we do not return offset for v3.0 since # the correct encoding was not introduced for v3.0 offset = None + length = None return cls( text=entity.text, category=entity.category, subcategory=entity.subcategory, + length=length, offset=offset, confidence_score=entity.confidence_score, ) def __repr__(self): return "CategorizedEntity(text={}, category={}, subcategory={}, "\ - "offset={}, confidence_score={})".format( + "length={}, offset={}, confidence_score={})".format( self.text, self.category, self.subcategory, + self.length, self.offset, self.confidence_score )[:1024] @@ -340,8 +350,12 @@ class PiiEntity(DictMixin): Identification/Social Security Number/Phone Number, etc. :ivar str subcategory: Entity subcategory, such as Credit Card/EU Phone number/ABA Routing Numbers, etc. + :ivar int length: The PII entity text length. This value depends on the value + of the `string_index_type` parameter specified in the original request, which + is UnicodeCodePoints by default. :ivar int offset: The PII entity text offset from the start of the document. - Returned in unicode code points. + This value depends on the value of the `string_index_type` parameter specified + in the original request, which is UnicodeCodePoints by default. :ivar float confidence_score: Confidence score between 0 and 1 of the extracted entity. """ @@ -350,6 +364,7 @@ def __init__(self, **kwargs): self.text = kwargs.get('text', None) self.category = kwargs.get('category', None) self.subcategory = kwargs.get('subcategory', None) + self.length = kwargs.get('length', None) self.offset = kwargs.get('offset', None) self.confidence_score = kwargs.get('confidence_score', None) @@ -359,17 +374,19 @@ def _from_generated(cls, entity): text=entity.text, category=entity.category, subcategory=entity.subcategory, + length=entity.length, offset=entity.offset, confidence_score=entity.confidence_score, ) def __repr__(self): return ( - "PiiEntity(text={}, category={}, subcategory={}, offset={}, "\ - "confidence_score={})".format( + "PiiEntity(text={}, category={}, subcategory={}, length={}, "\ + "offset={}, confidence_score={})".format( self.text, self.category, self.subcategory, + self.length, self.offset, self.confidence_score )[:1024] @@ -379,20 +396,26 @@ def __repr__(self): class HealthcareEntity(DictMixin): """HealthcareEntity contains information about a Healthcare entity found in text. - :ivar str text: Entity text as appears in the request. - :ivar str category: Entity category, such as Dosage or MedicationName, etc. - :ivar str subcategory: Entity subcategory. # TODO: add subcategory examples - :ivar int offset: The Healthcare entity text offset from the start of the document. - :ivar float confidence_score: Confidence score between 0 and 1 of the extracted + :ivar str text: Entity text as appears in the request. + :ivar str category: Entity category, such as Dosage or MedicationName, etc. + :ivar str subcategory: Entity subcategory. # TODO: add subcategory examples + :ivar int length: The entity text length. This value depends on the value + of the `string_index_type` parameter specified in the original request, which is + UnicodeCodePoints by default. + :ivar int offset: The entity text offset from the start of the document. + This value depends on the value of the `string_index_type` parameter specified + in the original request, which is UnicodeCodePoints by default. + :ivar float confidence_score: Confidence score between 0 and 1 of the extracted entity. - :ivar links: A collection of entity references in known data sources. - :vartype links: list[~azure.ai.textanalytics.HealthcareEntityLink] + :ivar links: A collection of entity references in known data sources. + :vartype links: list[~azure.ai.textanalytics.HealthcareEntityLink] """ def __init__(self, **kwargs): self.text = kwargs.get("text", None) self.category = kwargs.get("category", None) self.subcategory = kwargs.get("subcategory", None) + self.length = kwargs.get("length", None) self.offset = kwargs.get("offset", None) self.confidence_score = kwargs.get("confidence_score", None) self.links = kwargs.get("links", []) @@ -403,6 +426,7 @@ def _from_generated(cls, healthcare_entity): text=healthcare_entity.text, category=healthcare_entity.category, subcategory=healthcare_entity.subcategory, + length=healthcare_entity.length, offset=healthcare_entity.offset, confidence_score=healthcare_entity.confidence_score, links=[ @@ -411,11 +435,12 @@ def _from_generated(cls, healthcare_entity): ) def __repr__(self): - return "HealthcareEntity(text={}, category={}, subcategory={}, offset={}, confidence_score={},\ - links={})".format( + return "HealthcareEntity(text={}, category={}, subcategory={}, length={}, offset={}, "\ + "confidence_score={}, links={})".format( self.text, self.category, self.subcategory, + self.length, self.offset, self.confidence_score, repr(self.links) @@ -841,8 +866,13 @@ class LinkedEntityMatch(DictMixin): returned. :vartype confidence_score: float :ivar text: Entity text as appears in the request. + :ivar int length: The linked entity match text length. This value depends on the value of the + `string_index_type` parameter set in the original request, which is UnicodeCodePoints by default. + Only returned for API versions v3.1-preview and up. :ivar int offset: The linked entity match text offset from the start of the document. - Returned in unicode code points. Only returned for API versions v3.1-preview and up. + The value depends on the value of the `string_index_type` parameter + set in the original request, which is UnicodeCodePoints by default. + Only returned for API versions v3.1-preview and up. :vartype text: str .. versionadded:: v3.1-preview The *offset* property. @@ -851,24 +881,28 @@ class LinkedEntityMatch(DictMixin): def __init__(self, **kwargs): self.confidence_score = kwargs.get("confidence_score", None) self.text = kwargs.get("text", None) + self.length = kwargs.get("length", None) self.offset = kwargs.get("offset", None) @classmethod def _from_generated(cls, match): offset = match.offset + length = match.length if isinstance(match, _v3_0_models.Match): # we do not return offset for v3.0 since # the correct encoding was not introduced for v3.0 offset = None + length = None return cls( confidence_score=match.confidence_score, text=match.text, + length=length, offset=offset, ) def __repr__(self): - return "LinkedEntityMatch(confidence_score={}, text={}, offset={})".format( - self.confidence_score, self.text, self.offset + return "LinkedEntityMatch(confidence_score={}, text={}, length={}, offset={})".format( + self.confidence_score, self.text, self.length, self.offset )[:1024] @@ -954,8 +988,13 @@ class SentenceSentiment(DictMixin): and 1 for the sentence for all labels. :vartype confidence_scores: ~azure.ai.textanalytics.SentimentConfidenceScores - :ivar int offset: The sentence offset from the start of the document. Returned - in unicode code points. Only returned for API versions v3.1-preview and up. + :ivar int length: The sentence text length. This value depends on the value of the + `string_index_type` parameter set in the original request, which is UnicodeCodePoints + by default. Only returned for API versions v3.1-preview and up. + :ivar int offset: The sentence text offset from the start of the document. + The value depends on the value of the `string_index_type` parameter + set in the original request, which is UnicodeCodePoints by default. Only returned for + API versions v3.1-preview and up. :ivar mined_opinions: The list of opinions mined from this sentence. For example in the sentence "The food is good, but the service is bad", we would mine the two opinions "food is good" and "service is bad". Only returned @@ -971,16 +1010,19 @@ def __init__(self, **kwargs): self.text = kwargs.get("text", None) self.sentiment = kwargs.get("sentiment", None) self.confidence_scores = kwargs.get("confidence_scores", None) + self.length = kwargs.get("length", None) self.offset = kwargs.get("offset", None) self.mined_opinions = kwargs.get("mined_opinions", None) @classmethod def _from_generated(cls, sentence, results, sentiment): offset = sentence.offset + length = sentence.length if isinstance(sentence, _v3_0_models.SentenceSentiment): # we do not return offset for v3.0 since # the correct encoding was not introduced for v3.0 offset = None + length = None if hasattr(sentence, "aspects"): mined_opinions = ( [MinedOpinion._from_generated(aspect, results, sentiment) for aspect in sentence.aspects] # pylint: disable=protected-access @@ -992,16 +1034,18 @@ def _from_generated(cls, sentence, results, sentiment): text=sentence.text, sentiment=sentence.sentiment, confidence_scores=SentimentConfidenceScores._from_generated(sentence.confidence_scores), # pylint: disable=protected-access + length=length, offset=offset, mined_opinions=mined_opinions ) def __repr__(self): return "SentenceSentiment(text={}, sentiment={}, confidence_scores={}, "\ - "offset={}, mined_opinions={})".format( + "length={}, offset={}, mined_opinions={})".format( self.text, self.sentiment, repr(self.confidence_scores), + self.length, self.offset, repr(self.mined_opinions) )[:1024] @@ -1068,14 +1112,19 @@ class AspectSentiment(DictMixin): for 'neutral' will always be 0 :vartype confidence_scores: ~azure.ai.textanalytics.SentimentConfidenceScores - :ivar int offset: The aspect offset from the start of the document. Returned - in unicode code points. + :ivar int length: The aspect text length. This value depends on the value of the + `string_index_type` parameter set in the original request, which is UnicodeCodePoints + by default. + :ivar int offset: The aspect text offset from the start of the document. + The value depends on the value of the `string_index_type` parameter + set in the original request, which is UnicodeCodePoints by default. """ def __init__(self, **kwargs): self.text = kwargs.get("text", None) self.sentiment = kwargs.get("sentiment", None) self.confidence_scores = kwargs.get("confidence_scores", None) + self.length = kwargs.get("length", None) self.offset = kwargs.get("offset", None) @classmethod @@ -1084,14 +1133,17 @@ def _from_generated(cls, aspect): text=aspect.text, sentiment=aspect.sentiment, confidence_scores=SentimentConfidenceScores._from_generated(aspect.confidence_scores), # pylint: disable=protected-access + length=aspect.length, offset=aspect.offset, ) def __repr__(self): - return "AspectSentiment(text={}, sentiment={}, confidence_scores={}, offset={})".format( + return "AspectSentiment(text={}, sentiment={}, confidence_scores={}, "\ + "length={}, offset={})".format( self.text, self.sentiment, repr(self.confidence_scores), + self.length, self.offset, )[:1024] @@ -1110,8 +1162,12 @@ class OpinionSentiment(DictMixin): for 'neutral' will always be 0 :vartype confidence_scores: ~azure.ai.textanalytics.SentimentConfidenceScores - :ivar int offset: The opinion offset from the start of the document. Returned - in unicode code points. + :ivar int length: The opinion text length. This value depends on the value of the + `string_index_type` parameter set in the original request, which is UnicodeCodePoints + by default. + :ivar int offset: The opinion text offset from the start of the document. + The value depends on the value of the `string_index_type` parameter + set in the original request, which is UnicodeCodePoints by default. :ivar bool is_negated: Whether the opinion is negated. For example, in "The food is not good", the opinion "good" is negated. """ @@ -1120,6 +1176,7 @@ def __init__(self, **kwargs): self.text = kwargs.get("text", None) self.sentiment = kwargs.get("sentiment", None) self.confidence_scores = kwargs.get("confidence_scores", None) + self.length = kwargs.get("length", None) self.offset = kwargs.get("offset", None) self.is_negated = kwargs.get("is_negated", None) @@ -1129,16 +1186,19 @@ def _from_generated(cls, opinion): text=opinion.text, sentiment=opinion.sentiment, confidence_scores=SentimentConfidenceScores._from_generated(opinion.confidence_scores), # pylint: disable=protected-access + length=opinion.length, offset=opinion.offset, is_negated=opinion.is_negated ) def __repr__(self): return ( - "OpinionSentiment(text={}, sentiment={}, confidence_scores={}, offset={}, is_negated={})".format( + "OpinionSentiment(text={}, sentiment={}, confidence_scores={}, length={}, offset={}, "\ + "is_negated={})".format( self.text, self.sentiment, repr(self.confidence_scores), + self.length, self.offset, self.is_negated )[:1024] diff --git a/sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py b/sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py index 3c1e580ff644..843753d06702 100644 --- a/sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py +++ b/sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py @@ -70,12 +70,13 @@ def categorized_entity(): text="Bill Gates", category="Person", subcategory="Age", + length=10, offset=0, confidence_score=0.899 ) model_repr = ( "CategorizedEntity(text=Bill Gates, category=Person, subcategory=Age, " - "offset=0, confidence_score=0.899)" + "length=10, offset=0, confidence_score=0.899)" ) assert repr(model) == model_repr return model, model_repr @@ -87,10 +88,11 @@ def pii_entity(): text="859-98-0987", category="SSN", subcategory=None, + length=11, offset=0, confidence_score=0.899 ) - model_repr = "PiiEntity(text=859-98-0987, category=SSN, subcategory=None, offset=0, confidence_score=0.899)" + model_repr = "PiiEntity(text=859-98-0987, category=SSN, subcategory=None, length=11, offset=0, confidence_score=0.899)" assert repr(model) == model_repr return model, model_repr @@ -101,8 +103,9 @@ def linked_entity_match(): confidence_score=0.999, text="Bill Gates", offset=0, + length=10 ) - model_repr = "LinkedEntityMatch(confidence_score=0.999, text=Bill Gates, offset=0)" + model_repr = "LinkedEntityMatch(confidence_score=0.999, text=Bill Gates, length=10, offset=0)" assert repr(model) == model_repr return model, model_repr @@ -156,9 +159,10 @@ def aspect_sentiment(aspect_opinion_confidence_score): text="aspect", sentiment="positive", confidence_scores=aspect_opinion_confidence_score[0], + length=6, offset=10, ) - model_repr = "AspectSentiment(text=aspect, sentiment=positive, confidence_scores={}, offset=10)".format( + model_repr = "AspectSentiment(text=aspect, sentiment=positive, confidence_scores={}, length=6, offset=10)".format( aspect_opinion_confidence_score[1] ) assert repr(model) == model_repr @@ -170,10 +174,11 @@ def opinion_sentiment(aspect_opinion_confidence_score): text="opinion", sentiment="positive", confidence_scores=aspect_opinion_confidence_score[0], + length=7, offset=3, is_negated=False ) - model_repr = "OpinionSentiment(text=opinion, sentiment=positive, confidence_scores={}, offset=3, is_negated=False)".format( + model_repr = "OpinionSentiment(text=opinion, sentiment=positive, confidence_scores={}, length=7, offset=3, is_negated=False)".format( aspect_opinion_confidence_score[1] ) assert repr(model) == model_repr @@ -195,12 +200,13 @@ def sentence_sentiment(sentiment_confidence_scores, mined_opinion): text="This is a sentence.", sentiment="neutral", confidence_scores=sentiment_confidence_scores[0], + length=19, offset=0, mined_opinions=[mined_opinion[0]] ) model_repr = ( "SentenceSentiment(text=This is a sentence., sentiment=neutral, confidence_scores={}, "\ - "offset=0, mined_opinions=[{}])".format( + "length=19, offset=0, mined_opinions=[{}])".format( sentiment_confidence_scores[1], mined_opinion[1] ) )