From 2899deafeffb4a15098a1daa695124e7b6e4eb90 Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudh@umass.edu>
Date: Sun, 15 Oct 2023 18:53:17 -0400
Subject: [PATCH 1/6] Added info about required packages

---
 ...d-question-answering-model-debugging.ipynb | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
index 3b663cfc61..4af484b9f1 100644
--- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
+++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
@@ -42,6 +42,31 @@
     "The following section examines the code necessary to create datasets and a model. It then generates insights using the `responsibleai` API that can be visually analyzed."
    ]
   },
+{
+   "cell_type": "markdown",
+   "id": "6174bcad",
+   "metadata": {},
+   "source": [
+    "### Prepare\n",
+    "\n",
+    "To run this notebook, we need to install the following packages:\n",
+    "\n",
+    "```requirements.txt\n",
+    "raiutils\n",
+    "raiwidgets\n",
+    "datasets\n",
+    "transformers\n",
+    "responsibleai_text\n",
+    "torch\n",
+    "```\n",
+    "\n",
+    "Run the following command to load the spacy pipeline:\n",
+    "\n",
+    "```bash\n",
+    "python -m spacy download en_core_web_sm\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "40739025",

From 80b0c3454bb8b34998da9e0bdb71af3c079bf829 Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudh@umass.edu>
Date: Sun, 15 Oct 2023 18:53:55 -0400
Subject: [PATCH 2/6] Update
 responsibleaidashboard-question-answering-model-debugging.ipynb

---
 ...ponsibleaidashboard-question-answering-model-debugging.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
index 4af484b9f1..d804c9bed8 100644
--- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
+++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
@@ -51,7 +51,7 @@
     "\n",
     "To run this notebook, we need to install the following packages:\n",
     "\n",
-    "```requirements.txt\n",
+    "```\n",
     "raiutils\n",
     "raiwidgets\n",
     "datasets\n",

From af0099398be0a75b2588a219a79db69057c63bf2 Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudh@umass.edu>
Date: Sun, 15 Oct 2023 19:00:01 -0400
Subject: [PATCH 3/6] show example prediction

---
 ...d-question-answering-model-debugging.ipynb | 57 +++++++++++--------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
index d804c9bed8..2d8de2ffcd 100644
--- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
+++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
@@ -111,16 +111,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = datasets.load_dataset(\"squad\", split=\"train\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a0eef443",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "dataset = datasets.load_dataset(\"squad\", split=\"train\")\n",
     "dataset"
    ]
   },
@@ -155,17 +146,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = pd.DataFrame({'context': context, 'questions': questions, 'answers': answers})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e6f87e9c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data"
+    "data = pd.DataFrame({'context': context, 'questions': questions, 'answers': answers})\n",
+    "data = data.sample(frac=1.0, random_state=42).reset_index(drop=True)\n",
+    "data.head()"
    ]
   },
   {
@@ -184,18 +167,42 @@
    "outputs": [],
    "source": [
     "# load the question-answering model\n",
-    "pmodel = pipeline('question-answering')"
+    "pipeline_model = pipeline('question-answering')\n",
+    "test_size = 5\n",
+    "\n",
+    "train_data = data\n",
+    "test_data = data[:test_size]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cf8327b",
+   "metadata": {},
+   "source": [
+    "See an example of the model's predictions"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "04801887",
+   "id": "ce087699",
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_data = data\n",
-    "test_data = data[:5]"
+    "def get_answer(dataset, idx):\n",
+    "    model_output = pipeline_model(question=dataset['questions'][idx], \n",
+    "                                  context=dataset['context'][idx])\n",
+    "    pred = model_output['answer']\n",
+    "    return pred\n",
+    "\n",
+    "def check_answer(dataset, idx):\n",
+    "    pred = get_answer(dataset, idx)\n",
+    "    print('Question  : ', dataset['questions'][idx])\n",
+    "    print('Answer    : ', dataset['answers'][idx])\n",
+    "    print('Predicted : ', pred)\n",
+    "    print('Correct   : ', pred == dataset['answers'][idx])\n",
+    "\n",
+    "check_answer(test_data, 0)\n"
    ]
   },
   {

From 6c19f0fd3187d18631324a699d6ef9ecb24e9916 Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudh@umass.edu>
Date: Sun, 15 Oct 2023 19:01:09 -0400
Subject: [PATCH 4/6] Update
 responsibleaidashboard-question-answering-model-debugging.ipynb

---
 ...onsibleaidashboard-question-answering-model-debugging.ipynb | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
index 2d8de2ffcd..dbcb6b8dc9 100644
--- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
+++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb
@@ -241,8 +241,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "rai_insights = RAITextInsights(pmodel, test_data,\n",
-    "                               \"answers\",\n",
+    "rai_insights = RAITextInsights(pipeline_model, test_data, \"answers\",\n",
     "                               task_type=ModelTask.QUESTION_ANSWERING)"
    ]
   },

From b2e954014ca5f412a89087aa5bc5585bd3bd9c0b Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudh@umass.edu>
Date: Tue, 30 Jan 2024 10:49:08 -0500
Subject: [PATCH 5/6] add helper method for genai metrics

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>
---
 .../utils/genai_metrics/metrics.py            | 14 +++
 .../tests/test_genai_metrics.py               | 94 ++++++++-----------
 2 files changed, 51 insertions(+), 57 deletions(-)

diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
index e0e4934d76..784c38a7c1 100644
--- a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
+++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
@@ -3,6 +3,7 @@
 
 """Compute AI-assisted metrics for generative text models."""
 
+import numpy as np
 import logging
 from pathlib import Path
 
@@ -30,3 +31,16 @@ def get_genai_metric(metric_name, **metric_kwargs):
     metric = evaluate.load(
         str(curr_file_dir.joinpath(f'scripts/{metric_name}.py')))
     return metric.compute(**metric_kwargs)
+
+
+def get_genai_metric_mean(metric_name, **metric_kwargs):
+    """Get the mean of the metric from the genai library.
+
+    :param metric_name: The name of the metric.
+    :type metric_name: str
+    :param metric_kwargs: The keyword arguments to pass to the metric.
+    :type metric_kwargs: dict
+    :return: The mean of the metric.
+    :rtype: float
+    """
+    return np.mean(get_genai_metric(metric_name, **metric_kwargs)['scores'])
diff --git a/responsibleai_text/tests/test_genai_metrics.py b/responsibleai_text/tests/test_genai_metrics.py
index 5285d6c623..8cf530e5ad 100644
--- a/responsibleai_text/tests/test_genai_metrics.py
+++ b/responsibleai_text/tests/test_genai_metrics.py
@@ -1,7 +1,8 @@
 # Copyright (c) Microsoft Corporation
 # Licensed under the MIT License.
 
-from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric
+from responsibleai_text.utils.genai_metrics.metrics import (
+    get_genai_metric, get_genai_metric_mean)
 
 PREDICTIONS = ['This is a prediction']
 REFERENCES = ['This is a reference']
@@ -15,69 +16,48 @@ def predict(self, inp):
 
 class TestGenAIMetrics:
 
-    def test_coherence(self):
-        metric = get_genai_metric('coherence',
-                                  predictions=PREDICTIONS,
-                                  references=REFERENCES,
+    def assert_metrics(self, metric_name,
+                       expected, input_len,
+                       **metric_kwargs):
+        metric = get_genai_metric(metric_name, **metric_kwargs,
                                   wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1]
+        assert metric['scores'] == [expected]
 
-        metric = get_genai_metric('coherence',
-                                  predictions=PREDICTIONS * 5,
-                                  references=REFERENCES * 5,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1] * 5
+        metric_mean = get_genai_metric_mean(metric_name, **metric_kwargs,
+                                            wrapper_model=DummyModelWrapper())
+        assert metric_mean == expected
 
-    def test_equivalence(self):
-        metric = get_genai_metric('equivalence',
-                                  predictions=PREDICTIONS,
-                                  references=REFERENCES,
-                                  answers=ANSWERS,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1]
+        kwargs_multi = {k: v * input_len for k, v in metric_kwargs.items()}
+        metric_multi = get_genai_metric(metric_name, **kwargs_multi,
+                                        wrapper_model=DummyModelWrapper())
+        assert metric_multi['scores'] == [expected] * input_len
 
-        metric = get_genai_metric('equivalence',
-                                  predictions=PREDICTIONS * 5,
-                                  references=REFERENCES * 5,
-                                  answers=ANSWERS * 5,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1] * 5
+        metric_mean_multi = get_genai_metric_mean(
+            metric_name, **kwargs_multi, wrapper_model=DummyModelWrapper())
+        assert metric_mean_multi == expected
 
-    def test_fluency(self):
-        metric = get_genai_metric('fluency',
-                                  predictions=PREDICTIONS,
-                                  references=REFERENCES,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1]
+    def test_coherence(self):
+        self.assert_metrics('coherence', 1, 5,
+                            predictions=PREDICTIONS,
+                            references=REFERENCES)
 
-        metric = get_genai_metric('fluency',
-                                  predictions=PREDICTIONS * 5,
-                                  references=REFERENCES * 5,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1] * 5
+    def test_equivalence(self):
+        self.assert_metrics('equivalence', 1, 5,
+                            predictions=PREDICTIONS,
+                            references=REFERENCES,
+                            answers=ANSWERS)
 
-    def test_groundedness(self):
-        metric = get_genai_metric('groundedness',
-                                  predictions=PREDICTIONS,
-                                  references=REFERENCES,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1]
+    def test_fluency(self):
+        self.assert_metrics('fluency', 1, 5,
+                            predictions=PREDICTIONS,
+                            references=REFERENCES)
 
-        metric = get_genai_metric('groundedness',
-                                  predictions=PREDICTIONS * 5,
-                                  references=REFERENCES * 5,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1] * 5
+    def test_groundedness(self):
+        self.assert_metrics('groundedness', 1, 5,
+                            predictions=PREDICTIONS,
+                            references=REFERENCES)
 
     def test_relevance(self):
-        metric = get_genai_metric('relevance',
-                                  predictions=PREDICTIONS,
-                                  references=REFERENCES,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1]
-
-        metric = get_genai_metric('relevance',
-                                  predictions=PREDICTIONS * 5,
-                                  references=REFERENCES * 5,
-                                  wrapper_model=DummyModelWrapper())
-        assert metric['scores'] == [1] * 5
+        self.assert_metrics('relevance', 1, 5,
+                            predictions=PREDICTIONS,
+                            references=REFERENCES)

From eeefe71cff27f18dec7e05bf05ce5d66a959b610 Mon Sep 17 00:00:00 2001
From: Kartik Choudhary <kartikchoudh@umass.edu>
Date: Tue, 30 Jan 2024 10:50:20 -0500
Subject: [PATCH 6/6] Fix import order in metrics.py

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>
---
 .../responsibleai_text/utils/genai_metrics/metrics.py          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
index 784c38a7c1..214c435c02 100644
--- a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
+++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
@@ -3,10 +3,11 @@
 
 """Compute AI-assisted metrics for generative text models."""
 
-import numpy as np
 import logging
 from pathlib import Path
 
+import numpy as np
+
 module_logger = logging.getLogger(__name__)
 module_logger.setLevel(logging.INFO)