[pf-evals] Fix the evaluate API relative data path is not working due…

… to underlying working directory change (#3603) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](https://github.com/microsoft/promptflow/blob/main/CONTRIBUTING.md).** - [ ] **I confirm that all new dependencies are compatible with the MIT license.** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
microsoft · Aug 5, 2024 · cc796bb · cc796bb
1 parent 24df289
commit cc796bb
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 2 deletions.
diff --git a/src/promptflow-evals/CHANGELOG.md b/src/promptflow-evals/CHANGELOG.md
@@ -8,6 +8,7 @@
 
 ### Bugs Fixed
 - Large simulation was causing a jinja exception, this has been fixed.
+- Fixed the issue where the relative data path was not working with the evaluate API when using multiple evaluators.
 
 ### Improvements
 - Converted built-in evaluators to async-based implementation, leveraging async batch run for performance improvement. Introduced `PF_EVALS_BATCH_USE_ASYNC` environment variable to enable/disable async batch run, with the default set to False.

diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import inspect
+import os
 import re
 from typing import Any, Callable, Dict, Optional, Set, Tuple
 
@@ -442,7 +443,15 @@ def _evaluate(  # pylint: disable=too-many-locals
     # Batch Run
     evaluators_info = {}
     use_pf_client = kwargs.get("_use_pf_client", True)
-    batch_run_client = ProxyClient(pf_client) if use_pf_client else CodeClient()
+    if use_pf_client:
+        batch_run_client = ProxyClient(pf_client)
+
+        # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
+        # multiple evaluators. If the path is already absolute, abspath will return the original path.
+        data = os.path.abspath(data)
+    else:
+        batch_run_client = CodeClient()
+        data = input_data_df
 
     with BatchRunContext(batch_run_client):
         for evaluator_name, evaluator in evaluators.items():
@@ -452,7 +461,7 @@ def _evaluate(  # pylint: disable=too-many-locals
                 run=target_run,
                 evaluator_name=evaluator_name,
                 column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
-                data=input_data_df if isinstance(batch_run_client, CodeClient) else data,
+                data=data,
                 stream=True,
                 name=kwargs.get("_run_name"),
             )

diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -125,6 +125,41 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         assert row_result_df["outputs.f1_score.f1_score"][2] == 1
         assert result["studio_url"] is None
 
+    def test_evaluate_with_relative_data_path(self, model_config):
+        original_working_dir = os.getcwd()
+
+        try:
+            working_dir = os.path.dirname(__file__)
+            os.chdir(working_dir)
+
+            data_file = "data/evaluate_test_data.jsonl"
+            input_data = pd.read_json(data_file, lines=True)
+
+            groundedness_eval = GroundednessEvaluator(model_config)
+            fluency_eval = FluencyEvaluator(model_config)
+
+            # Run the evaluation
+            result = evaluate(
+                data=data_file,
+                evaluators={"grounded": groundedness_eval, "fluency": fluency_eval},
+            )
+
+            row_result_df = pd.DataFrame(result["rows"])
+            metrics = result["metrics"]
+
+            # Validate the results
+            assert result is not None
+            assert result["rows"] is not None
+            assert row_result_df.shape[0] == len(input_data)
+
+            assert "outputs.grounded.gpt_groundedness" in row_result_df.columns.to_list()
+            assert "outputs.fluency.gpt_fluency" in row_result_df.columns.to_list()
+
+            assert "grounded.gpt_groundedness" in metrics.keys()
+            assert "fluency.gpt_fluency" in metrics.keys()
+        finally:
+            os.chdir(original_working_dir)
+
     @pytest.mark.azuretest
     def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file):
         input_data = pd.read_json(data_file, lines=True)