Squashed commit of the following:

commit 827d31e Author: Harutaka Kawamura <[email protected]> Date: Fri Oct 27 12:17:31 2023 +0900 Fix lint error on master (mlflow#10181) Signed-off-by: harupy <[email protected]> commit 15e5b94 Author: Daniel Lok <[email protected]> Date: Fri Oct 27 10:57:25 2023 +0800 Add param to enable/disable saving ONNX model as external data (mlflow#10152) Signed-off-by: Daniel Lok <[email protected]> commit a8ef779 Author: Harutaka Kawamura <[email protected]> Date: Fri Oct 27 11:34:29 2023 +0900 Fix lint job condition (mlflow#10179) Signed-off-by: harupy <[email protected]> commit 8c63df6 Author: Prithvi Kannan <[email protected]> Date: Thu Oct 26 18:33:02 2023 -0700 Remove other timeouts (mlflow#10172) commit f10fbd3 Author: Prithvi Kannan <[email protected]> Date: Thu Oct 26 18:30:58 2023 -0700 Surface openai error (mlflow#10176) commit d9b8d17 Author: Ann Zhang <[email protected]> Date: Thu Oct 26 18:00:35 2023 -0700 Fix typo (mlflow#10175) Signed-off-by: Ann Zhang <[email protected]> Signed-off-by: Bryan Qiu <[email protected]>
bbqiu · Oct 27, 2023 · 6d06cbf · 6d06cbf
1 parent 452ae24
commit 6d06cbf
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 38 deletions.
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -32,7 +32,7 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     timeout-minutes: 30
-    if: github.event_name != 'pull_request' && github.event.pull_request.draft == false
+    if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
     steps:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/untracked

diff --git a/mlflow/metrics/genai/genai_metric.py b/mlflow/metrics/genai/genai_metric.py
@@ -96,7 +96,6 @@ def make_genai_metric(
     aggregations: Optional[List[str]] = ["mean", "variance", "p90"],  # noqa: B006
     greater_is_better: bool = True,
     max_workers: int = 10,
-    judge_request_timeout: int = 60,
 ) -> EvaluationMetric:
     """
     Create a genai metric used to evaluate LLM using LLM as a judge in MLflow.
@@ -123,8 +122,6 @@ def make_genai_metric(
     :param greater_is_better: (Optional) Whether the metric is better when it is greater.
     :param max_workers: (Optional) The maximum number of workers to use for judge scoring.
         Defaults to 10 workers.
-    :param judge_request_timeout: (Optional) The timeout in seconds for each judge scoring request.
-        Defaults to 60 seconds.
 
     :return: A metric object.
 
@@ -281,6 +278,7 @@ def score_model_on_one_payload(
                     if e.error_code in [
                         ErrorCode.Name(BAD_REQUEST),
                         ErrorCode.Name(UNAUTHENTICATED),
+                        ErrorCode.Name(INVALID_PARAMETER_VALUE),
                     ]:
                         raise MlflowException(e)
                 return None, f"Failed to score model on payload. Error: {e!s}"
@@ -304,7 +302,7 @@ def score_model_on_one_payload(
                 for indx, (input, output) in enumerate(zip(inputs, outputs))
             }
 
-            for future in as_completed(futures, timeout=judge_request_timeout):
+            for future in as_completed(futures):
                 indx = futures[future]
                 score, justification = future.result()
                 scores[indx] = score

diff --git a/mlflow/metrics/genai/metric_definitions.py b/mlflow/metrics/genai/metric_definitions.py
@@ -15,7 +15,6 @@ def answer_similarity(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
-    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the answer similarity of an LLM
@@ -36,8 +35,6 @@ def answer_similarity(
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
         answer similarity. It is highly recommended to add examples to be used as a reference to
         evaluate the new results.
-    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
-        Defaults to 60 seconds.
     :return: A metric object
     """
     if metric_version is None:
@@ -73,7 +70,6 @@ def answer_similarity(
         parameters=answer_similarity_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
-        judge_request_timeout=judge_request_timeout,
     )
 
 
@@ -82,7 +78,6 @@ def answer_correctness(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
-    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the answer correctness of an LLM
@@ -103,8 +98,6 @@ def answer_correctness(
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
         answer correctness. It is highly recommended to add examples to be used as a reference to
         evaluate the new results.
-    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
-        Defaults to 60 seconds.
     :return: A metric object
     """
     if metric_version is None:
@@ -140,7 +133,6 @@ def answer_correctness(
         parameters=answer_correctness_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
-        judge_request_timeout=judge_request_timeout,
     )
 
 
@@ -149,7 +141,6 @@ def faithfulness(
     model: Optional[str] = None,
     metric_version: Optional[str] = _get_latest_metric_version(),
     examples: Optional[List[EvaluationExample]] = None,
-    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the faithfullness of an LLM using the
@@ -170,8 +161,6 @@ def faithfulness(
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
         faithfulness. It is highly recommended to add examples to be used as a reference to evaluate
         the new results.
-    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
-        Defaults to 60 seconds.
     :return: A metric object
     """
     class_name = f"mlflow.metrics.genai.prompts.{metric_version}.FaithfulnessMetric"
@@ -205,7 +194,6 @@ def faithfulness(
         parameters=faithfulness_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
-        judge_request_timeout=judge_request_timeout,
     )
 
 
@@ -214,7 +202,6 @@ def answer_relevance(
     model: Optional[str] = None,
     metric_version: Optional[str] = _get_latest_metric_version(),
     examples: Optional[List[EvaluationExample]] = None,
-    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the answer relevance of an LLM
@@ -231,8 +218,6 @@ def answer_relevance(
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
         answer relevance. It is highly recommended to add examples to be used as a reference to
         evaluate the new results.
-    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
-        Defaults to 60 seconds.
     :return: A metric object
     """
     class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerRelevanceMetric"
@@ -265,15 +250,13 @@ def answer_relevance(
         parameters=answer_relevance_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
-        judge_request_timeout=judge_request_timeout,
     )
 
 
 def relevance(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
-    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
     This function will create a genai metric used to evaluate the evaluate the relevance of an
@@ -294,8 +277,6 @@ def relevance(
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
         relevance. It is highly recommended to add examples to be used as a reference to evaluate
         the new results.
-    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
-        Defaults to 60 seconds.
     :return: A metric object
     """
     if metric_version is None:
@@ -331,5 +312,4 @@ def relevance(
         parameters=relevance_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
-        judge_request_timeout=judge_request_timeout,
     )
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
@@ -1766,7 +1766,7 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
             # If data is a pandas dataframe, predictions must be specified
             if predictions is None:
                 raise MlflowException(
-                    message="The model output must be specified in the predicitons "
+                    message="The model output must be specified in the predictions "
                     "parameter when model=None.",
                     error_code=INVALID_PARAMETER_VALUE,
                 )

diff --git a/mlflow/onnx/__init__.py b/mlflow/onnx/__init__.py
@@ -92,6 +92,7 @@ def save_model(
     onnx_execution_providers=None,
     onnx_session_options=None,
     metadata=None,
+    save_as_external_data=True,
 ):
     """
     Save an ONNX model to a path on the local file system.
@@ -139,6 +140,7 @@ def save_model(
                                  See onnxruntime API for further descriptions:
                                  https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
     :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file.
+    :param save_as_external_data: Save tensors to external file(s).
 
                      .. Note:: Experimental: This parameter may change or be removed in a future
                                              release without warning.
@@ -167,7 +169,7 @@ def save_model(
 
     # Save onnx-model
     if Version(onnx.__version__) >= Version("1.9.0"):
-        onnx.save_model(onnx_model, model_data_path, save_as_external_data=True)
+        onnx.save_model(onnx_model, model_data_path, save_as_external_data=save_as_external_data)
     else:
         onnx.save_model(onnx_model, model_data_path)
 
@@ -446,6 +448,7 @@ def log_model(
     onnx_execution_providers=None,
     onnx_session_options=None,
     metadata=None,
+    save_as_external_data=True,
 ):
     """
     Log an ONNX model as an MLflow artifact for the current run.
@@ -498,6 +501,7 @@ def log_model(
                                  See onnxruntime API for further descriptions:
                                  https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
     :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file.
+    :param save_as_external_data: Save tensors to external file(s).
 
                      .. Note:: Experimental: This parameter may change or be removed in a future
                                              release without warning.
@@ -519,4 +523,5 @@ def log_model(
         onnx_execution_providers=onnx_execution_providers,
         onnx_session_options=onnx_session_options,
         metadata=metadata,
+        save_as_external_data=save_as_external_data,
     )
diff --git a/tests/evaluate/test_evaluation.py b/tests/evaluate/test_evaluation.py
@@ -1517,7 +1517,7 @@ def test_evaluate_with_static_dataset_error_handling_pandas_dataframe():
         with pytest.raises(
             MlflowException,
             match="The model output must be specified in the "
-            "predicitons parameter when model=None.",
+            "predictions parameter when model=None.",
         ):
             mlflow.evaluate(
                 data=X.assign(y=y, model_output=y),

diff --git a/tests/onnx/test_onnx_model_export.py b/tests/onnx/test_onnx_model_export.py
@@ -253,16 +253,11 @@ def test_model_save_load(onnx_model, model_path):
 
 
 def test_model_save_load_nonexternal_data(onnx_model, model_path):
-    original_save_model = onnx.save_model
     if Version(onnx.__version__) >= Version("1.9.0"):
+        onnx.convert_model_to_external_data = mock.Mock()
 
-        def onnx_save_nonexternal(
-            model, path, save_as_external_data
-        ):  # pylint: disable=unused-argument
-            original_save_model(model, path, save_as_external_data=False)
-
-        with mock.patch("onnx.save_model", wraps=onnx_save_nonexternal):
-            mlflow.onnx.save_model(onnx_model, model_path)
+        mlflow.onnx.save_model(onnx_model, model_path, save_as_external_data=False)
+        onnx.convert_model_to_external_data.assert_not_called()
 
         # Loading ONNX model
         onnx.checker.check_model = mock.Mock()
@@ -346,10 +341,17 @@ def test_model_save_load_multiple_inputs(onnx_model_multiple_inputs_float64, mod
     assert onnx.checker.check_model.called
 
 
+@pytest.mark.parametrize("save_as_external_data", [True, False])
 def test_model_save_load_evaluate_pyfunc_format_multiple_inputs(
-    onnx_model_multiple_inputs_float64, data_multiple_inputs, predicted_multiple_inputs, model_path
+    onnx_model_multiple_inputs_float64,
+    data_multiple_inputs,
+    predicted_multiple_inputs,
+    model_path,
+    save_as_external_data,
 ):
-    mlflow.onnx.save_model(onnx_model_multiple_inputs_float64, model_path)
+    mlflow.onnx.save_model(
+        onnx_model_multiple_inputs_float64, model_path, save_as_external_data=save_as_external_data
+    )
 
     # Loading pyfunc model
     pyfunc_loaded = mlflow.pyfunc.load_model(model_path)