diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index a38597b7000c4..ebec120b1c0ff 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -32,7 +32,7 @@ jobs: lint: runs-on: ubuntu-latest timeout-minutes: 30 - if: github.event_name != 'pull_request' && github.event.pull_request.draft == false + if: github.event_name != 'pull_request' || github.event.pull_request.draft == false steps: - uses: actions/checkout@v3 - uses: ./.github/actions/untracked diff --git a/mlflow/metrics/genai/genai_metric.py b/mlflow/metrics/genai/genai_metric.py index 257f1f30cf29f..72ee2e94a40b7 100644 --- a/mlflow/metrics/genai/genai_metric.py +++ b/mlflow/metrics/genai/genai_metric.py @@ -96,7 +96,6 @@ def make_genai_metric( aggregations: Optional[List[str]] = ["mean", "variance", "p90"], # noqa: B006 greater_is_better: bool = True, max_workers: int = 10, - judge_request_timeout: int = 60, ) -> EvaluationMetric: """ Create a genai metric used to evaluate LLM using LLM as a judge in MLflow. @@ -123,8 +122,6 @@ def make_genai_metric( :param greater_is_better: (Optional) Whether the metric is better when it is greater. :param max_workers: (Optional) The maximum number of workers to use for judge scoring. Defaults to 10 workers. - :param judge_request_timeout: (Optional) The timeout in seconds for each judge scoring request. - Defaults to 60 seconds. :return: A metric object. @@ -281,6 +278,7 @@ def score_model_on_one_payload( if e.error_code in [ ErrorCode.Name(BAD_REQUEST), ErrorCode.Name(UNAUTHENTICATED), + ErrorCode.Name(INVALID_PARAMETER_VALUE), ]: raise MlflowException(e) return None, f"Failed to score model on payload. Error: {e!s}" @@ -304,7 +302,7 @@ def score_model_on_one_payload( for indx, (input, output) in enumerate(zip(inputs, outputs)) } - for future in as_completed(futures, timeout=judge_request_timeout): + for future in as_completed(futures): indx = futures[future] score, justification = future.result() scores[indx] = score diff --git a/mlflow/metrics/genai/metric_definitions.py b/mlflow/metrics/genai/metric_definitions.py index 5d63f6be5f8c8..51370dc4f8360 100644 --- a/mlflow/metrics/genai/metric_definitions.py +++ b/mlflow/metrics/genai/metric_definitions.py @@ -15,7 +15,6 @@ def answer_similarity( model: Optional[str] = None, metric_version: Optional[str] = None, examples: Optional[List[EvaluationExample]] = None, - judge_request_timeout=60, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the answer similarity of an LLM @@ -36,8 +35,6 @@ def answer_similarity( :param examples: (Optional) Provide a list of examples to help the judge model evaluate the answer similarity. It is highly recommended to add examples to be used as a reference to evaluate the new results. - :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request. - Defaults to 60 seconds. :return: A metric object """ if metric_version is None: @@ -73,7 +70,6 @@ def answer_similarity( parameters=answer_similarity_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, - judge_request_timeout=judge_request_timeout, ) @@ -82,7 +78,6 @@ def answer_correctness( model: Optional[str] = None, metric_version: Optional[str] = None, examples: Optional[List[EvaluationExample]] = None, - judge_request_timeout=60, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the answer correctness of an LLM @@ -103,8 +98,6 @@ def answer_correctness( :param examples: (Optional) Provide a list of examples to help the judge model evaluate the answer correctness. It is highly recommended to add examples to be used as a reference to evaluate the new results. - :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request. - Defaults to 60 seconds. :return: A metric object """ if metric_version is None: @@ -140,7 +133,6 @@ def answer_correctness( parameters=answer_correctness_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, - judge_request_timeout=judge_request_timeout, ) @@ -149,7 +141,6 @@ def faithfulness( model: Optional[str] = None, metric_version: Optional[str] = _get_latest_metric_version(), examples: Optional[List[EvaluationExample]] = None, - judge_request_timeout=60, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the faithfullness of an LLM using the @@ -170,8 +161,6 @@ def faithfulness( :param examples: (Optional) Provide a list of examples to help the judge model evaluate the faithfulness. It is highly recommended to add examples to be used as a reference to evaluate the new results. - :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request. - Defaults to 60 seconds. :return: A metric object """ class_name = f"mlflow.metrics.genai.prompts.{metric_version}.FaithfulnessMetric" @@ -205,7 +194,6 @@ def faithfulness( parameters=faithfulness_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, - judge_request_timeout=judge_request_timeout, ) @@ -214,7 +202,6 @@ def answer_relevance( model: Optional[str] = None, metric_version: Optional[str] = _get_latest_metric_version(), examples: Optional[List[EvaluationExample]] = None, - judge_request_timeout=60, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the answer relevance of an LLM @@ -231,8 +218,6 @@ def answer_relevance( :param examples: (Optional) Provide a list of examples to help the judge model evaluate the answer relevance. It is highly recommended to add examples to be used as a reference to evaluate the new results. - :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request. - Defaults to 60 seconds. :return: A metric object """ class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerRelevanceMetric" @@ -265,7 +250,6 @@ def answer_relevance( parameters=answer_relevance_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, - judge_request_timeout=judge_request_timeout, ) @@ -273,7 +257,6 @@ def relevance( model: Optional[str] = None, metric_version: Optional[str] = None, examples: Optional[List[EvaluationExample]] = None, - judge_request_timeout=60, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the evaluate the relevance of an @@ -294,8 +277,6 @@ def relevance( :param examples: (Optional) Provide a list of examples to help the judge model evaluate the relevance. It is highly recommended to add examples to be used as a reference to evaluate the new results. - :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request. - Defaults to 60 seconds. :return: A metric object """ if metric_version is None: @@ -331,5 +312,4 @@ def relevance( parameters=relevance_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, - judge_request_timeout=judge_request_timeout, ) diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py index b416be1507fe1..d761312f36d6f 100644 --- a/mlflow/models/evaluation/base.py +++ b/mlflow/models/evaluation/base.py @@ -1766,7 +1766,7 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir): # If data is a pandas dataframe, predictions must be specified if predictions is None: raise MlflowException( - message="The model output must be specified in the predicitons " + message="The model output must be specified in the predictions " "parameter when model=None.", error_code=INVALID_PARAMETER_VALUE, ) diff --git a/mlflow/onnx/__init__.py b/mlflow/onnx/__init__.py index 997dda2ac1516..91779113c278c 100644 --- a/mlflow/onnx/__init__.py +++ b/mlflow/onnx/__init__.py @@ -92,6 +92,7 @@ def save_model( onnx_execution_providers=None, onnx_session_options=None, metadata=None, + save_as_external_data=True, ): """ Save an ONNX model to a path on the local file system. @@ -139,6 +140,7 @@ def save_model( See onnxruntime API for further descriptions: https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. + :param save_as_external_data: Save tensors to external file(s). .. Note:: Experimental: This parameter may change or be removed in a future release without warning. @@ -167,7 +169,7 @@ def save_model( # Save onnx-model if Version(onnx.__version__) >= Version("1.9.0"): - onnx.save_model(onnx_model, model_data_path, save_as_external_data=True) + onnx.save_model(onnx_model, model_data_path, save_as_external_data=save_as_external_data) else: onnx.save_model(onnx_model, model_data_path) @@ -446,6 +448,7 @@ def log_model( onnx_execution_providers=None, onnx_session_options=None, metadata=None, + save_as_external_data=True, ): """ Log an ONNX model as an MLflow artifact for the current run. @@ -498,6 +501,7 @@ def log_model( See onnxruntime API for further descriptions: https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. + :param save_as_external_data: Save tensors to external file(s). .. Note:: Experimental: This parameter may change or be removed in a future release without warning. @@ -519,4 +523,5 @@ def log_model( onnx_execution_providers=onnx_execution_providers, onnx_session_options=onnx_session_options, metadata=metadata, + save_as_external_data=save_as_external_data, ) diff --git a/tests/evaluate/test_evaluation.py b/tests/evaluate/test_evaluation.py index 01d4d603bef97..cd2f7cae2c3bc 100644 --- a/tests/evaluate/test_evaluation.py +++ b/tests/evaluate/test_evaluation.py @@ -1517,7 +1517,7 @@ def test_evaluate_with_static_dataset_error_handling_pandas_dataframe(): with pytest.raises( MlflowException, match="The model output must be specified in the " - "predicitons parameter when model=None.", + "predictions parameter when model=None.", ): mlflow.evaluate( data=X.assign(y=y, model_output=y), diff --git a/tests/onnx/test_onnx_model_export.py b/tests/onnx/test_onnx_model_export.py index 6f8a5dcf15702..0a9a154b60dc7 100644 --- a/tests/onnx/test_onnx_model_export.py +++ b/tests/onnx/test_onnx_model_export.py @@ -253,16 +253,11 @@ def test_model_save_load(onnx_model, model_path): def test_model_save_load_nonexternal_data(onnx_model, model_path): - original_save_model = onnx.save_model if Version(onnx.__version__) >= Version("1.9.0"): + onnx.convert_model_to_external_data = mock.Mock() - def onnx_save_nonexternal( - model, path, save_as_external_data - ): # pylint: disable=unused-argument - original_save_model(model, path, save_as_external_data=False) - - with mock.patch("onnx.save_model", wraps=onnx_save_nonexternal): - mlflow.onnx.save_model(onnx_model, model_path) + mlflow.onnx.save_model(onnx_model, model_path, save_as_external_data=False) + onnx.convert_model_to_external_data.assert_not_called() # Loading ONNX model onnx.checker.check_model = mock.Mock() @@ -346,10 +341,17 @@ def test_model_save_load_multiple_inputs(onnx_model_multiple_inputs_float64, mod assert onnx.checker.check_model.called +@pytest.mark.parametrize("save_as_external_data", [True, False]) def test_model_save_load_evaluate_pyfunc_format_multiple_inputs( - onnx_model_multiple_inputs_float64, data_multiple_inputs, predicted_multiple_inputs, model_path + onnx_model_multiple_inputs_float64, + data_multiple_inputs, + predicted_multiple_inputs, + model_path, + save_as_external_data, ): - mlflow.onnx.save_model(onnx_model_multiple_inputs_float64, model_path) + mlflow.onnx.save_model( + onnx_model_multiple_inputs_float64, model_path, save_as_external_data=save_as_external_data + ) # Loading pyfunc model pyfunc_loaded = mlflow.pyfunc.load_model(model_path)