Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 827d31e
Author: Harutaka Kawamura <[email protected]>
Date:   Fri Oct 27 12:17:31 2023 +0900

    Fix lint error on master (mlflow#10181)

    Signed-off-by: harupy <[email protected]>

commit 15e5b94
Author: Daniel Lok <[email protected]>
Date:   Fri Oct 27 10:57:25 2023 +0800

    Add param to enable/disable saving ONNX model as external data (mlflow#10152)

    Signed-off-by: Daniel Lok <[email protected]>

commit a8ef779
Author: Harutaka Kawamura <[email protected]>
Date:   Fri Oct 27 11:34:29 2023 +0900

    Fix lint job condition (mlflow#10179)

    Signed-off-by: harupy <[email protected]>

commit 8c63df6
Author: Prithvi Kannan <[email protected]>
Date:   Thu Oct 26 18:33:02 2023 -0700

    Remove other timeouts (mlflow#10172)

commit f10fbd3
Author: Prithvi Kannan <[email protected]>
Date:   Thu Oct 26 18:30:58 2023 -0700

    Surface openai error (mlflow#10176)

commit d9b8d17
Author: Ann Zhang <[email protected]>
Date:   Thu Oct 26 18:00:35 2023 -0700

    Fix typo (mlflow#10175)

    Signed-off-by: Ann Zhang <[email protected]>

Signed-off-by: Bryan Qiu <[email protected]>
  • Loading branch information
bbqiu committed Oct 27, 2023
1 parent 452ae24 commit 6d06cbf
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 38 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
lint:
runs-on: ubuntu-latest
timeout-minutes: 30
if: github.event_name != 'pull_request' && github.event.pull_request.draft == false
if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/untracked
Expand Down
6 changes: 2 additions & 4 deletions mlflow/metrics/genai/genai_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def make_genai_metric(
aggregations: Optional[List[str]] = ["mean", "variance", "p90"], # noqa: B006
greater_is_better: bool = True,
max_workers: int = 10,
judge_request_timeout: int = 60,
) -> EvaluationMetric:
"""
Create a genai metric used to evaluate LLM using LLM as a judge in MLflow.
Expand All @@ -123,8 +122,6 @@ def make_genai_metric(
:param greater_is_better: (Optional) Whether the metric is better when it is greater.
:param max_workers: (Optional) The maximum number of workers to use for judge scoring.
Defaults to 10 workers.
:param judge_request_timeout: (Optional) The timeout in seconds for each judge scoring request.
Defaults to 60 seconds.
:return: A metric object.
Expand Down Expand Up @@ -281,6 +278,7 @@ def score_model_on_one_payload(
if e.error_code in [
ErrorCode.Name(BAD_REQUEST),
ErrorCode.Name(UNAUTHENTICATED),
ErrorCode.Name(INVALID_PARAMETER_VALUE),
]:
raise MlflowException(e)
return None, f"Failed to score model on payload. Error: {e!s}"
Expand All @@ -304,7 +302,7 @@ def score_model_on_one_payload(
for indx, (input, output) in enumerate(zip(inputs, outputs))
}

for future in as_completed(futures, timeout=judge_request_timeout):
for future in as_completed(futures):
indx = futures[future]
score, justification = future.result()
scores[indx] = score
Expand Down
20 changes: 0 additions & 20 deletions mlflow/metrics/genai/metric_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def answer_similarity(
model: Optional[str] = None,
metric_version: Optional[str] = None,
examples: Optional[List[EvaluationExample]] = None,
judge_request_timeout=60,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the answer similarity of an LLM
Expand All @@ -36,8 +35,6 @@ def answer_similarity(
:param examples: (Optional) Provide a list of examples to help the judge model evaluate the
answer similarity. It is highly recommended to add examples to be used as a reference to
evaluate the new results.
:param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
Defaults to 60 seconds.
:return: A metric object
"""
if metric_version is None:
Expand Down Expand Up @@ -73,7 +70,6 @@ def answer_similarity(
parameters=answer_similarity_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
judge_request_timeout=judge_request_timeout,
)


Expand All @@ -82,7 +78,6 @@ def answer_correctness(
model: Optional[str] = None,
metric_version: Optional[str] = None,
examples: Optional[List[EvaluationExample]] = None,
judge_request_timeout=60,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the answer correctness of an LLM
Expand All @@ -103,8 +98,6 @@ def answer_correctness(
:param examples: (Optional) Provide a list of examples to help the judge model evaluate the
answer correctness. It is highly recommended to add examples to be used as a reference to
evaluate the new results.
:param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
Defaults to 60 seconds.
:return: A metric object
"""
if metric_version is None:
Expand Down Expand Up @@ -140,7 +133,6 @@ def answer_correctness(
parameters=answer_correctness_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
judge_request_timeout=judge_request_timeout,
)


Expand All @@ -149,7 +141,6 @@ def faithfulness(
model: Optional[str] = None,
metric_version: Optional[str] = _get_latest_metric_version(),
examples: Optional[List[EvaluationExample]] = None,
judge_request_timeout=60,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the faithfullness of an LLM using the
Expand All @@ -170,8 +161,6 @@ def faithfulness(
:param examples: (Optional) Provide a list of examples to help the judge model evaluate the
faithfulness. It is highly recommended to add examples to be used as a reference to evaluate
the new results.
:param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
Defaults to 60 seconds.
:return: A metric object
"""
class_name = f"mlflow.metrics.genai.prompts.{metric_version}.FaithfulnessMetric"
Expand Down Expand Up @@ -205,7 +194,6 @@ def faithfulness(
parameters=faithfulness_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
judge_request_timeout=judge_request_timeout,
)


Expand All @@ -214,7 +202,6 @@ def answer_relevance(
model: Optional[str] = None,
metric_version: Optional[str] = _get_latest_metric_version(),
examples: Optional[List[EvaluationExample]] = None,
judge_request_timeout=60,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the answer relevance of an LLM
Expand All @@ -231,8 +218,6 @@ def answer_relevance(
:param examples: (Optional) Provide a list of examples to help the judge model evaluate the
answer relevance. It is highly recommended to add examples to be used as a reference to
evaluate the new results.
:param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
Defaults to 60 seconds.
:return: A metric object
"""
class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerRelevanceMetric"
Expand Down Expand Up @@ -265,15 +250,13 @@ def answer_relevance(
parameters=answer_relevance_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
judge_request_timeout=judge_request_timeout,
)


def relevance(
model: Optional[str] = None,
metric_version: Optional[str] = None,
examples: Optional[List[EvaluationExample]] = None,
judge_request_timeout=60,
) -> EvaluationMetric:
"""
This function will create a genai metric used to evaluate the evaluate the relevance of an
Expand All @@ -294,8 +277,6 @@ def relevance(
:param examples: (Optional) Provide a list of examples to help the judge model evaluate the
relevance. It is highly recommended to add examples to be used as a reference to evaluate
the new results.
:param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
Defaults to 60 seconds.
:return: A metric object
"""
if metric_version is None:
Expand Down Expand Up @@ -331,5 +312,4 @@ def relevance(
parameters=relevance_class_module.parameters,
aggregations=["mean", "variance", "p90"],
greater_is_better=True,
judge_request_timeout=judge_request_timeout,
)
2 changes: 1 addition & 1 deletion mlflow/models/evaluation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1766,7 +1766,7 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
# If data is a pandas dataframe, predictions must be specified
if predictions is None:
raise MlflowException(
message="The model output must be specified in the predicitons "
message="The model output must be specified in the predictions "
"parameter when model=None.",
error_code=INVALID_PARAMETER_VALUE,
)
Expand Down
7 changes: 6 additions & 1 deletion mlflow/onnx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def save_model(
onnx_execution_providers=None,
onnx_session_options=None,
metadata=None,
save_as_external_data=True,
):
"""
Save an ONNX model to a path on the local file system.
Expand Down Expand Up @@ -139,6 +140,7 @@ def save_model(
See onnxruntime API for further descriptions:
https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
:param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file.
:param save_as_external_data: Save tensors to external file(s).
.. Note:: Experimental: This parameter may change or be removed in a future
release without warning.
Expand Down Expand Up @@ -167,7 +169,7 @@ def save_model(

# Save onnx-model
if Version(onnx.__version__) >= Version("1.9.0"):
onnx.save_model(onnx_model, model_data_path, save_as_external_data=True)
onnx.save_model(onnx_model, model_data_path, save_as_external_data=save_as_external_data)
else:
onnx.save_model(onnx_model, model_data_path)

Expand Down Expand Up @@ -446,6 +448,7 @@ def log_model(
onnx_execution_providers=None,
onnx_session_options=None,
metadata=None,
save_as_external_data=True,
):
"""
Log an ONNX model as an MLflow artifact for the current run.
Expand Down Expand Up @@ -498,6 +501,7 @@ def log_model(
See onnxruntime API for further descriptions:
https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
:param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file.
:param save_as_external_data: Save tensors to external file(s).
.. Note:: Experimental: This parameter may change or be removed in a future
release without warning.
Expand All @@ -519,4 +523,5 @@ def log_model(
onnx_execution_providers=onnx_execution_providers,
onnx_session_options=onnx_session_options,
metadata=metadata,
save_as_external_data=save_as_external_data,
)
2 changes: 1 addition & 1 deletion tests/evaluate/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1517,7 +1517,7 @@ def test_evaluate_with_static_dataset_error_handling_pandas_dataframe():
with pytest.raises(
MlflowException,
match="The model output must be specified in the "
"predicitons parameter when model=None.",
"predictions parameter when model=None.",
):
mlflow.evaluate(
data=X.assign(y=y, model_output=y),
Expand Down
22 changes: 12 additions & 10 deletions tests/onnx/test_onnx_model_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,16 +253,11 @@ def test_model_save_load(onnx_model, model_path):


def test_model_save_load_nonexternal_data(onnx_model, model_path):
original_save_model = onnx.save_model
if Version(onnx.__version__) >= Version("1.9.0"):
onnx.convert_model_to_external_data = mock.Mock()

def onnx_save_nonexternal(
model, path, save_as_external_data
): # pylint: disable=unused-argument
original_save_model(model, path, save_as_external_data=False)

with mock.patch("onnx.save_model", wraps=onnx_save_nonexternal):
mlflow.onnx.save_model(onnx_model, model_path)
mlflow.onnx.save_model(onnx_model, model_path, save_as_external_data=False)
onnx.convert_model_to_external_data.assert_not_called()

# Loading ONNX model
onnx.checker.check_model = mock.Mock()
Expand Down Expand Up @@ -346,10 +341,17 @@ def test_model_save_load_multiple_inputs(onnx_model_multiple_inputs_float64, mod
assert onnx.checker.check_model.called


@pytest.mark.parametrize("save_as_external_data", [True, False])
def test_model_save_load_evaluate_pyfunc_format_multiple_inputs(
onnx_model_multiple_inputs_float64, data_multiple_inputs, predicted_multiple_inputs, model_path
onnx_model_multiple_inputs_float64,
data_multiple_inputs,
predicted_multiple_inputs,
model_path,
save_as_external_data,
):
mlflow.onnx.save_model(onnx_model_multiple_inputs_float64, model_path)
mlflow.onnx.save_model(
onnx_model_multiple_inputs_float64, model_path, save_as_external_data=save_as_external_data
)

# Loading pyfunc model
pyfunc_loaded = mlflow.pyfunc.load_model(model_path)
Expand Down

0 comments on commit 6d06cbf

Please sign in to comment.