diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index f454266e00..8365a02e3c 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -226,6 +226,22 @@ async def _evaluate_single_inference_result( else 'test_user_id' ) + if ( + inference_result.status == InferenceStatus.FAILURE + or inference_result.inferences is None + ): + logger.error( + 'Evaluation attempted on failed inference for eval case `%s`.' + ' Error: %s', + inference_result.eval_case_id, + inference_result.error_message, + ) + eval_case_result = await self._build_not_evaluated_eval_case_result( + inference_result=inference_result, + user_id=user_id, + ) + return (inference_result, eval_case_result) + if eval_case.conversation_scenario is None and len( inference_result.inferences ) != len(eval_case.conversation): @@ -389,6 +405,31 @@ def _generate_final_eval_status( return final_eval_status + async def _build_not_evaluated_eval_case_result( + self, + *, + inference_result: InferenceResult, + user_id: str, + ) -> EvalCaseResult: + """Constructs an EvalCaseResult for cases that could not be evaluated.""" + session_details = await self._session_service.get_session( + app_name=inference_result.app_name, + user_id=user_id, + session_id=inference_result.session_id, + ) + + return EvalCaseResult( + eval_set_file=inference_result.eval_set_id, + eval_set_id=inference_result.eval_set_id, + eval_id=inference_result.eval_case_id, + final_eval_status=EvalStatus.NOT_EVALUATED, + overall_eval_metric_results=[], + eval_metric_result_per_invocation=[], + session_id=inference_result.session_id, + session_details=session_details, + user_id=user_id, + ) + async def _perform_inference_single_eval_item( self, app_name: str, diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py index 66080828d8..527b1cc609 100644 --- a/tests/unittests/evaluation/test_local_eval_service.py +++ b/tests/unittests/evaluation/test_local_eval_service.py @@ -314,6 +314,82 @@ async def test_evaluate_success( assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2 +@pytest.mark.asyncio +async def test_evaluate_skips_failed_inference_results( + eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker +): + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="test user content.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="test final response.")] + ), + ) + inference_results = [ + InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case_failure", + inferences=None, + session_id="session_fail", + status=InferenceStatus.FAILURE, + error_message="simulated failure", + ), + InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case_success", + inferences=[invocation.model_copy(deep=True)], + session_id="session_success", + status=InferenceStatus.SUCCESS, + ), + InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case_unknown", + inferences=[invocation.model_copy(deep=True)], + session_id="session_unknown", + status=InferenceStatus.UNKNOWN, + ), + ] + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) + evaluate_request = EvaluateRequest( + inference_results=inference_results, + evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2), + ) + + mock_eval_case = mocker.MagicMock(spec=EvalCase) + mock_eval_case.conversation = [invocation.model_copy(deep=True)] + mock_eval_case.conversation_scenario = None + mock_eval_case.session_input = None + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case + + results = [] + async for result in eval_service.evaluate(evaluate_request): + results.append(result) + + assert len(results) == 3 + results_by_case = {result.eval_id: result for result in results} + + failure_result = results_by_case["case_failure"] + assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED + assert failure_result.overall_eval_metric_results == [] + assert failure_result.eval_metric_result_per_invocation == [] + + for case_id in ["case_success", "case_unknown"]: + case_result = results_by_case[case_id] + assert case_result.final_eval_status == EvalStatus.PASSED + assert len(case_result.overall_eval_metric_results) == 1 + assert ( + case_result.overall_eval_metric_results[0].metric_name == "fake_metric" + ) + assert case_result.overall_eval_metric_results[0].score == 0.9 + + assert mock_eval_sets_manager.get_eval_case.call_count == 3 + assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3 + + @pytest.mark.asyncio async def test_evaluate_eval_case_not_found( eval_service, @@ -407,6 +483,93 @@ async def test_evaluate_single_inference_result( assert metric_result.eval_status == EvalStatus.PASSED +@pytest.mark.asyncio +async def test_evaluate_single_inference_result_handles_failed_inference( + eval_service, mock_eval_sets_manager, mocker +): + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="test user content.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="test final response.")] + ), + ) + inference_result = InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case1", + inferences=None, + session_id="session1", + status=InferenceStatus.FAILURE, + error_message="simulated inference failure", + ) + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) + evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) + + mock_eval_case = mocker.MagicMock(spec=EvalCase) + mock_eval_case.conversation = [invocation.model_copy(deep=True)] + mock_eval_case.conversation_scenario = None + mock_eval_case.session_input = None + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case + + _, result = await eval_service._evaluate_single_inference_result( + inference_result=inference_result, evaluate_config=evaluate_config + ) + + assert isinstance(result, EvalCaseResult) + assert result.eval_id == "case1" + assert result.final_eval_status == EvalStatus.NOT_EVALUATED + assert result.overall_eval_metric_results == [] + assert result.eval_metric_result_per_invocation == [] + mock_eval_sets_manager.get_eval_case.assert_called_once_with( + app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1" + ) + + +@pytest.mark.asyncio +async def test_evaluate_single_inference_result_handles_missing_inferences( + eval_service, mock_eval_sets_manager, mocker +): + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="test user content.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="test final response.")] + ), + ) + inference_result = InferenceResult( + app_name="test_app", + eval_set_id="test_eval_set", + eval_case_id="case1", + inferences=None, + session_id="session1", + status=InferenceStatus.SUCCESS, + ) + eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) + evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) + + mock_eval_case = mocker.MagicMock(spec=EvalCase) + mock_eval_case.conversation = [invocation.model_copy(deep=True)] + mock_eval_case.conversation_scenario = None + mock_eval_case.session_input = None + mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case + + _, result = await eval_service._evaluate_single_inference_result( + inference_result=inference_result, evaluate_config=evaluate_config + ) + + assert isinstance(result, EvalCaseResult) + assert result.eval_id == "case1" + assert result.final_eval_status == EvalStatus.NOT_EVALUATED + assert result.overall_eval_metric_results == [] + assert result.eval_metric_result_per_invocation == [] + mock_eval_sets_manager.get_eval_case.assert_called_once_with( + app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1" + ) + + @pytest.mark.asyncio async def test_evaluate_single_inference_result_for_conversation_scenario( eval_service, mock_eval_sets_manager, mocker