From 550bc0332050a4c5aae522b421e705d30ea5d84a Mon Sep 17 00:00:00 2001
From: ahibrahim <ahibrahim@microsoft.com>
Date: Mon, 10 Nov 2025 23:32:34 +0200
Subject: [PATCH 1/7] changes

---
 .../_groundedness/_groundedness.py            | 155 ++++++++++++++++--
 1 file changed, 138 insertions(+), 17 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index b66c177d7bbc..87596f225614 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -2,6 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os, logging
+import re
+import math
 from typing import Dict, List, Optional, Union, Any, Tuple
 
 from typing_extensions import overload, override
@@ -9,6 +11,7 @@
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
 from ..._common.utils import (
     ErrorBlame,
     ErrorTarget,
@@ -103,21 +106,29 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY)  # Default to no query
-
-        self._higher_is_better = True
         super().__init__(
             model_config=model_config,
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
             credential=credential,
-            _higher_is_better=self._higher_is_better,
+            _higher_is_better=True,
             **kwargs,
         )
         self._model_config = model_config
-        self.threshold = threshold
         # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
 
+        # To make sure they're not used directly
+        self._flow = None
+        self._prompty_file = None
+
+        self._flow_with_query = self._load_flow(
+            self._PROMPTY_FILE_WITH_QUERY, credential=credential
+        )
+        self._flow_no_query = self._load_flow(
+            self._PROMPTY_FILE_NO_QUERY, credential=credential
+        )
+
     @overload
     def __call__(
         self,
@@ -201,16 +212,18 @@ def __call__(  # pylint: disable=docstring-missing-param
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
 
-        if kwargs.get("query", None):
-            self._ensure_query_prompty_loaded()
-
         return super().__call__(*args, **kwargs)
 
-    def _ensure_query_prompty_loaded(self):
-        """Switch to the query prompty file if not already loaded."""
+    def _load_flow(self, prompty_filename: str, **kwargs) -> AsyncPrompty:
+        """Load the Prompty flow from the specified file.
+        :param prompty_filename: The filename of the Prompty flow to load.
+        :type prompty_filename: str
+        :return: The loaded Prompty flow.
+        :rtype: AsyncPrompty
+        """
 
         current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
+        prompty_path = os.path.join(current_dir, prompty_filename)
 
         self._prompty_file = prompty_path
         prompty_model_config = construct_prompty_model_config(
@@ -219,6 +232,14 @@ def _ensure_query_prompty_loaded(self):
             UserAgentSingleton().value,
         )
         self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
+        flow = AsyncPrompty.load(
+            source=prompty_path,
+            model=prompty_model_config,
+            is_reasoning_model=self._is_reasoning_model,
+            **kwargs,
+        )
+
+        return flow
 
     def _has_context(self, eval_input: dict) -> bool:
         """
@@ -226,6 +247,17 @@ def _has_context(self, eval_input: dict) -> bool:
         Treats None, empty strings, empty lists, and lists of empty strings as no context.
         """
         context = eval_input.get("context", None)
+        return self._validate_context(context)
+
+    def _validate_context(self, context) -> bool:
+        """
+        Validate if the provided context is non-empty and meaningful.
+        Treats None, empty strings, empty lists, and lists of empty strings as no context.
+        :param context: The context to validate
+        :type context: Union[str, List, None]
+        :return: True if context is valid and non-empty, False otherwise
+        :rtype: bool
+        """
         if not context:
             return False
         if context == "<>":  # Special marker for no context
@@ -239,7 +271,7 @@ def _has_context(self, eval_input: dict) -> bool:
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         if eval_input.get("query", None) is None:
-            return await super()._do_eval(eval_input)
+            return await super()._do_eval_with_flow(eval_input, self._flow_no_query)
 
         contains_context = self._has_context(eval_input)
 
@@ -254,7 +286,82 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         }
 
         # Replace and call the parent method
-        return await super()._do_eval(simplified_eval_input)
+        return await super()._do_eval_with_flow(simplified_eval_input, self._flow_with_query)
+
+    async def _do_eval_with_flow(self, eval_input: Dict, flow: AsyncPrompty) -> Dict[str, Union[float, str]]:  # type: ignore[override]
+        """Do a relevance evaluation.
+        NOTE: This is copy from parent with addition of flow parameter to allow choosing between two flows.
+        :param eval_input: The input to the evaluator. Expected to contain
+        whatever inputs are needed for the _flow method, including context
+        and other fields depending on the child class.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message="Only text conversation inputs are supported.",
+                internal_message="Only text conversation inputs are supported.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.CONVERSATION,
+            )
+        # Call the prompty flow to get the evaluation result.
+        prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+
+        score = math.nan
+        if prompty_output_dict:
+            llm_output = prompty_output_dict.get("llm_output", "")
+            input_token_count = prompty_output_dict.get("input_token_count", 0)
+            output_token_count = prompty_output_dict.get("output_token_count", 0)
+            total_token_count = prompty_output_dict.get("total_token_count", 0)
+            finish_reason = prompty_output_dict.get("finish_reason", "")
+            model_id = prompty_output_dict.get("model_id", "")
+            sample_input = prompty_output_dict.get("sample_input", "")
+            sample_output = prompty_output_dict.get("sample_output", "")
+            # Parse out score and reason from evaluators known to possess them.
+            if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
+                score, reason = parse_quality_evaluator_reason_score(llm_output)
+                binary_result = self._get_binary_result(score)
+                return {
+                    self._result_key: float(score),
+                    f"gpt_{self._result_key}": float(score),
+                    f"{self._result_key}_reason": reason,
+                    f"{self._result_key}_result": binary_result,
+                    f"{self._result_key}_threshold": self._threshold,
+                    f"{self._result_key}_prompt_tokens": input_token_count,
+                    f"{self._result_key}_completion_tokens": output_token_count,
+                    f"{self._result_key}_total_tokens": total_token_count,
+                    f"{self._result_key}_finish_reason": finish_reason,
+                    f"{self._result_key}_model": model_id,
+                    f"{self._result_key}_sample_input": sample_input,
+                    f"{self._result_key}_sample_output": sample_output,
+                }
+            match = re.search(r"\d", llm_output)
+            if match:
+                score = float(match.group())
+                binary_result = self._get_binary_result(score)
+            return {
+                self._result_key: float(score),
+                f"gpt_{self._result_key}": float(score),
+                f"{self._result_key}_result": binary_result,
+                f"{self._result_key}_threshold": self._threshold,
+                f"{self._result_key}_prompt_tokens": input_token_count,
+                f"{self._result_key}_completion_tokens": output_token_count,
+                f"{self._result_key}_total_tokens": total_token_count,
+                f"{self._result_key}_finish_reason": finish_reason,
+                f"{self._result_key}_model": model_id,
+                f"{self._result_key}_sample_input": sample_input,
+                f"{self._result_key}_sample_output": sample_output,
+            }
+
+        binary_result = self._get_binary_result(score)
+        return {
+            self._result_key: float(score),
+            f"gpt_{self._result_key}": float(score),
+            f"{self._result_key}_result": binary_result,
+            f"{self._result_key}_threshold": self._threshold,
+        }
 
     async def _real_call(self, **kwargs):
         """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -272,12 +379,20 @@ async def _real_call(self, **kwargs):
                 return {
                     self._result_key: self._NOT_APPLICABLE_RESULT,
                     f"{self._result_key}_result": "pass",
-                    f"{self._result_key}_threshold": self.threshold,
+                    f"{self._result_key}_threshold": self._threshold,
                     f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
                 }
             else:
                 raise ex
 
+    def _is_single_entry(self, value):
+        """Determine if the input value represents a single entry, unsure is returned as False."""
+        if isinstance(value, str):
+            return True
+        if isinstance(value, list) and len(value) == 1:
+            return True
+        return False
+
     def _convert_kwargs_to_eval_input(self, **kwargs):
         if kwargs.get("context") or kwargs.get("conversation"):
             return super()._convert_kwargs_to_eval_input(**kwargs)
@@ -285,9 +400,6 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
         response = kwargs.get("response")
         tool_definitions = kwargs.get("tool_definitions")
 
-        if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
-            self._ensure_query_prompty_loaded()
-
         if (not query) or (not response):  # or not tool_definitions:
             msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
             raise EvaluationException(
@@ -298,7 +410,16 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
             )
         context = self._get_context_from_agent_response(response, tool_definitions)
 
-        filtered_response = self._filter_file_search_results(response)
+        if not self._validate_context(context) and self._is_single_entry(response) and self._is_single_entry(query):
+            msg = f"{type(self).__name__}: No valid context provided or could be extracted from the query or response."
+            raise EvaluationException(
+                message=msg,
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
+            )
+
+        filtered_response = self._filter_file_search_results(response) if self._validate_context(context) else response
         return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query)
 
     def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:

From 3eec964307d7fb10cfa7791893ea7c5b9ebc067a Mon Sep 17 00:00:00 2001
From: ahibrahim <ahibrahim@microsoft.com>
Date: Mon, 10 Nov 2025 23:34:41 +0200
Subject: [PATCH 2/7] changes

---
 .../evaluation/_evaluators/_groundedness/_groundedness.py  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 87596f225614..5ae91a54b395 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -20,6 +20,7 @@
     construct_prompty_model_config,
     validate_model_config,
     simplify_messages,
+    parse_quality_evaluator_reason_score,
 )
 
 try:
@@ -271,7 +272,7 @@ def _validate_context(self, context) -> bool:
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         if eval_input.get("query", None) is None:
-            return await super()._do_eval_with_flow(eval_input, self._flow_no_query)
+            return await self._do_eval_with_flow(eval_input, self._flow_no_query)
 
         contains_context = self._has_context(eval_input)
 
@@ -286,7 +287,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         }
 
         # Replace and call the parent method
-        return await super()._do_eval_with_flow(simplified_eval_input, self._flow_with_query)
+        return await self._do_eval_with_flow(simplified_eval_input, self._flow_with_query)
 
     async def _do_eval_with_flow(self, eval_input: Dict, flow: AsyncPrompty) -> Dict[str, Union[float, str]]:  # type: ignore[override]
         """Do a relevance evaluation.
@@ -307,7 +308,7 @@ async def _do_eval_with_flow(self, eval_input: Dict, flow: AsyncPrompty) -> Dict
                 target=ErrorTarget.CONVERSATION,
             )
         # Call the prompty flow to get the evaluation result.
-        prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        prompty_output_dict = await flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
 
         score = math.nan
         if prompty_output_dict:

From 7a2a5e2064c94fc6578dc1b3711a61e20ba5f8db Mon Sep 17 00:00:00 2001
From: ahibrahim <ahibrahim@microsoft.com>
Date: Mon, 10 Nov 2025 23:39:32 +0200
Subject: [PATCH 3/7] change

---
 .../ai/evaluation/_evaluators/_groundedness/_groundedness.py    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index 5ae91a54b395..e99b30d5f4ed 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -226,13 +226,11 @@ def _load_flow(self, prompty_filename: str, **kwargs) -> AsyncPrompty:
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, prompty_filename)
 
-        self._prompty_file = prompty_path
         prompty_model_config = construct_prompty_model_config(
             validate_model_config(self._model_config),
             self._DEFAULT_OPEN_API_VERSION,
             UserAgentSingleton().value,
         )
-        self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
         flow = AsyncPrompty.load(
             source=prompty_path,
             model=prompty_model_config,

From 4388e4a9787d50c4557d4387f362fdb78ab05484 Mon Sep 17 00:00:00 2001
From: ahibrahim <ahibrahim@microsoft.com>
Date: Mon, 10 Nov 2025 23:41:26 +0200
Subject: [PATCH 4/7] black

---
 .../evaluation/_evaluators/_groundedness/_groundedness.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index e99b30d5f4ed..a03fb5aefb0c 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -123,12 +123,8 @@ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
         self._flow = None
         self._prompty_file = None
 
-        self._flow_with_query = self._load_flow(
-            self._PROMPTY_FILE_WITH_QUERY, credential=credential
-        )
-        self._flow_no_query = self._load_flow(
-            self._PROMPTY_FILE_NO_QUERY, credential=credential
-        )
+        self._flow_with_query = self._load_flow(self._PROMPTY_FILE_WITH_QUERY, credential=credential)
+        self._flow_no_query = self._load_flow(self._PROMPTY_FILE_NO_QUERY, credential=credential)
 
     @overload
     def __call__(

From dc21981566066ccb582d16be6e1a78d8fed95a5c Mon Sep 17 00:00:00 2001
From: ahibrahim <ahibrahim@microsoft.com>
Date: Mon, 10 Nov 2025 23:49:52 +0200
Subject: [PATCH 5/7] update flow attribute

---
 .../tests/unittests/test_built_in_evaluator.py                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py
index 9bfbc85721eb..50e2e9ab2eb8 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py
@@ -217,7 +217,7 @@ def test_groundedness_evaluator_with_agent_response(self, mock_async_prompty, mo
     def test_groundedness_evaluator_with_context(self, mock_model_config):
         """Test GroundednessEvaluator with direct context (traditional use)"""
         groundedness_eval = GroundednessEvaluator(model_config=mock_model_config)
-        groundedness_eval._flow = MagicMock(return_value=quality_response_async_mock())
+        groundedness_eval._flow_no_query = MagicMock(return_value=quality_response_async_mock())
 
         result = groundedness_eval(
             response="The capital of Japan is Tokyo.",
@@ -231,7 +231,7 @@ def test_groundedness_evaluator_with_context(self, mock_model_config):
     def test_groundedness_evaluator_missing_required_inputs(self, mock_model_config):
         """Test GroundednessEvaluator with missing required inputs for agent response mode"""
         groundedness_eval = GroundednessEvaluator(model_config=mock_model_config)
-        groundedness_eval._flow = MagicMock(return_value=quality_response_async_mock())
+        groundedness_eval._flow_no_query = MagicMock(return_value=quality_response_async_mock())
 
         with pytest.raises(EvaluationException) as exc_info:
             groundedness_eval(

From 48d55fbb7a5e14de1e02d1afacba36bd87b2e055 Mon Sep 17 00:00:00 2001
From: ahibrahim <ahibrahim@microsoft.com>
Date: Mon, 10 Nov 2025 23:54:12 +0200
Subject: [PATCH 6/7] update name

---
 .../ai/evaluation/_evaluators/_groundedness/_groundedness.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index a03fb5aefb0c..c83eb1d44d3f 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -123,8 +123,8 @@ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
         self._flow = None
         self._prompty_file = None
 
-        self._flow_with_query = self._load_flow(self._PROMPTY_FILE_WITH_QUERY, credential=credential)
-        self._flow_no_query = self._load_flow(self._PROMPTY_FILE_NO_QUERY, credential=credential)
+        self._flow_with_query = self._load_flow(self._PROMPTY_FILE_WITH_QUERY, token_credential=credential)
+        self._flow_no_query = self._load_flow(self._PROMPTY_FILE_NO_QUERY, token_credential=credential)
 
     @overload
     def __call__(

From 28adb97cddac50fba01f0a1d6fef7dc56d6daa4b Mon Sep 17 00:00:00 2001
From: ahibrahim <ahibrahim@microsoft.com>
Date: Tue, 11 Nov 2025 00:02:53 +0200
Subject: [PATCH 7/7] lint and docs

---
 .../evaluation/_evaluators/_groundedness/_groundedness.py  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
index c83eb1d44d3f..b0afd920d6eb 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -284,12 +284,15 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
         return await self._do_eval_with_flow(simplified_eval_input, self._flow_with_query)
 
     async def _do_eval_with_flow(self, eval_input: Dict, flow: AsyncPrompty) -> Dict[str, Union[float, str]]:  # type: ignore[override]
-        """Do a relevance evaluation.
+        """Do an evaluation.
+
         NOTE: This is copy from parent with addition of flow parameter to allow choosing between two flows.
         :param eval_input: The input to the evaluator. Expected to contain
-        whatever inputs are needed for the _flow method, including context
+        whatever inputs are needed for the flow method, including context
         and other fields depending on the child class.
         :type eval_input: Dict
+        :param flow: The AsyncPrompty flow to use for evaluation.
+        :type flow: AsyncPrompty
         :return: The evaluation result.
         :rtype: Dict
         """