feat: Pass in metric version and name to experiments

john-weiler · john-weiler · commit a4499daa7b0d · 2025-06-11T19:18:17.000-07:00
diff --git a/src/galileo/experiments.py b/src/galileo/experiments.py
@@ -18,7 +18,7 @@
 )
 from galileo.resources.models import ExperimentResponse, HTTPValidationError, PromptRunSettings, ScorerConfig, TaskType
 from galileo.schema.datasets import DatasetRecord
-from galileo.schema.metrics import LocalMetricConfig
+from galileo.schema.metrics import LocalMetricConfig, Metric
 from galileo.scorers import Scorers, ScorerSettings
 from galileo.utils.datasets import load_dataset_and_records
 
@@ -88,27 +88,43 @@ def list(self, project_id: str) -> Optional[Union[HTTPValidationError, list["Exp
 
     @staticmethod
     def create_metric_configs(
-        project_id: str, experiment_id: str, metrics: builtins.list[Union[str, LocalMetricConfig]]
+        project_id: str, experiment_id: str, metrics: builtins.list[Union[str, LocalMetricConfig, Metric]]
     ) -> tuple[builtins.list[ScorerConfig], builtins.list[LocalMetricConfig]]:
         scorers = []
-        scorer_names = [metric for metric in metrics if isinstance(metric, str)]
-        if scorer_names:
-            all_scorers = Scorers().list()
-            known_metrics = {metric.name: metric for metric in all_scorers}
-            unknown_metrics = []
-            for metric in scorer_names:
-                if metric in known_metrics:
-                    scorers.append(ScorerConfig.from_dict(known_metrics[metric].to_dict()))
+
+        local_metric_configs: builtins.list[LocalMetricConfig] = []
+
+        all_scorers = Scorers().list()
+        known_metrics = {metric.name: metric for metric in all_scorers}
+
+        unknown_metrics = []
+
+        for metric in metrics:
+            if isinstance(metric, LocalMetricConfig):
+                local_metric_configs.append(metric)
+                continue
+            else:
+                name = metric.name if isinstance(metric, Metric) else metric
+                version = metric.version if isinstance(metric, Metric) else None
+
+                if name in known_metrics:
+                    raw_metric_dict = known_metrics[name].to_dict()
+
+                    # Set the version on the ScorerConfig if provided
+                    if version is not None:
+                        raw_version = Scorers().get_scorer_version(scorer_id=raw_metric_dict["id"], version=version)
+                        raw_metric_dict["scorer_version"] = raw_version.to_dict()
+                    scorers.append(ScorerConfig.from_dict(raw_metric_dict))
                 else:
-                    unknown_metrics.append(metric)
-            if unknown_metrics:
-                raise ValueError(
-                    "One or more non-existent metrics are specified:"
-                    + ", ".join(f"'{metric}'" for metric in unknown_metrics)
-                )
-            ScorerSettings().create(project_id=project_id, run_id=experiment_id, scorers=scorers)
+                    unknown_metrics.append(name)
+
+        if unknown_metrics:
+            raise ValueError(
+                "One or more non-existent metrics are specified:"
+                + ", ".join(f"'{metric}'" for metric in unknown_metrics)
+            )
 
-        local_metric_configs = [metric for metric in metrics if isinstance(metric, LocalMetricConfig)]
+        ScorerSettings().create(project_id=project_id, run_id=experiment_id, scorers=scorers)
 
         return scorers, local_metric_configs
 
@@ -212,7 +228,7 @@ def run_experiment(
     dataset: Optional[Union[Dataset, list[dict[str, str]], str]] = None,
     dataset_id: Optional[str] = None,
     dataset_name: Optional[str] = None,
-    metrics: Optional[list[Union[str, LocalMetricConfig]]] = None,
+    metrics: Optional[list[Union[str, LocalMetricConfig, Metric]]] = None,
     function: Optional[Callable] = None,
 ) -> Any:
     """
diff --git a/src/galileo/schema/metrics.py b/src/galileo/schema/metrics.py
@@ -1,12 +1,13 @@
 from typing import Callable, Generic, Optional, TypeVar, Union
 
-from pydantic import BaseModel, Field, ValidationError, field_validator
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
 from pydantic_core.core_schema import ValidationInfo
 
 from galileo_core.schemas.logging.span import Span
 from galileo_core.schemas.logging.step import StepType
 from galileo_core.schemas.logging.trace import Trace
 from galileo_core.schemas.shared.metric import MetricValueType
+from galileo_core.schemas.shared.scorers.scorer_name import ScorerName
 
 MetricType = TypeVar("MetricType", bound=MetricValueType)
 
@@ -33,3 +34,23 @@ def set_aggregatable_types(cls, value: list[StepType], info: ValidationInfo) ->
             if step_type not in [StepType.workflow, StepType.trace]:
                 raise ValidationError("aggregatable_types can only contain trace or workflow steps")
         return value
+
+
+class Metric(BaseModel):
+    name: Union[str] = Field(
+        description="The name of the metric you want to run a specific version of (ie: 'Sentence Density')."
+    )
+    version: int | None = Field(
+        default=None,
+        description="The version of the metric (ie: 1, 2, 3, etc.). If None is provided, the 'default' version will be used.",
+    )
+
+    @model_validator(mode="after")
+    def validate_name_and_version(self) -> "Metric":
+        preset_metric_names = [scorer.value for scorer in ScorerName]
+        if self.name in preset_metric_names:
+            if self.version is not None:
+                raise ValueError(
+                    f"Galileo metric's '{self.name}' do not support versioning at this time. Please use the default version."
+                )
+        return self
diff --git a/src/galileo/scorers.py b/src/galileo/scorers.py
@@ -1,7 +1,11 @@
 from typing import Optional, Union
+from uuid import UUID
 
 from galileo.base import BaseClientModel
-from galileo.resources.api.data import list_scorers_with_filters_scorers_list_post
+from galileo.resources.api.data import (
+    get_scorer_version_or_latest_scorers_scorer_id_version_get,
+    list_scorers_with_filters_scorers_list_post,
+)
 from galileo.resources.api.run_scorer_settings import (
     upsert_scorers_config_projects_project_id_runs_run_id_scorer_settings_post,
 )
@@ -14,6 +18,7 @@
     ScorerTypeFilterOperator,
     ScorerTypes,
 )
+from galileo.resources.models.base_scorer_version_response import BaseScorerVersionResponse
 from galileo.resources.models.run_scorer_settings_patch_request import RunScorerSettingsPatchRequest
 from galileo.resources.models.run_scorer_settings_response import RunScorerSettingsResponse
 from galileo.resources.types import Unset
@@ -33,6 +38,21 @@ def list(self, types: list[ScorerTypes] = None) -> Union[Unset, list[ScorerRespo
         result = list_scorers_with_filters_scorers_list_post.sync(client=self.client, body=body)
         return result.scorers
 
+    def get_scorer_version(self, scorer_id: UUID, version: int) -> Union[Unset, BaseScorerVersionResponse]:
+        """
+        Args:
+            name: str
+                Name of the scorer
+            version: int
+                Version of the scorer.
+        Returns:
+            Scorer response if found, otherwise None
+        """
+        result = get_scorer_version_or_latest_scorers_scorer_id_version_get.sync(
+            scorer_id=scorer_id, version=version, client=self.client
+        )
+        return result
+
 
 class ScorerSettings(BaseClientModel):
     def create(
diff --git a/tests/schemas/test_metrics.py b/tests/schemas/test_metrics.py
@@ -0,0 +1,57 @@
+import pytest
+from pydantic import ValidationError
+
+from galileo.schema.metrics import Metric
+from galileo_core.schemas.shared.scorers.scorer_name import ScorerName
+
+
+def test_metric_validator_preset_with_version():
+    """Test that creating a Metric with a preset name and version raises a ValidationError"""
+    # Get a valid value from the ScorerName enum
+    # First, get all the available enum values
+    preset_names = [scorer.value for scorer in ScorerName]
+    # Make sure there's at least one value
+    assert preset_names, "No values found in ScorerName enum"
+    preset_name = preset_names[0]
+
+    # Attempt to create a Metric with a preset name and a version
+    with pytest.raises(ValidationError) as exc_info:
+        Metric(name=preset_name, version=1)
+
+    # Verify the error message
+    assert f"Galileo metric's '{preset_name}' do not support versioning at this time" in str(exc_info.value)
+
+
+def test_metric_validator_preset_no_version():
+    """Test that creating a Metric with a preset name and no version is valid"""
+    # Get a valid value from the ScorerName enum
+    preset_names = [scorer.value for scorer in ScorerName]
+    assert preset_names, "No values found in ScorerName enum"
+    preset_name = preset_names[0]
+
+    # Create a Metric with a preset name and no version
+    metric = Metric(name=preset_name)
+
+    # Verify the metric is created correctly
+    assert metric.name == preset_name
+    assert metric.version is None
+
+
+def test_metric_validator_custom_with_version():
+    """Test that creating a Metric with a custom name and version is valid"""
+    # Create a Metric with a custom name and a version
+    metric = Metric(name="my_custom_metric", version=2)
+
+    # Verify the metric is created correctly
+    assert metric.name == "my_custom_metric"
+    assert metric.version == 2
+
+
+def test_metric_validator_custom_no_version():
+    """Test that creating a Metric with a custom name and no version is valid"""
+    # Create a Metric with a custom name and no version
+    metric = Metric(name="my_custom_metric")
+
+    # Verify the metric is created correctly
+    assert metric.name == "my_custom_metric"
+    assert metric.version is None
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
@@ -652,3 +652,58 @@ def test_create_scorer_configs(self, mock_scorer_settings, mock_scorers):
         # Test unknown metrics
         with pytest.raises(ValueError):
             Experiments.create_metric_configs("project_id", "experiment_id", ["unknown_metric"])
+
+    @patch("galileo.experiments.Scorers")
+    @patch("galileo.experiments.ScorerSettings")
+    def test_create_scorer_configs_with_metric_objects(self, mock_scorer_settings, mock_scorers):
+        # Setup mock return values
+        mock_scorers_instance = mock_scorers.return_value
+
+        # Create mock scorer responses
+        mock_scorers = [
+            ScorerResponse.from_dict({"id": "1", "name": "metric1", "scorer_type": "preset", "tags": ["test"]}),
+            ScorerResponse.from_dict({"id": "2", "name": "metric2", "scorer_type": "preset", "tags": ["test"]}),
+            ScorerResponse.from_dict({"id": "3", "name": "versionable_metric", "scorer_type": "llm", "tags": ["test"]}),
+        ]
+
+        mock_scorers_instance.list.return_value = mock_scorers
+
+        # Mock the get_scorer_version method
+        mock_version_response = MagicMock()
+        mock_version_response.to_dict.return_value = {"id": "version1", "version": 2}
+        mock_scorers_instance.get_scorer_version.return_value = mock_version_response
+
+        from galileo.schema.metrics import Metric
+
+        # Test with Metric objects (without version)
+        metric1 = Metric(name="metric1")
+        metric2 = Metric(name="metric2")
+
+        scorers, local_scorers = Experiments.create_metric_configs("project_id", "experiment_id", [metric1, metric2])
+
+        assert len(scorers) == 2  # Should return two valid scorers
+        assert len(local_scorers) == 0  # No local scorers
+
+        # Verify get_scorer_version was not called (since no version was specified)
+        mock_scorers_instance.get_scorer_version.assert_not_called()
+
+        # Test with a Metric object with version
+        versionable_metric = Metric(name="versionable_metric", version=2)
+
+        scorers, local_scorers = Experiments.create_metric_configs("project_id", "experiment_id", [versionable_metric])
+
+        assert len(scorers) == 1  # Should return one valid scorer
+        assert len(local_scorers) == 0  # No local scorers
+
+        # Verify get_scorer_version was called with the correct parameters
+        mock_scorers_instance.get_scorer_version.assert_called_once_with(scorer_id="3", version=2)
+
+        # Test mixed input types
+        local_metric = LocalMetricConfig(name="length", scorer_fn=lambda x: len(x))
+
+        scorers, local_scorers = Experiments.create_metric_configs(
+            "project_id", "experiment_id", ["metric1", local_metric, Metric(name="metric2")]
+        )
+
+        assert len(scorers) == 2  # Should return two valid scorers
+        assert len(local_scorers) == 1  # One local scorer
diff --git a/tests/test_scorers.py b/tests/test_scorers.py
@@ -1,5 +1,8 @@
+import uuid
 from unittest.mock import ANY, Mock, patch
 
+from src.galileo.resources.models.base_scorer_version_response import BaseScorerVersionResponse
+
 from galileo.resources.models import (
     ListScorersRequest,
     ListScorersResponse,
@@ -67,3 +70,88 @@ def test_list_all_scorers_preset_filter(list_scorers_mock: Mock):
             filters=[ScorerTypeFilter(operator=ScorerTypeFilterOperator.EQ, value=ScorerTypes.LLM)]
         ),
     )
+
+
+def create_mock_version_response():
+    return BaseScorerVersionResponse.from_dict(
+        {
+            "id": "b8933a6d-7a65-4ce3-bfe4-b863109a0425",
+            "version": 2,
+            "model_name": "GPT-4o",
+            "num_judges": 3,
+            "created_at": "2025-03-28T18:54:02.848267+00:00",
+            "updated_at": "2025-03-28T18:54:02.848269+00:00",
+            "generated_scorer": {
+                "id": "c7933a6d-7a65-4ce3-bfe4-b863109a0499",
+                "name": "test_generated_scorer",
+                "instructions": "Evaluate the response quality",
+                "chain_poll_template": {
+                    "name": "quality_check",
+                    "prompt": "Rate the quality on a scale of 1-10",
+                    "template_type": "standard",
+                },
+            },
+            "registered_scorer": None,
+        }
+    )
+
+
+class MockHTTPError(Exception):
+    def __init__(self, status_code):
+        self.status_code = status_code
+        super().__init__(f"HTTP Error: {status_code}")
+
+
+@patch("galileo.scorers.get_scorer_version_or_latest_scorers_scorer_id_version_get")
+def test_get_scorer_version_success(get_scorer_version_mock: Mock):
+    # Setup
+    mock_response = create_mock_version_response()
+    get_scorer_version_mock.sync.return_value = mock_response
+    scorer_id = uuid.UUID("b8933a6d-7a65-4ce3-bfe4-b863109a0425")
+    version = 2
+
+    # Execute
+    result = Scorers().get_scorer_version(scorer_id=scorer_id, version=version)
+
+    # Verify
+    assert result == mock_response
+    get_scorer_version_mock.sync.assert_called_once_with(scorer_id=scorer_id, version=version, client=ANY)
+    assert result.id == "b8933a6d-7a65-4ce3-bfe4-b863109a0425"
+    assert result.version == 2
+    # Access properties from additional_properties instead
+    assert result.additional_properties["model_name"] == "GPT-4o"
+    assert result.additional_properties["num_judges"] == 3
+    # Access generated_scorer as a dictionary
+    assert result.generated_scorer["name"] == "test_generated_scorer"
+    assert result.registered_scorer is None
+
+
+@patch("galileo.scorers.get_scorer_version_or_latest_scorers_scorer_id_version_get")
+def test_get_scorer_version_not_found(get_scorer_version_mock: Mock):
+    # Setup
+    get_scorer_version_mock.sync.side_effect = MockHTTPError(status_code=404)
+    scorer_id = uuid.UUID("b8933a6d-7a65-4ce3-bfe4-b863109a0425")
+    version = 2
+
+    # Execute
+    result = Scorers().get_scorer_version(scorer_id=scorer_id, version=version)
+
+    # Verify
+    assert result is None
+    get_scorer_version_mock.sync.assert_called_once_with(scorer_id=scorer_id, version=version, client=ANY)
+
+
+@patch("galileo.scorers.get_scorer_version_or_latest_scorers_scorer_id_version_get")
+def test_get_scorer_version_other_error(get_scorer_version_mock: Mock):
+    # Setup
+    error = MockHTTPError(status_code=500)
+    get_scorer_version_mock.sync.side_effect = error
+    scorer_id = uuid.UUID("b8933a6d-7a65-4ce3-bfe4-b863109a0425")
+    version = 2
+
+    # Execute
+    result = Scorers().get_scorer_version(scorer_id=scorer_id, version=version)
+
+    # Verify - it seems the implementation catches all exceptions, not just 404s
+    assert result is None
+    get_scorer_version_mock.sync.assert_called_once_with(scorer_id=scorer_id, version=version, client=ANY)