Merge branch 'main' into scorers-enum

dmcwhorter · dmcwhorter · commit e5cc85681ab2 · 2025-06-13T12:28:15.000-04:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "galileo"
-version = "1.1.0"
+version = "1.2.0"
 description = "Client library for the Galileo platform."
 authors = [{ name = "Galileo Technologies Inc.", email = "team@galileo.ai" }]
 readme = "README.md"
@@ -18,7 +18,7 @@ langchain-core = { version = "^0.3.61", optional = true }
 openai = { version = "<1.84.0", optional = true }
 openai-agents = { version = "<0.0.13", optional = true }
 
-galileo-core = "~=3.42.0"
+galileo-core = "~=3.44.0"
 
 [tool.poetry.group.test.dependencies]
 pytest = "^8.4.0"
@@ -28,7 +28,7 @@ pytest-xdist = "^3.7.0"
 pytest-socket = "^0.7"
 pytest-asyncio = "^0.26.0"
 requests-mock = "^1.11.0"
-galileo-core = { extras = ["testing"], version = "~=3.42.0" }
+galileo-core = { extras = ["testing"], version = "~=3.44.0" }
 
 pytest-env = "^1.1.5"
 langchain-core = "^0.3.61"
diff --git a/src/galileo/__init__.py b/src/galileo/__init__.py
@@ -14,4 +14,4 @@
 from galileo_core.schemas.logging.step import StepType
 from galileo_core.schemas.logging.trace import Trace
 
-__version__ = "1.1.0"
+__version__ = "1.2.0"
diff --git a/src/galileo/experiments.py b/src/galileo/experiments.py
@@ -18,7 +18,7 @@
 )
 from galileo.resources.models import ExperimentResponse, HTTPValidationError, PromptRunSettings, ScorerConfig, TaskType
 from galileo.schema.datasets import DatasetRecord
-from galileo.schema.metrics import GalileoScorers, LocalMetricConfig
+from galileo.schema.metrics import GalileoScorers, LocalMetricConfig, Metric
 from galileo.scorers import Scorers, ScorerSettings
 from galileo.utils.datasets import load_dataset_and_records
 
@@ -88,21 +88,40 @@ def list(self, project_id: str) -> Optional[Union[HTTPValidationError, list["Exp
 
     @staticmethod
     def create_metric_configs(
-        project_id: str, experiment_id: str, metrics: builtins.list[Union[GalileoScorers, LocalMetricConfig, str]]
+        project_id: str,
+        experiment_id: str,
+        metrics: builtins.list[Union[GalileoScorers, Metric, LocalMetricConfig, str]],
     ) -> tuple[builtins.list[ScorerConfig], builtins.list[LocalMetricConfig]]:
-        scorers = []
-        scorer_names = [
-            metric.value if isinstance(metric, GalileoScorers) else metric
-            for metric in metrics
-            if isinstance(metric, GalileoScorers) or isinstance(metric, str)
-        ]
-        if scorer_names:
+        local_metric_configs: list[LocalMetricConfig] = []
+        scorer_name_versions: list[tuple[str, Optional[int]]] = []
+        for metric in metrics:
+            if isinstance(metric, GalileoScorers):
+                scorer_name_versions.append((metric.value, None))
+            elif isinstance(metric, Metric):
+                scorer_name_versions.append((metric.name, metric.version))
+            elif isinstance(metric, LocalMetricConfig):
+                local_metric_configs.append(metric)
+            elif isinstance(metric, str):
+                scorer_name_versions.append((metric, None))
+            else:
+                raise ValueError(f"Unknown metric type: {type(metric)}")
+
+        scorers: list[ScorerConfig] = []
+        if scorer_name_versions:
             all_scorers = Scorers().list()
             known_metrics = {metric.name: metric for metric in all_scorers}
             unknown_metrics = []
-            for metric in scorer_names:
-                if metric in known_metrics:
-                    scorers.append(ScorerConfig.from_dict(known_metrics[metric].to_dict()))
+            for scorer_name, scorer_version in scorer_name_versions:
+                if scorer_name in known_metrics:
+                    raw_metric_dict = known_metrics[scorer_name].to_dict()
+
+                    # Set the version on the ScorerConfig if provided
+                    if scorer_version is not None:
+                        raw_version = Scorers().get_scorer_version(
+                            scorer_id=raw_metric_dict["id"], version=scorer_version
+                        )
+                        raw_metric_dict["scorer_version"] = raw_version.to_dict()
+                    scorers.append(ScorerConfig.from_dict(raw_metric_dict))
                 else:
                     unknown_metrics.append(metric)
             if unknown_metrics:
@@ -112,8 +131,6 @@ def create_metric_configs(
                 )
             ScorerSettings().create(project_id=project_id, run_id=experiment_id, scorers=scorers)
 
-        local_metric_configs = [metric for metric in metrics if isinstance(metric, LocalMetricConfig)]
-
         return scorers, local_metric_configs
 
     def run(
@@ -216,7 +233,7 @@ def run_experiment(
     dataset: Optional[Union[Dataset, list[dict[str, str]], str]] = None,
     dataset_id: Optional[str] = None,
     dataset_name: Optional[str] = None,
-    metrics: Optional[list[Union[GalileoScorers, LocalMetricConfig, str]]] = None,
+    metrics: Optional[list[Union[GalileoScorers, Metric, LocalMetricConfig, str]]] = None,
     function: Optional[Callable] = None,
 ) -> Any:
     """
diff --git a/src/galileo/schema/metrics.py b/src/galileo/schema/metrics.py
@@ -1,14 +1,12 @@
 from typing import Callable, Generic, Optional, TypeVar, Union
 
-from pydantic import BaseModel, Field, ValidationError, field_validator
+from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
 from pydantic_core.core_schema import ValidationInfo
 
 from galileo_core.schemas.logging.span import Span
 from galileo_core.schemas.logging.step import StepType
 from galileo_core.schemas.logging.trace import Trace
 from galileo_core.schemas.shared.metric import MetricValueType
-
-# ruff: noqa: F401
 from galileo_core.schemas.shared.scorers.scorer_name import ScorerName as GalileoScorers
 
 MetricType = TypeVar("MetricType", bound=MetricValueType)
@@ -36,3 +34,23 @@ def set_aggregatable_types(cls, value: list[StepType], info: ValidationInfo) ->
             if step_type not in [StepType.workflow, StepType.trace]:
                 raise ValidationError("aggregatable_types can only contain trace or workflow steps")
         return value
+
+
+class Metric(BaseModel):
+    name: str = Field(
+        description="The name of the metric you want to run a specific version of (ie: 'Sentence Density')."
+    )
+    version: Optional[int] = Field(
+        default=None,
+        description="The version of the metric (ie: 1, 2, 3, etc.). If None is provided, the 'default' version will be used.",
+    )
+
+    @model_validator(mode="after")
+    def validate_name_and_version(self) -> "Metric":
+        preset_metric_names = [scorer.value for scorer in GalileoScorers]
+        if self.name in preset_metric_names:
+            if self.version is not None:
+                raise ValueError(
+                    f"Galileo metric's '{self.name}' do not support versioning at this time. Please use the default version."
+                )
+        return self
diff --git a/src/galileo/scorers.py b/src/galileo/scorers.py
@@ -1,7 +1,11 @@
 from typing import Optional, Union
+from uuid import UUID
 
 from galileo.base import BaseClientModel
-from galileo.resources.api.data import list_scorers_with_filters_scorers_list_post
+from galileo.resources.api.data import (
+    get_scorer_version_or_latest_scorers_scorer_id_version_get,
+    list_scorers_with_filters_scorers_list_post,
+)
 from galileo.resources.api.run_scorer_settings import (
     upsert_scorers_config_projects_project_id_runs_run_id_scorer_settings_post,
 )
@@ -14,6 +18,7 @@
     ScorerTypeFilterOperator,
     ScorerTypes,
 )
+from galileo.resources.models.base_scorer_version_response import BaseScorerVersionResponse
 from galileo.resources.models.run_scorer_settings_patch_request import RunScorerSettingsPatchRequest
 from galileo.resources.models.run_scorer_settings_response import RunScorerSettingsResponse
 from galileo.resources.types import Unset
@@ -33,6 +38,21 @@ def list(self, types: list[ScorerTypes] = None) -> Union[Unset, list[ScorerRespo
         result = list_scorers_with_filters_scorers_list_post.sync(client=self.client, body=body)
         return result.scorers
 
+    def get_scorer_version(self, scorer_id: UUID, version: int) -> Union[Unset, BaseScorerVersionResponse]:
+        """
+        Args:
+            name: str
+                Name of the scorer
+            version: int
+                Version of the scorer.
+        Returns:
+            Scorer response if found, otherwise None
+        """
+        result = get_scorer_version_or_latest_scorers_scorer_id_version_get.sync(
+            scorer_id=scorer_id, version=version, client=self.client
+        )
+        return result
+
 
 class ScorerSettings(BaseClientModel):
     def create(
diff --git a/tests/schemas/test_metrics.py b/tests/schemas/test_metrics.py
@@ -0,0 +1,57 @@
+import pytest
+from pydantic import ValidationError
+
+from galileo.schema.metrics import Metric
+from galileo_core.schemas.shared.scorers.scorer_name import ScorerName
+
+
+def test_metric_validator_preset_with_version():
+    """Test that creating a Metric with a preset name and version raises a ValidationError"""
+    # Get a valid value from the ScorerName enum
+    # First, get all the available enum values
+    preset_names = [scorer.value for scorer in ScorerName]
+    # Make sure there's at least one value
+    assert preset_names, "No values found in ScorerName enum"
+    preset_name = preset_names[0]
+
+    # Attempt to create a Metric with a preset name and a version
+    with pytest.raises(ValidationError) as exc_info:
+        Metric(name=preset_name, version=1)
+
+    # Verify the error message
+    assert f"Galileo metric's '{preset_name}' do not support versioning at this time" in str(exc_info.value)
+
+
+def test_metric_validator_preset_no_version():
+    """Test that creating a Metric with a preset name and no version is valid"""
+    # Get a valid value from the ScorerName enum
+    preset_names = [scorer.value for scorer in ScorerName]
+    assert preset_names, "No values found in ScorerName enum"
+    preset_name = preset_names[0]
+
+    # Create a Metric with a preset name and no version
+    metric = Metric(name=preset_name)
+
+    # Verify the metric is created correctly
+    assert metric.name == preset_name
+    assert metric.version is None
+
+
+def test_metric_validator_custom_with_version():
+    """Test that creating a Metric with a custom name and version is valid"""
+    # Create a Metric with a custom name and a version
+    metric = Metric(name="my_custom_metric", version=2)
+
+    # Verify the metric is created correctly
+    assert metric.name == "my_custom_metric"
+    assert metric.version == 2
+
+
+def test_metric_validator_custom_no_version():
+    """Test that creating a Metric with a custom name and no version is valid"""
+    # Create a Metric with a custom name and no version
+    metric = Metric(name="my_custom_metric")
+
+    # Verify the metric is created correctly
+    assert metric.name == "my_custom_metric"
+    assert metric.version is None
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
@@ -120,6 +120,14 @@ def complex_trace_function(input):
     return output
 
 
+def mock_scorer_version_response():
+    mock_response = MagicMock()
+    mock_response.id = "mock_scorer_version_id"
+    mock_response.version = 1
+    mock_response.to_dict.return_value = {"id": "mock_scorer_version_id", "version": 1}
+    return mock_response
+
+
 class TestExperiments:
     @patch("galileo.experiments.create_experiment_projects_project_id_experiments_post")
     def test_create(self, galileo_resources_api_create_experiment: Mock):
@@ -270,6 +278,9 @@ def test_run_experiment_without_metrics(
     @patch.object(galileo.experiments.Experiments, "create", return_value=experiment_response())
     @patch.object(galileo.experiments.Experiments, "get", return_value=experiment_response())
     @patch.object(galileo.experiments.Projects, "get", return_value=project())
+    @patch.object(galileo.experiments.Scorers, "list", return_value=scorers())
+    @patch.object(galileo.experiments.Scorers, "get_scorer_version", return_value=mock_scorer_version_response())
+    @patch.object(galileo.experiments.ScorerSettings, "create")
     @pytest.mark.parametrize("thread_pool", [True, False])
     @pytest.mark.parametrize(
         ["function", "metrics", "num_spans", "span_type", "results", "aggregate_results"],
@@ -332,6 +343,9 @@ def test_run_experiment_without_metrics(
     )
     def test_run_experiment_with_func(
         self,
+        mock_scorer_settings_create: Mock,
+        mock_get_scorer_version: Mock,
+        mock_scorers_list: Mock,
         mock_get_project: Mock,
         mock_get_experiment: Mock,
         mock_create_experiment: Mock,
@@ -603,8 +617,14 @@ def test_run_experiment_with_prompt_template_and_local_dataset(self, local_datas
     @patch.object(galileo.experiments.Experiments, "create", return_value=experiment_response())
     @patch.object(galileo.experiments.Experiments, "get", return_value=experiment_response())
     @patch.object(galileo.experiments.Projects, "get", return_value=project())
+    @patch.object(galileo.experiments.Scorers, "list", return_value=scorers())
+    @patch.object(galileo.experiments.Scorers, "get_scorer_version", return_value=mock_scorer_version_response())
+    @patch.object(galileo.experiments.ScorerSettings, "create", return_value=None)
     def test_run_experiment_with_local_scorers_and_prompt_template(
         self,
+        mock_scorer_settings_create: Mock,
+        mock_get_scorer_version: Mock,
+        mock_scorers_list: Mock,
         mock_get_project: Mock,
         mock_get_experiment: Mock,
         mock_create_experiment: Mock,
@@ -652,3 +672,58 @@ def test_create_scorer_configs(self, mock_scorer_settings, mock_scorers):
         # Test unknown metrics
         with pytest.raises(ValueError):
             Experiments.create_metric_configs("project_id", "experiment_id", ["unknown_metric"])
+
+    @patch("galileo.experiments.Scorers")
+    @patch("galileo.experiments.ScorerSettings")
+    def test_create_scorer_configs_with_metric_objects(self, mock_scorer_settings, mock_scorers):
+        # Setup mock return values
+        mock_scorers_instance = mock_scorers.return_value
+
+        # Create mock scorer responses
+        mock_scorers = [
+            ScorerResponse.from_dict({"id": "1", "name": "metric1", "scorer_type": "preset", "tags": ["test"]}),
+            ScorerResponse.from_dict({"id": "2", "name": "metric2", "scorer_type": "preset", "tags": ["test"]}),
+            ScorerResponse.from_dict({"id": "3", "name": "versionable_metric", "scorer_type": "llm", "tags": ["test"]}),
+        ]
+
+        mock_scorers_instance.list.return_value = mock_scorers
+
+        # Mock the get_scorer_version method
+        mock_version_response = MagicMock()
+        mock_version_response.to_dict.return_value = {"id": "version1", "version": 2}
+        mock_scorers_instance.get_scorer_version.return_value = mock_version_response
+
+        from galileo.schema.metrics import Metric
+
+        # Test with Metric objects (without version)
+        metric1 = Metric(name="metric1")
+        metric2 = Metric(name="metric2")
+
+        scorers, local_scorers = Experiments.create_metric_configs("project_id", "experiment_id", [metric1, metric2])
+
+        assert len(scorers) == 2  # Should return two valid scorers
+        assert len(local_scorers) == 0  # No local scorers
+
+        # Verify get_scorer_version was not called (since no version was specified)
+        mock_scorers_instance.get_scorer_version.assert_not_called()
+
+        # Test with a Metric object with version
+        versionable_metric = Metric(name="versionable_metric", version=2)
+
+        scorers, local_scorers = Experiments.create_metric_configs("project_id", "experiment_id", [versionable_metric])
+
+        assert len(scorers) == 1  # Should return one valid scorer
+        assert len(local_scorers) == 0  # No local scorers
+
+        # Verify get_scorer_version was called with the correct parameters
+        mock_scorers_instance.get_scorer_version.assert_called_once_with(scorer_id="3", version=2)
+
+        # Test mixed input types
+        local_metric = LocalMetricConfig(name="length", scorer_fn=lambda x: len(x))
+
+        scorers, local_scorers = Experiments.create_metric_configs(
+            "project_id", "experiment_id", ["metric1", local_metric, Metric(name="metric2")]
+        )
+
+        assert len(scorers) == 2  # Should return two valid scorers
+        assert len(local_scorers) == 1  # One local scorer
diff --git a/tests/test_scorers.py b/tests/test_scorers.py