Skip to content

Commit e5cc856

Browse files
committed
Merge branch 'main' into scorers-enum
2 parents 48644db + 086f7d1 commit e5cc856

File tree

9 files changed

+271
-30
lines changed

9 files changed

+271
-30
lines changed

poetry.lock

Lines changed: 4 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "galileo"
3-
version = "1.1.0"
3+
version = "1.2.0"
44
description = "Client library for the Galileo platform."
55
authors = [{ name = "Galileo Technologies Inc.", email = "team@galileo.ai" }]
66
readme = "README.md"
@@ -18,7 +18,7 @@ langchain-core = { version = "^0.3.61", optional = true }
1818
openai = { version = "<1.84.0", optional = true }
1919
openai-agents = { version = "<0.0.13", optional = true }
2020

21-
galileo-core = "~=3.42.0"
21+
galileo-core = "~=3.44.0"
2222

2323
[tool.poetry.group.test.dependencies]
2424
pytest = "^8.4.0"
@@ -28,7 +28,7 @@ pytest-xdist = "^3.7.0"
2828
pytest-socket = "^0.7"
2929
pytest-asyncio = "^0.26.0"
3030
requests-mock = "^1.11.0"
31-
galileo-core = { extras = ["testing"], version = "~=3.42.0" }
31+
galileo-core = { extras = ["testing"], version = "~=3.44.0" }
3232

3333
pytest-env = "^1.1.5"
3434
langchain-core = "^0.3.61"

src/galileo/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@
1414
from galileo_core.schemas.logging.step import StepType
1515
from galileo_core.schemas.logging.trace import Trace
1616

17-
__version__ = "1.1.0"
17+
__version__ = "1.2.0"

src/galileo/experiments.py

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
)
1919
from galileo.resources.models import ExperimentResponse, HTTPValidationError, PromptRunSettings, ScorerConfig, TaskType
2020
from galileo.schema.datasets import DatasetRecord
21-
from galileo.schema.metrics import GalileoScorers, LocalMetricConfig
21+
from galileo.schema.metrics import GalileoScorers, LocalMetricConfig, Metric
2222
from galileo.scorers import Scorers, ScorerSettings
2323
from galileo.utils.datasets import load_dataset_and_records
2424

@@ -88,21 +88,40 @@ def list(self, project_id: str) -> Optional[Union[HTTPValidationError, list["Exp
8888

8989
@staticmethod
9090
def create_metric_configs(
91-
project_id: str, experiment_id: str, metrics: builtins.list[Union[GalileoScorers, LocalMetricConfig, str]]
91+
project_id: str,
92+
experiment_id: str,
93+
metrics: builtins.list[Union[GalileoScorers, Metric, LocalMetricConfig, str]],
9294
) -> tuple[builtins.list[ScorerConfig], builtins.list[LocalMetricConfig]]:
93-
scorers = []
94-
scorer_names = [
95-
metric.value if isinstance(metric, GalileoScorers) else metric
96-
for metric in metrics
97-
if isinstance(metric, GalileoScorers) or isinstance(metric, str)
98-
]
99-
if scorer_names:
95+
local_metric_configs: list[LocalMetricConfig] = []
96+
scorer_name_versions: list[tuple[str, Optional[int]]] = []
97+
for metric in metrics:
98+
if isinstance(metric, GalileoScorers):
99+
scorer_name_versions.append((metric.value, None))
100+
elif isinstance(metric, Metric):
101+
scorer_name_versions.append((metric.name, metric.version))
102+
elif isinstance(metric, LocalMetricConfig):
103+
local_metric_configs.append(metric)
104+
elif isinstance(metric, str):
105+
scorer_name_versions.append((metric, None))
106+
else:
107+
raise ValueError(f"Unknown metric type: {type(metric)}")
108+
109+
scorers: list[ScorerConfig] = []
110+
if scorer_name_versions:
100111
all_scorers = Scorers().list()
101112
known_metrics = {metric.name: metric for metric in all_scorers}
102113
unknown_metrics = []
103-
for metric in scorer_names:
104-
if metric in known_metrics:
105-
scorers.append(ScorerConfig.from_dict(known_metrics[metric].to_dict()))
114+
for scorer_name, scorer_version in scorer_name_versions:
115+
if scorer_name in known_metrics:
116+
raw_metric_dict = known_metrics[scorer_name].to_dict()
117+
118+
# Set the version on the ScorerConfig if provided
119+
if scorer_version is not None:
120+
raw_version = Scorers().get_scorer_version(
121+
scorer_id=raw_metric_dict["id"], version=scorer_version
122+
)
123+
raw_metric_dict["scorer_version"] = raw_version.to_dict()
124+
scorers.append(ScorerConfig.from_dict(raw_metric_dict))
106125
else:
107126
unknown_metrics.append(metric)
108127
if unknown_metrics:
@@ -112,8 +131,6 @@ def create_metric_configs(
112131
)
113132
ScorerSettings().create(project_id=project_id, run_id=experiment_id, scorers=scorers)
114133

115-
local_metric_configs = [metric for metric in metrics if isinstance(metric, LocalMetricConfig)]
116-
117134
return scorers, local_metric_configs
118135

119136
def run(
@@ -216,7 +233,7 @@ def run_experiment(
216233
dataset: Optional[Union[Dataset, list[dict[str, str]], str]] = None,
217234
dataset_id: Optional[str] = None,
218235
dataset_name: Optional[str] = None,
219-
metrics: Optional[list[Union[GalileoScorers, LocalMetricConfig, str]]] = None,
236+
metrics: Optional[list[Union[GalileoScorers, Metric, LocalMetricConfig, str]]] = None,
220237
function: Optional[Callable] = None,
221238
) -> Any:
222239
"""

src/galileo/schema/metrics.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
from typing import Callable, Generic, Optional, TypeVar, Union
22

3-
from pydantic import BaseModel, Field, ValidationError, field_validator
3+
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
44
from pydantic_core.core_schema import ValidationInfo
55

66
from galileo_core.schemas.logging.span import Span
77
from galileo_core.schemas.logging.step import StepType
88
from galileo_core.schemas.logging.trace import Trace
99
from galileo_core.schemas.shared.metric import MetricValueType
10-
11-
# ruff: noqa: F401
1210
from galileo_core.schemas.shared.scorers.scorer_name import ScorerName as GalileoScorers
1311

1412
MetricType = TypeVar("MetricType", bound=MetricValueType)
@@ -36,3 +34,23 @@ def set_aggregatable_types(cls, value: list[StepType], info: ValidationInfo) ->
3634
if step_type not in [StepType.workflow, StepType.trace]:
3735
raise ValidationError("aggregatable_types can only contain trace or workflow steps")
3836
return value
37+
38+
39+
class Metric(BaseModel):
40+
name: str = Field(
41+
description="The name of the metric you want to run a specific version of (ie: 'Sentence Density')."
42+
)
43+
version: Optional[int] = Field(
44+
default=None,
45+
description="The version of the metric (ie: 1, 2, 3, etc.). If None is provided, the 'default' version will be used.",
46+
)
47+
48+
@model_validator(mode="after")
49+
def validate_name_and_version(self) -> "Metric":
50+
preset_metric_names = [scorer.value for scorer in GalileoScorers]
51+
if self.name in preset_metric_names:
52+
if self.version is not None:
53+
raise ValueError(
54+
f"Galileo metric's '{self.name}' do not support versioning at this time. Please use the default version."
55+
)
56+
return self

src/galileo/scorers.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
from typing import Optional, Union
2+
from uuid import UUID
23

34
from galileo.base import BaseClientModel
4-
from galileo.resources.api.data import list_scorers_with_filters_scorers_list_post
5+
from galileo.resources.api.data import (
6+
get_scorer_version_or_latest_scorers_scorer_id_version_get,
7+
list_scorers_with_filters_scorers_list_post,
8+
)
59
from galileo.resources.api.run_scorer_settings import (
610
upsert_scorers_config_projects_project_id_runs_run_id_scorer_settings_post,
711
)
@@ -14,6 +18,7 @@
1418
ScorerTypeFilterOperator,
1519
ScorerTypes,
1620
)
21+
from galileo.resources.models.base_scorer_version_response import BaseScorerVersionResponse
1722
from galileo.resources.models.run_scorer_settings_patch_request import RunScorerSettingsPatchRequest
1823
from galileo.resources.models.run_scorer_settings_response import RunScorerSettingsResponse
1924
from galileo.resources.types import Unset
@@ -33,6 +38,21 @@ def list(self, types: list[ScorerTypes] = None) -> Union[Unset, list[ScorerRespo
3338
result = list_scorers_with_filters_scorers_list_post.sync(client=self.client, body=body)
3439
return result.scorers
3540

41+
def get_scorer_version(self, scorer_id: UUID, version: int) -> Union[Unset, BaseScorerVersionResponse]:
42+
"""
43+
Args:
44+
name: str
45+
Name of the scorer
46+
version: int
47+
Version of the scorer.
48+
Returns:
49+
Scorer response if found, otherwise None
50+
"""
51+
result = get_scorer_version_or_latest_scorers_scorer_id_version_get.sync(
52+
scorer_id=scorer_id, version=version, client=self.client
53+
)
54+
return result
55+
3656

3757
class ScorerSettings(BaseClientModel):
3858
def create(

tests/schemas/test_metrics.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import pytest
2+
from pydantic import ValidationError
3+
4+
from galileo.schema.metrics import Metric
5+
from galileo_core.schemas.shared.scorers.scorer_name import ScorerName
6+
7+
8+
def test_metric_validator_preset_with_version():
9+
"""Test that creating a Metric with a preset name and version raises a ValidationError"""
10+
# Get a valid value from the ScorerName enum
11+
# First, get all the available enum values
12+
preset_names = [scorer.value for scorer in ScorerName]
13+
# Make sure there's at least one value
14+
assert preset_names, "No values found in ScorerName enum"
15+
preset_name = preset_names[0]
16+
17+
# Attempt to create a Metric with a preset name and a version
18+
with pytest.raises(ValidationError) as exc_info:
19+
Metric(name=preset_name, version=1)
20+
21+
# Verify the error message
22+
assert f"Galileo metric's '{preset_name}' do not support versioning at this time" in str(exc_info.value)
23+
24+
25+
def test_metric_validator_preset_no_version():
26+
"""Test that creating a Metric with a preset name and no version is valid"""
27+
# Get a valid value from the ScorerName enum
28+
preset_names = [scorer.value for scorer in ScorerName]
29+
assert preset_names, "No values found in ScorerName enum"
30+
preset_name = preset_names[0]
31+
32+
# Create a Metric with a preset name and no version
33+
metric = Metric(name=preset_name)
34+
35+
# Verify the metric is created correctly
36+
assert metric.name == preset_name
37+
assert metric.version is None
38+
39+
40+
def test_metric_validator_custom_with_version():
41+
"""Test that creating a Metric with a custom name and version is valid"""
42+
# Create a Metric with a custom name and a version
43+
metric = Metric(name="my_custom_metric", version=2)
44+
45+
# Verify the metric is created correctly
46+
assert metric.name == "my_custom_metric"
47+
assert metric.version == 2
48+
49+
50+
def test_metric_validator_custom_no_version():
51+
"""Test that creating a Metric with a custom name and no version is valid"""
52+
# Create a Metric with a custom name and no version
53+
metric = Metric(name="my_custom_metric")
54+
55+
# Verify the metric is created correctly
56+
assert metric.name == "my_custom_metric"
57+
assert metric.version is None

tests/test_experiments.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,14 @@ def complex_trace_function(input):
120120
return output
121121

122122

123+
def mock_scorer_version_response():
124+
mock_response = MagicMock()
125+
mock_response.id = "mock_scorer_version_id"
126+
mock_response.version = 1
127+
mock_response.to_dict.return_value = {"id": "mock_scorer_version_id", "version": 1}
128+
return mock_response
129+
130+
123131
class TestExperiments:
124132
@patch("galileo.experiments.create_experiment_projects_project_id_experiments_post")
125133
def test_create(self, galileo_resources_api_create_experiment: Mock):
@@ -270,6 +278,9 @@ def test_run_experiment_without_metrics(
270278
@patch.object(galileo.experiments.Experiments, "create", return_value=experiment_response())
271279
@patch.object(galileo.experiments.Experiments, "get", return_value=experiment_response())
272280
@patch.object(galileo.experiments.Projects, "get", return_value=project())
281+
@patch.object(galileo.experiments.Scorers, "list", return_value=scorers())
282+
@patch.object(galileo.experiments.Scorers, "get_scorer_version", return_value=mock_scorer_version_response())
283+
@patch.object(galileo.experiments.ScorerSettings, "create")
273284
@pytest.mark.parametrize("thread_pool", [True, False])
274285
@pytest.mark.parametrize(
275286
["function", "metrics", "num_spans", "span_type", "results", "aggregate_results"],
@@ -332,6 +343,9 @@ def test_run_experiment_without_metrics(
332343
)
333344
def test_run_experiment_with_func(
334345
self,
346+
mock_scorer_settings_create: Mock,
347+
mock_get_scorer_version: Mock,
348+
mock_scorers_list: Mock,
335349
mock_get_project: Mock,
336350
mock_get_experiment: Mock,
337351
mock_create_experiment: Mock,
@@ -603,8 +617,14 @@ def test_run_experiment_with_prompt_template_and_local_dataset(self, local_datas
603617
@patch.object(galileo.experiments.Experiments, "create", return_value=experiment_response())
604618
@patch.object(galileo.experiments.Experiments, "get", return_value=experiment_response())
605619
@patch.object(galileo.experiments.Projects, "get", return_value=project())
620+
@patch.object(galileo.experiments.Scorers, "list", return_value=scorers())
621+
@patch.object(galileo.experiments.Scorers, "get_scorer_version", return_value=mock_scorer_version_response())
622+
@patch.object(galileo.experiments.ScorerSettings, "create", return_value=None)
606623
def test_run_experiment_with_local_scorers_and_prompt_template(
607624
self,
625+
mock_scorer_settings_create: Mock,
626+
mock_get_scorer_version: Mock,
627+
mock_scorers_list: Mock,
608628
mock_get_project: Mock,
609629
mock_get_experiment: Mock,
610630
mock_create_experiment: Mock,
@@ -652,3 +672,58 @@ def test_create_scorer_configs(self, mock_scorer_settings, mock_scorers):
652672
# Test unknown metrics
653673
with pytest.raises(ValueError):
654674
Experiments.create_metric_configs("project_id", "experiment_id", ["unknown_metric"])
675+
676+
@patch("galileo.experiments.Scorers")
677+
@patch("galileo.experiments.ScorerSettings")
678+
def test_create_scorer_configs_with_metric_objects(self, mock_scorer_settings, mock_scorers):
679+
# Setup mock return values
680+
mock_scorers_instance = mock_scorers.return_value
681+
682+
# Create mock scorer responses
683+
mock_scorers = [
684+
ScorerResponse.from_dict({"id": "1", "name": "metric1", "scorer_type": "preset", "tags": ["test"]}),
685+
ScorerResponse.from_dict({"id": "2", "name": "metric2", "scorer_type": "preset", "tags": ["test"]}),
686+
ScorerResponse.from_dict({"id": "3", "name": "versionable_metric", "scorer_type": "llm", "tags": ["test"]}),
687+
]
688+
689+
mock_scorers_instance.list.return_value = mock_scorers
690+
691+
# Mock the get_scorer_version method
692+
mock_version_response = MagicMock()
693+
mock_version_response.to_dict.return_value = {"id": "version1", "version": 2}
694+
mock_scorers_instance.get_scorer_version.return_value = mock_version_response
695+
696+
from galileo.schema.metrics import Metric
697+
698+
# Test with Metric objects (without version)
699+
metric1 = Metric(name="metric1")
700+
metric2 = Metric(name="metric2")
701+
702+
scorers, local_scorers = Experiments.create_metric_configs("project_id", "experiment_id", [metric1, metric2])
703+
704+
assert len(scorers) == 2 # Should return two valid scorers
705+
assert len(local_scorers) == 0 # No local scorers
706+
707+
# Verify get_scorer_version was not called (since no version was specified)
708+
mock_scorers_instance.get_scorer_version.assert_not_called()
709+
710+
# Test with a Metric object with version
711+
versionable_metric = Metric(name="versionable_metric", version=2)
712+
713+
scorers, local_scorers = Experiments.create_metric_configs("project_id", "experiment_id", [versionable_metric])
714+
715+
assert len(scorers) == 1 # Should return one valid scorer
716+
assert len(local_scorers) == 0 # No local scorers
717+
718+
# Verify get_scorer_version was called with the correct parameters
719+
mock_scorers_instance.get_scorer_version.assert_called_once_with(scorer_id="3", version=2)
720+
721+
# Test mixed input types
722+
local_metric = LocalMetricConfig(name="length", scorer_fn=lambda x: len(x))
723+
724+
scorers, local_scorers = Experiments.create_metric_configs(
725+
"project_id", "experiment_id", ["metric1", local_metric, Metric(name="metric2")]
726+
)
727+
728+
assert len(scorers) == 2 # Should return two valid scorers
729+
assert len(local_scorers) == 1 # One local scorer

0 commit comments

Comments
 (0)