diff --git a/google/cloud/storage/_opentelemetry_tracing.py b/google/cloud/storage/_opentelemetry_tracing.py index 3416081cd..b654aae2b 100644 --- a/google/cloud/storage/_opentelemetry_tracing.py +++ b/google/cloud/storage/_opentelemetry_tracing.py @@ -18,7 +18,7 @@ import os from contextlib import contextmanager - +from urllib.parse import urlparse from google.api_core import exceptions as api_exceptions from google.api_core import retry as api_retry from google.cloud.storage import __version__ @@ -28,7 +28,15 @@ ENABLE_OTEL_TRACES_ENV_VAR = "ENABLE_GCS_PYTHON_CLIENT_OTEL_TRACES" _DEFAULT_ENABLE_OTEL_TRACES_VALUE = False -enable_otel_traces = os.environ.get( + +def _parse_bool_env(name: str, default: bool = False) -> bool: + val = os.environ.get(name, None) + if val is None: + return default + return str(val).strip().lower() in {"1", "true", "yes", "on"} + + +enable_otel_traces = _parse_bool_env( ENABLE_OTEL_TRACES_ENV_VAR, _DEFAULT_ENABLE_OTEL_TRACES_VALUE ) logger = logging.getLogger(__name__) @@ -105,11 +113,10 @@ def _set_api_request_attr(request, client): if request.get("method"): attr["http.request.method"] = request.get("method") if request.get("path"): - path = request.get("path") - full_path = f"{client._connection.API_BASE_URL}{path}" - attr["url.full"] = full_path - if request.get("timeout"): - attr["connect_timeout,read_timeout"] = request.get("timeout") + full_url = client._connection.build_api_url(request.get("path")) + attr.update(_get_opentelemetry_attributes_from_url(full_url, strip_query=True)) + if "timeout" in request: + attr["connect_timeout,read_timeout"] = str(request.get("timeout")) return attr @@ -117,3 +124,26 @@ def _set_retry_attr(retry, conditional_predicate=None): predicate = conditional_predicate if conditional_predicate else retry._predicate retry_info = f"multiplier{retry._multiplier}/deadline{retry._deadline}/max{retry._maximum}/initial{retry._initial}/predicate{predicate}" return {"retry": retry_info} + + +def _get_opentelemetry_attributes_from_url(url, strip_query=True): + """Helper to assemble OpenTelemetry span attributes from a URL.""" + u = urlparse(url) + netloc = u.netloc + # u.hostname is always lowercase. We parse netloc to preserve casing. + # netloc format: [userinfo@]host[:port] + if "@" in netloc: + netloc = netloc.split("@", 1)[1] + if ":" in netloc and not netloc.endswith("]"): # Handle IPv6 literal + netloc = netloc.split(":", 1)[0] + + attributes = { + "server.address": netloc, + "server.port": u.port, + "url.scheme": u.scheme, + "url.path": u.path, + } + if not strip_query: + attributes["url.query"] = u.query + + return attributes diff --git a/google/cloud/storage/blob.py b/google/cloud/storage/blob.py index b7d5d698a..3c69e9b9a 100644 --- a/google/cloud/storage/blob.py +++ b/google/cloud/storage/blob.py @@ -47,6 +47,9 @@ from google.cloud._helpers import _rfc3339_nanos_to_datetime from google.cloud._helpers import _to_bytes from google.cloud.exceptions import NotFound +from google.cloud.storage._opentelemetry_tracing import ( + _get_opentelemetry_attributes_from_url, +) from google.cloud.storage._helpers import _add_etag_match_headers from google.cloud.storage._helpers import _add_generation_match_parameters from google.cloud.storage._helpers import _PropertyMixin @@ -1067,13 +1070,11 @@ def _do_download( Please enable this as per your use case. """ - extra_attributes = { - "url.full": download_url, - "download.chunk_size": f"{self.chunk_size}", - "download.raw_download": raw_download, - "upload.checksum": f"{checksum}", - "download.single_shot_download": single_shot_download, - } + extra_attributes = _get_opentelemetry_attributes_from_url(download_url) + extra_attributes["download.chunk_size"] = f"{self.chunk_size}" + extra_attributes["download.raw_download"] = raw_download + extra_attributes["upload.checksum"] = f"{checksum}" + extra_attributes["download.single_shot_download"] = single_shot_download args = {"timeout": timeout} if self.chunk_size is None: @@ -2063,10 +2064,8 @@ def _do_multipart_upload( upload_url, headers=headers, checksum=checksum, retry=retry ) - extra_attributes = { - "url.full": upload_url, - "upload.checksum": f"{checksum}", - } + extra_attributes = _get_opentelemetry_attributes_from_url(upload_url) + extra_attributes["upload.checksum"] = f"{checksum}" args = {"timeout": timeout} with create_trace_span( name="Storage.MultipartUpload/transmit", @@ -2396,11 +2395,10 @@ def _do_resumable_upload( retry=retry, command=command, ) - extra_attributes = { - "url.full": upload.resumable_url, - "upload.chunk_size": upload.chunk_size, - "upload.checksum": f"{checksum}", - } + extra_attributes = _get_opentelemetry_attributes_from_url(upload.resumable_url) + extra_attributes["upload.chunk_size"] = upload.chunk_size + extra_attributes["upload.checksum"] = f"{checksum}" + args = {"timeout": timeout} with create_trace_span( name="Storage.ResumableUpload/transmitNextChunk", diff --git a/tests/unit/test__opentelemetry_tracing.py b/tests/unit/test__opentelemetry_tracing.py index bdbb40fd2..907f8447a 100644 --- a/tests/unit/test__opentelemetry_tracing.py +++ b/tests/unit/test__opentelemetry_tracing.py @@ -136,7 +136,7 @@ def test_get_final_attributes(setup, setup_optin): } api_request = { "method": "GET", - "path": "/foo/bar/baz", + "path": "/foo/bar/baz?sensitive=true", "timeout": (100, 100), } retry_obj = api_retry.Retry() @@ -147,15 +147,19 @@ def test_get_final_attributes(setup, setup_optin): "rpc.system": "http", "user_agent.original": f"gcloud-python/{__version__}", "http.request.method": "GET", - "url.full": "https://testOtel.org/foo/bar/baz", - "connect_timeout,read_timeout": (100, 100), + "server.address": "testOtel.org", + "url.path": "/foo/bar/baz", + "url.scheme": "https", + "connect_timeout,read_timeout": str((100, 100)), "retry": f"multiplier{retry_obj._multiplier}/deadline{retry_obj._deadline}/max{retry_obj._maximum}/initial{retry_obj._initial}/predicate{retry_obj._predicate}", } expected_attributes.update(_opentelemetry_tracing._cloud_trace_adoption_attrs) with mock.patch("google.cloud.storage.client.Client") as test_client: test_client.project = "test_project" - test_client._connection.API_BASE_URL = "https://testOtel.org" + test_client._connection.build_api_url.return_value = ( + "https://testOtel.org/foo/bar/baz?sensitive=true" + ) with _opentelemetry_tracing.create_trace_span( test_span_name, attributes=test_span_attributes, @@ -165,6 +169,7 @@ def test_get_final_attributes(setup, setup_optin): ) as span: assert span is not None assert span.name == test_span_name + assert "url.query" not in span.attributes assert span.attributes == expected_attributes @@ -196,23 +201,73 @@ def test_set_conditional_retry_attr(setup, setup_optin): assert span.attributes == expected_attributes -def test_set_api_request_attr(): - from google.cloud.storage import Client +def test__get_opentelemetry_attributes_from_url(): + url = "https://example.com:8080/path?query=true" + expected = { + "server.address": "example.com", + "server.port": 8080, + "url.scheme": "https", + "url.path": "/path", + } + # Test stripping query + attrs = _opentelemetry_tracing._get_opentelemetry_attributes_from_url( + url, strip_query=True + ) + assert attrs == expected + assert "url.query" not in attrs - test_client = Client() - args_method = {"method": "GET"} - expected_attributes = {"http.request.method": "GET"} - attr = _opentelemetry_tracing._set_api_request_attr(args_method, test_client) - assert attr == expected_attributes + # Test not stripping query + expected["url.query"] = "query=true" + attrs = _opentelemetry_tracing._get_opentelemetry_attributes_from_url( + url, strip_query=False + ) + assert attrs == expected - args_path = {"path": "/foo/bar/baz"} - expected_attributes = {"url.full": "https://storage.googleapis.com/foo/bar/baz"} - attr = _opentelemetry_tracing._set_api_request_attr(args_path, test_client) - assert attr == expected_attributes - args_timeout = {"timeout": (100, 100)} +def test__get_opentelemetry_attributes_from_url_with_query(): + url = "https://example.com/path?query=true&another=false" + expected = { + "server.address": "example.com", + "server.port": None, + "url.scheme": "https", + "url.path": "/path", + "url.query": "query=true&another=false", + } + # Test not stripping query + attrs = _opentelemetry_tracing._get_opentelemetry_attributes_from_url( + url, strip_query=False + ) + assert attrs == expected + + +def test_set_api_request_attr_with_pii_in_query(): + client = mock.Mock() + client._connection.build_api_url.return_value = ( + "https://example.com/path?sensitive=true&token=secret" + ) + + request = { + "method": "GET", + "path": "/path?sensitive=true&token=secret", + "timeout": 60, + } expected_attributes = { - "connect_timeout,read_timeout": (100, 100), + "http.request.method": "GET", + "server.address": "example.com", + "server.port": None, + "url.scheme": "https", + "url.path": "/path", + "connect_timeout,read_timeout": "60", } - attr = _opentelemetry_tracing._set_api_request_attr(args_timeout, test_client) + attr = _opentelemetry_tracing._set_api_request_attr(request, client) assert attr == expected_attributes + assert "url.query" not in attr # Ensure query with PII is not captured + + +def test_set_api_request_attr_no_timeout(): + client = mock.Mock() + client._connection.build_api_url.return_value = "https://example.com/path" + + request = {"method": "GET", "path": "/path"} + attr = _opentelemetry_tracing._set_api_request_attr(request, client) + assert "connect_timeout,read_timeout" not in attr diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index db8094a95..99de31961 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -684,6 +684,7 @@ def test__list_resource_w_defaults(self): credentials = _make_credentials() client = self._make_one(project=project, credentials=credentials) connection = client._base_connection = _make_connection() + connection.build_api_url = mock.Mock(return_value="http://example.com" + path) iterator = client._list_resource( path=path, @@ -719,6 +720,7 @@ def test__list_resource_w_explicit(self): credentials = _make_credentials() client = self._make_one(project=project, credentials=credentials) connection = client._base_connection = _make_connection() + connection.build_api_url = mock.Mock(return_value="http://example.com" + path) iterator = client._list_resource( path=path,