Merge pull request #46 from speechmatics/feature/add-speaker-id-to-rt

TudorCRL · web-flow · commit 42943b1340f2 · 2025-10-13T15:39:40.000+01:00
Add speaker id to rt
diff --git a/sdk/batch/speechmatics/batch/_transport.py b/sdk/batch/speechmatics/batch/_transport.py
@@ -297,9 +297,9 @@ async def _prepare_headers(self) -> dict[str, str]:
             Headers dictionary with authentication and tracking info
         """
         auth_headers = await self._auth.get_auth_headers()
-        auth_headers[
-            "User-Agent"
-        ] = f"speechmatics-batch-v{get_version()} python/{sys.version_info.major}.{sys.version_info.minor}"
+        auth_headers["User-Agent"] = (
+            f"speechmatics-batch-v{get_version()} python/{sys.version_info.major}.{sys.version_info.minor}"
+        )
 
         if self._request_id:
             auth_headers["X-Request-Id"] = self._request_id
diff --git a/sdk/rt/speechmatics/rt/__init__.py b/sdk/rt/speechmatics/rt/__init__.py
@@ -25,6 +25,7 @@
 from ._models import ServerMessageType
 from ._models import SessionInfo
 from ._models import SpeakerDiarizationConfig
+from ._models import SpeakerIdentifier
 from ._models import TranscriptionConfig
 from ._models import TranscriptResult
 from ._models import TranslationConfig
@@ -53,6 +54,7 @@
     "SessionError",
     "SessionInfo",
     "SpeakerDiarizationConfig",
+    "SpeakerIdentifier",
     "StaticKeyAuth",
     "TimeoutError",
     "TranscriptResult",
diff --git a/sdk/rt/speechmatics/rt/_base_client.py b/sdk/rt/speechmatics/rt/_base_client.py
@@ -8,6 +8,8 @@
 from typing import Any
 from typing import Optional
 
+from typing_extensions import Self
+
 from ._auth import AuthBase
 from ._auth import StaticKeyAuth
 from ._events import EventEmitter
@@ -96,7 +98,7 @@ async def _ws_connect(self, ws_headers: Optional[dict] = None) -> None:
         await self._transport.connect(ws_headers)
         self._recv_task = asyncio.create_task(self._recv_loop())
 
-    async def __aenter__(self) -> _BaseClient:
+    async def __aenter__(self) -> Self:
         return self
 
     async def __aexit__(self, *args: Any) -> None:
diff --git a/sdk/rt/speechmatics/rt/_models.py b/sdk/rt/speechmatics/rt/_models.py
@@ -2,6 +2,7 @@
 
 from dataclasses import asdict
 from dataclasses import dataclass
+from dataclasses import field
 from enum import Enum
 from typing import Any
 from typing import Optional
@@ -61,7 +62,7 @@ class ClientMessageType(str, Enum):
         EndOfStream: Signals that no more audio data will be sent.
         SetRecognitionConfig: Updates transcription configuration during
             an active session (advanced use).
-        GetSpeakers: Internal, Speechmatics only message. Allows the client to request speaker data.
+        GetSpeakers: Allows the client to request speaker data.
 
     Examples:
         >>> # Starting a recognition session
@@ -110,7 +111,7 @@ class ServerMessageType(str, Enum):
             change for the given audio segment.
         AddPartialTranslation: Provides interim translation results that
             may change as more context becomes available.
-        SpeakerResult: Internal, Speechmatics only message containing the speakers data.
+        SpeakerResult: Provides the speaker identification data.
         Info: Informational messages from the server.
         Warning: Warning messages that don't stop transcription.
         Error: Error messages indicating transcription failure.
@@ -245,19 +246,58 @@ class SpeakerDiarizationConfig:
             is a close enough match, even if other speakers may be closer. This is useful
             for cases where we can flip incorrectly between similar speakers during a single
             speaker section.
+        speakers: (Optional) Add speaker identifiers to your session to identify specific speakers.
+            This is a list of SpeakerIdentifier objects generated in previous transcription sessions.
+            You can provide multiple identifiers for a single speaker to help the engine identify
+            the speaker more accurately.
 
     Examples:
         >>> config = SpeakerDiarizationConfig(
             max_speakers=2,
             speaker_sensitivity=0.8,
             prefer_current_speaker=True,
+            speakers=[
+                SpeakerIdentifier(label="Agent", speaker_identifiers=["agent_1"]),
+                SpeakerIdentifier(label="Customer", speaker_identifiers=["cust_1"]),
+            ],
         )
 
     """
 
     max_speakers: Optional[int] = None
     speaker_sensitivity: Optional[float] = None
     prefer_current_speaker: Optional[bool] = None
+    speakers: Optional[list[SpeakerIdentifier]] = None
+
+
+@dataclass
+class SpeakerIdentifier:
+    """Labeled speaker identifier for guided speaker diarization.
+
+    Use this to map one or more known speaker identifiers to a human-readable
+    label. When provided in `SpeakerDiarizationConfig.speakers`, the engine can
+    use these identifiers as hints to consistently assign the specified label.
+
+    Attributes:
+        label: Human-readable label to assign to this speaker or group
+            (e.g., "Agent", "Customer", "Alice").
+        speaker_identifiers: A list of string identifiers associated with this
+            speaker. These can be any stable identifiers relevant to your
+            application (for example device IDs, prior session speaker IDs,
+            channel tags, etc.).
+
+    Examples:
+        >>> config = SpeakerDiarizationConfig(
+        ...     max_speakers=2,
+        ...     speakers=[
+        ...         SpeakerIdentifier(label="Agent", speaker_identifiers=["agent_1"]),
+        ...         SpeakerIdentifier(label="Customer", speaker_identifiers=["cust_1"]),
+        ...     ],
+        ... )
+    """
+
+    label: str = ""
+    speaker_identifiers: list[str] = field(default_factory=list)
 
 
 @dataclass