Skip to content

Commit 12f8f89

Browse files
authored
feat: add Base64 encoding/decoding for media data (#102)
1 parent d932bf1 commit 12f8f89

File tree

3 files changed

+102
-2
lines changed

3 files changed

+102
-2
lines changed

src/mosaico/media.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from __future__ import annotations
22

3+
import base64
4+
import binascii
35
import contextlib
46
import io
57
import mimetypes
8+
import re
69
import uuid
710
from collections.abc import Generator
811
from typing import IO, Any, cast
@@ -11,14 +14,18 @@
1114
from pydantic import BaseModel
1215
from pydantic.config import ConfigDict
1316
from pydantic.fields import Field
14-
from pydantic.functional_validators import model_validator
17+
from pydantic.functional_serializers import field_serializer
18+
from pydantic.functional_validators import field_validator, model_validator
1519
from typing_extensions import Self
1620

1721
from mosaico.config import settings
1822
from mosaico.integrations.base.adapters import Adapter
1923
from mosaico.types import FilePath
2024

2125

26+
_BASE64_BYTE_PATTERN = re.compile(rb"^[A-Za-z0-9+/]+={0,2}$")
27+
28+
2229
class Media(BaseModel):
2330
"""
2431
Represents a media object.
@@ -72,6 +79,34 @@ def validate_media(cls, values: dict[str, Any]) -> Any:
7279

7380
return values
7481

82+
@field_validator("data")
83+
@classmethod
84+
def decode_base64_data(cls, v: bytes | str | None) -> bytes | str | None:
85+
"""
86+
Decode field data from Base64 only if it looks like Base64.
87+
"""
88+
if isinstance(v, str):
89+
try:
90+
raw = v.encode("ascii")
91+
except UnicodeEncodeError:
92+
return v
93+
94+
if len(raw) % 4 == 0 and _BASE64_BYTE_PATTERN.match(raw) is not None and raw.endswith(b"="):
95+
try:
96+
return base64.b64decode(raw, validate=True)
97+
except binascii.Error:
98+
pass
99+
return v
100+
101+
@field_serializer("data", when_used="json")
102+
def encode_base64_data(self, v) -> str:
103+
"""
104+
Codifica campo data em base64.
105+
"""
106+
if isinstance(v, bytes):
107+
return base64.b64encode(v).decode(self.encoding)
108+
return v
109+
75110
@classmethod
76111
def from_path(
77112
cls,

src/mosaico/transcription_aligners/sequence_matcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def align(self, transcription: Transcription, original_text: str) -> Transcripti
4747
# No change needed, keep original words and time ranges
4848
logger.debug("No change needed, keeping original words and time ranges.")
4949
for word in words_with_time_ranges[i1:i2]:
50-
fixed_words.append(word.copy())
50+
fixed_words.append(word.model_copy())
5151

5252
elif tag == "replace":
5353
# Replace words but adapt their time ranges

tests/test_media.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import base64
2+
13
import pytest
24
from pydantic import ValidationError
35

@@ -27,6 +29,69 @@ def test_creation_with_path():
2729
assert media.metadata == {}
2830

2931

32+
def test_decode_base64_str_to_bytes():
33+
original = b"hello world \xf0\x9f\x8c\x8d"
34+
b64 = base64.b64encode(original).decode("ascii")
35+
m = Media(data=b64, path=None)
36+
assert isinstance(m.data, bytes)
37+
assert m.data == original
38+
39+
40+
def test_encode_bytes_to_base64_json():
41+
original = b"foo bar baz"
42+
m = Media(data=original, path=None)
43+
m_dict = m.model_dump(mode="json")
44+
assert isinstance(m_dict["data"], str)
45+
decoded = base64.b64decode(m_dict["data"])
46+
assert decoded == original
47+
48+
49+
def test_round_trip_via_model_dump_and_parse():
50+
original = b"\x00\x01\x02binary\xff"
51+
b64 = base64.b64encode(original).decode("ascii")
52+
m1 = Media.from_data(b64)
53+
json_str = m1.model_dump_json()
54+
m2 = Media.model_validate_json(json_str)
55+
assert isinstance(m2.data, bytes)
56+
assert m2.data == original
57+
58+
59+
def test_non_base64_string_pass_through():
60+
raw = "not base64!!!"
61+
m = Media(data=raw)
62+
assert isinstance(m.data, str)
63+
assert m.data == raw
64+
65+
66+
def test_non_ascii_string_pass_through():
67+
raw = "hello 😊"
68+
m = Media(data=raw)
69+
assert isinstance(m.data, str)
70+
assert m.data == raw
71+
72+
73+
def test_length_not_multiple_of_four_pass_through():
74+
raw = "YWxsb3c"
75+
assert len(raw) % 4 != 0
76+
m = Media(data=raw)
77+
assert isinstance(m.data, str)
78+
assert m.data == raw
79+
80+
81+
def test_empty_string_pass_through():
82+
raw = ""
83+
m = Media(data=raw)
84+
assert isinstance(m.data, str)
85+
assert m.data == raw
86+
87+
88+
def test_bytes_input_unchanged():
89+
original = b"abcd"
90+
m = Media(data=original)
91+
assert isinstance(m.data, bytes)
92+
assert m.data == original
93+
94+
3095
def test_creation_with_mime_type():
3196
media = Media(data="test content", mime_type="text/plain")
3297
assert media.data == "test content"

0 commit comments

Comments
 (0)