Skip to content

Commit 41574e5

Browse files
authored
Merge pull request #131 from statisticsnorway/ensure-gs-prefix
Add missing 'storage options' in to_json. Ensure gs prefix
2 parents 7a7fb71 + c347d68 commit 41574e5

File tree

3 files changed

+18
-3
lines changed

3 files changed

+18
-3
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "dapla-toolbelt"
3-
version = "2.0.10"
3+
version = "2.0.11"
44
description = "Dapla Toolbelt"
55
authors = ["Dapla Developers <dapla-platform-developers@ssb.no>"]
66
license = "MIT"

src/dapla/pandas.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ def read_pandas(
7171
A Pandas DataFrame containing the selected dataset.
7272
7373
"""
74+
if isinstance(gcs_path, str):
75+
gcs_path = FileClient._ensure_gcs_uri_prefix(gcs_path)
76+
7477
if isinstance(gcs_path, list) and file_format != "parquet":
7578
raise ValueError("Multiple paths are only supported for parquet format")
7679
match SupportedFileFormat(file_format):
@@ -158,6 +161,9 @@ def write_pandas(
158161
"""
159162
import pyarrow.parquet
160163

164+
if isinstance(gcs_path, str):
165+
gcs_path = FileClient._ensure_gcs_uri_prefix(gcs_path)
166+
161167
match SupportedFileFormat(file_format):
162168
case SupportedFileFormat.PARQUET:
163169
# Transfom and write pandas dataframe
@@ -178,7 +184,7 @@ def write_pandas(
178184
**kwargs,
179185
)
180186
case SupportedFileFormat.JSON:
181-
df.to_json(gcs_path, **kwargs)
187+
df.to_json(gcs_path, **kwargs, storage_options=_get_storage_options()) # type: ignore [call-overload]
182188
case SupportedFileFormat.CSV:
183189
df.to_csv(gcs_path, storage_options=_get_storage_options(), **kwargs)
184190
case SupportedFileFormat.XML:

tests/test_pandas.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def test_read_csv_format(
5757
mock_google_creds.token = None
5858
auth_client_mock.fetch_google_credentials.return_value = mock_google_creds
5959
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
60+
file_client_mock._ensure_gcs_uri_prefix.return_value = "gs://tests/data/fruits.csv"
6061
read_csv_mock.return_value = read_csv("tests/data/fruits.csv")
6162
result = read_pandas("gs://tests/data/fruits.csv", file_format="csv")
6263
print(result.head(5))
@@ -70,6 +71,7 @@ def test_read_csv_format(
7071
def test_read_sas7bdat_format(file_client_mock: Mock) -> None:
7172
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
7273
file_client_mock._remove_gcs_uri_prefix.return_value = "tests/data/sasdata.sas7bdat"
74+
file_client_mock._ensure_gcs_uri_prefix.return_value = "tests/data/sasdata.sas7bdat"
7375
result = read_pandas(
7476
"tests/data/sasdata.sas7bdat", file_format="sas7bdat", encoding="latin1"
7577
)
@@ -89,6 +91,7 @@ def test_read_excel_format(
8991
read_excel_mock.return_value = read_excel("tests/data/people.xlsx")
9092
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
9193
file_client_mock._remove_gcs_uri_prefix.return_value = "tests/data/people.xlsx"
94+
file_client_mock._ensure_gcs_uri_prefix.return_value = "gs://tests/data/people.xlsx"
9295
result = read_pandas("gs://tests/data/people.xlsx", file_format="excel")
9396
print(result)
9497
assert sum(result["Alder"].to_list()) == 81
@@ -107,7 +110,7 @@ def test_write_excel_format(
107110
mock_google_creds.token = None
108111
auth_client_mock.fetch_google_credentials.return_value = mock_google_creds
109112
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
110-
113+
file_client_mock._ensure_gcs_uri_prefix.return_value = "gs://tests/output/test.xlsx"
111114
data = {"age": [23, 30, 77, 32]}
112115
df = pd.DataFrame(data, index=["June", "Robert", "Lily", "David"])
113116
to_excel_mock.return_value = None
@@ -127,6 +130,7 @@ def test_write_csv_format(
127130
mock_google_creds.token = None
128131
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
129132
auth_client_mock.fetch_google_credentials.return_value = mock_google_creds
133+
file_client_mock._ensure_gcs_uri_prefix.return_value = "gs://tests/output/test.csv"
130134
# Create pandas dataframe
131135
data = {"apples": [3, 2, 0, 1], "oranges": [0, 3, 7, 2]}
132136
df = pd.DataFrame(data, index=["June", "Robert", "Lily", "David"])
@@ -147,6 +151,9 @@ def test_read_xml_format(
147151
mock_google_creds.token = None
148152
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
149153
auth_client_mock.fetch_google_credentials.return_value = mock_google_creds
154+
file_client_mock._ensure_gcs_uri_prefix.return_value = (
155+
"gs://tests/data/students.xml"
156+
)
150157
read_xml_mock.return_value = read_xml("tests/data/students.xml")
151158
result = read_pandas("gs://tests/data/students.xml", file_format="xml")
152159
assert result["email"][3] == "skrue@mail.com"
@@ -159,6 +166,7 @@ def test_read_xml_format(
159166
def test_read_partitioned_parquet(file_client_mock: Mock) -> None:
160167
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
161168
file_client_mock._remove_gcs_uri_prefix.return_value = "tests/data/partition"
169+
file_client_mock._ensure_gcs_uri_prefix.return_value = "gs://tests/data/partition"
162170
result = read_pandas("tests/data/partition")
163171
print(result.head(5))
164172
assert result["innskudd"][1] == 2000
@@ -174,6 +182,7 @@ def test_write_xml_format(
174182
mock_google_creds.token = None
175183
file_client_mock.get_gcs_file_system.return_value = LocalFileSystem()
176184
auth_client_mock.fetch_google_credentials.return_value = mock_google_creds
185+
file_client_mock._ensure_gcs_uri_prefix.return_value = "gs://tests/output/test.xml"
177186
# Create pandas dataframe
178187
data = {"apples": [3, 2, 0, 1], "oranges": [0, 3, 7, 2]}
179188
df = pd.DataFrame(data, index=["June", "Robert", "Lily", "David"])

0 commit comments

Comments
 (0)