Skip to content

Commit 8fb00da

Browse files
authored
Merge pull request #160 from statisticsnorway/restrict-numpy
Remove Spark workaround. Loosen pyarrow constraints
2 parents 9ca2760 + faea90a commit 8fb00da

File tree

13 files changed

+1381
-1168
lines changed

13 files changed

+1381
-1168
lines changed

.vscode/settings.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
{
2+
"python.testing.pytestArgs": [
3+
"tests"
4+
],
5+
"python.testing.unittestEnabled": false,
6+
"python.testing.pytestEnabled": true,
27
"python.testing.unittestArgs": [
38
"-v",
49
"-s",
510
"./tests",
611
"-p",
712
"test*.py"
8-
],
9-
"python.testing.pytestEnabled": false,
10-
"python.testing.unittestEnabled": true
13+
]
1114
}

poetry.lock

Lines changed: 1343 additions & 1092 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "dapla-toolbelt"
3-
version = "2.0.17"
3+
version = "2.0.18"
44
description = "Dapla Toolbelt"
55
authors = ["Dapla Developers <dapla-platform-developers@ssb.no>"]
66
license = "MIT"
@@ -17,7 +17,7 @@ Changelog = "https://github.com/statisticsnorway/dapla-toolbelt/releases"
1717
[tool.poetry.dependencies]
1818
python = ">=3.10,<4.0"
1919
requests = ">=2.27.1"
20-
pyarrow = ">=14.0.2, <15" # This tight constraint is dependent on fixing https://issues.apache.org/jira/browse/ARROW-7867
20+
pyarrow = ">=14.0.2"
2121
pandas = { version = ">=1.4.2", extras = ["excel", "xml"] }
2222
gcsfs = ">=2022.7.1"
2323
ipython = ">=8.10.0"
@@ -78,6 +78,10 @@ warn_unreachable = true
7878
pretty = true
7979
show_column_numbers = true
8080
show_error_context = true
81+
disallow_untyped_calls = false
82+
# the above is added due to Google libs being untyped
83+
# note that this *does* enforce typing functions defined in dapla-toolbelt,
84+
# but it allows calling untyped functions in a typed context.
8185

8286
[[tool.mypy.overrides]]
8387
# Allow missing type hints in third-party libraries without type information.
@@ -87,6 +91,7 @@ module = [
8791
"fsspec.*",
8892
"responses.*",
8993
"tomli.*",
94+
"google.*",
9095
"google.cloud.*",
9196
]
9297
ignore_missing_imports = true

src/dapla/auth.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def fetch_local_user_from_jupyter() -> dict[str, Any]:
9494
hub = HubAuth()
9595
response = requests.get(
9696
os.environ["LOCAL_USER_PATH"],
97-
headers={"Authorization": "token %s" % hub.api_token},
97+
headers={"Authorization": f"token {hub.api_token}"},
9898
cert=(hub.certfile, hub.keyfile),
9999
verify=hub.client_ca,
100100
allow_redirects=False,
@@ -229,6 +229,6 @@ def _print_warning(self) -> None:
229229
display(
230230
HTML(
231231
'Your session has timed out. Please <a href="/hub/login">log in</a> to continue.'
232-
) # type: ignore [no-untyped-call]
232+
)
233233
)
234234
)

src/dapla/collector.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def start(self, specification: dict[str, Any]) -> Response:
2727
collector_response = requests.put(
2828
self.collector_url,
2929
headers={
30-
"Authorization": "Bearer %s" % keycloak_token,
30+
"Authorization": f"Bearer {keycloak_token}",
3131
"Content-type": "application/json",
3232
},
3333
data=json.dumps(specification),
@@ -42,7 +42,7 @@ def running_tasks(self) -> Response:
4242
"""Get all running collector tasks."""
4343
keycloak_token = AuthClient.fetch_personal_token()
4444
collector_response = requests.get(
45-
self.collector_url, headers={"Authorization": "Bearer %s" % keycloak_token}
45+
self.collector_url, headers={"Authorization": f"Bearer {keycloak_token}"}
4646
)
4747
collector_response.raise_for_status()
4848
return collector_response
@@ -59,7 +59,7 @@ def stop(self, task_id: int) -> Response:
5959
keycloak_token = AuthClient.fetch_personal_token()
6060
collector_response = requests.delete(
6161
f"{self.collector_url}/{task_id}",
62-
headers={"Authorization": "Bearer %s" % keycloak_token},
62+
headers={"Authorization": f"Bearer {keycloak_token}"},
6363
)
6464
if collector_response.status_code == 400:
6565
print(

src/dapla/converter.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def start(self, job_config: dict[str, Any]) -> Response:
2727
converter_response = requests.post(
2828
f"{self.converter_url}/jobs",
2929
headers={
30-
"Authorization": "Bearer %s" % keycloak_token,
30+
"Authorization": f"Bearer {keycloak_token}",
3131
"Content-type": "application/json",
3232
},
3333
data=json.dumps(job_config),
@@ -50,7 +50,7 @@ def start_simulation(self, job_config: dict[str, Any]) -> Response:
5050
converter_response = requests.post(
5151
f"{self.converter_url}/jobs/simulation",
5252
headers={
53-
"Authorization": "Bearer %s" % keycloak_token,
53+
"Authorization": f"Bearer {keycloak_token}",
5454
"Content-type": "application/json",
5555
},
5656
data=json.dumps(job_config),
@@ -72,7 +72,7 @@ def get_job_summary(self, job_id: str) -> Response:
7272
job_summary = requests.get(
7373
f"{self.converter_url}/jobs/{job_id}/execution-summary",
7474
headers={
75-
"Authorization": "Bearer %s" % keycloak_token,
75+
"Authorization": f"Bearer {keycloak_token}",
7676
"Content-type": "application/json",
7777
},
7878
)
@@ -93,7 +93,7 @@ def stop_job(self, job_id: str) -> Response:
9393
job_status = requests.post(
9494
f"{self.converter_url}/jobs/{job_id}/stop",
9595
headers={
96-
"Authorization": "Bearer %s" % keycloak_token,
96+
"Authorization": f"Bearer {keycloak_token}",
9797
"Content-type": "application/json",
9898
},
9999
)
@@ -122,7 +122,7 @@ def get_pseudo_report(self, job_id: str) -> Response:
122122
pseudo_report = requests.get(
123123
f"{self.converter_url}/jobs/{job_id}/reports/pseudo",
124124
headers={
125-
"Authorization": "Bearer %s" % keycloak_token,
125+
"Authorization": f"Bearer {keycloak_token}",
126126
"Content-type": "application/json",
127127
},
128128
)
@@ -145,7 +145,7 @@ def get_pseudo_schema(self, job_id: str) -> Response:
145145
pseudo_report = requests.get(
146146
f"{self.converter_url}/jobs/{job_id}/reports/pseudo-schema-hierarchy",
147147
headers={
148-
"Authorization": "Bearer %s" % keycloak_token,
148+
"Authorization": f"Bearer {keycloak_token}",
149149
"Content-type": "application/json",
150150
},
151151
)

src/dapla/doctor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,11 @@ def gcs_credentials_valid() -> bool:
6868
print("Checking your Google Cloud Storage credentials...")
6969

7070
# Fetch the google token
71-
google_token, _ = AuthClient.fetch_google_credentials().token
71+
google_token = AuthClient.fetch_google_credentials().token
7272

7373
try:
7474
requests.get(
75-
"https://oauth2.googleapis.com/tokeninfo?access_token=%s" % google_token
75+
f"https://oauth2.googleapis.com/tokeninfo?access_token={google_token}"
7676
)
7777
except HttpError as ex:
7878
if str(ex) == "Invalid Credentials, 401":

src/dapla/gcs.py

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -14,50 +14,8 @@ def __init__(
1414
) -> None:
1515
"""Initialize GCSFileSystem."""
1616
super().__init__(token=token, **kwargs)
17-
# Temporary bug fix for https://issues.apache.org/jira/browse/ARROW-7867
18-
# Spark writes an empty file to GCS (to mimic a folder structure) before writing partitioned data
19-
# Resolve this by ignoring the "empty" file when reading partitioned parquet files
20-
try:
21-
# Constant is moved to core module in Pyarrow 10.0.0
22-
from pyarrow.parquet.core import ( # type: ignore [attr-defined]
23-
EXCLUDED_PARQUET_PATHS,
24-
)
25-
except ImportError:
26-
# Fallback for Pyarrow versions <10.0.0
27-
from pyarrow.parquet.core import ( # type: ignore [attr-defined]
28-
EXCLUDED_PARQUET_PATHS,
29-
)
30-
from pyarrow.parquet import ParquetManifest
31-
32-
EXCLUDED_PARQUET_PATHS.add("")
33-
ParquetManifest._should_silently_exclude = ( # type: ignore [attr-defined]
34-
GCSFileSystem._should_silently_exclude
35-
)
3617

3718
def isdir(self, path: str) -> bool:
3819
"""Check if path is a directory."""
3920
info = super(gcsfs.GCSFileSystem, self).info(path)
4021
return t.cast(bool, info["type"] == "directory")
41-
42-
@staticmethod
43-
# This code is from from pyarrow.parquet.core
44-
def _should_silently_exclude(file_name: str) -> bool:
45-
try:
46-
# Constant is moved to core module in Pyarrow 10.0.0
47-
from pyarrow.parquet.core import ( # type: ignore [attr-defined]
48-
EXCLUDED_PARQUET_PATHS,
49-
)
50-
except ImportError:
51-
# Fallback for Pyarrow versions <10.0.0
52-
from pyarrow.parquet.core import ( # type: ignore [attr-defined]
53-
EXCLUDED_PARQUET_PATHS,
54-
)
55-
56-
return (
57-
file_name.endswith(".crc")
58-
or file_name.endswith("_$folder$") # Checksums
59-
or file_name.startswith(".") # HDFS directories in S3
60-
or file_name.startswith("_") # Hidden files starting with .
61-
or ".tmp" in file_name # Hidden files starting with _
62-
or file_name in EXCLUDED_PARQUET_PATHS # Temp files
63-
)

src/dapla/guardian.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def call_api(
4242
api_response = requests.get(
4343
api_endpoint_url,
4444
headers={
45-
"Authorization": "Bearer %s" % maskinporten_token,
45+
"Authorization": f"Bearer {maskinporten_token}",
4646
"Accept": "application/json",
4747
},
4848
)
@@ -74,7 +74,7 @@ def get_guardian_token(
7474
guardian_response = requests.post(
7575
guardian_endpoint,
7676
headers={
77-
"Authorization": "Bearer %s" % keycloak_token,
77+
"Authorization": f"Bearer {keycloak_token}",
7878
"Content-type": "application/json",
7979
},
8080
json=body,

src/dapla/jupyterhub.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def generate_api_token(
2323
+ os.environ["JUPYTERHUB_USER"]
2424
+ "/tokens",
2525
json=body,
26-
headers={"Authorization": "token %s" % hub.api_token},
26+
headers={"Authorization": f"token {hub.api_token}"},
2727
allow_redirects=False,
2828
)
2929
hub_response.raise_for_status()

0 commit comments

Comments
 (0)