Skip to content

Commit 33666d6

Browse files
Add SHACLE
* checking checksum correctly when downloading files * add shacl_validate * increase version to 2.4.0
1 parent 25aef6d commit 33666d6

File tree

18 files changed

+1217
-301
lines changed

18 files changed

+1217
-301
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
Log of changes in the versions
44

5+
## v2.4.0
6+
7+
- added support for SHACL through function `shacl_validate`
8+
59
## v2.4.0-rc.2
610

711
- fix issues with ZenodoRecord

CITATION.cff

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ authors:
1111
given-names: "Lucas"
1212
orcid: "https://orcid.org/0000-0002-4116-0065"
1313
title: "h5rdmtoolbox - HDF5 Research Data Management Toolbox"
14-
version: 2.4.0-rc.2
15-
doi: 10.5281/zenodo.17334652
16-
date-released: 2025-10-12
14+
version: 2.4.0
15+
doi: 10.5281/zenodo.17390960
16+
date-released: 2025-10-19
1717
url: "https://github.com/matthiasprobst/h5rdmtoolbox"

codemeta.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"license": "https://spdx.org/licenses/MIT",
55
"codeRepository": "git+https://github.com/matthiasprobst/h5RDMtoolbox.git",
66
"name": "h5RDMtoolbox",
7-
"version": "2.4.0-rc.2",
7+
"version": "2.4.0",
88
"description": "Supporting a FAIR Research Data lifecycle using Python and HDF5.",
99
"applicationCategory": "Engineering",
1010
"programmingLanguage": [

docs/colab/quickstart.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"scrolled": true
88
},
99
"outputs": [],
10-
"source": "# !pip install h5rdmtoolbox==2.4.0rc2"
10+
"source": "# !pip install h5rdmtoolbox==2.4.0"
1111
},
1212
{
1313
"cell_type": "code",

docs/gettingstarted/quickoverview.ipynb

Lines changed: 334 additions & 112 deletions
Large diffs are not rendered by default.

docs/userguide/wrapper/FAIRAttributes.ipynb

Lines changed: 186 additions & 79 deletions
Large diffs are not rendered by default.

h5rdmtoolbox/__init__.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import json
5555
from .wrapper.accessor import register_accessor
5656

57+
from .ld.shacl import validate_hdf, ValidationResult
5758
# noinspection PyUnresolvedReferences
5859
from .utils import DownloadFileManager
5960

@@ -169,6 +170,62 @@ def dump_jsonld(
169170
)
170171

171172

173+
# def shacl_validate(
174+
# *,
175+
# hdf_data: Union[str, rdflib.Graph] = None,
176+
# hdf_source: Union[h5py.File, pathlib.Path] = None,
177+
# shacl_data: Union[str, rdflib.Graph] = None,
178+
# shacl_source: Union[str, pathlib.Path] = None,
179+
# hdf_file_uri="https://example.org/hdf5file#",
180+
# shacl_format: str = 'turtle',
181+
# hdf_data_format: str = 'turtle',
182+
# **pyshacl_kwargs
183+
# ) -> ValidationResult:
184+
# """Validate HDF5 file content against SHACL shapes.
185+
#
186+
# Parameters
187+
# ----------
188+
# hdf_data : Union[str, rdflib.Graph], optional
189+
# RDF data of the HDF5 file as string or rdflib.Graph. If not
190+
# provided, `hdf_source` must be provided.
191+
# hdf_source : Union[h5py.File, pathlib.Path], optional
192+
# HDF5 file or h5py.File object to extract RDF data from. If not
193+
# provided, `hdf_data` must be provided.
194+
# shacl_data : Union[str, rdflib.Graph], optional
195+
# SHACL shapes as string or rdflib.Graph. If not provided,
196+
# `shacl_source` must be provided.
197+
# shacl_source : Union[str, pathlib.Path], optional
198+
# File path to SHACL shapes. If not provided, `shacl_data`
199+
# must be provided.
200+
# hdf_file_uri : str, optional
201+
# The file URI to use for the HDF5 file when extracting RDF data.
202+
# Default is "https://example.org/hdf5file#".
203+
# shacl_format : str, optional
204+
# The format of the SHACL shapes if `shacl_data` is provided as
205+
# string. Default is 'turtle'.
206+
# hdf_data_format : str, optional
207+
# The format of the HDF5 RDF data if `hdf_data` is provided as
208+
# string. Default is 'turtle'.
209+
# **pyshacl_kwargs
210+
# Additional keyword arguments passed to pyshacl.validate().
211+
#
212+
# Returns
213+
# -------
214+
# ValidationResult
215+
# The result of the SHACL validation.
216+
# """
217+
# return validate_hdf(
218+
# hdf_data=hdf_data,
219+
# hdf_source=hdf_source,
220+
# shacl_data=shacl_data,
221+
# shacl_source=shacl_source,
222+
# hdf_file_uri=hdf_file_uri,
223+
# shacl_format=shacl_format,
224+
# hdf_data_format=hdf_data_format,
225+
# **pyshacl_kwargs
226+
# )
227+
228+
172229
def dump_jsonld_depr(hdf_filename: Union[str, pathlib.Path],
173230
skipND: int = 1,
174231
structural: bool = True,

h5rdmtoolbox/ld/shacl.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import pathlib
2+
from dataclasses import dataclass
3+
from typing import Union, List
4+
5+
import h5py
6+
import rdflib
7+
from pyshacl import validate as pyshacl_validate
8+
9+
from .hdf.file import get_ld as get_hdf_ld
10+
from .user.file import get_ld as get_contextual_ld
11+
12+
13+
@dataclass
14+
class ValidationResult:
15+
conforms: bool
16+
results_graph: rdflib.Graph
17+
results_text: str
18+
messages: List[str]
19+
20+
21+
def _parse_shacl(shacl: Union[str, pathlib.Path, rdflib.Graph], format) -> rdflib.Graph:
22+
if isinstance(shacl, pathlib.Path):
23+
if format is None:
24+
format = rdflib.util.guess_format(str(shacl))
25+
shacl_graph = rdflib.Graph()
26+
shacl_graph.parse(str(shacl), format=rdflib.util.guess_format(str(shacl)))
27+
elif isinstance(shacl, str):
28+
if format is None:
29+
format = 'turtle'
30+
try:
31+
shacl_graph = rdflib.Graph()
32+
shacl_graph.parse(shacl, format=format)
33+
except Exception:
34+
# it may be a file, not a string graph:
35+
return _parse_shacl(pathlib.Path(shacl))
36+
elif isinstance(shacl, rdflib.Graph):
37+
shacl_graph = shacl
38+
else:
39+
raise TypeError('shacl must be a pathlib.Path, str, or rdflib.Graph')
40+
return shacl_graph
41+
42+
43+
def validate_hdf(
44+
*,
45+
hdf_data: Union[str, rdflib.Graph] = None,
46+
hdf_source: Union[h5py.File, pathlib.Path] = None,
47+
shacl_data: Union[str, rdflib.Graph] = None,
48+
shacl_source: Union[str, pathlib.Path] = None,
49+
hdf_file_uri="https://example.org/hdf5file#",
50+
shacl_format: str = 'turtle',
51+
hdf_data_format: str = 'turtle',
52+
**pyshacl_kwargs
53+
) -> ValidationResult:
54+
"""
55+
Validate an HDF5 file against SHACL shapes.
56+
Parameters
57+
----------
58+
hdf_data: Union[str, rdflib.Graph], optional
59+
The HDF5 data as a string or rdflib.Graph. If is string is provided
60+
it is assumed to be in Turtle format (you may overwrite this by passing hdf_data_format.
61+
hdf_source: Union[h5py.File, pathlib.Path], optional
62+
The path to the HDF5 file.
63+
shacl_data: Union[str, rdflib.Graph], optional
64+
The SHACL shapes as a string or rdflib.Graph. If is string is provided
65+
it is assumed to be in Turtle format.
66+
shacl_source: Union[str, pathlib.Path], optional
67+
The path to the SHACL shapes file.
68+
shacl_format: str, optional
69+
The format of the SHACL shapes string. Default is 'turtle'.
70+
hdf_data_format: str, optional
71+
The format of the HDF5 data string. Default is 'turtle'.
72+
**pyshacl_kwargs:
73+
Additional keyword arguments to pass to pyshacl.validate().
74+
75+
Returns
76+
-------
77+
ValidationResult
78+
The result of the validation containing:
79+
- conforms: bool
80+
- results_graph: rdflib.Graph
81+
- results_text: str
82+
- messages: List[str]
83+
"""
84+
if shacl_data is not None and shacl_source is not None:
85+
raise ValueError('Only one of "shacl_data" or "shacl_source" should be provided.')
86+
if shacl_data is None and shacl_source is None:
87+
raise ValueError('One of "shacl_data" or shacl_source must be provided.')
88+
89+
if hdf_data is not None and hdf_source is not None:
90+
raise ValueError('Only one of "hdf_data" or "hdf_source" should be provided.')
91+
if hdf_data is None and hdf_source is None:
92+
raise ValueError('One of "hdf_data" or "hdf_source" must be provided.')
93+
94+
if hdf_data is not None:
95+
if isinstance(hdf_data, str):
96+
h5_graph = rdflib.Graph()
97+
h5_graph.parse(hdf_data, format='turtle')
98+
elif isinstance(hdf_data, rdflib.Graph):
99+
h5_graph = hdf_data
100+
else:
101+
raise TypeError(f'Parameter "hdf_data" must be a str or rdflib.Graph, but is {type(hdf_data)}')
102+
if hdf_source is not None:
103+
if isinstance(hdf_source, (str, pathlib.Path)):
104+
if not pathlib.Path(hdf_source).exists():
105+
raise FileNotFoundError(f'HDF5 file source "{hdf_source}" not found.')
106+
with h5py.File(hdf_source, 'r') as h5f:
107+
return validate_hdf(
108+
hdf_data=None,
109+
hdf_source=h5f,
110+
shacl_data=shacl_data,
111+
shacl_source=shacl_source,
112+
hdf_file_uri=hdf_file_uri,
113+
hdf_data_format=hdf_data_format,
114+
**pyshacl_kwargs
115+
)
116+
if not isinstance(hdf_source, h5py.File):
117+
raise TypeError('Parameter "hdf_source" must be an h5py.File or a path to an HDF5 file.')
118+
h5_graph1 = get_hdf_ld(hdf_source, file_uri=hdf_file_uri, skipND=True)
119+
h5_graph2 = get_contextual_ld(hdf_source, file_uri=hdf_file_uri)
120+
h5_graph = h5_graph1 + h5_graph2
121+
122+
shacl_graph = None
123+
if shacl_data is not None:
124+
if isinstance(shacl_data, str):
125+
shacl_graph = rdflib.Graph()
126+
shacl_graph.parse(data=shacl_data, format=shacl_format)
127+
elif isinstance(shacl_data, rdflib.Graph):
128+
shacl_graph = shacl_data
129+
else:
130+
raise TypeError('Parameter "shacl_data" must be a str or rdflib.Graph')
131+
elif shacl_source is not None:
132+
# shacl is a filename:
133+
if not pathlib.Path(shacl_source).exists():
134+
raise FileNotFoundError(f'SHACL file source "{shacl_source}" not found.')
135+
shacl_graph = rdflib.Graph()
136+
shacl_graph.parse(source=shacl_source, format=shacl_format)
137+
138+
conforms, results_graph, results_text = _validate_graphs(
139+
h5_graph,
140+
shacl_graph,
141+
**pyshacl_kwargs
142+
)
143+
return ValidationResult(
144+
conforms=conforms,
145+
results_graph=results_graph,
146+
results_text=results_text,
147+
messages=_get_messages(results_graph)
148+
)
149+
150+
151+
def _validate_graphs(
152+
data_graph,
153+
shacl_graph,
154+
inference='rdfs',
155+
abort_on_first=False,
156+
meta_shacl=False,
157+
advanced=False,
158+
debug=False):
159+
"""
160+
Validate a data graph against a SHACL shapes graph.
161+
162+
Parameters:
163+
- data_graph: The RDF graph containing the data to be validated.
164+
- shacl_graph: The RDF graph containing the SHACL shapes.
165+
- inference: Type of inference to apply ('rdfs', 'owl', or None).
166+
- abort_on_first: If True, stop validation on the first error found.
167+
- meta_shacl: If True, enable Meta-SHACL features.
168+
- advanced: If True, enable advanced SHACL features.
169+
- debug: If True, enable debug output.
170+
171+
Returns:
172+
- A tuple (conforms, results_graph, results_text) where:
173+
- conforms: Boolean indicating if the data graph conforms to the shapes.
174+
- results_graph: An RDF graph with validation results.
175+
- results_text: A textual summary of the validation results.
176+
"""
177+
178+
conforms, results_graph, results_text = pyshacl_validate(
179+
data_graph,
180+
shacl_graph=shacl_graph,
181+
inference=inference,
182+
abort_on_first=abort_on_first,
183+
meta_shacl=meta_shacl,
184+
advanced=advanced,
185+
debug=debug
186+
)
187+
188+
return conforms, results_graph, results_text
189+
190+
191+
def _get_messages(results_graph):
192+
"""
193+
Extract validation messages from a SHACL results graph.
194+
195+
Parameters:
196+
- results_graph: An RDF graph containing SHACL validation results.
197+
198+
Returns:
199+
- A list of validation messages.
200+
"""
201+
messages = []
202+
for s, p, o in results_graph.triples((None, rdflib.namespace.SH.resultMessage, None)):
203+
messages.append(str(o))
204+
return messages

h5rdmtoolbox/repository/interface.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(self,
4343
download_url,
4444
access_url,
4545
checksum,
46+
checksum_algorithm,
4647
name,
4748
size,
4849
media_type,
@@ -51,6 +52,7 @@ def __init__(self,
5152
self.download_url = download_url
5253
self.access_url = access_url
5354
self.checksum = checksum
55+
self.checksum_algorithm = checksum_algorithm
5456
self.name = name
5557
self.media_type = media_type
5658
self.size = size
@@ -106,10 +108,14 @@ def download(self, target_folder: Optional[Union[str, pathlib.Path]] = None) ->
106108
"""Download the file to target_folder. If None, local user dir is used.
107109
Returns the file location"""
108110
from .utils import download_file
109-
return download_file(file_url=self.download_url,
110-
target_folder=target_folder,
111-
access_token=self.access_token)
112-
111+
return download_file(
112+
file_url=self.download_url,
113+
filename=self.name,
114+
target_folder=target_folder,
115+
access_token=self.access_token,
116+
checksum=self.checksum,
117+
checksum_algorithm=self.checksum_algorithm,
118+
)
113119

114120

115121
class RepositoryInterface(abc.ABC):

0 commit comments

Comments
 (0)