diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8748e04..affcef5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,6 +10,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + with: + submodules: recursive - name: Setup Python environment uses: actions/setup-python@v5 @@ -23,7 +25,7 @@ jobs: - name: Validate run: | isort --check-only src/ tests/ - black --check --line-length 100 . + black --check --line-length 100 src/ tests/ mypy build-and-test: @@ -44,6 +46,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + with: + submodules: recursive - name: Setup Python environment uses: actions/setup-python@v5 @@ -56,7 +60,7 @@ jobs: pip install -e .[build] - name: Test - run: py.test -vvs + run: py.test -vvs --ignore=spec/ - name: Build run: python setup.py build sdist bdist_wheel diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..2b13e85 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "spec"] + path = spec + url = https://github.com/package-url/purl-spec.git diff --git a/Makefile b/Makefile index 2b1593b..1ee5201 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ PYTHON_EXE?=python3 ACTIVATE?=. bin/activate; VIRTUALENV_PYZ=thirdparty/virtualenv.pyz -BLACK_ARGS=--exclude=".cache|lib|bin|var" --line-length 100 +BLACK_ARGS=src/ --exclude="\.cache|lib|bin|var" --line-length 100 virtualenv: @echo "-> Bootstrap the virtualenv with PYTHON_EXE=${PYTHON_EXE}" @@ -52,7 +52,7 @@ isort: black: @echo "-> Apply black code formatter" - @${ACTIVATE} black ${BLACK_ARGS} . + @${ACTIVATE} black ${BLACK_ARGS} mypy: @echo "-> Type check the Python code." diff --git a/spec b/spec new file mode 160000 index 0000000..a627e02 --- /dev/null +++ b/spec @@ -0,0 +1 @@ +Subproject commit a627e02e97b3a43de3938c3d8f67da7a51395578 diff --git a/src/packageurl/__init__.py b/src/packageurl/__init__.py index 2b7e052..4d90f42 100644 --- a/src/packageurl/__init__.py +++ b/src/packageurl/__init__.py @@ -29,6 +29,7 @@ from collections.abc import Mapping from typing import TYPE_CHECKING from typing import Any +from typing import Optional from typing import Union from typing import overload from urllib.parse import quote as _percent_quote @@ -116,7 +117,7 @@ def normalize_namespace( namespace_str = namespace if isinstance(namespace, str) else namespace.decode("utf-8") namespace_str = namespace_str.strip().strip("/") - if ptype in ("bitbucket", "github", "pypi", "gitlab"): + if ptype in ("bitbucket", "github", "pypi", "gitlab", "composer"): namespace_str = namespace_str.lower() segments = [seg for seg in namespace_str.split("/") if seg.strip()] segments_quoted = map(get_quoter(encode), segments) @@ -124,7 +125,10 @@ def normalize_namespace( def normalize_name( - name: AnyStr | None, ptype: str | None, encode: bool | None = True + name: AnyStr | None, + qualifiers: Union[Union[str, bytes], dict[str, str], None], + ptype: str | None, + encode: bool | None = True, ) -> str | None: if not name: return None @@ -133,20 +137,38 @@ def normalize_name( quoter = get_quoter(encode) name_str = quoter(name_str) name_str = name_str.strip().strip("/") - if ptype in ("bitbucket", "github", "pypi", "gitlab"): + if ptype and ptype in ("mlflow"): + # MLflow purl names are case-sensitive for Azure ML, it is case sensitive and must be kept as-is in the package URL + # For Databricks, it is case insensitive and must be lowercased in the package URL + if isinstance(qualifiers, dict): + repo_url = qualifiers.get("repository_url") + if repo_url and "azureml" in repo_url.lower(): + return name_str + if repo_url and "databricks" in repo_url.lower(): + return name_str.lower() + if isinstance(qualifiers, str): + if "azureml" in qualifiers.lower(): + return name_str + if "databricks" in qualifiers.lower(): + return name_str.lower() + if ptype in ("bitbucket", "github", "pypi", "gitlab", "composer"): name_str = name_str.lower() if ptype == "pypi": name_str = name_str.replace("_", "-") return name_str or None -def normalize_version(version: AnyStr | None, encode: bool | None = True) -> str | None: +def normalize_version( + version: AnyStr | None, ptype: Optional[Union[str, bytes]], encode: bool | None = True +) -> str | None: if not version: return None version_str = version if isinstance(version, str) else version.decode("utf-8") quoter = get_quoter(encode) version_str = quoter(version_str.strip()) + if ptype and isinstance(ptype, str) and ptype in ("huggingface"): + return version_str.lower() return version_str or None @@ -304,8 +326,8 @@ def normalize( """ type_norm = normalize_type(type, encode) namespace_norm = normalize_namespace(namespace, type_norm, encode) - name_norm = normalize_name(name, type_norm, encode) - version_norm = normalize_version(version, encode) + name_norm = normalize_name(name, qualifiers, type_norm, encode) + version_norm = normalize_version(version, type, encode) qualifiers_norm = normalize_qualifiers(qualifiers, encode) subpath_norm = normalize_subpath(subpath, encode) return type_norm, namespace_norm, name_norm, version_norm, qualifiers_norm, subpath_norm @@ -464,8 +486,21 @@ def from_string(cls, purl: str) -> Self: if not type_ or not sep: raise ValueError(f"purl is missing the required type component: {purl!r}.") + if not all(c in string.ascii_letters + string.digits + "-._" for c in type_): + raise ValueError( + f"purl type must be composed only of ASCII letters and numbers, period, dash and underscore: {type_!r}." + ) + + if ":" in type_: + raise ValueError(f"purl type cannot contain a colon: {type_!r}.") + + if type_[0] in string.digits: + raise ValueError(f"purl type cannot start with a number: {type_!r}.") + type_ = type_.lower() + original_remainder = remainder + scheme, authority, path, qualifiers_str, subpath = _urlsplit( url=remainder, scheme="", allow_fragments=True ) @@ -480,7 +515,9 @@ def from_string(cls, purl: str) -> Self: path = authority + ":" + path if scheme: - path = scheme + ":" + path + # This is a way to preserve the casing of the original scheme + original_scheme = original_remainder.split(":", 1)[0] + path = original_scheme + ":" + path path = path.lstrip("/") diff --git a/tests/test_purl_spec.py b/tests/test_purl_spec.py new file mode 100644 index 0000000..951bbad --- /dev/null +++ b/tests/test_purl_spec.py @@ -0,0 +1,160 @@ +# Copyright (c) the purl authors +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Visit https://github.com/package-url/packageurl-python for support and +# download. + +import json +import os + +import pytest + +from packageurl import PackageURL + +current_dir = os.path.dirname(__file__) +root_dir = os.path.abspath(os.path.join(current_dir, "..")) +spec_file_path = os.path.join(root_dir, "spec", "tests", "spec", "specification-test.json") + +valid_purl_types_file = os.path.join(root_dir, "spec", "purl-types-index.json") + + +with open(spec_file_path, "r", encoding="utf-8") as f: + test_cases = json.load(f) + +with open(valid_purl_types_file, "r", encoding="utf-8") as f: + valid_purl_types = json.load(f) + +tests = test_cases["tests"] + +parse_tests = [t for t in tests if t["test_type"] == "parse"] +build_tests = [t for t in tests if t["test_type"] == "build"] + + +@pytest.mark.parametrize( + "description, input_str, expected_output, expected_failure", + [ + (t["description"], t["input"], t["expected_output"], t["expected_failure"]) + for t in parse_tests + ], +) +def test_parse(description, input_str, expected_output, expected_failure): + if expected_failure: + with pytest.raises(Exception): + PackageURL.from_string(input_str) + # assert None ==PackageURL.from_string(input_str) + else: + result = PackageURL.from_string(input_str) + assert result.to_string() == expected_output + + +@pytest.mark.parametrize( + "description, input_dict, expected_output, expected_failure", + [ + (t["description"], t["input"], t["expected_output"], t["expected_failure"]) + for t in build_tests + ], +) +def test_build(description, input_dict, expected_output, expected_failure): + kwargs = { + "type": input_dict.get("type"), + "namespace": input_dict.get("namespace"), + "name": input_dict.get("name"), + "version": input_dict.get("version"), + "qualifiers": input_dict.get("qualifiers"), + "subpath": input_dict.get("subpath"), + } + + if expected_failure: + with pytest.raises(Exception): + PackageURL(**kwargs).to_string() + else: + purl = PackageURL(**kwargs) + assert purl.to_string() == expected_output + + +def load_spec_files(spec_dir): + """ + Load all JSON files from the given directory into a dictionary. + Key = filename, Value = parsed JSON content + """ + spec_data = {} + for filename in os.listdir(spec_dir): + if filename.endswith("-test.json"): + filepath = os.path.join(spec_dir, filename) + with open(filepath, "r", encoding="utf-8") as f: + try: + data = json.load(f) + spec_data[filename] = data["tests"] + except json.JSONDecodeError as e: + print(f"Error parsing {filename}: {e}") + return spec_data + + +SPEC_DIR = os.path.join(os.path.dirname(__file__), "..", "spec", "tests", "types") +spec_dict = load_spec_files(SPEC_DIR) + +flattened_cases = [] +for filename, cases in spec_dict.items(): + for case in cases: + flattened_cases.append((filename, case["description"], case)) + + +@pytest.mark.parametrize("filename,description,test_case", flattened_cases) +def test_package_type_case(filename, description, test_case): + test_type = test_case["test_type"] + expected_failure = test_case.get("expected_failure", False) + + if expected_failure: + with pytest.raises(Exception): + run_test_case(test_case, test_type, description) + else: + run_test_case(test_case, test_type, description) + + +def run_test_case(case, test_type, desc): + if test_type == "parse": + purl = PackageURL.from_string(case["input"]) + expected = case["expected_output"] + assert purl.type == expected["type"] + assert purl.namespace == expected["namespace"] + assert purl.name == expected["name"] + assert purl.version == expected["version"] + if expected["qualifiers"]: + assert purl.qualifiers == expected["qualifiers"] + else: + assert not purl.qualifiers + assert purl.subpath == expected["subpath"] + + elif test_type == "roundtrip": + purl = PackageURL.from_string(case["input"]) + assert purl.to_string() == case["expected_output"] + + elif test_type == "build": + input_data = case["input"] + purl = PackageURL( + type=input_data["type"], + namespace=input_data["namespace"], + name=input_data["name"], + version=input_data["version"], + qualifiers=input_data.get("qualifiers"), + subpath=input_data.get("subpath"), + ) + assert purl.to_string() == case["expected_output"]