diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 74f5674..f506892 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -202,6 +202,9 @@ jobs: - name: Install Pytest run: pip install pytest pytest-mock + - name: Setup PlayWright + run: playwright install && playwright install-deps + - name: Run Pytest run: pytest --no-header -vv diff --git a/.github/workflows/generate-release.yml b/.github/workflows/generate-release.yml index 687efdc..290196d 100644 --- a/.github/workflows/generate-release.yml +++ b/.github/workflows/generate-release.yml @@ -76,6 +76,9 @@ jobs: - name: Install Pytest run: pip install pytest pytest-mock + - name: Setup PlayWright + run: playwright install && playwright install-deps + - name: Run Pytest run: pytest --no-header -vv diff --git a/.github/workflows/generate-test-release.yml b/.github/workflows/generate-test-release.yml index 8d73889..5a3260a 100644 --- a/.github/workflows/generate-test-release.yml +++ b/.github/workflows/generate-test-release.yml @@ -75,6 +75,9 @@ jobs: - name: Install Pytest run: pip install pytest pytest-mock + - name: Setup PlayWright + run: playwright install && playwright install-deps + - name: Run Pytest run: pytest --no-header -vv diff --git a/CITATION.cff b/CITATION.cff index c3245ae..451e733 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,8 +3,8 @@ message: If you use this software, please cite it using these metadata. title: PyPi Extractor abstract: Extract package information for a given user in PyPi. type: software -version: 0.1.2 -date-released: 2024-06-26 +version: 0.1.3 +date-released: 2024-12-12 repository-code: https://github.com/DevelopersToolbox/pypi-extractor-package keywords: - "Wolf Software" diff --git a/README.md b/README.md index 770e9c7..d25e899 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,22 @@ PyPI Extractor is a Python package designed to fetch and process detailed inform Python Package Index (PyPI). This package is particularly useful for users who want to retrieve and analyze metadata for packages maintained by a specific PyPI user. +## Significant Update From 0.1.3 + +pypi.org no longer allow you to scrap details using the requests package, or any package that does not support JavaScript. To resolve this we have +updated this package to utilise [PlayWright](https://pypi.org/project/playwright/) when retrieving a list of packages for a given user. While we have +attempted to automate as much as possible you might want to do some of the work manually. + +Playwright needs two commands to be run in order for it to function correctly: + +``` +playwright install +playwright install-deps +``` + +We have added an `auto_install` option to the main class so that you can instruct the package to do the install for you, this helps when installing the +package in a fully automated way, e.g. Puppet or similar. + ## Features - Retrieve a list of packages maintained by a specific PyPI user. @@ -116,11 +132,13 @@ print(package_details) A class to fetch and process package details for a given PyPI user. -##### `__init__(self, username: str)` +##### `__init__(self, username: str, verbose: bool, auto_install: bool)` - Initializes the `PyPiExtractor` with a username. - Parameters: - `username` (str): The PyPI username. + - `verbose` (bool): Verbose output (Default: False) + - `auto_install` (bool): Auto install PlayWright dependencies (Default: False) - Raises: - `PyPiExtractorError`: If the username is not provided. @@ -132,6 +150,14 @@ A class to fetch and process package details for a given PyPI user. - Raises: - `PyPiExtractorError`: If the username is not provided. +##### `enable_verbose(self)` + +- Enable verbose mode. + +##### `enable_auto_install(self)` + +- Enable auto install. + ##### `get_user_packages(self) -> list` - Fetches the list of packages for the given PyPI user. diff --git a/requirements.txt b/requirements.txt index 74c270f..d88d7e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests==2.32.3 beautifulsoup4==4.12.3 +playwright==1.49.1 diff --git a/setup.py b/setup.py index 869e7b7..2f3cafe 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='wolfsoftware.pypi-extractor', - version='0.1.2', + version='0.1.3', author='Wolf Software', author_email='pypi@wolfsoftware.com', description='Extract package information for a given user in PyPi.', diff --git a/tests/testconf.py b/tests/conftest.py similarity index 69% rename from tests/testconf.py rename to tests/conftest.py index 5404db8..ab24880 100644 --- a/tests/testconf.py +++ b/tests/conftest.py @@ -16,32 +16,56 @@ import requests +def raise_error(*args, **kwargs): + """Raise an error if the real playwright gets used.""" + raise RuntimeError("Real Playwright should not be invoked!") + + @pytest.fixture -def mock_get_user_packages_success() -> Generator[Union[MagicMock, AsyncMock], Any, None]: - """Fixture to mock requests.get for get_user_packages success case.""" - with patch('requests.get') as mock_get: - mock_response = Mock() - mock_response.raise_for_status.return_value = None - mock_response.text = ''' - -

Package1

-

Description1

-
- -

Package2

-

Description2

-
- ''' - mock_get.return_value = mock_response - yield mock_get +def mock_playwright() -> Generator[MagicMock, None, None]: + """Mock the Playwright sync API.""" + with patch('wolfsoftware.pypi_extractor.pypi.sync_playwright') as mock_sync_playwright: + mock_playwright_instance = MagicMock() + mock_browser = MagicMock() + mock_context = MagicMock() + mock_page = MagicMock() + + # Mock page.goto() and page.wait_for_selector() + mock_page.goto.return_value = None + mock_page.wait_for_selector.return_value = None + + # Mock page.query_selector_all() to return simulated package elements + def mock_query_selector_all(selector): + """Handle mocking the right data.""" + if selector == 'a.package-snippet': + return [ + MagicMock(query_selector=MagicMock(side_effect=[ + MagicMock(inner_text=MagicMock(return_value="Package1")), + MagicMock(inner_text=MagicMock(return_value="Description1")), + ])), + MagicMock(query_selector=MagicMock(side_effect=[ + MagicMock(inner_text=MagicMock(return_value="Package2")), + MagicMock(inner_text=MagicMock(return_value="Description2")), + ])), + ] + return [] + mock_page.query_selector_all.side_effect = mock_query_selector_all + + mock_context.new_page.return_value = mock_page + mock_browser.new_context.return_value = mock_context + mock_playwright_instance.chromium.launch.return_value = mock_browser + mock_sync_playwright.return_value.__enter__.return_value = mock_playwright_instance + yield mock_sync_playwright @pytest.fixture -def mock_get_user_packages_error() -> Generator[Union[MagicMock, AsyncMock], Any, None]: - """Fixture to mock requests.get for get_user_packages error case.""" - with patch('requests.get') as mock_get: - mock_get.side_effect = requests.RequestException("Request error") - yield mock_get +def mock_playwright_error() -> Generator[MagicMock, None, None]: + """Fixture to mock Playwright with an error scenario.""" + with patch('wolfsoftware.pypi_extractor.pypi.sync_playwright') as mock_sync_playwright: + mock_playwright_instance = MagicMock() + mock_playwright_instance.chromium.launch.side_effect = Exception("Playwright error") + mock_sync_playwright.return_value.__enter__.return_value = mock_playwright_instance + yield mock_sync_playwright @pytest.fixture @@ -155,24 +179,13 @@ def mock_get_package_details_error() -> Generator[Union[MagicMock, AsyncMock], A @pytest.fixture -def mock_get_all_packages_details_success() -> Generator[Union[MagicMock, AsyncMock], Any, None]: - """Fixture to mock requests.get for get_all_packages_details success case.""" +def mock_get_all_packages_details_success() -> Generator[MagicMock, None, None]: + """Mock requests.get for get_all_packages_details success case.""" with patch('requests.get') as mock_get: - mock_response_user = Mock() + # Mock response for the user packages API + mock_response_user = MagicMock() mock_response_user.raise_for_status.return_value = None - mock_response_user.text = ''' - -

Package1

-

Description1

-
- -

Package2

-

Description2

-
- ''' - mock_response_package1 = Mock() - mock_response_package1.raise_for_status.return_value = None - mock_response_package1.json.return_value = { + mock_response_user.json.return_value = { 'info': { 'name': 'Package1', 'version': '1.0.0', @@ -186,37 +199,30 @@ def mock_get_all_packages_details_success() -> Generator[Union[MagicMock, AsyncM 'requires_python': '>=3.6', }, 'releases': { - '0.9.0': [ - { - 'upload_time': '2021-01-01T00:00:00', - 'upload_time_iso_8601': '2021-01-01T00:00:00Z', - 'python_version': 'py3', - 'url': 'https://example.com', - 'filename': 'package-0.9.0.tar.gz', - 'packagetype': 'sdist', - 'md5_digest': 'abc123', - 'digests': {'sha256': 'def456'}, - 'size': 12345 - } - ], '1.0.0': [ { 'upload_time': '2021-06-01T00:00:00', 'upload_time_iso_8601': '2021-06-01T00:00:00Z', 'python_version': 'py3', - 'url': 'https://example.com', + 'url': 'https://example.com/package-1.0.0.tar.gz', 'filename': 'package-1.0.0.tar.gz', 'packagetype': 'sdist', - 'md5_digest': 'ghi789', - 'digests': {'sha256': 'jkl012'}, - 'size': 23456 + 'md5_digest': 'abc123', + 'digests': {'sha256': 'def456'}, + 'size': 12345 } - ], + ] }, 'requires_dist': ['requests', 'beautifulsoup4'], 'urls': [{'url': 'https://example.com/package-1.0.0.tar.gz'}], } - mock_response_package2 = Mock() + + # Simulate two different package details responses + mock_response_package1 = MagicMock() + mock_response_package1.raise_for_status.return_value = None + mock_response_package1.json.return_value = mock_response_user.json.return_value + + mock_response_package2 = MagicMock() mock_response_package2.raise_for_status.return_value = None mock_response_package2.json.return_value = { 'info': { @@ -226,41 +232,30 @@ def mock_get_all_packages_details_success() -> Generator[Union[MagicMock, AsyncM 'author': 'Author2', 'author_email': 'author2@example.com', 'license': 'MIT', - 'home_page': 'https://example.com', - 'keywords': 'example, package', + 'home_page': 'https://example.com/package2', + 'keywords': 'example, package2', 'classifiers': ['Development Status :: 5 - Production/Stable'], 'requires_python': '>=3.6', }, 'releases': { - '1.0.0': [ - { - 'upload_time': '2021-01-01T00:00:00', - 'upload_time_iso_8601': '2021-01-01T00:00:00Z', - 'python_version': 'py3', - 'url': 'https://example.com', - 'filename': 'package-1.0.0.tar.gz', - 'packagetype': 'sdist', - 'md5_digest': 'abc123', - 'digests': {'sha256': 'def456'}, - 'size': 12345 - } - ], '2.0.0': [ { - 'upload_time': '2021-06-01T00:00:00', - 'upload_time_iso_8601': '2021-06-01T00:00:00Z', + 'upload_time': '2022-06-01T00:00:00', + 'upload_time_iso_8601': '2022-06-01T00:00:00Z', 'python_version': 'py3', - 'url': 'https://example.com', + 'url': 'https://example.com/package-2.0.0.tar.gz', 'filename': 'package-2.0.0.tar.gz', 'packagetype': 'sdist', 'md5_digest': 'ghi789', 'digests': {'sha256': 'jkl012'}, 'size': 23456 } - ], + ] }, 'requires_dist': ['requests', 'beautifulsoup4'], 'urls': [{'url': 'https://example.com/package-2.0.0.tar.gz'}], } - mock_get.side_effect = [mock_response_user, mock_response_package1, mock_response_package2] + + # Simulate the sequence of requests + mock_get.side_effect = [mock_response_package1, mock_response_package2] yield mock_get diff --git a/tests/test_pypi_extractor.py b/tests/test_pypi_extractor.py index d4b9aa0..b83a66c 100644 --- a/tests/test_pypi_extractor.py +++ b/tests/test_pypi_extractor.py @@ -10,13 +10,6 @@ import pytest from wolfsoftware.pypi_extractor import PyPiExtractor, PyPiExtractorError # pylint: disable=unused-import, no-name-in-module -from .testconf import ( # noqa: F401 pylint: disable=unused-import - mock_get_user_packages_success, - mock_get_user_packages_error, - mock_get_package_details_success, - mock_get_package_details_error, - mock_get_all_packages_details_success -) def test_version() -> None: @@ -78,16 +71,11 @@ def test_set_username_with_invalid_value() -> None: pypi_info.set_username("") -def test_get_user_packages_success(mock_get_user_packages_success) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument - """ - Test get_user_packages method for a successful case. - - This test uses the mock_get_user_packages_success fixture to mock requests.get method - to return a successful response and verifies that the get_user_packages method returns - the expected list of packages. - """ - pypi_info = PyPiExtractor("testuser") - packages: List = pypi_info.get_user_packages() +@pytest.mark.usefixtures("mock_playwright") +def test_get_user_packages_success() -> None: + """Test the get_user_packages method for a successful case.""" + pypi_extractor = PyPiExtractor("testuser") + packages: List[Dict[str, str]] = pypi_extractor.get_user_packages() assert len(packages) == 2 # nosec: B101 assert packages[0]['name'] == "Package1" # nosec: B101 @@ -96,20 +84,16 @@ def test_get_user_packages_success(mock_get_user_packages_success) -> None: # n assert packages[1]['summary'] == "Description2" # nosec: B101 -def test_get_user_packages_error(mock_get_user_packages_error) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument - """ - Test get_user_packages method when there is an error. - - This test uses the mock_get_user_packages_error fixture to mock requests.get method - to raise an exception and verifies that the get_user_packages method raises a PyPiExtractorError. - """ - pypi_info = PyPiExtractor("testuser") - - with pytest.raises(PyPiExtractorError, match="Error fetching user profile: Request error"): - pypi_info.get_user_packages() +@pytest.mark.usefixtures("mock_playwright_error") +def test_get_user_packages_error() -> None: + """Test the get_user_packages method when Playwright fails.""" + pypi_extractor = PyPiExtractor("testuser") + with pytest.raises(PyPiExtractorError, match="Error fetching user profile with Playwright"): + pypi_extractor.get_user_packages() -def test_get_package_details_success(mock_get_package_details_success) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument +@pytest.mark.usefixtures("mock_get_package_details_success") +def test_get_package_details_success() -> None: """ Test get_package_details method for a successful case. @@ -136,7 +120,8 @@ def test_get_package_details_success(mock_get_package_details_success) -> None: assert details['older_versions'][0]['version'] == "0.9.0" # nosec: B101 -def test_get_package_details_error(mock_get_package_details_error) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument +@pytest.mark.usefixtures("mock_get_package_details_error") +def test_get_package_details_error() -> None: """ Test get_package_details method when there is an error. @@ -149,7 +134,8 @@ def test_get_package_details_error(mock_get_package_details_error) -> None: # n pypi_info.get_package_details("Package1") -def test_get_all_packages_details_success(mock_get_all_packages_details_success) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument +@pytest.mark.usefixtures("mock_playwright", "mock_get_all_packages_details_success") +def test_get_all_packages_details_success() -> None: """ Test get_all_packages_details method for a successful case. diff --git a/wolfsoftware/pypi_extractor/pypi.py b/wolfsoftware/pypi_extractor/pypi.py index 1b07b4b..0882e83 100644 --- a/wolfsoftware/pypi_extractor/pypi.py +++ b/wolfsoftware/pypi_extractor/pypi.py @@ -7,8 +7,11 @@ from typing import Any, Dict, List, Optional import json +import subprocess # nosec: B404 + import requests -from bs4 import BeautifulSoup + +from playwright.sync_api import sync_playwright from .exceptions import PyPiExtractorError @@ -21,7 +24,7 @@ class PyPiExtractor: username (Optional[str]): The PyPI username whose packages are to be fetched. """ - def __init__(self, username: Optional[str] = None) -> None: + def __init__(self, username: Optional[str] = None, verbose: Optional[bool] = False, auto_install: Optional[bool] = False) -> None: """ Initialize the PyPIPackageInfo. The username can be set during initialization or later using the set_username method. @@ -29,6 +32,8 @@ def __init__(self, username: Optional[str] = None) -> None: username (Optional[str]): The PyPI username. Default is None. """ self.username: Optional[str] = username + self.verbose: Optional[bool] = verbose + self.auto_install: Optional[bool] = auto_install def set_username(self, username: str) -> None: """ @@ -44,6 +49,31 @@ def set_username(self, username: str) -> None: raise PyPiExtractorError("Username must be provided") self.username = username + def enable_verbose(self) -> None: + """Enable verbose output.""" + self.verbose = True + + def enable_auto_install(self) -> None: + """Enable auto_install.""" + self.auto_install = True + + def ensure_playwright_browsers_and_deps(self) -> None: + """Ensure Playwright browsers and system dependencies are installed silently.""" + if self.auto_install: + try: + # Install Playwright browsers silently + subprocess.run(["playwright", "install"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # nosec: B603 B607 + if self.verbose: + print("Playwright browsers installed successfully.") + + # Install system-level dependencies silently (Linux only) + subprocess.run(["playwright", "install-deps"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # nosec: B603 B607 + if self.verbose: + print("System dependencies installed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error during Playwright setup: {e}") + raise + def get_user_packages(self) -> List[Dict[str, str]]: """ Fetch the list of packages for the given PyPI user. @@ -52,27 +82,34 @@ def get_user_packages(self) -> List[Dict[str, str]]: list: A list of dictionaries containing package names and summaries. Raises: - PyPIPackageInfoError: If the username is not set or if there is an error fetching or parsing the user profile. + PyPiExtractorError: If the username is not set or if there is an error fetching or parsing the user profile. """ if not self.username: raise PyPiExtractorError("Username must be set before fetching packages") profile_url: str = "https://pypi.org/user/" + self.username + "/" + packages: List[Dict[str, str]] = [] + try: - response: requests.Response = requests.get(profile_url, timeout=10) - response.raise_for_status() - except requests.RequestException as e: - raise PyPiExtractorError(f"Error fetching user profile: {e}") from e + self.ensure_playwright_browsers_and_deps() - soup = BeautifulSoup(response.text, 'html.parser') - packages: List[Dict[str, str]] = [] - for project in soup.find_all('a', class_='package-snippet'): - try: - package_name: str = project.find('h3', class_='package-snippet__title').text.strip() - summary: str = project.find('p', class_='package-snippet__description').text.strip() - packages.append({'name': package_name, 'summary': summary}) - except AttributeError as e: - raise PyPiExtractorError(f"Error parsing package details: {e}") from e + with sync_playwright() as p: + browser: Any = p.chromium.launch(headless=True) + context: Any = browser.new_context() + page: Any = context.new_page() + + page.goto(profile_url) + page.wait_for_selector('.package-snippet') + + elements: Any = page.query_selector_all('a.package-snippet') + for element in elements: + package_name: Any = element.query_selector('h3.package-snippet__title').inner_text().strip() + summary: Any = element.query_selector('p.package-snippet__description').inner_text().strip() + packages.append({'name': package_name, 'summary': summary}) + + browser.close() + except Exception as e: + raise PyPiExtractorError(f"Error fetching user profile with Playwright: {e}") from e return packages