From 6cf67ba89203a9925c5d9044d49d0b0b9a6c65aa Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 21 Apr 2025 17:48:58 +0000 Subject: [PATCH 01/30] init sitemap --- src/crawlee/_utils/sitemap.py | 162 ++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 src/crawlee/_utils/sitemap.py diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py new file mode 100644 index 0000000000..d602a5eb2b --- /dev/null +++ b/src/crawlee/_utils/sitemap.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +from contextlib import suppress +from dataclasses import dataclass +from datetime import datetime +from logging import getLogger +from typing import TYPE_CHECKING, Literal, TypedDict +from xml.sax.handler import ContentHandler + +from typing_extensions import override +from yarl import URL + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from xml.sax.xmlreader import AttributesImpl + +logger = getLogger(__name__) + + +@dataclass() +class SitemapUrl: + loc: str + lastmod: datetime | None + changefreq: Literal['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'] | None + priority: float | None + origin_sitemap_url: str | None + + +@dataclass() +class NestedSitemap: + loc: str + origin_sitemap_url: str | None + + +class ParseSitemapOptions(TypedDict, total=False): + emit_nested_sitemaps: bool + max_depth: int + sitemap_retries: int + timeout: float | None + + +class SitemapSource(TypedDict, total=False): + type: Literal['url', 'raw'] + url: str + content: str + depth: int + + +class SitemapItem(TypedDict, total=False): + type: Literal['url', 'sitemap_url'] + loc: str + url: str + lastmod: datetime + changefreq: str + priority: float + + +class SitemapHandler(ContentHandler): + def __init__(self) -> None: + super().__init__() + self.root_tag_name: str | None = None + self.current_tag: str | None = None + self.current_url: SitemapItem = {} + self.buffer: str = '' + self.items: list[SitemapItem] = [] + + @override + def startElement(self, name: str, attrs: AttributesImpl) -> None: + if self.root_tag_name is None and name in ('urlset', 'sitemapindex'): + self.root_tag_name = name + + if name in ('loc', 'lastmod', 'changefreq', 'priority'): + self.current_tag = name + self.buffer = '' + + def characters(self, content: str) -> None: + if self.current_tag: + self.buffer += content + + @override + def endElement(self, name: str) -> None: + changefreq_atr = ('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never') + if name == self.current_tag: + text = self.buffer.strip() + + if name == 'loc': + if self.root_tag_name == 'sitemapindex': + self.items.append({'type': 'sitemap_url', 'url': text}) + else: + self.current_url['loc'] = text + + elif name == 'lastmod' and text: + with suppress(ValueError): + self.current_url['lastmod'] = datetime.fromisoformat(text.replace('Z', '+00:00')) + + elif name == 'priority' and text: + with suppress(ValueError): + self.current_url['priority'] = float(text) + + elif name == 'changefreq' and (text and text in changefreq_atr): + self.current_url['changefreq'] = text + + self.current_tag = None + + if name == 'url' and 'loc' in self.current_url: + self.items.append({'type': 'url', **self.current_url}) + self.current_url = {} + + +class Sitemap: + def __init__(self, urls: list[str]) -> None: + self.urls = urls + + @classmethod + async def try_common_names(cls, url: str, proxy_url: str | None = None) -> Sitemap: + base_url = URL(url) + + sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))] + + return await cls.load(sitemap_urls, proxy_url) + + @classmethod + async def load( + cls, + urls: str | list[str], + proxy_url: str | None = None, + parse_sitemap_options: ParseSitemapOptions | None = None, + ) -> Sitemap: + if isinstance(urls, str): + urls = [urls] + + return await cls.parse( + [{'type': 'url', 'url': url} for url in urls], + proxy_url, + parse_sitemap_options, + ) + + @classmethod + async def from_xml_string(cls, content: str, proxy_url: str | None = None) -> Sitemap: + return await cls.parse([{'type': 'raw', 'content': content}], proxy_url) + + @classmethod + async def parse( + cls, + sources: list[SitemapSource], + proxy_url: str | None = None, + parse_sitemap_options: ParseSitemapOptions | None = None, + ) -> Sitemap: + urls: list[str] = [] + + urls = [item.loc async for item in parse_sitemap(sources, proxy_url, parse_sitemap_options)] + + return cls(urls) + + +# mypy: ignore-errors +async def parse_sitemap( + sources: list[SitemapSource], + proxy_url: str | None = None, + parse_sitemap_options: ParseSitemapOptions | None = None, +) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: + raise NotImplementedError('This is a stub') From c96572a8ab59830e7d48b1c61f0ff82997da0e52 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 22 Apr 2025 23:40:01 +0000 Subject: [PATCH 02/30] implementation --- src/crawlee/_utils/sitemap.py | 414 +++++++++++++++++++++++++++++----- 1 file changed, 357 insertions(+), 57 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index d602a5eb2b..35394eea9d 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -1,13 +1,18 @@ from __future__ import annotations +import asyncio +import zlib from contextlib import suppress from dataclasses import dataclass from datetime import datetime +from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, Literal, TypedDict +from xml.sax.expatreader import ExpatParser from xml.sax.handler import ContentHandler -from typing_extensions import override +import httpx +from typing_extensions import NotRequired, override from yarl import URL if TYPE_CHECKING: @@ -16,20 +21,22 @@ logger = getLogger(__name__) +VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'} + @dataclass() class SitemapUrl: loc: str - lastmod: datetime | None - changefreq: Literal['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'] | None - priority: float | None - origin_sitemap_url: str | None + lastmod: datetime | None = None + changefreq: str | None = None + priority: float | None = None + origin_sitemap_url: str | None = None @dataclass() class NestedSitemap: loc: str - origin_sitemap_url: str | None + origin_sitemap_url: str | None = None class ParseSitemapOptions(TypedDict, total=False): @@ -39,84 +46,336 @@ class ParseSitemapOptions(TypedDict, total=False): timeout: float | None -class SitemapSource(TypedDict, total=False): +class _SitemapSource(TypedDict): type: Literal['url', 'raw'] - url: str - content: str - depth: int + url: NotRequired[str] + content: NotRequired[str] + depth: NotRequired[int] -class SitemapItem(TypedDict, total=False): +class _SitemapItem(TypedDict, total=False): type: Literal['url', 'sitemap_url'] loc: str url: str - lastmod: datetime - changefreq: str - priority: float + lastmod: datetime | None + changefreq: str | None + priority: float | None -class SitemapHandler(ContentHandler): +class _XMLSaxSitemapHandler(ContentHandler): def __init__(self) -> None: super().__init__() - self.root_tag_name: str | None = None - self.current_tag: str | None = None - self.current_url: SitemapItem = {} - self.buffer: str = '' - self.items: list[SitemapItem] = [] + self._root_tag_name: str | None = None + self._current_tag: str | None = None + self._current_url: _SitemapItem = {} + self._buffer: str = '' + self._items: list[_SitemapItem] = [] + + @property + def items(self) -> list[_SitemapItem]: + return self._items @override def startElement(self, name: str, attrs: AttributesImpl) -> None: - if self.root_tag_name is None and name in ('urlset', 'sitemapindex'): - self.root_tag_name = name + if self._root_tag_name is None and name in ('urlset', 'sitemapindex'): + self._root_tag_name = name if name in ('loc', 'lastmod', 'changefreq', 'priority'): - self.current_tag = name - self.buffer = '' + self._current_tag = name + self._buffer = '' def characters(self, content: str) -> None: - if self.current_tag: - self.buffer += content + if self._current_tag: + self._buffer += content @override def endElement(self, name: str) -> None: - changefreq_atr = ('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never') - if name == self.current_tag: - text = self.buffer.strip() + if name == self._current_tag: + text = self._buffer.strip() if name == 'loc': - if self.root_tag_name == 'sitemapindex': - self.items.append({'type': 'sitemap_url', 'url': text}) + if self._root_tag_name == 'sitemapindex': + self._items.append({'type': 'sitemap_url', 'url': text}) else: - self.current_url['loc'] = text + self._current_url['loc'] = text elif name == 'lastmod' and text: with suppress(ValueError): - self.current_url['lastmod'] = datetime.fromisoformat(text.replace('Z', '+00:00')) + self._current_url['lastmod'] = datetime.fromisoformat(text.replace('Z', '+00:00')) elif name == 'priority' and text: with suppress(ValueError): - self.current_url['priority'] = float(text) + self._current_url['priority'] = float(text) - elif name == 'changefreq' and (text and text in changefreq_atr): - self.current_url['changefreq'] = text + elif name == 'changefreq' and text in VALID_CHANGE_FREQS: + self._current_url['changefreq'] = text self.current_tag = None - if name == 'url' and 'loc' in self.current_url: - self.items.append({'type': 'url', **self.current_url}) - self.current_url = {} + if name == 'url' and 'loc' in self._current_url: + self.items.append({'type': 'url', **self._current_url}) + self._current_url = {} + + +class _TxtSitemapParser: + """Parser for plaintext sitemaps that processes data as a stream.""" + + def __init__(self) -> None: + self._buffer = '' + + async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]: + """Process a chunk of text data and yield items one by one.""" + self._buffer += chunk + + # Process complete lines + if '\n' in self._buffer: + lines = self._buffer.split('\n') + # Last element might be incomplete, save for next chunk + self._buffer = lines.pop() + + for line in lines: + url = line.strip() + if url: + yield {'type': 'url', 'loc': url} + + async def flush(self) -> AsyncGenerator[_SitemapItem, None]: + """Process any remaining data in the buffer, yielding items one by one.""" + if self._buffer: + url = self._buffer.strip() + if url: + yield {'type': 'url', 'loc': url} + self.buffer = '' + + def close(self) -> None: + """Clean up resources.""" + self._buffer = '' + + +class _XmlSitemapParser: + """Parser for XML sitemaps using SAX to process data as a stream.""" + + def __init__(self) -> None: + self._parser = ExpatParser() + self._handler = _XMLSaxSitemapHandler() + self._parser.setContentHandler(self._handler) + + async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]: + """Process a chunk of XML data and yield items one by one.""" + try: + self._parser.feed(chunk) + + # If we get here, the XML was valid and complete + for item in self._handler.items: + yield item + + self._handler.items.clear() + + except Exception as e: + logger.warning(f'Failed to parse XML data chunk: {e}') + + async def flush(self) -> AsyncGenerator[_SitemapItem, None]: + """Process any remaining data in the buffer, yielding items one by one.""" + try: + self._parser.flush() + + for item in self._handler.items: + yield item + + self._handler.items.clear() + + except Exception as e: + logger.warning(f'Failed to parse remaining XML data: {e}') + + def close(self) -> None: + """Clean up resources.""" + self._parser.close() + + +def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser: + """Create appropriate parser based on content type and URL.""" + if 'text/plain' in content_type.lower() or (url and url.endswith('.txt')): + return _TxtSitemapParser() + # Default to XML parser for most cases + return _XmlSitemapParser() + + +def _get_origin_url(source: _SitemapSource) -> str: + """Determine the origin URL for a sitemap source.""" + if source['type'] == 'url' and 'url' in source: + return source['url'] + if source['type'] == 'raw' and 'content' in source: + # For raw content sources, create a consistent identifier + return f'raw://{sha256(source["content"].encode()).hexdigest()}' + return '' + + +async def _process_sitemap_item( + item: _SitemapItem, + source: _SitemapSource, + depth: int, + visited_sitemap_urls: set[str], + sources: list[_SitemapSource], + *, + emit_nested_sitemaps: bool, +) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]: + """Process a sitemap item and yield appropriate results.""" + item_copy = item.copy() # Work with a copy to avoid modifying the original + + if 'type' not in item_copy: + return + + item_type = item_copy.pop('type') + + # Handle sitemap URL references (nested sitemaps) + if item_type == 'sitemap_url' and 'url' in item_copy: + sitemap_url = item_copy['url'] + if sitemap_url and sitemap_url not in visited_sitemap_urls: + # Add to processing queue + sources.append(_SitemapSource(type='url', url=sitemap_url, depth=depth + 1)) + + # Output the nested sitemap reference if requested + if emit_nested_sitemaps: + yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=None) + + # Handle individual URL entries + elif item_type == 'url' and 'loc' in item_copy: + # Determine the origin sitemap URL for tracking purposes + origin_url = _get_origin_url(source) + + # Create and yield the sitemap URL object + yield SitemapUrl( + loc=item_copy['loc'], + lastmod=item_copy.get('lastmod'), + changefreq=item_copy.get('changefreq'), + priority=item_copy.get('priority'), + origin_sitemap_url=origin_url, + ) + + +async def _process_raw_source( + source: _SitemapSource, + depth: int, + visited_sitemap_urls: set[str], + sources: list[_SitemapSource], + *, + emit_nested_sitemaps: bool, +) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: + """Process a raw content sitemap source.""" + if 'content' not in source: + logger.warning(f'Raw source missing content: {source}') + return + + content = source['content'] + parser = _get_parser('text/xml') + + try: + # Process the content + async for item in parser.process_chunk(content): + async for result in _process_sitemap_item( + item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps + ): + if result: + yield result + + # Process any remaining content + async for item in parser.flush(): + async for result in _process_sitemap_item( + item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps + ): + if result: + yield result + except Exception as e: + logger.warning(f'Failed to parse raw sitemap content: {e}') + finally: + parser.close() + + +async def _fetch_and_process_sitemap( + client: httpx.AsyncClient, + source: _SitemapSource, + depth: int, + visited_sitemap_urls: set[str], + sources: list[_SitemapSource], + retries_left: int, + *, + emit_nested_sitemaps: bool, +) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: + """Fetch a sitemap from a URL and process its content.""" + if 'url' not in source: + return + + sitemap_url = source['url'] + + try: + while retries_left > 0: + retries_left -= 1 + async with client.stream('GET', sitemap_url) as response: + response.raise_for_status() + + # Determine content type and compression + content_type = response.headers.get('content-type', '') + is_gzipped = ( + 'application/gzip' in content_type + or 'application/x-gzip' in content_type + or sitemap_url.endswith('.gz') + ) + + # Create appropriate parser + parser = _get_parser(content_type, sitemap_url) + decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) if is_gzipped else None + + try: + # Process chunks as they arrive + async for raw_chunk in response.aiter_bytes(chunk_size=8192): + chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk + + text_chunk = chunk.decode('utf-8', errors='replace') + async for item in parser.process_chunk(text_chunk): + async for result in _process_sitemap_item( + item, + source, + depth, + visited_sitemap_urls, + sources, + emit_nested_sitemaps=emit_nested_sitemaps, + ): + if result: + yield result + + # Process any remaining content + async for item in parser.flush(): + async for result in _process_sitemap_item( + item, + source, + depth, + visited_sitemap_urls, + sources, + emit_nested_sitemaps=emit_nested_sitemaps, + ): + if result: + yield result + finally: + parser.close() + break + + except Exception as e: + if retries_left > 0: + logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}') + await asyncio.sleep(1) # Brief pause before retry class Sitemap: def __init__(self, urls: list[str]) -> None: - self.urls = urls + self._urls = urls + + @property + def urls(self) -> list[str]: + return self._urls @classmethod async def try_common_names(cls, url: str, proxy_url: str | None = None) -> Sitemap: base_url = URL(url) - sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))] - return await cls.load(sitemap_urls, proxy_url) @classmethod @@ -128,35 +387,76 @@ async def load( ) -> Sitemap: if isinstance(urls, str): urls = [urls] - - return await cls.parse( - [{'type': 'url', 'url': url} for url in urls], - proxy_url, - parse_sitemap_options, - ) + return await cls.parse([_SitemapSource(type='url', url=url) for url in urls], proxy_url, parse_sitemap_options) @classmethod async def from_xml_string(cls, content: str, proxy_url: str | None = None) -> Sitemap: - return await cls.parse([{'type': 'raw', 'content': content}], proxy_url) + return await cls.parse([_SitemapSource(type='raw', content=content)], proxy_url) @classmethod async def parse( cls, - sources: list[SitemapSource], + sources: list[_SitemapSource], proxy_url: str | None = None, parse_sitemap_options: ParseSitemapOptions | None = None, ) -> Sitemap: - urls: list[str] = [] - urls = [item.loc async for item in parse_sitemap(sources, proxy_url, parse_sitemap_options)] - return cls(urls) -# mypy: ignore-errors async def parse_sitemap( - sources: list[SitemapSource], + initial_sources: list[_SitemapSource], proxy_url: str | None = None, - parse_sitemap_options: ParseSitemapOptions | None = None, + options: ParseSitemapOptions | None = None, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: - raise NotImplementedError('This is a stub') + """Parse sitemap(s) and yield URLs found in them. + + This function coordinates the process of fetching and parsing sitemaps, + handling both URL-based and raw content sources. It follows nested sitemaps + up to the specified maximum depth. + """ + # Set default options + options = options or {} + emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) + max_depth = options.get('max_depth', float('inf')) + sitemap_retries = options.get('sitemap_retries', 3) + timeout = options.get('timeout', 30) + + # Setup working state + sources = list(initial_sources) + visited_sitemap_urls: set[str] = set() + + # Process sources until the queue is empty + while sources: + source = sources.pop(0) + depth = source.get('depth', 0) + + # Skip if we've reached max depth + if depth > max_depth: + logger.debug(f'Skipping sitemap {source.get("url", "")} - exceeded max depth {max_depth}') + continue + + # Process based on source type + if source['type'] == 'raw': + async for result in _process_raw_source( + source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps + ): + yield result + + elif source['type'] == 'url' and 'url' in source: + # Add to visited set before processing to avoid duplicates + visited_sitemap_urls.add(source['url']) + + async with httpx.AsyncClient(timeout=httpx.Timeout(timeout), proxy=proxy_url) as client: + async for result in _fetch_and_process_sitemap( + client, + source, + depth, + visited_sitemap_urls, + sources, + sitemap_retries, + emit_nested_sitemaps=emit_nested_sitemaps, + ): + yield result + else: + logger.warning(f'Invalid source configuration: {source}') From fcbca2375085b8a726fbceac8c5d6bd9d28cb569 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 24 Apr 2025 22:21:40 +0000 Subject: [PATCH 03/30] optimization uvicorn paths --- src/crawlee/_utils/sitemap.py | 3 +- tests/unit/_utils/test_sitemap.py | 38 ++++++++ tests/unit/server.py | 140 ++++++++++++++++++------------ 3 files changed, 125 insertions(+), 56 deletions(-) create mode 100644 tests/unit/_utils/test_sitemap.py diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 35394eea9d..9574574cc4 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -22,6 +22,7 @@ logger = getLogger(__name__) VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'} +SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'} @dataclass() @@ -309,7 +310,7 @@ async def _fetch_and_process_sitemap( try: while retries_left > 0: retries_left -= 1 - async with client.stream('GET', sitemap_url) as response: + async with client.stream('GET', sitemap_url, headers=SITEMAP_HEADERS) as response: response.raise_for_status() # Determine content type and compression diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py new file mode 100644 index 0000000000..a081a26b47 --- /dev/null +++ b/tests/unit/_utils/test_sitemap.py @@ -0,0 +1,38 @@ +from yarl import URL + +from crawlee._utils.sitemap import Sitemap + + +async def test_sitemap(server_url: URL) -> None: + sitemap_body = """ + + + +http://not-exists.com/ +2005-02-03 +monthly +0.8 + + +http://not-exists.com/catalog?item=12&desc=vacation_hawaii +weekly + + +http://not-exists.com/catalog?item=73&desc=vacation_new_zealand +2004-12-23 +weekly + + +http://not-exists.com/catalog?item=74&desc=vacation_newfoundland +2004-12-23T18:00:15+00:00 +0.3 + + +http://not-exists.com/catalog?item=83&desc=vacation_usa +2004-11-23 + + +""".strip() + sitemap_url = (server_url / 'get_sitemap').with_query(content=sitemap_body) + sitemap = await Sitemap.load(str(sitemap_url)) + print(len(sitemap.urls)) diff --git a/tests/unit/server.py b/tests/unit/server.py index 21ba01cec8..3c510ae6d8 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import gzip import json import threading import time @@ -23,9 +24,9 @@ if TYPE_CHECKING: from socket import socket - Receive = Callable[[], Awaitable[dict[str, Any]]] Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] +PathHandler = Callable[[dict[str, Any], Receive, Send], Coroutine[None, None, None]] def get_headers_dict(scope: dict[str, Any]) -> dict[str, str]: @@ -92,55 +93,47 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: send: The ASGI send function. """ assert scope['type'] == 'http' - path = scope['path'] - + paths: dict[str, PathHandler] = { + 'start_enqueue': start_enqueue_endpoint, + 'sub_index': secondary_index_endpoint, + 'incapsula': incapsula_endpoint, + 'page_1': generic_response_endpoint, + 'page_2': generic_response_endpoint, + 'page_3': generic_response_endpoint, + 'set_cookies': set_cookies, + 'set_complex_cookies': set_complex_cookies, + 'cookies': get_cookies, + 'status': echo_status, + 'headers': echo_headers, + 'user-agent': echo_user_agent, + 'get_sitemap.gz': get_sitemap_gz_endpoint, + 'get_sitemap': get_sitemap_endpoint, + 'get': get_echo, + 'post': post_echo, + 'dynamic_content': dynamic_content, + 'redirect': redirect_to_url, + 'json': hello_world_json, + 'xml': hello_world_xml, + 'robots.txt': robots_txt, + } + path_parts = URL(scope['path']).parts + path = path_parts[1] if len(path_parts) > 1 else path_parts[0] # Route requests to appropriate handlers - if path.startswith('/start_enqueue'): - await start_enqueue_endpoint(send) - elif path.startswith('/sub_index'): - await secondary_index_endpoint(send) - elif path.startswith('/incapsula'): - await incapsula_endpoint(send) - elif path.startswith(('/page_1', '/page_2', '/page_3')): - await generic_response_endpoint(send) - elif path.startswith('/set_cookies'): - await set_cookies(scope, send) - elif path.startswith('/set_complex_cookies'): - await set_complex_cookies(send) - elif path.startswith('/cookies'): - await get_cookies(scope, send) - elif path.startswith('/status/'): - await echo_status(scope, send) - elif path.startswith('/headers'): - await echo_headers(scope, send) - elif path.startswith('/user-agent'): - await echo_user_agent(scope, send) - elif path.startswith('/get'): - await get_echo(scope, send) - elif path.startswith('/post'): - await post_echo(scope, receive, send) - elif path.startswith('/dynamic_content'): - await dynamic_content(scope, send) - elif path.startswith('/redirect'): - await redirect_to_url(scope, send) - elif path.startswith('/json'): - await hello_world_json(send) - elif path.startswith('/xml'): - await hello_world_xml(send) - elif path.startswith('/robots.txt'): - await robots_txt(send) + if path in paths: + path_func = paths[path] + await path_func(scope, receive, send) else: - await hello_world(send) + await hello_world(scope, receive, send) -async def get_cookies(scope: dict[str, Any], send: Send) -> None: +async def get_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to retrieve cookies sent in the request.""" headers = get_headers_dict(scope) cookies = get_cookies_from_headers(headers) await send_json_response(send, {'cookies': cookies}) -async def set_cookies(scope: dict[str, Any], send: Send) -> None: +async def set_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to set cookies from query parameters and redirect.""" query_params = get_query_params(scope.get('query_string', b'')) @@ -165,7 +158,7 @@ async def set_cookies(scope: dict[str, Any], send: Send) -> None: await send({'type': 'http.response.body', 'body': b'Redirecting to get_cookies...'}) -async def hello_world(send: Send) -> None: +async def hello_world(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle basic requests with a simple HTML response.""" await send_html_response( send, @@ -173,7 +166,7 @@ async def hello_world(send: Send) -> None: ) -async def hello_world_json(send: Send) -> None: +async def hello_world_json(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle basic requests with a simple JSON response.""" await send_json_response( send, @@ -181,7 +174,7 @@ async def hello_world_json(send: Send) -> None: ) -async def hello_world_xml(send: Send) -> None: +async def hello_world_xml(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle basic requests with a simple XML response.""" await send_html_response( send, @@ -240,7 +233,7 @@ async def post_echo(scope: dict[str, Any], receive: Receive, send: Send) -> None await send_json_response(send, response) -async def echo_status(scope: dict[str, Any], send: Send) -> None: +async def echo_status(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo the status code from the URL path.""" status_code = int(scope['path'].replace('/status/', '')) await send( @@ -253,13 +246,13 @@ async def echo_status(scope: dict[str, Any], send: Send) -> None: await send({'type': 'http.response.body', 'body': b''}) -async def echo_headers(scope: dict[str, Any], send: Send) -> None: +async def echo_headers(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo back the request headers as JSON.""" headers = get_headers_dict(scope) await send_json_response(send, headers) -async def start_enqueue_endpoint(send: Send) -> None: +async def start_enqueue_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the main page with links.""" await send_html_response( send, @@ -267,7 +260,7 @@ async def start_enqueue_endpoint(send: Send) -> None: ) -async def secondary_index_endpoint(send: Send) -> None: +async def secondary_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the secondary page with links.""" await send_html_response( send, @@ -275,7 +268,7 @@ async def secondary_index_endpoint(send: Send) -> None: ) -async def incapsula_endpoint(send: Send) -> None: +async def incapsula_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for a page with an incapsula iframe.""" await send_html_response( send, @@ -283,7 +276,7 @@ async def incapsula_endpoint(send: Send) -> None: ) -async def generic_response_endpoint(send: Send) -> None: +async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests with a generic HTML response.""" await send_html_response( send, @@ -291,7 +284,7 @@ async def generic_response_endpoint(send: Send) -> None: ) -async def redirect_to_url(scope: dict[str, Any], send: Send) -> None: +async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests that should redirect to a specified full URL.""" query_params = get_query_params(scope.get('query_string', b'')) @@ -311,14 +304,14 @@ async def redirect_to_url(scope: dict[str, Any], send: Send) -> None: await send({'type': 'http.response.body', 'body': f'Redirecting to {target_url}...'.encode()}) -async def echo_user_agent(scope: dict[str, Any], send: Send) -> None: +async def echo_user_agent(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo back the user agent header as a response.""" headers = get_headers_dict(scope) user_agent = headers.get('user-agent', 'Not provided') await send_json_response(send, {'user-agent': user_agent}) -async def get_echo(scope: dict[str, Any], send: Send) -> None: +async def get_echo(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Echo back GET request details similar to httpbin.org/get.""" path = scope.get('path', '') query_string = scope.get('query_string', b'') @@ -343,7 +336,7 @@ async def get_echo(scope: dict[str, Any], send: Send) -> None: await send_json_response(send, response) -async def set_complex_cookies(send: Send) -> None: +async def set_complex_cookies(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to set specific cookies with various attributes.""" headers = [ @@ -366,7 +359,7 @@ async def set_complex_cookies(send: Send) -> None: await send({'type': 'http.response.body', 'body': b'Cookies have been set!'}) -async def dynamic_content(scope: dict[str, Any], send: Send) -> None: +async def dynamic_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to serve HTML-page with dynamic content received in the request.""" query_params = get_query_params(scope.get('query_string', b'')) @@ -375,11 +368,48 @@ async def dynamic_content(scope: dict[str, Any], send: Send) -> None: await send_html_response(send, html_content=content.encode()) -async def robots_txt(send: Send) -> None: +async def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests for the robots.txt file.""" await send_html_response(send, ROBOTS_TXT) +async def get_sitemap_endpoint(scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests to serve XML sitemap content received in the request.""" + query_params = get_query_params(scope.get('query_string', b'')) + content = query_params.get('content', '') + + await send( + { + 'type': 'http.response.start', + 'status': 200, + 'headers': [[b'content-type', b'application/xml; charset=utf-8']], + } + ) + + await send({'type': 'http.response.body', 'body': content.encode()}) + + +async def get_sitemap_gz_endpoint(scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests to serve gzipped XML sitemap content received in the request.""" + query_params = get_query_params(scope.get('query_string', b'')) + content = query_params.get('content', '') + + # Compress the content using gzip + compressed_content = gzip.compress(content.encode()) + + await send( + { + 'type': 'http.response.start', + 'status': 200, + 'headers': [ + [b'content-type', b'application/xml; charset=utf-8'], + [b'content-encoding', b'gzip'], + ], + } + ) + await send({'type': 'http.response.body', 'body': compressed_content}) + + class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" From f0b089c42dcaea67dd9ef7fdee2c3ce92a069eaf Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 30 May 2025 12:13:55 +0000 Subject: [PATCH 04/30] add tests --- src/crawlee/_utils/sitemap.py | 17 ++- tests/unit/_utils/test_sitemap.py | 211 +++++++++++++++++++++++++++++- tests/unit/server.py | 37 ++---- 3 files changed, 224 insertions(+), 41 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 9574574cc4..ec918ee0eb 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -194,7 +194,7 @@ def close(self) -> None: def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser: """Create appropriate parser based on content type and URL.""" - if 'text/plain' in content_type.lower() or (url and url.endswith('.txt')): + if 'text/plain' in content_type.lower() or (url and URL(url).path.endswith('.txt')): return _TxtSitemapParser() # Default to XML parser for most cases return _XmlSitemapParser() @@ -315,21 +315,20 @@ async def _fetch_and_process_sitemap( # Determine content type and compression content_type = response.headers.get('content-type', '') - is_gzipped = ( - 'application/gzip' in content_type - or 'application/x-gzip' in content_type - or sitemap_url.endswith('.gz') - ) # Create appropriate parser parser = _get_parser(content_type, sitemap_url) - decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) if is_gzipped else None - + decompressor = None try: # Process chunks as they arrive + first_chunk = True async for raw_chunk in response.aiter_bytes(chunk_size=8192): - chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk + # Check if the first chunk is a valid gzip header + if first_chunk and raw_chunk.startswith(b'\x1f\x8b'): + decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) + first_chunk = False + chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk text_chunk = chunk.decode('utf-8', errors='replace') async for item in parser.process_chunk(text_chunk): async for result in _process_sitemap_item( diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index a081a26b47..98d9ba9019 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -1,10 +1,12 @@ -from yarl import URL +import base64 +import gzip +from datetime import datetime -from crawlee._utils.sitemap import Sitemap +from yarl import URL +from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap -async def test_sitemap(server_url: URL) -> None: - sitemap_body = """ +BASIC_SITEMAP = """ @@ -33,6 +35,203 @@ async def test_sitemap(server_url: URL) -> None: """.strip() - sitemap_url = (server_url / 'get_sitemap').with_query(content=sitemap_body) + +BASIC_RESULTS = { + 'http://not-exists.com/', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', +} + + +def compress_gzip(data: str) -> bytes: + """Compress a string using gzip.""" + return gzip.compress(data.encode()) + + +def encode_base64(data: bytes) -> str: + """Encode bytes to a base64 string.""" + return base64.b64encode(data).decode('utf-8') + + +async def test_sitemap(server_url: URL) -> None: + """Test loading a basic sitemap.""" + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) sitemap = await Sitemap.load(str(sitemap_url)) - print(len(sitemap.urls)) + + assert len(sitemap.urls) == 5 + assert set(sitemap.urls) == BASIC_RESULTS + + +async def test_extract_metadata_sitemap(server_url: URL) -> None: + """Test extracting item metadata from a sitemap.""" + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + + items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}])] + assert len(items) == 5 + assert items[0] == SitemapUrl( + loc='http://not-exists.com/', + priority=0.8, + changefreq='monthly', + lastmod=datetime.fromisoformat('2005-02-03'), + origin_sitemap_url=str(sitemap_url), + ) + + +async def test_gzipped_sitemap(server_url: URL) -> None: + """Test loading a gzipped sitemap with correct type and .xml.gz url.""" + gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) + sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip') + sitemap = await Sitemap.load(str(sitemap_url)) + assert len(sitemap.urls) == 5 + assert set(sitemap.urls) == BASIC_RESULTS + + +async def test_gzipped_sitemap_with_invalid_data(server_url: URL) -> None: + """Test loading a invalid gzipped sitemap with correct type and .xml.gz url.""" + compress_data = compress_gzip(BASIC_SITEMAP) + invalid_gzipped_data = encode_base64(compress_data[:30]) + sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip') + sitemap = await Sitemap.load(str(sitemap_url)) + + assert len(sitemap.urls) == 0 + assert sitemap.urls == [] + + +async def test_gz_sitemap_with_non_gzipped(server_url: URL) -> None: + """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data.""" + sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query( + base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip' + ) + sitemap = await Sitemap.load(str(sitemap_url)) + + assert len(sitemap.urls) == 5 + assert set(sitemap.urls) == BASIC_RESULTS + + +async def test_gzipped_sitemap_with_bad_type(server_url: URL) -> None: + """Test loading a gzipped sitemap with bad type and .xml.gz url.""" + gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) + sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query(base64=gzipped_data) + sitemap = await Sitemap.load(str(sitemap_url)) + + assert len(sitemap.urls) == 5 + assert set(sitemap.urls) == BASIC_RESULTS + + +async def test_xml_sitemap_with_gzipped_data(server_url: URL) -> None: + """Test loading a gzipped sitemap with correct type and .xml url.""" + gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip') + sitemap = await Sitemap.load(str(sitemap_url)) + + assert len(sitemap.urls) == 5 + assert set(sitemap.urls) == BASIC_RESULTS + + +async def test_parent_sitemap(server_url: URL) -> None: + """Test loading a parent sitemap that references child sitemaps.""" + parent_sitemap = """ + + + +{child_sitemap} +2004-12-23 + + +{child_sitemap_2} +2004-12-23 + + +""".strip() + child_sitemap = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + child_sitemap_2 = (server_url / 'get_sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP))) + parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2) + encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode()) + parent_sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encoded_parent_sitemap_content) + + sitemap = await Sitemap.load(str(parent_sitemap_url)) + + assert len(sitemap.urls) == 10 + assert set(sitemap.urls) == BASIC_RESULTS + + +async def test_non_sitemap_url(server_url: URL) -> None: + """Test loading a URL that does not point to a sitemap.""" + sitemap = await Sitemap.load(str(server_url)) + + assert len(sitemap.urls) == 0 + assert sitemap.urls == [] + + +async def test_cdata_sitemap(server_url: URL) -> None: + """Test loading a sitemap with CDATA sections.""" + cdata_sitemap = """ + + + + + + + """.strip() + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(cdata_sitemap.encode())) + sitemap = await Sitemap.load(str(sitemap_url)) + + assert len(sitemap.urls) == 1 + assert sitemap.urls == ['http://not-exists.com/catalog'] + + +async def test_txt_sitemap(server_url: URL) -> None: + """Test loading a plain text sitemap.""" + urls = [ + 'http://not-exists.com/catalog?item=78&desc=vacation_crete', + 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', + ] + txt_sitemap_content = '\n'.join(urls) + + sitemap_url = (server_url / 'get_sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode())) + sitemap = await Sitemap.load(str(sitemap_url)) + + assert len(sitemap.urls) == 2 + assert set(sitemap.urls) == { + 'http://not-exists.com/catalog?item=78&desc=vacation_crete', + 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', + } + + +async def test_sitemap_pretty(server_url: URL) -> None: + """Test loading a pretty-printed sitemap.""" + pretty_sitemap = """ + + + + + http://not-exists.com/catalog?item=80&desc=vacation_turkey + + + 2005-02-03 + + + + monthly + + + 0.8 + + + +""".strip() + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(pretty_sitemap.encode())) + sitemap = await Sitemap.load(str(sitemap_url)) + + assert len(sitemap.urls) == 1 + assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey'] + + +async def test_sitemap_from_string() -> None: + """Test creating a Sitemap instance from an XML string.""" + sitemap = await Sitemap.from_xml_string(BASIC_SITEMAP) + + assert len(sitemap.urls) == 5 + assert set(sitemap.urls) == BASIC_RESULTS diff --git a/tests/unit/server.py b/tests/unit/server.py index 3c510ae6d8..48a97812bd 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -1,7 +1,7 @@ from __future__ import annotations import asyncio -import gzip +import base64 import json import threading import time @@ -106,8 +106,9 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'status': echo_status, 'headers': echo_headers, 'user-agent': echo_user_agent, - 'get_sitemap.gz': get_sitemap_gz_endpoint, - 'get_sitemap': get_sitemap_endpoint, + 'get_sitemap.txt': get_sitemap_endpoint, + 'get_sitemap.xml': get_sitemap_endpoint, + 'get_sitemap.xml.gz': get_sitemap_endpoint, 'get': get_echo, 'post': post_echo, 'dynamic_content': dynamic_content, @@ -376,38 +377,22 @@ async def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> N async def get_sitemap_endpoint(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to serve XML sitemap content received in the request.""" query_params = get_query_params(scope.get('query_string', b'')) - content = query_params.get('content', '') - - await send( - { - 'type': 'http.response.start', - 'status': 200, - 'headers': [[b'content-type', b'application/xml; charset=utf-8']], - } - ) - await send({'type': 'http.response.body', 'body': content.encode()}) + in_content = query_params.get('content', '') + in_base64 = query_params.get('base64', '') + c_type = query_params.get('c_type', 'application/xml; charset=utf-8') - -async def get_sitemap_gz_endpoint(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - """Handle requests to serve gzipped XML sitemap content received in the request.""" - query_params = get_query_params(scope.get('query_string', b'')) - content = query_params.get('content', '') - - # Compress the content using gzip - compressed_content = gzip.compress(content.encode()) + out_content = base64.b64decode(in_base64) if in_base64 else in_content.encode() await send( { 'type': 'http.response.start', 'status': 200, - 'headers': [ - [b'content-type', b'application/xml; charset=utf-8'], - [b'content-encoding', b'gzip'], - ], + 'headers': [[b'content-type', c_type.encode()]], } ) - await send({'type': 'http.response.body', 'body': compressed_content}) + + await send({'type': 'http.response.body', 'body': out_content}) class TestServer(Server): From c2dbb73d1fcecba17b0ec18cf939868ca41ca9c4 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 30 May 2025 12:40:08 +0000 Subject: [PATCH 05/30] integrate sitemap to robots.txt --- src/crawlee/_utils/robots.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index 930ae09431..ac896e69cf 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -5,6 +5,7 @@ from protego import Protego from yarl import URL +from crawlee._utils.sitemap import Sitemap from crawlee._utils.web import is_status_code_client_error if TYPE_CHECKING: @@ -15,9 +16,10 @@ class RobotsTxtFile: - def __init__(self, url: str, robots: Protego) -> None: + def __init__(self, url: str, robots: Protego, proxy_info: ProxyInfo | None = None) -> None: self._robots = robots self._original_url = URL(url).origin() + self._proxy_info = proxy_info @classmethod async def from_content(cls, url: str, content: str) -> Self: @@ -56,7 +58,7 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N robots = Protego.parse(body.decode('utf-8')) - return cls(url, robots) + return cls(url, robots, proxy_info=proxy_info) def is_allowed(self, url: str, user_agent: str = '*') -> bool: """Check if the given URL is allowed for the given user agent. @@ -83,3 +85,14 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None: """ crawl_delay = self._robots.crawl_delay(user_agent) return int(crawl_delay) if crawl_delay is not None else None + + async def parse_sitemaps(self) -> Sitemap: + """Parse the sitemaps from the robots.txt file and return a list of `Sitemap` instances.""" + sitemaps = self.get_sitemaps() + proxy_url = self._proxy_info.url if self._proxy_info else None + return await Sitemap.load(sitemaps, proxy_url) + + async def parse_urls_from_sitemaps(self) -> list[str]: + """Parse the URLs from the sitemaps in the robots.txt file and return a list of `Sitemap` instances.""" + sitemap = await self.parse_sitemaps() + return sitemap.urls From 3279aa653b4f7064f200427fe7a185b91a09e31a Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 3 Jun 2025 11:58:33 +0000 Subject: [PATCH 06/30] add implementation `SitemapRequestLoader` --- src/crawlee/_utils/sitemap.py | 26 +-- src/crawlee/request_loaders/__init__.py | 8 +- .../_sitemap_request_loader.py | 202 ++++++++++++++++++ 3 files changed, 217 insertions(+), 19 deletions(-) create mode 100644 src/crawlee/request_loaders/_sitemap_request_loader.py diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index ec918ee0eb..e521bb6379 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -47,7 +47,7 @@ class ParseSitemapOptions(TypedDict, total=False): timeout: float | None -class _SitemapSource(TypedDict): +class SitemapSource(TypedDict): type: Literal['url', 'raw'] url: NotRequired[str] content: NotRequired[str] @@ -200,7 +200,7 @@ def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapPa return _XmlSitemapParser() -def _get_origin_url(source: _SitemapSource) -> str: +def _get_origin_url(source: SitemapSource) -> str: """Determine the origin URL for a sitemap source.""" if source['type'] == 'url' and 'url' in source: return source['url'] @@ -212,10 +212,10 @@ def _get_origin_url(source: _SitemapSource) -> str: async def _process_sitemap_item( item: _SitemapItem, - source: _SitemapSource, + source: SitemapSource, depth: int, visited_sitemap_urls: set[str], - sources: list[_SitemapSource], + sources: list[SitemapSource], *, emit_nested_sitemaps: bool, ) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]: @@ -232,7 +232,7 @@ async def _process_sitemap_item( sitemap_url = item_copy['url'] if sitemap_url and sitemap_url not in visited_sitemap_urls: # Add to processing queue - sources.append(_SitemapSource(type='url', url=sitemap_url, depth=depth + 1)) + sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1)) # Output the nested sitemap reference if requested if emit_nested_sitemaps: @@ -254,10 +254,10 @@ async def _process_sitemap_item( async def _process_raw_source( - source: _SitemapSource, + source: SitemapSource, depth: int, visited_sitemap_urls: set[str], - sources: list[_SitemapSource], + sources: list[SitemapSource], *, emit_nested_sitemaps: bool, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: @@ -293,10 +293,10 @@ async def _process_raw_source( async def _fetch_and_process_sitemap( client: httpx.AsyncClient, - source: _SitemapSource, + source: SitemapSource, depth: int, visited_sitemap_urls: set[str], - sources: list[_SitemapSource], + sources: list[SitemapSource], retries_left: int, *, emit_nested_sitemaps: bool, @@ -387,16 +387,16 @@ async def load( ) -> Sitemap: if isinstance(urls, str): urls = [urls] - return await cls.parse([_SitemapSource(type='url', url=url) for url in urls], proxy_url, parse_sitemap_options) + return await cls.parse([SitemapSource(type='url', url=url) for url in urls], proxy_url, parse_sitemap_options) @classmethod async def from_xml_string(cls, content: str, proxy_url: str | None = None) -> Sitemap: - return await cls.parse([_SitemapSource(type='raw', content=content)], proxy_url) + return await cls.parse([SitemapSource(type='raw', content=content)], proxy_url) @classmethod async def parse( cls, - sources: list[_SitemapSource], + sources: list[SitemapSource], proxy_url: str | None = None, parse_sitemap_options: ParseSitemapOptions | None = None, ) -> Sitemap: @@ -405,7 +405,7 @@ async def parse( async def parse_sitemap( - initial_sources: list[_SitemapSource], + initial_sources: list[SitemapSource], proxy_url: str | None = None, options: ParseSitemapOptions | None = None, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: diff --git a/src/crawlee/request_loaders/__init__.py b/src/crawlee/request_loaders/__init__.py index 57829ec5ce..c04d9aa810 100644 --- a/src/crawlee/request_loaders/__init__.py +++ b/src/crawlee/request_loaders/__init__.py @@ -2,10 +2,6 @@ from ._request_loader import RequestLoader from ._request_manager import RequestManager from ._request_manager_tandem import RequestManagerTandem +from ._sitemap_request_loader import SitemapRequestLoader -__all__ = [ - 'RequestList', - 'RequestLoader', - 'RequestManager', - 'RequestManagerTandem', -] +__all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader'] diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py new file mode 100644 index 0000000000..a16571486e --- /dev/null +++ b/src/crawlee/request_loaders/_sitemap_request_loader.py @@ -0,0 +1,202 @@ +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING, Any + +from crawlee import Request +from crawlee._utils.docs import docs_group +from crawlee._utils.globs import Glob +from crawlee._utils.sitemap import ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap +from crawlee.request_loaders._request_loader import RequestLoader + +if TYPE_CHECKING: + import re + from collections.abc import Sequence + + from crawlee.storage_clients.models import ProcessedRequest + + +@docs_group('Classes') +class SitemapRequestLoader(RequestLoader): + """A request loader that reads URLs from sitemap(s). + + The loader fetches and parses sitemaps in the background, allowing crawling to start + before all URLs are loaded. It supports filtering URLs using glob and regex patterns. + """ + + def __init__( + self, + sitemap_urls: list[str], + *, + proxy_url: str | None = None, + include: list[re.Pattern[Any] | Glob] | None = None, + exclude: list[re.Pattern[Any] | Glob] | None = None, + max_buffer_size: int = 200, + parse_sitemap_options: ParseSitemapOptions | None = None, + ) -> None: + """Initialize the sitemap request loader. + + Args: + sitemap_urls: Configuration options for the loader. + proxy_url: Optional proxy to use for fetching sitemaps. + include: List of glob or regex patterns to include URLs. + exclude: List of glob or regex patterns to exclude URLs. + max_buffer_size: Maximum number of URLs to buffer in memory. + parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`. + """ + self._sitemap_urls = sitemap_urls + self._include = include + self._exclude = exclude + self._proxy_url = proxy_url + self._parse_sitemap_options = parse_sitemap_options or ParseSitemapOptions() + + self._handled_count = 0 + self._total_count = 0 + + # URL queue and tracking + self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size) + self._in_progress: set[str] = set() + self._processed_urls: set[str] = set() + + # Loading state + self._loading_task: asyncio.Task[None] | None = None + self._loading_finished = False + self._loading_error: Exception | None = None + + def _check_url_patterns( + self, + target_url: str, + include: Sequence[re.Pattern[Any] | Glob] | None, + exclude: Sequence[re.Pattern[Any] | Glob] | None, + ) -> bool: + """Check if a URL matches configured include/exclude patterns.""" + # If the URL matches any `exclude` pattern, reject it + for pattern in exclude or (): + if isinstance(pattern, Glob): + pattern = pattern.regexp # noqa: PLW2901 + + if pattern.match(target_url) is not None: + return False + + # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL + if include is None: + return True + + # If the URL matches any `include` pattern, accept it + for pattern in include: + if isinstance(pattern, Glob): + pattern = pattern.regexp # noqa: PLW2901 + + if pattern.match(target_url) is not None: + return True + + # The URL does not match any `include` pattern - reject it + return False + + async def _load_sitemaps(self) -> None: + """Load URLs from sitemaps in the background.""" + try: + # Parse all sitemaps + async for item in parse_sitemap( + [SitemapSource(type='url', url=url) for url in self._sitemap_urls], + proxy_url=self._proxy_url, + options=self._parse_sitemap_options, + ): + # Only process URL items (not nested sitemaps) + if isinstance(item, SitemapUrl): + url = item.loc + + # Skip if already processed + if url in self._processed_urls: + continue + + # Check if URL should be included + if not self._check_url_patterns(url, self._include, self._exclude): + continue + + # Add to queue (will block if full) + await self._url_queue.put(url) + self._processed_urls.add(url) + self._total_count += 1 + + except Exception as e: + self._loading_error = e + raise + finally: + self._loading_finished = True + + async def get_total_count(self) -> int: + """Return the total number of URLs found so far.""" + return self._total_count + + async def is_empty(self) -> bool: + """Check if there are no more URLs to process.""" + return self._url_queue.empty() and self._loading_finished + + async def is_finished(self) -> bool: + """Check if all URLs have been processed.""" + return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_finished + + async def fetch_next_request(self) -> Request | None: + """Fetch the next request to process.""" + if not self._loading_task: + self._loading_task = asyncio.create_task(self._load_sitemaps()) + + while not (self._loading_finished and self._url_queue.empty()): + if self._url_queue.empty(): + await asyncio.sleep(0) + continue + + url = await self._url_queue.get() + + request = Request.from_url(url) + self._in_progress.add(request.id) + return request + + return None + + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as successfully handled.""" + if request.id in self._in_progress: + self._in_progress.remove(request.id) + self._handled_count += 1 + return None + + async def get_handled_count(self) -> int: + """Return the number of handled requests.""" + return self._handled_count + + def is_loading_finished(self) -> bool: + """Check if sitemap loading has finished.""" + return self._loading_finished + + def get_queue_size(self) -> int: + """Get the current number of URLs in the queue.""" + return self._url_queue.qsize() + + async def wait_for_loading(self) -> None: + """Wait for sitemap loading to complete.""" + if self._loading_task: + await self._loading_task + + async def abort_loading(self) -> None: + """Abort the sitemap loading process.""" + if self._loading_task and not self._loading_task.done(): + self._loading_task.cancel() + try: + await self._loading_task + except asyncio.CancelledError: + pass + finally: + self._loading_finished = True + + async def __aiter__(self) -> SitemapRequestLoader: + """Make the loader async iterable.""" + return self + + async def __anext__(self) -> Request: + """Get the next request from the iterator.""" + request = await self.fetch_next_request() + if request is None: + raise StopAsyncIteration + return request From b1910f1037aacb257c74f298b19ab335cb433100 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 3 Jun 2025 18:05:23 +0000 Subject: [PATCH 07/30] add tests --- .../_sitemap_request_loader.py | 42 ++----- .../test_sitemap_request_loader.py | 104 ++++++++++++++++++ 2 files changed, 112 insertions(+), 34 deletions(-) create mode 100644 tests/unit/request_loaders/test_sitemap_request_loader.py diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py index a16571486e..5470315c59 100644 --- a/src/crawlee/request_loaders/_sitemap_request_loader.py +++ b/src/crawlee/request_loaders/_sitemap_request_loader.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +from logging import getLogger from typing import TYPE_CHECKING, Any from crawlee import Request @@ -16,6 +17,9 @@ from crawlee.storage_clients.models import ProcessedRequest +logger = getLogger(__name__) + + @docs_group('Classes') class SitemapRequestLoader(RequestLoader): """A request loader that reads URLs from sitemap(s). @@ -59,9 +63,8 @@ def __init__( self._processed_urls: set[str] = set() # Loading state - self._loading_task: asyncio.Task[None] | None = None + self._loading_task = asyncio.create_task(self._load_sitemaps()) self._loading_finished = False - self._loading_error: Exception | None = None def _check_url_patterns( self, @@ -96,7 +99,6 @@ def _check_url_patterns( async def _load_sitemaps(self) -> None: """Load URLs from sitemaps in the background.""" try: - # Parse all sitemaps async for item in parse_sitemap( [SitemapSource(type='url', url=url) for url in self._sitemap_urls], proxy_url=self._proxy_url, @@ -114,13 +116,12 @@ async def _load_sitemaps(self) -> None: if not self._check_url_patterns(url, self._include, self._exclude): continue - # Add to queue (will block if full) await self._url_queue.put(url) self._processed_urls.add(url) self._total_count += 1 - except Exception as e: - self._loading_error = e + except Exception: + logger.exception('Error loading sitemaps') raise finally: self._loading_finished = True @@ -139,12 +140,9 @@ async def is_finished(self) -> bool: async def fetch_next_request(self) -> Request | None: """Fetch the next request to process.""" - if not self._loading_task: - self._loading_task = asyncio.create_task(self._load_sitemaps()) - while not (self._loading_finished and self._url_queue.empty()): if self._url_queue.empty(): - await asyncio.sleep(0) + await asyncio.sleep(0.5) continue url = await self._url_queue.get() @@ -166,19 +164,6 @@ async def get_handled_count(self) -> int: """Return the number of handled requests.""" return self._handled_count - def is_loading_finished(self) -> bool: - """Check if sitemap loading has finished.""" - return self._loading_finished - - def get_queue_size(self) -> int: - """Get the current number of URLs in the queue.""" - return self._url_queue.qsize() - - async def wait_for_loading(self) -> None: - """Wait for sitemap loading to complete.""" - if self._loading_task: - await self._loading_task - async def abort_loading(self) -> None: """Abort the sitemap loading process.""" if self._loading_task and not self._loading_task.done(): @@ -189,14 +174,3 @@ async def abort_loading(self) -> None: pass finally: self._loading_finished = True - - async def __aiter__(self) -> SitemapRequestLoader: - """Make the loader async iterable.""" - return self - - async def __anext__(self) -> Request: - """Get the next request from the iterator.""" - request = await self.fetch_next_request() - if request is None: - raise StopAsyncIteration - return request diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py new file mode 100644 index 0000000000..14295fab22 --- /dev/null +++ b/tests/unit/request_loaders/test_sitemap_request_loader.py @@ -0,0 +1,104 @@ +import base64 +import gzip + +from yarl import URL + +from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader + +BASIC_SITEMAP = """ + + + +http://not-exists.com/ +2005-02-03 +monthly +0.8 + + +http://not-exists.com/catalog?item=12&desc=vacation_hawaii +weekly + + +http://not-exists.com/catalog?item=73&desc=vacation_new_zealand +2004-12-23 +weekly + + +http://not-exists.com/catalog?item=74&desc=vacation_newfoundland +2004-12-23T18:00:15+00:00 +0.3 + + +http://not-exists.com/catalog?item=83&desc=vacation_usa +2004-11-23 + + +""".strip() + + +def compress_gzip(data: str) -> bytes: + """Compress a string using gzip.""" + return gzip.compress(data.encode()) + + +def encode_base64(data: bytes) -> str: + """Encode bytes to a base64 string.""" + return base64.b64encode(data).decode('utf-8') + + +async def test_sitemap_traversal(server_url: URL) -> None: + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_loader = SitemapRequestLoader([str(sitemap_url)]) + + while not await sitemap_loader.is_finished(): + item = await sitemap_loader.fetch_next_request() + assert item is not None + + await sitemap_loader.mark_request_as_handled(item) + + assert await sitemap_loader.is_empty() + assert await sitemap_loader.is_finished() + assert await sitemap_loader.get_total_count() == 5 + assert await sitemap_loader.get_handled_count() == 5 + + +async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL) -> None: + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_loader = SitemapRequestLoader([str(sitemap_url)]) + + items = [] + + for _ in range(5): + item = await sitemap_loader.fetch_next_request() + assert item is not None + assert not await sitemap_loader.is_finished() + items.append(item) + + assert await sitemap_loader.is_empty() + assert not await sitemap_loader.is_finished() + + for item in items: + await sitemap_loader.mark_request_as_handled(item) + + assert await sitemap_loader.is_empty() + assert await sitemap_loader.is_finished() + + +async def test_abort_sitemap_loading(server_url: URL) -> None: + sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2) + + item = await sitemap_loader.fetch_next_request() + assert item is not None + await sitemap_loader.mark_request_as_handled(item) + + assert not await sitemap_loader.is_empty() + assert not await sitemap_loader.is_finished() + + await sitemap_loader.abort_loading() + + item = await sitemap_loader.fetch_next_request() + assert item is not None + await sitemap_loader.mark_request_as_handled(item) + + assert await sitemap_loader.is_finished() From 65e4a38876909fcf99913ba01393ec876ffd16f8 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 3 Jun 2025 18:30:35 +0000 Subject: [PATCH 08/30] update docs --- src/crawlee/_utils/robots.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index ac896e69cf..4a40731a71 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -87,12 +87,12 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None: return int(crawl_delay) if crawl_delay is not None else None async def parse_sitemaps(self) -> Sitemap: - """Parse the sitemaps from the robots.txt file and return a list of `Sitemap` instances.""" + """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.""" sitemaps = self.get_sitemaps() proxy_url = self._proxy_info.url if self._proxy_info else None return await Sitemap.load(sitemaps, proxy_url) async def parse_urls_from_sitemaps(self) -> list[str]: - """Parse the URLs from the sitemaps in the robots.txt file and return a list of `Sitemap` instances.""" + """Parse the sitemaps in the robots.txt file and return a list URLs.""" sitemap = await self.parse_sitemaps() return sitemap.urls From d432941f17f7680316e2cd280f94f30acc1b17e0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 3 Jun 2025 18:39:42 +0000 Subject: [PATCH 09/30] fix uvicorn path --- tests/unit/server.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/server.py b/tests/unit/server.py index 48a97812bd..4c5dff9d36 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -117,8 +117,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'xml': hello_world_xml, 'robots.txt': robots_txt, } - path_parts = URL(scope['path']).parts - path = path_parts[1] if len(path_parts) > 1 else path_parts[0] + path = URL(scope['path']).parts[1] # Route requests to appropriate handlers if path in paths: path_func = paths[path] From 4d61c12e49cf91cac5e15831496d84b01e8e11ff Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 9 Jun 2025 10:52:11 +0000 Subject: [PATCH 10/30] unification echo_content --- src/crawlee/_utils/sitemap.py | 10 +++--- tests/unit/_utils/test_sitemap.py | 20 ++++++++--- .../test_adaptive_playwright_crawler.py | 4 +-- tests/unit/server.py | 35 +++++++------------ 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index e521bb6379..aa2853b39d 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -4,7 +4,7 @@ import zlib from contextlib import suppress from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timedelta from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, Literal, TypedDict @@ -44,7 +44,7 @@ class ParseSitemapOptions(TypedDict, total=False): emit_nested_sitemaps: bool max_depth: int sitemap_retries: int - timeout: float | None + timeout: timedelta | None class SitemapSource(TypedDict): @@ -420,7 +420,9 @@ async def parse_sitemap( emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) max_depth = options.get('max_depth', float('inf')) sitemap_retries = options.get('sitemap_retries', 3) - timeout = options.get('timeout', 30) + timeout = options.get('timeout', timedelta(seconds=30)) + + httpx_timeout = httpx.Timeout(float(timeout.seconds)) if timeout else None # Setup working state sources = list(initial_sources) @@ -447,7 +449,7 @@ async def parse_sitemap( # Add to visited set before processing to avoid duplicates visited_sitemap_urls.add(source['url']) - async with httpx.AsyncClient(timeout=httpx.Timeout(timeout), proxy=proxy_url) as client: + async with httpx.AsyncClient(timeout=httpx_timeout, proxy=proxy_url) as client: async for result in _fetch_and_process_sitemap( client, source, diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index 98d9ba9019..085a6d8292 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -57,7 +57,9 @@ def encode_base64(data: bytes) -> str: async def test_sitemap(server_url: URL) -> None: """Test loading a basic sitemap.""" - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'get_sitemap.xml').with_query( + base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' + ) sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 5 @@ -66,7 +68,9 @@ async def test_sitemap(server_url: URL) -> None: async def test_extract_metadata_sitemap(server_url: URL) -> None: """Test extracting item metadata from a sitemap.""" - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'get_sitemap.xml').with_query( + base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' + ) items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}])] assert len(items) == 5 @@ -113,7 +117,9 @@ async def test_gz_sitemap_with_non_gzipped(server_url: URL) -> None: async def test_gzipped_sitemap_with_bad_type(server_url: URL) -> None: """Test loading a gzipped sitemap with bad type and .xml.gz url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) - sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query(base64=gzipped_data) + sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query( + base64=gzipped_data, c_type='application/xml; charset=utf-8' + ) sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 5 @@ -175,7 +181,9 @@ async def test_cdata_sitemap(server_url: URL) -> None: """.strip() - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(cdata_sitemap.encode())) + sitemap_url = (server_url / 'get_sitemap.xml').with_query( + base64=encode_base64(cdata_sitemap.encode()), c_type='application/xml; charset=utf-8' + ) sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 1 @@ -222,7 +230,9 @@ async def test_sitemap_pretty(server_url: URL) -> None: """.strip() - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(pretty_sitemap.encode())) + sitemap_url = (server_url / 'get_sitemap.xml').with_query( + base64=encode_base64(pretty_sitemap.encode()), c_type='application/xml; charset=utf-8' + ) sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 1 diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 63d078e95e..228d4938d4 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -61,8 +61,8 @@ def test_urls(server_url: URL) -> list[str]: """Example pages used in the test are mocked for static requests.""" return [ - str(server_url.with_path('dynamic_content').with_query(content=_PAGE_CONTENT_STATIC)), - str(server_url.with_path('dynamic_content').with_query(id='test2', content=_PAGE_CONTENT_STATIC)), + str(server_url.with_path('echo_content').with_query(content=_PAGE_CONTENT_STATIC)), + str(server_url.with_path('echo_content').with_query(id='test2', content=_PAGE_CONTENT_STATIC)), ] diff --git a/tests/unit/server.py b/tests/unit/server.py index 4c5dff9d36..3fbb7a8960 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -106,12 +106,12 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'status': echo_status, 'headers': echo_headers, 'user-agent': echo_user_agent, - 'get_sitemap.txt': get_sitemap_endpoint, - 'get_sitemap.xml': get_sitemap_endpoint, - 'get_sitemap.xml.gz': get_sitemap_endpoint, + 'get_sitemap.txt': echo_content, + 'get_sitemap.xml': echo_content, + 'get_sitemap.xml.gz': echo_content, 'get': get_echo, 'post': post_echo, - 'dynamic_content': dynamic_content, + 'echo_content': echo_content, 'redirect': redirect_to_url, 'json': hello_world_json, 'xml': hello_world_xml, @@ -359,29 +359,15 @@ async def set_complex_cookies(_scope: dict[str, Any], _receive: Receive, send: S await send({'type': 'http.response.body', 'body': b'Cookies have been set!'}) -async def dynamic_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None: +async def echo_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None: """Handle requests to serve HTML-page with dynamic content received in the request.""" query_params = get_query_params(scope.get('query_string', b'')) content = query_params.get('content', '') + base64_content = query_params.get('base64', '') + c_type = query_params.get('c_type', 'text/html; charset=utf-8') - await send_html_response(send, html_content=content.encode()) - - -async def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: - """Handle requests for the robots.txt file.""" - await send_html_response(send, ROBOTS_TXT) - - -async def get_sitemap_endpoint(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - """Handle requests to serve XML sitemap content received in the request.""" - query_params = get_query_params(scope.get('query_string', b'')) - - in_content = query_params.get('content', '') - in_base64 = query_params.get('base64', '') - c_type = query_params.get('c_type', 'application/xml; charset=utf-8') - - out_content = base64.b64decode(in_base64) if in_base64 else in_content.encode() + out_content = base64.b64decode(base64_content) if base64_content else content.encode() await send( { @@ -394,6 +380,11 @@ async def get_sitemap_endpoint(scope: dict[str, Any], _receive: Receive, send: S await send({'type': 'http.response.body', 'body': out_content}) +async def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests for the robots.txt file.""" + await send_html_response(send, ROBOTS_TXT) + + class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" From 04cd366a6d55c7971ccbb95718534dbc1117581b Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 9 Jun 2025 11:27:32 +0000 Subject: [PATCH 11/30] update endpoints --- tests/unit/_utils/test_sitemap.py | 26 +++++++++---------- .../test_sitemap_request_loader.py | 6 ++--- tests/unit/server.py | 10 +++---- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index 085a6d8292..8e31d436d8 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -57,7 +57,7 @@ def encode_base64(data: bytes) -> str: async def test_sitemap(server_url: URL) -> None: """Test loading a basic sitemap.""" - sitemap_url = (server_url / 'get_sitemap.xml').with_query( + sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url)) @@ -68,7 +68,7 @@ async def test_sitemap(server_url: URL) -> None: async def test_extract_metadata_sitemap(server_url: URL) -> None: """Test extracting item metadata from a sitemap.""" - sitemap_url = (server_url / 'get_sitemap.xml').with_query( + sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' ) @@ -86,7 +86,7 @@ async def test_extract_metadata_sitemap(server_url: URL) -> None: async def test_gzipped_sitemap(server_url: URL) -> None: """Test loading a gzipped sitemap with correct type and .xml.gz url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) - sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip') + sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS @@ -96,7 +96,7 @@ async def test_gzipped_sitemap_with_invalid_data(server_url: URL) -> None: """Test loading a invalid gzipped sitemap with correct type and .xml.gz url.""" compress_data = compress_gzip(BASIC_SITEMAP) invalid_gzipped_data = encode_base64(compress_data[:30]) - sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip') + sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 0 @@ -105,7 +105,7 @@ async def test_gzipped_sitemap_with_invalid_data(server_url: URL) -> None: async def test_gz_sitemap_with_non_gzipped(server_url: URL) -> None: """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data.""" - sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query( + sitemap_url = (server_url / 'sitemap.xml.gz').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip' ) sitemap = await Sitemap.load(str(sitemap_url)) @@ -117,7 +117,7 @@ async def test_gz_sitemap_with_non_gzipped(server_url: URL) -> None: async def test_gzipped_sitemap_with_bad_type(server_url: URL) -> None: """Test loading a gzipped sitemap with bad type and .xml.gz url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) - sitemap_url = (server_url / 'get_sitemap.xml.gz').with_query( + sitemap_url = (server_url / 'sitemap.xml.gz').with_query( base64=gzipped_data, c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url)) @@ -129,7 +129,7 @@ async def test_gzipped_sitemap_with_bad_type(server_url: URL) -> None: async def test_xml_sitemap_with_gzipped_data(server_url: URL) -> None: """Test loading a gzipped sitemap with correct type and .xml url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip') + sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 5 @@ -151,11 +151,11 @@ async def test_parent_sitemap(server_url: URL) -> None: """.strip() - child_sitemap = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) - child_sitemap_2 = (server_url / 'get_sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP))) + child_sitemap = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP))) parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2) encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode()) - parent_sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encoded_parent_sitemap_content) + parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content) sitemap = await Sitemap.load(str(parent_sitemap_url)) @@ -181,7 +181,7 @@ async def test_cdata_sitemap(server_url: URL) -> None: """.strip() - sitemap_url = (server_url / 'get_sitemap.xml').with_query( + sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(cdata_sitemap.encode()), c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url)) @@ -198,7 +198,7 @@ async def test_txt_sitemap(server_url: URL) -> None: ] txt_sitemap_content = '\n'.join(urls) - sitemap_url = (server_url / 'get_sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode())) + sitemap_url = (server_url / 'sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode())) sitemap = await Sitemap.load(str(sitemap_url)) assert len(sitemap.urls) == 2 @@ -230,7 +230,7 @@ async def test_sitemap_pretty(server_url: URL) -> None: """.strip() - sitemap_url = (server_url / 'get_sitemap.xml').with_query( + sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(pretty_sitemap.encode()), c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url)) diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py index 14295fab22..c44e822e85 100644 --- a/tests/unit/request_loaders/test_sitemap_request_loader.py +++ b/tests/unit/request_loaders/test_sitemap_request_loader.py @@ -47,7 +47,7 @@ def encode_base64(data: bytes) -> str: async def test_sitemap_traversal(server_url: URL) -> None: - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) sitemap_loader = SitemapRequestLoader([str(sitemap_url)]) while not await sitemap_loader.is_finished(): @@ -63,7 +63,7 @@ async def test_sitemap_traversal(server_url: URL) -> None: async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL) -> None: - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) sitemap_loader = SitemapRequestLoader([str(sitemap_url)]) items = [] @@ -85,7 +85,7 @@ async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL) - async def test_abort_sitemap_loading(server_url: URL) -> None: - sitemap_url = (server_url / 'get_sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2) item = await sitemap_loader.fetch_next_request() diff --git a/tests/unit/server.py b/tests/unit/server.py index 3fbb7a8960..363bc61186 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -106,12 +106,12 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'status': echo_status, 'headers': echo_headers, 'user-agent': echo_user_agent, - 'get_sitemap.txt': echo_content, - 'get_sitemap.xml': echo_content, - 'get_sitemap.xml.gz': echo_content, + 'echo_content': echo_content, + 'sitemap.txt': echo_content, + 'sitemap.xml': echo_content, + 'sitemap.xml.gz': echo_content, 'get': get_echo, 'post': post_echo, - 'echo_content': echo_content, 'redirect': redirect_to_url, 'json': hello_world_json, 'xml': hello_world_xml, @@ -360,7 +360,7 @@ async def set_complex_cookies(_scope: dict[str, Any], _receive: Receive, send: S async def echo_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - """Handle requests to serve HTML-page with dynamic content received in the request.""" + """Echo back content (plain text or base64) with specified content-type.""" query_params = get_query_params(scope.get('query_string', b'')) content = query_params.get('content', '') From a446bb1797724904ff2c15004b60549045e15767 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 9 Jun 2025 13:18:53 +0000 Subject: [PATCH 12/30] clear extra property in `SitemapRequestLoader` --- src/crawlee/_utils/sitemap.py | 4 ++-- .../request_loaders/_sitemap_request_loader.py | 16 +++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index aa2853b39d..a432c9bb39 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -172,7 +172,7 @@ async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]: self._handler.items.clear() except Exception as e: - logger.warning(f'Failed to parse XML data chunk: {e}') + logger.warning(f'Failed to parse XML data chunk: {e}', exc_info=True) async def flush(self) -> AsyncGenerator[_SitemapItem, None]: """Process any remaining data in the buffer, yielding items one by one.""" @@ -422,7 +422,7 @@ async def parse_sitemap( sitemap_retries = options.get('sitemap_retries', 3) timeout = options.get('timeout', timedelta(seconds=30)) - httpx_timeout = httpx.Timeout(float(timeout.seconds)) if timeout else None + httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None # Setup working state sources = list(initial_sources) diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py index 5470315c59..37d0473db5 100644 --- a/src/crawlee/request_loaders/_sitemap_request_loader.py +++ b/src/crawlee/request_loaders/_sitemap_request_loader.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +from contextlib import suppress from logging import getLogger from typing import TYPE_CHECKING, Any @@ -64,7 +65,6 @@ def __init__( # Loading state self._loading_task = asyncio.create_task(self._load_sitemaps()) - self._loading_finished = False def _check_url_patterns( self, @@ -123,8 +123,6 @@ async def _load_sitemaps(self) -> None: except Exception: logger.exception('Error loading sitemaps') raise - finally: - self._loading_finished = True async def get_total_count(self) -> int: """Return the total number of URLs found so far.""" @@ -132,15 +130,15 @@ async def get_total_count(self) -> int: async def is_empty(self) -> bool: """Check if there are no more URLs to process.""" - return self._url_queue.empty() and self._loading_finished + return self._url_queue.empty() and self._loading_task.done() async def is_finished(self) -> bool: """Check if all URLs have been processed.""" - return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_finished + return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_task.done() async def fetch_next_request(self) -> Request | None: """Fetch the next request to process.""" - while not (self._loading_finished and self._url_queue.empty()): + while not (self._loading_task.done() and self._url_queue.empty()): if self._url_queue.empty(): await asyncio.sleep(0.5) continue @@ -168,9 +166,5 @@ async def abort_loading(self) -> None: """Abort the sitemap loading process.""" if self._loading_task and not self._loading_task.done(): self._loading_task.cancel() - try: + with suppress(asyncio.CancelledError): await self._loading_task - except asyncio.CancelledError: - pass - finally: - self._loading_finished = True From a82985a15b97b06141baf431a7f88fd70938781e Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 10 Jun 2025 18:04:30 +0000 Subject: [PATCH 13/30] implimitation stream method --- src/crawlee/crawlers/_basic/_basic_crawler.py | 1 + .../_playwright/_playwright_http_client.py | 22 +++++- src/crawlee/crawlers/_playwright/_types.py | 8 ++ src/crawlee/http_clients/_base.py | 59 ++++++++++++++ src/crawlee/http_clients/_curl_impersonate.py | 54 +++++++++++++ src/crawlee/http_clients/_httpx.py | 79 +++++++++++++++++-- 6 files changed, 214 insertions(+), 9 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 192d34091f..5dd7510b4b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -670,6 +670,7 @@ async def _run_crawler(self) -> None: self._snapshotter, self._statistics, self._session_pool if self._use_session_pool else None, + self._http_client, *self._additional_context_managers, ) if cm and getattr(cm, 'active', False) is False diff --git a/src/crawlee/crawlers/_playwright/_playwright_http_client.py b/src/crawlee/crawlers/_playwright/_playwright_http_client.py index d8a51af40e..5a72b80506 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_http_client.py +++ b/src/crawlee/crawlers/_playwright/_playwright_http_client.py @@ -1,7 +1,7 @@ from __future__ import annotations import contextvars -from contextlib import asynccontextmanager +from contextlib import AbstractAsyncContextManager, asynccontextmanager from typing import TYPE_CHECKING from typing_extensions import override @@ -12,6 +12,7 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator + from datetime import timedelta from playwright.async_api import Page @@ -48,6 +49,7 @@ class PlaywrightHttpClient(HttpClient): def __init__(self) -> None: """Initialize a new instance.""" + self._active = False @override async def crawl( @@ -89,3 +91,21 @@ async def send_request( ) return await PlaywrightHttpResponse.from_playwright_response(response, protocol='') + + @override + def stream( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + payload: HttpPayload | None = None, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + timeout: timedelta | None = None, + ) -> AbstractAsyncContextManager[HttpResponse]: + raise NotImplementedError('The `stream` method should not be used for `PlaywrightHttpClient`') + + async def cleanup(self) -> None: + # The `browser_page_context` is responsible for resource cleanup + return diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py index 427c7fe05d..05b0984fe3 100644 --- a/src/crawlee/crawlers/_playwright/_types.py +++ b/src/crawlee/crawlers/_playwright/_types.py @@ -7,6 +7,8 @@ from crawlee._utils.docs import docs_group if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from playwright.async_api import APIResponse, Response from typing_extensions import Self @@ -42,6 +44,12 @@ class PlaywrightHttpResponse: def read(self) -> bytes: return self._content + async def iter_bytes(self) -> AsyncGenerator[bytes, None]: + chunk_size = None + chunk_size = len(self._content) if chunk_size is None else chunk_size + for i in range(0, len(self._content), max(chunk_size, 1)): + yield self._content[i : i + chunk_size] + @classmethod async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self: headers = HttpHeaders(response.headers) diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index efbd09b88e..c10999bd37 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -7,6 +7,11 @@ from crawlee._utils.docs import docs_group if TYPE_CHECKING: + from collections.abc import AsyncIterator + from contextlib import AbstractAsyncContextManager + from datetime import timedelta + from types import TracebackType + from crawlee import Request from crawlee._types import HttpHeaders, HttpMethod, HttpPayload from crawlee.proxy_configuration import ProxyInfo @@ -33,6 +38,9 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: """Read the content of the response body.""" + def iter_bytes(self) -> AsyncIterator[bytes]: + """Iterate over the content of the response body in chunks.""" + @dataclass(frozen=True) @docs_group('Data structures') @@ -64,6 +72,9 @@ def __init__( """ self._persist_cookies_per_session = persist_cookies_per_session + # Flag to indicate the context state. + self._active = False + @abstractmethod async def crawl( self, @@ -119,3 +130,51 @@ async def send_request( Returns: The HTTP response received from the server. """ + + @abstractmethod + def stream( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + payload: HttpPayload | None = None, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + timeout: timedelta | None = None, + ) -> AbstractAsyncContextManager[HttpResponse]: + """Stream an HTTP request via the client.""" + + @abstractmethod + async def cleanup(self) -> None: + """Clean up resources used by the client. + + This method is called when the client is no longer needed. + It should be overridden in subclasses to perform any necessary cleanup. + """ + + async def __aenter__(self) -> HttpClient: + """Initialize the client when entering the context manager. + + Raises: + RuntimeError: If the context manager is already active. + """ + if self._active: + raise RuntimeError(f'The {self.__class__.__name__} is already active.') + + self._active = True + return self + + async def __aexit__( + self, exc_type: BaseException | None, exc_value: BaseException | None, traceback: TracebackType | None + ) -> None: + """Deinitialize the client and clean up resources when exiting the context manager. + + Raises: + RuntimeError: If the context manager is already active. + """ + if not self._active: + raise RuntimeError(f'The {self.__class__.__name__} is not active.') + + await self.cleanup() + self._active = False diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index 847681d5b3..5e04ac868c 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -1,5 +1,6 @@ from __future__ import annotations +from contextlib import asynccontextmanager from typing import TYPE_CHECKING, Any, Optional from curl_cffi import CurlInfo @@ -19,6 +20,8 @@ from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from datetime import timedelta from http.cookiejar import Cookie from curl_cffi import Curl @@ -85,6 +88,10 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: return self._response.content + async def iter_bytes(self) -> AsyncGenerator[bytes, None]: + async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] + yield chunk + @docs_group('Classes') class CurlImpersonateHttpClient(HttpClient): @@ -198,6 +205,48 @@ async def send_request( return _CurlImpersonateResponse(response) + @asynccontextmanager + @override + async def stream( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + payload: HttpPayload | None = None, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + timeout: timedelta | None = None, + ) -> AsyncGenerator[HttpResponse]: + if isinstance(headers, dict) or headers is None: + headers = HttpHeaders(headers or {}) + + proxy_url = proxy_info.url if proxy_info else None + client = self._get_client(proxy_url) + + try: + response = await client.request( + url=url, + method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method + headers=dict(headers) if headers else None, + data=payload, + cookies=session.cookies.jar if session else None, + stream=True, + ) + except CurlRequestError as exc: + if self._is_proxy_error(exc): + raise ProxyError from exc + raise + + if self._persist_cookies_per_session and session and response.curl: + response_cookies = self._get_cookies(response.curl) + session.cookies.store_cookies(response_cookies) + + try: + yield _CurlImpersonateResponse(response) + finally: + await response.aclose() + def _get_client(self, proxy_url: str | None) -> AsyncSession: """Retrieve or create an asynchronous HTTP session for the given proxy URL. @@ -245,3 +294,8 @@ def _get_cookies(curl: Curl) -> list[Cookie]: cookie = curl_morsel.to_cookiejar_cookie() cookies.append(cookie) return cookies + + async def cleanup(self) -> None: + for client in self._client_by_proxy_url.values(): + await client.close() + self._client_by_proxy_url.clear() diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index aebf833a12..aaf1edd76e 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -1,5 +1,6 @@ from __future__ import annotations +from contextlib import asynccontextmanager from logging import getLogger from typing import TYPE_CHECKING, Any, Optional, cast @@ -14,6 +15,8 @@ from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: + from collections.abc import AsyncGenerator, AsyncIterator + from datetime import timedelta from ssl import SSLContext from crawlee import Request @@ -46,6 +49,9 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: return self._response.read() + def iter_bytes(self) -> AsyncIterator[bytes]: + return self._response.aiter_bytes() + class _HttpxTransport(httpx.AsyncHTTPTransport): """HTTP transport adapter that stores response cookies in a `Session`. @@ -182,18 +188,15 @@ async def send_request( session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: - if isinstance(headers, dict) or headers is None: - headers = HttpHeaders(headers or {}) - client = self._get_client(proxy_info.url if proxy_info else None) - headers = self._combine_headers(headers) - http_request = client.build_request( + http_request = self._build_request( + client=client, url=url, method=method, - headers=dict(headers) if headers else None, - content=payload, - extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, + headers=headers, + payload=payload, + session=session, ) try: @@ -205,6 +208,60 @@ async def send_request( return _HttpxResponse(response) + @asynccontextmanager + @override + async def stream( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + payload: HttpPayload | None = None, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + timeout: timedelta | None = None, + ) -> AsyncGenerator[HttpResponse]: + client = self._get_client(proxy_info.url if proxy_info else None) + + http_request = self._build_request( + client=client, + url=url, + method=method, + headers=headers, + payload=payload, + session=session, + ) + + response = await client.send(http_request, stream=True) + + try: + yield _HttpxResponse(response) + finally: + await response.aclose() + + def _build_request( + self, + client: httpx.AsyncClient, + url: str, + method: HttpMethod, + headers: HttpHeaders | dict[str, str] | None, + payload: HttpPayload | None, + session: Session | None = None, + ) -> httpx.Request: + """Build an `httpx.Request` using the provided parameters.""" + if isinstance(headers, dict) or headers is None: + headers = HttpHeaders(headers or {}) + + headers = self._combine_headers(headers) + + return client.build_request( + url=url, + method=method, + headers=dict(headers) if headers else None, + content=payload, + extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, + ) + def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient: """Retrieve or create an HTTP client for the given proxy URL. @@ -262,3 +319,9 @@ def _is_proxy_error(error: httpx.TransportError) -> bool: return True return False + + async def cleanup(self) -> None: + for client in self._client_by_proxy_url.values(): + await client.aclose() + self._client_by_proxy_url.clear() + await self._transport.aclose() From 8a7aa4aea15e33529b4464dd698ef0c317a79942 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 10 Jun 2025 21:37:04 +0000 Subject: [PATCH 14/30] add chunk_size parameter for `iter_bytes` --- src/crawlee/crawlers/_playwright/_types.py | 3 +-- src/crawlee/http_clients/_base.py | 2 +- src/crawlee/http_clients/_curl_impersonate.py | 4 ++-- src/crawlee/http_clients/_httpx.py | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py index 05b0984fe3..a95ca05224 100644 --- a/src/crawlee/crawlers/_playwright/_types.py +++ b/src/crawlee/crawlers/_playwright/_types.py @@ -44,8 +44,7 @@ class PlaywrightHttpResponse: def read(self) -> bytes: return self._content - async def iter_bytes(self) -> AsyncGenerator[bytes, None]: - chunk_size = None + async def iter_bytes(self, chunk_size: int | None = None) -> AsyncGenerator[bytes, None]: chunk_size = len(self._content) if chunk_size is None else chunk_size for i in range(0, len(self._content), max(chunk_size, 1)): yield self._content[i : i + chunk_size] diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index c10999bd37..561bac24f8 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -38,7 +38,7 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: """Read the content of the response body.""" - def iter_bytes(self) -> AsyncIterator[bytes]: + def iter_bytes(self, chunk_size: int | None = None) -> AsyncIterator[bytes]: """Iterate over the content of the response body in chunks.""" diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index 5e04ac868c..6abeddff6f 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -88,8 +88,8 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: return self._response.content - async def iter_bytes(self) -> AsyncGenerator[bytes, None]: - async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] + async def iter_bytes(self, chunk_size: int | None = None) -> AsyncGenerator[bytes, None]: + async for chunk in self._response.aiter_content(chunk_size=chunk_size): # type: ignore[no-untyped-call] yield chunk diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index aaf1edd76e..7b133c702b 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -49,8 +49,8 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: return self._response.read() - def iter_bytes(self) -> AsyncIterator[bytes]: - return self._response.aiter_bytes() + def iter_bytes(self, chunk_size: int | None = None) -> AsyncIterator[bytes]: + return self._response.aiter_bytes(chunk_size=chunk_size) class _HttpxTransport(httpx.AsyncHTTPTransport): From 594604f610a1f1fd05e9b80474cd17401864bde9 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 10 Jun 2025 21:48:35 +0000 Subject: [PATCH 15/30] add support timeout for stream --- src/crawlee/http_clients/_curl_impersonate.py | 1 + src/crawlee/http_clients/_httpx.py | 5 +++++ uv.lock | 4 ++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index 6abeddff6f..bb9480f194 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -232,6 +232,7 @@ async def stream( data=payload, cookies=session.cookies.jar if session else None, stream=True, + timeout=timeout.total_seconds() if timeout else None, ) except CurlRequestError as exc: if self._is_proxy_error(exc): diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index 7b133c702b..4f87d3f9b5 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -230,6 +230,7 @@ async def stream( headers=headers, payload=payload, session=session, + timeout=timeout, ) response = await client.send(http_request, stream=True) @@ -247,6 +248,7 @@ def _build_request( headers: HttpHeaders | dict[str, str] | None, payload: HttpPayload | None, session: Session | None = None, + timeout: timedelta | None = None, ) -> httpx.Request: """Build an `httpx.Request` using the provided parameters.""" if isinstance(headers, dict) or headers is None: @@ -254,12 +256,15 @@ def _build_request( headers = self._combine_headers(headers) + httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None + return client.build_request( url=url, method=method, headers=dict(headers) if headers else None, content=payload, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, + timeout=httpx_timeout, ) def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient: diff --git a/uv.lock b/uv.lock index 7a57e06cc9..a68c7037d1 100644 --- a/uv.lock +++ b/uv.lock @@ -763,7 +763,7 @@ provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impe dev = [ { name = "apify-client" }, { name = "build", specifier = "~=1.2.2" }, - { name = "dycw-pytest-only", specifier = ">=2.1.1" }, + { name = "dycw-pytest-only", specifier = "~=2.1.0" }, { name = "mypy", specifier = "~=1.16.0" }, { name = "pre-commit", specifier = "~=4.2.0" }, { name = "proxy-py", specifier = "~=2.4.0" }, @@ -771,7 +771,7 @@ dev = [ { name = "pytest", specifier = "~=8.4.0" }, { name = "pytest-asyncio", specifier = "~=1.0.0" }, { name = "pytest-cov", specifier = "~=6.1.0" }, - { name = "pytest-timeout", specifier = ">=2.4.0" }, + { name = "pytest-timeout", specifier = "~=2.4.0" }, { name = "pytest-xdist", specifier = "~=3.7.0" }, { name = "ruff", specifier = "~=0.11.0" }, { name = "setuptools" }, From 0607be6189968adbf207f1cabfc5f71ef5533b90 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 10 Jun 2025 22:17:50 +0000 Subject: [PATCH 16/30] add test --- .../unit/http_clients/test_curl_impersonate.py | 18 ++++++++++++++++++ tests/unit/http_clients/test_httpx.py | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py index 745b7abf6d..855feec3e0 100644 --- a/tests/unit/http_clients/test_curl_impersonate.py +++ b/tests/unit/http_clients/test_curl_impersonate.py @@ -116,3 +116,21 @@ async def test_send_request_allow_redirects_false(server_url: URL) -> None: assert response.status_code == 302 assert response.headers['Location'] == target_url + + +async def test_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: + check_body = b"""\ + + Hello, world! + + +""" + + content_body: bytes = b'' + + async with http_client.stream(str(server_url)) as response: + assert response.status_code == 200 + async for chunk in response.iter_bytes(): + content_body += chunk + + assert content_body == check_body diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index cca5110cb7..4af07f0644 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -128,3 +128,21 @@ async def test_crawl_follow_redirects_false(server_url: URL) -> None: assert crawling_result.http_response.status_code == 302 assert crawling_result.http_response.headers['Location'] == target_url assert request.loaded_url == redirect_url + + +async def test_stream(http_client: HttpxHttpClient, server_url: URL) -> None: + check_body = b"""\ + + Hello, world! + + +""" + + content_body: bytes = b'' + + async with http_client.stream(str(server_url)) as response: + assert response.status_code == 200 + async for chunk in response.iter_bytes(): + content_body += chunk + + assert content_body == check_body From 2be938372a67b5bca4859ff2fec2dbeed46882e2 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 10 Jun 2025 22:25:18 +0000 Subject: [PATCH 17/30] update docsstrings --- src/crawlee/http_clients/_base.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 561bac24f8..55d898ceb9 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -143,14 +143,34 @@ def stream( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AbstractAsyncContextManager[HttpResponse]: - """Stream an HTTP request via the client.""" + """Stream an HTTP request via the client. + + This method should be used for downloading potentially large data where you need to process + the response body in chunks rather than loading it entirely into memory. + + Args: + url: The URL to send the request to. + method: The HTTP method to use. + headers: The headers to include in the request. + payload: The data to be sent as the request body. + session: The session associated with the request. + proxy_info: The information about the proxy to be used. + timeout: The maximum time to wait for establishing the connection. + + Raises: + ProxyError: Raised if a proxy-related error occurs. + + Returns: + An async context manager yielding the HTTP response with streaming capabilities. + """ @abstractmethod async def cleanup(self) -> None: """Clean up resources used by the client. - This method is called when the client is no longer needed. - It should be overridden in subclasses to perform any necessary cleanup. + This method is called when the client is no longer needed and should be overridden + in subclasses to perform any necessary cleanup such as closing connections, + releasing file handles, or other resource deallocation. """ async def __aenter__(self) -> HttpClient: From 95579fe8d863a863208eeedce43bf4c48f6d7e95 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 11 Jun 2025 13:42:57 +0000 Subject: [PATCH 18/30] remove `chunk_size` --- src/crawlee/crawlers/_playwright/_types.py | 8 ++++---- src/crawlee/http_clients/_base.py | 2 +- src/crawlee/http_clients/_curl_impersonate.py | 4 ++-- src/crawlee/http_clients/_httpx.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py index a95ca05224..75fbbdc695 100644 --- a/src/crawlee/crawlers/_playwright/_types.py +++ b/src/crawlee/crawlers/_playwright/_types.py @@ -44,10 +44,10 @@ class PlaywrightHttpResponse: def read(self) -> bytes: return self._content - async def iter_bytes(self, chunk_size: int | None = None) -> AsyncGenerator[bytes, None]: - chunk_size = len(self._content) if chunk_size is None else chunk_size - for i in range(0, len(self._content), max(chunk_size, 1)): - yield self._content[i : i + chunk_size] + async def iter_bytes(self) -> AsyncGenerator[bytes, None]: + # Playwright not support `streaming` responses. + # This is a workaround to make it compatible with `HttpResponse` protocol. + yield self._content @classmethod async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self: diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 55d898ceb9..b1695172a3 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -38,7 +38,7 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: """Read the content of the response body.""" - def iter_bytes(self, chunk_size: int | None = None) -> AsyncIterator[bytes]: + def iter_bytes(self) -> AsyncIterator[bytes]: """Iterate over the content of the response body in chunks.""" diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index bb9480f194..cc83ba2910 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -88,8 +88,8 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: return self._response.content - async def iter_bytes(self, chunk_size: int | None = None) -> AsyncGenerator[bytes, None]: - async for chunk in self._response.aiter_content(chunk_size=chunk_size): # type: ignore[no-untyped-call] + async def iter_bytes(self) -> AsyncGenerator[bytes, None]: + async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] yield chunk diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index 4f87d3f9b5..b8544c9fd7 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -49,8 +49,8 @@ def headers(self) -> HttpHeaders: def read(self) -> bytes: return self._response.read() - def iter_bytes(self, chunk_size: int | None = None) -> AsyncIterator[bytes]: - return self._response.aiter_bytes(chunk_size=chunk_size) + def iter_bytes(self) -> AsyncIterator[bytes]: + return self._response.aiter_bytes() class _HttpxTransport(httpx.AsyncHTTPTransport): From 19428223ac0a88630e1debbe3e710073fba2c4c0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 11 Jun 2025 19:02:14 +0000 Subject: [PATCH 19/30] iter_bytesread_stream --- src/crawlee/crawlers/_playwright/_types.py | 2 +- src/crawlee/http_clients/_base.py | 18 +++++-- src/crawlee/http_clients/_curl_impersonate.py | 10 +++- src/crawlee/http_clients/_httpx.py | 10 +++- .../http_clients/test_curl_impersonate.py | 48 ++++++++++++++++++- tests/unit/http_clients/test_httpx.py | 48 ++++++++++++++++++- 6 files changed, 125 insertions(+), 11 deletions(-) diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py index 75fbbdc695..a4303ee7ba 100644 --- a/src/crawlee/crawlers/_playwright/_types.py +++ b/src/crawlee/crawlers/_playwright/_types.py @@ -44,7 +44,7 @@ class PlaywrightHttpResponse: def read(self) -> bytes: return self._content - async def iter_bytes(self) -> AsyncGenerator[bytes, None]: + async def read_stream(self) -> AsyncGenerator[bytes, None]: # Playwright not support `streaming` responses. # This is a workaround to make it compatible with `HttpResponse` protocol. yield self._content diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index b1695172a3..fa0009d713 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -36,10 +36,22 @@ def headers(self) -> HttpHeaders: """The HTTP headers received in the response.""" def read(self) -> bytes: - """Read the content of the response body.""" + """Read the entire content of the response body. - def iter_bytes(self) -> AsyncIterator[bytes]: - """Iterate over the content of the response body in chunks.""" + This method loads the complete response body into memory at once. It should be used + for responses received from regular HTTP requests (via `send_request` or `crawl` methods). + + Raises: + RuntimeError: If called on a response received from the `stream` method. + """ + + def read_stream(self) -> AsyncIterator[bytes]: + """Iterate over the content of the response body in chunks. + + This method should be used for responses received from the `stream` method to process + large response bodies without loading them entirely into memory. It allows for efficient + processing of potentially large data by yielding chunks sequentially. + """ @dataclass(frozen=True) diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index cc83ba2910..f5a6e45312 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -86,9 +86,17 @@ def headers(self) -> HttpHeaders: return HttpHeaders({key: value for key, value in self._response.headers.items() if value}) def read(self) -> bytes: + if self._response.astream_task: + raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') return self._response.content - async def iter_bytes(self) -> AsyncGenerator[bytes, None]: + async def read_stream(self) -> AsyncGenerator[bytes, None]: + # Calling `aiter_content` again after executing `astream_task` causes DeadLock + # this will prevent that from happening. + if not self._response.astream_task or self._response.astream_task.done(): # type: ignore[attr-defined] + yield b'' + return + async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] yield chunk diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index b8544c9fd7..ed2f3ba43e 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -47,10 +47,16 @@ def headers(self) -> HttpHeaders: return HttpHeaders(dict(self._response.headers)) def read(self) -> bytes: + if not self._response.is_closed: + raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') return self._response.read() - def iter_bytes(self) -> AsyncIterator[bytes]: - return self._response.aiter_bytes() + async def read_stream(self) -> AsyncIterator[bytes]: + if self._response.is_stream_consumed: + yield b'' + else: + async for chunk in self._response.aiter_bytes(): + yield chunk class _HttpxTransport(httpx.AsyncHTTPTransport): diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py index 855feec3e0..0ab4723538 100644 --- a/tests/unit/http_clients/test_curl_impersonate.py +++ b/tests/unit/http_clients/test_curl_impersonate.py @@ -125,12 +125,56 @@ async def test_stream(http_client: CurlImpersonateHttpClient, server_url: URL) - """ - content_body: bytes = b'' async with http_client.stream(str(server_url)) as response: assert response.status_code == 200 - async for chunk in response.iter_bytes(): + async for chunk in response.read_stream(): content_body += chunk assert content_body == check_body + + +async def test_stream_double_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: + check_body = b"""\ + + Hello, world! + + +""" + + async with http_client.stream(str(server_url)) as response: + assert response.status_code == 200 + content_body_first: bytes = b'' + async for chunk in response.read_stream(): + content_body_first += chunk + + content_body_second: bytes = b'' + async for chunk in response.read_stream(): + content_body_second += chunk + + assert content_body_first == check_body + assert content_body_second == b'' + + +async def test_stream_error_for_read(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: + async with http_client.stream(str(server_url)) as response: + assert response.status_code == 200 + + with pytest.raises(RuntimeError): + response.read() + + +async def test_send_request_error_for_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: + response = await http_client.send_request(str(server_url)) + + assert response.status_code == 200 + assert b''.join([item async for item in response.read_stream()]) == b'' + + +async def test_send_crawl_error_for_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: + response = await http_client.crawl(Request.from_url(str(server_url))) + http_response = response.http_response + + assert http_response.status_code == 200 + assert b''.join([item async for item in http_response.read_stream()]) == b'' diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index 4af07f0644..1e378daa3d 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -137,12 +137,56 @@ async def test_stream(http_client: HttpxHttpClient, server_url: URL) -> None: """ - content_body: bytes = b'' async with http_client.stream(str(server_url)) as response: assert response.status_code == 200 - async for chunk in response.iter_bytes(): + async for chunk in response.read_stream(): content_body += chunk assert content_body == check_body + + +async def test_stream_double_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: + check_body = b"""\ + + Hello, world! + + +""" + + async with http_client.stream(str(server_url)) as response: + assert response.status_code == 200 + content_body_first: bytes = b'' + async for chunk in response.read_stream(): + content_body_first += chunk + + content_body_second: bytes = b'' + async for chunk in response.read_stream(): + content_body_second += chunk + + assert content_body_first == check_body + assert content_body_second == b'' + + +async def test_stream_error_for_read(http_client: HttpxHttpClient, server_url: URL) -> None: + async with http_client.stream(str(server_url)) as response: + assert response.status_code == 200 + + with pytest.raises(RuntimeError): + response.read() + + +async def test_send_request_error_for_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: + response = await http_client.send_request(str(server_url)) + + assert response.status_code == 200 + assert b''.join([item async for item in response.read_stream()]) == b'' + + +async def test_send_crawl_error_for_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: + response = await http_client.crawl(Request.from_url(str(server_url))) + http_response = response.http_response + + assert http_response.status_code == 200 + assert b''.join([item async for item in http_response.read_stream()]) == b'' From 59166e176faead9a176eaf39c98094435a3eb12c Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 11 Jun 2025 19:21:41 +0000 Subject: [PATCH 20/30] update for use `HttpClient` with `stream` --- src/crawlee/_utils/robots.py | 13 ++-- src/crawlee/_utils/sitemap.py | 68 +++++++++++-------- .../_sitemap_request_loader.py | 15 ++-- tests/unit/_utils/test_sitemap.py | 49 ++++++------- .../test_sitemap_request_loader.py | 13 ++-- 5 files changed, 91 insertions(+), 67 deletions(-) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index 4a40731a71..5cbe59b5ed 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -16,9 +16,12 @@ class RobotsTxtFile: - def __init__(self, url: str, robots: Protego, proxy_info: ProxyInfo | None = None) -> None: + def __init__( + self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None + ) -> None: self._robots = robots self._original_url = URL(url).origin() + self._http_client = http_client self._proxy_info = proxy_info @classmethod @@ -58,7 +61,7 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N robots = Protego.parse(body.decode('utf-8')) - return cls(url, robots, proxy_info=proxy_info) + return cls(url, robots, http_client=http_client, proxy_info=proxy_info) def is_allowed(self, url: str, user_agent: str = '*') -> bool: """Check if the given URL is allowed for the given user agent. @@ -89,8 +92,10 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None: async def parse_sitemaps(self) -> Sitemap: """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.""" sitemaps = self.get_sitemaps() - proxy_url = self._proxy_info.url if self._proxy_info else None - return await Sitemap.load(sitemaps, proxy_url) + if not self._http_client: + raise ValueError('HTTP client is required to parse sitemaps.') + + return await Sitemap.load(sitemaps, self._http_client, self._proxy_info) async def parse_urls_from_sitemaps(self) -> list[str]: """Parse the sitemaps in the robots.txt file and return a list URLs.""" diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index a432c9bb39..37e2b7a777 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -11,7 +11,6 @@ from xml.sax.expatreader import ExpatParser from xml.sax.handler import ContentHandler -import httpx from typing_extensions import NotRequired, override from yarl import URL @@ -19,6 +18,9 @@ from collections.abc import AsyncGenerator from xml.sax.xmlreader import AttributesImpl + from crawlee.http_clients import HttpClient + from crawlee.proxy_configuration import ProxyInfo + logger = getLogger(__name__) VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'} @@ -292,13 +294,15 @@ async def _process_raw_source( async def _fetch_and_process_sitemap( - client: httpx.AsyncClient, + http_client: HttpClient, source: SitemapSource, depth: int, visited_sitemap_urls: set[str], sources: list[SitemapSource], retries_left: int, *, + proxy_info: ProxyInfo | None = None, + timeout: timedelta | None = None, emit_nested_sitemaps: bool, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: """Fetch a sitemap from a URL and process its content.""" @@ -310,9 +314,9 @@ async def _fetch_and_process_sitemap( try: while retries_left > 0: retries_left -= 1 - async with client.stream('GET', sitemap_url, headers=SITEMAP_HEADERS) as response: - response.raise_for_status() - + async with http_client.stream( + sitemap_url, method='GET', headers=SITEMAP_HEADERS, proxy_info=proxy_info, timeout=timeout + ) as response: # Determine content type and compression content_type = response.headers.get('content-type', '') @@ -322,7 +326,7 @@ async def _fetch_and_process_sitemap( try: # Process chunks as they arrive first_chunk = True - async for raw_chunk in response.aiter_bytes(chunk_size=8192): + async for raw_chunk in response.iter_bytes(): # Check if the first chunk is a valid gzip header if first_chunk and raw_chunk.startswith(b'\x1f\x8b'): decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) @@ -373,40 +377,45 @@ def urls(self) -> list[str]: return self._urls @classmethod - async def try_common_names(cls, url: str, proxy_url: str | None = None) -> Sitemap: + async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap: base_url = URL(url) sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))] - return await cls.load(sitemap_urls, proxy_url) + return await cls.load(sitemap_urls, http_client, proxy_info) @classmethod async def load( cls, urls: str | list[str], - proxy_url: str | None = None, + http_client: HttpClient, + proxy_info: ProxyInfo | None = None, parse_sitemap_options: ParseSitemapOptions | None = None, ) -> Sitemap: if isinstance(urls, str): urls = [urls] - return await cls.parse([SitemapSource(type='url', url=url) for url in urls], proxy_url, parse_sitemap_options) + return await cls.parse( + [SitemapSource(type='url', url=url) for url in urls], http_client, proxy_info, parse_sitemap_options + ) @classmethod - async def from_xml_string(cls, content: str, proxy_url: str | None = None) -> Sitemap: - return await cls.parse([SitemapSource(type='raw', content=content)], proxy_url) + async def from_xml_string(cls, content: str) -> Sitemap: + return await cls.parse([SitemapSource(type='raw', content=content)]) @classmethod async def parse( cls, sources: list[SitemapSource], - proxy_url: str | None = None, + http_client: HttpClient | None = None, + proxy_info: ProxyInfo | None = None, parse_sitemap_options: ParseSitemapOptions | None = None, ) -> Sitemap: - urls = [item.loc async for item in parse_sitemap(sources, proxy_url, parse_sitemap_options)] + urls = [item.loc async for item in parse_sitemap(sources, http_client, proxy_info, parse_sitemap_options)] return cls(urls) async def parse_sitemap( initial_sources: list[SitemapSource], - proxy_url: str | None = None, + http_client: HttpClient | None = None, + proxy_info: ProxyInfo | None = None, options: ParseSitemapOptions | None = None, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: """Parse sitemap(s) and yield URLs found in them. @@ -420,9 +429,6 @@ async def parse_sitemap( emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) max_depth = options.get('max_depth', float('inf')) sitemap_retries = options.get('sitemap_retries', 3) - timeout = options.get('timeout', timedelta(seconds=30)) - - httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None # Setup working state sources = list(initial_sources) @@ -447,18 +453,22 @@ async def parse_sitemap( elif source['type'] == 'url' and 'url' in source: # Add to visited set before processing to avoid duplicates + if http_client is None: + raise RuntimeError('HttpClient must be provided for URL-based sitemap sources.') + visited_sitemap_urls.add(source['url']) - async with httpx.AsyncClient(timeout=httpx_timeout, proxy=proxy_url) as client: - async for result in _fetch_and_process_sitemap( - client, - source, - depth, - visited_sitemap_urls, - sources, - sitemap_retries, - emit_nested_sitemaps=emit_nested_sitemaps, - ): - yield result + async for result in _fetch_and_process_sitemap( + http_client, + source, + depth, + visited_sitemap_urls, + sources, + sitemap_retries, + emit_nested_sitemaps=emit_nested_sitemaps, + proxy_info=proxy_info, + timeout=options.get('timeout', timedelta(seconds=30)), + ): + yield result else: logger.warning(f'Invalid source configuration: {source}') diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py index 37d0473db5..c3dda01938 100644 --- a/src/crawlee/request_loaders/_sitemap_request_loader.py +++ b/src/crawlee/request_loaders/_sitemap_request_loader.py @@ -15,6 +15,8 @@ import re from collections.abc import Sequence + from crawlee.http_clients import HttpClient + from crawlee.proxy_configuration import ProxyInfo from crawlee.storage_clients.models import ProcessedRequest @@ -32,8 +34,9 @@ class SitemapRequestLoader(RequestLoader): def __init__( self, sitemap_urls: list[str], + http_client: HttpClient, *, - proxy_url: str | None = None, + proxy_info: ProxyInfo | None = None, include: list[re.Pattern[Any] | Glob] | None = None, exclude: list[re.Pattern[Any] | Glob] | None = None, max_buffer_size: int = 200, @@ -43,16 +46,19 @@ def __init__( Args: sitemap_urls: Configuration options for the loader. - proxy_url: Optional proxy to use for fetching sitemaps. + proxy_info: Optional proxy to use for fetching sitemaps. include: List of glob or regex patterns to include URLs. exclude: List of glob or regex patterns to exclude URLs. max_buffer_size: Maximum number of URLs to buffer in memory. parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`. + http_client: the instance of `HttpClient` to use for fetching sitemaps. """ + self._http_client = http_client + self._sitemap_urls = sitemap_urls self._include = include self._exclude = exclude - self._proxy_url = proxy_url + self._proxy_info = proxy_info self._parse_sitemap_options = parse_sitemap_options or ParseSitemapOptions() self._handled_count = 0 @@ -101,7 +107,8 @@ async def _load_sitemaps(self) -> None: try: async for item in parse_sitemap( [SitemapSource(type='url', url=url) for url in self._sitemap_urls], - proxy_url=self._proxy_url, + self._http_client, + proxy_info=self._proxy_info, options=self._parse_sitemap_options, ): # Only process URL items (not nested sitemaps) diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index 8e31d436d8..807090eaa4 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -5,6 +5,7 @@ from yarl import URL from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap +from crawlee.http_clients._base import HttpClient BASIC_SITEMAP = """ @@ -55,24 +56,24 @@ def encode_base64(data: bytes) -> str: return base64.b64encode(data).decode('utf-8') -async def test_sitemap(server_url: URL) -> None: +async def test_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a basic sitemap.""" sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' ) - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS -async def test_extract_metadata_sitemap(server_url: URL) -> None: +async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test extracting item metadata from a sitemap.""" sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' ) - items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}])] + items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}], http_client=http_client)] assert len(items) == 5 assert items[0] == SitemapUrl( loc='http://not-exists.com/', @@ -83,60 +84,60 @@ async def test_extract_metadata_sitemap(server_url: URL) -> None: ) -async def test_gzipped_sitemap(server_url: URL) -> None: +async def test_gzipped_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with correct type and .xml.gz url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip') - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS -async def test_gzipped_sitemap_with_invalid_data(server_url: URL) -> None: +async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: HttpClient) -> None: """Test loading a invalid gzipped sitemap with correct type and .xml.gz url.""" compress_data = compress_gzip(BASIC_SITEMAP) invalid_gzipped_data = encode_base64(compress_data[:30]) sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip') - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 0 assert sitemap.urls == [] -async def test_gz_sitemap_with_non_gzipped(server_url: URL) -> None: +async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None: """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data.""" sitemap_url = (server_url / 'sitemap.xml.gz').with_query( base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip' ) - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS -async def test_gzipped_sitemap_with_bad_type(server_url: URL) -> None: +async def test_gzipped_sitemap_with_bad_type(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with bad type and .xml.gz url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) sitemap_url = (server_url / 'sitemap.xml.gz').with_query( base64=gzipped_data, c_type='application/xml; charset=utf-8' ) - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS -async def test_xml_sitemap_with_gzipped_data(server_url: URL) -> None: +async def test_xml_sitemap_with_gzipped_data(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with correct type and .xml url.""" gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip') - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS -async def test_parent_sitemap(server_url: URL) -> None: +async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a parent sitemap that references child sitemaps.""" parent_sitemap = """ @@ -157,21 +158,21 @@ async def test_parent_sitemap(server_url: URL) -> None: encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode()) parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content) - sitemap = await Sitemap.load(str(parent_sitemap_url)) + sitemap = await Sitemap.load(str(parent_sitemap_url), http_client=http_client) assert len(sitemap.urls) == 10 assert set(sitemap.urls) == BASIC_RESULTS -async def test_non_sitemap_url(server_url: URL) -> None: +async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None: """Test loading a URL that does not point to a sitemap.""" - sitemap = await Sitemap.load(str(server_url)) + sitemap = await Sitemap.load(str(server_url), http_client=http_client) assert len(sitemap.urls) == 0 assert sitemap.urls == [] -async def test_cdata_sitemap(server_url: URL) -> None: +async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a sitemap with CDATA sections.""" cdata_sitemap = """ @@ -184,13 +185,13 @@ async def test_cdata_sitemap(server_url: URL) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(cdata_sitemap.encode()), c_type='application/xml; charset=utf-8' ) - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 1 assert sitemap.urls == ['http://not-exists.com/catalog'] -async def test_txt_sitemap(server_url: URL) -> None: +async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a plain text sitemap.""" urls = [ 'http://not-exists.com/catalog?item=78&desc=vacation_crete', @@ -199,7 +200,7 @@ async def test_txt_sitemap(server_url: URL) -> None: txt_sitemap_content = '\n'.join(urls) sitemap_url = (server_url / 'sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode())) - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 2 assert set(sitemap.urls) == { @@ -208,7 +209,7 @@ async def test_txt_sitemap(server_url: URL) -> None: } -async def test_sitemap_pretty(server_url: URL) -> None: +async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None: """Test loading a pretty-printed sitemap.""" pretty_sitemap = """ @@ -233,7 +234,7 @@ async def test_sitemap_pretty(server_url: URL) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query( base64=encode_base64(pretty_sitemap.encode()), c_type='application/xml; charset=utf-8' ) - sitemap = await Sitemap.load(str(sitemap_url)) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 1 assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey'] diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py index c44e822e85..6e73708cb2 100644 --- a/tests/unit/request_loaders/test_sitemap_request_loader.py +++ b/tests/unit/request_loaders/test_sitemap_request_loader.py @@ -3,6 +3,7 @@ from yarl import URL +from crawlee.http_clients._base import HttpClient from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader BASIC_SITEMAP = """ @@ -46,9 +47,9 @@ def encode_base64(data: bytes) -> str: return base64.b64encode(data).decode('utf-8') -async def test_sitemap_traversal(server_url: URL) -> None: +async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) - sitemap_loader = SitemapRequestLoader([str(sitemap_url)]) + sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client) while not await sitemap_loader.is_finished(): item = await sitemap_loader.fetch_next_request() @@ -62,9 +63,9 @@ async def test_sitemap_traversal(server_url: URL) -> None: assert await sitemap_loader.get_handled_count() == 5 -async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL) -> None: +async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, http_client: HttpClient) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) - sitemap_loader = SitemapRequestLoader([str(sitemap_url)]) + sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client) items = [] @@ -84,9 +85,9 @@ async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL) - assert await sitemap_loader.is_finished() -async def test_abort_sitemap_loading(server_url: URL) -> None: +async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -> None: sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) - sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2) + sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2, http_client=http_client) item = await sitemap_loader.fetch_next_request() assert item is not None From eaf185ba0b7799f180b280a2b5f4b468a535d41b Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 11 Jun 2025 19:24:26 +0000 Subject: [PATCH 21/30] update with `read_stream` --- src/crawlee/_utils/sitemap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 37e2b7a777..2ebe27f0a0 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -326,7 +326,7 @@ async def _fetch_and_process_sitemap( try: # Process chunks as they arrive first_chunk = True - async for raw_chunk in response.iter_bytes(): + async for raw_chunk in response.read_stream(): # Check if the first chunk is a valid gzip header if first_chunk and raw_chunk.startswith(b'\x1f\x8b'): decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) From 03b03174ec67b21fd36c3de18da63df95f38fe91 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 12 Jun 2025 11:51:48 +0000 Subject: [PATCH 22/30] add activate property --- src/crawlee/http_clients/_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index fa0009d713..535c38a9d6 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -87,6 +87,11 @@ def __init__( # Flag to indicate the context state. self._active = False + @property + def active(self) -> bool: + """Indicate whether the context is active.""" + return self._active + @abstractmethod async def crawl( self, From fac5ca1c14821b78bef08e5385190cc9934555a0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 12 Jun 2025 11:55:21 +0000 Subject: [PATCH 23/30] add active property --- src/crawlee/http_clients/_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index fa0009d713..535c38a9d6 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -87,6 +87,11 @@ def __init__( # Flag to indicate the context state. self._active = False + @property + def active(self) -> bool: + """Indicate whether the context is active.""" + return self._active + @abstractmethod async def crawl( self, From 8abd28ec8de8e644e73716a0fa74bfff38e6126c Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Thu, 12 Jun 2025 16:39:38 +0300 Subject: [PATCH 24/30] Update src/crawlee/crawlers/_playwright/_types.py Co-authored-by: Jan Buchar --- src/crawlee/crawlers/_playwright/_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py index a4303ee7ba..8d0000adab 100644 --- a/src/crawlee/crawlers/_playwright/_types.py +++ b/src/crawlee/crawlers/_playwright/_types.py @@ -45,7 +45,7 @@ def read(self) -> bytes: return self._content async def read_stream(self) -> AsyncGenerator[bytes, None]: - # Playwright not support `streaming` responses. + # Playwright does not support `streaming` responses. # This is a workaround to make it compatible with `HttpResponse` protocol. yield self._content From 3c531694877e775954c89e0fb0cef75391baf514 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 12 Jun 2025 14:51:24 +0000 Subject: [PATCH 25/30] raise Error for `read_stream` if `stream` is consumed --- src/crawlee/http_clients/_base.py | 3 +++ src/crawlee/http_clients/_curl_impersonate.py | 5 +---- src/crawlee/http_clients/_httpx.py | 2 +- tests/unit/http_clients/test_curl_impersonate.py | 14 +++++++------- tests/unit/http_clients/test_httpx.py | 14 +++++++------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 535c38a9d6..46e2029f03 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -51,6 +51,9 @@ def read_stream(self) -> AsyncIterator[bytes]: This method should be used for responses received from the `stream` method to process large response bodies without loading them entirely into memory. It allows for efficient processing of potentially large data by yielding chunks sequentially. + + Raises: + RuntimeError: If called after the stream has been fully consumed. """ diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index f5a6e45312..f35df40929 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -91,11 +91,8 @@ def read(self) -> bytes: return self._response.content async def read_stream(self) -> AsyncGenerator[bytes, None]: - # Calling `aiter_content` again after executing `astream_task` causes DeadLock - # this will prevent that from happening. if not self._response.astream_task or self._response.astream_task.done(): # type: ignore[attr-defined] - yield b'' - return + raise RuntimeError('Stream is already consumed.') async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] yield chunk diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index ed2f3ba43e..a80e271f43 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -53,7 +53,7 @@ def read(self) -> bytes: async def read_stream(self) -> AsyncIterator[bytes]: if self._response.is_stream_consumed: - yield b'' + raise RuntimeError('Stream is already consumed.') else: async for chunk in self._response.aiter_bytes(): yield chunk diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py index 0ab4723538..5a0d632db6 100644 --- a/tests/unit/http_clients/test_curl_impersonate.py +++ b/tests/unit/http_clients/test_curl_impersonate.py @@ -135,7 +135,7 @@ async def test_stream(http_client: CurlImpersonateHttpClient, server_url: URL) - assert content_body == check_body -async def test_stream_double_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: +async def test_stream_error_double_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: check_body = b"""\ Hello, world! @@ -149,12 +149,10 @@ async def test_stream_double_read_stream(http_client: CurlImpersonateHttpClient, async for chunk in response.read_stream(): content_body_first += chunk - content_body_second: bytes = b'' - async for chunk in response.read_stream(): - content_body_second += chunk + with pytest.raises(RuntimeError): + [chunk async for chunk in response.read_stream()] assert content_body_first == check_body - assert content_body_second == b'' async def test_stream_error_for_read(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: @@ -169,7 +167,8 @@ async def test_send_request_error_for_read_stream(http_client: CurlImpersonateHt response = await http_client.send_request(str(server_url)) assert response.status_code == 200 - assert b''.join([item async for item in response.read_stream()]) == b'' + with pytest.raises(RuntimeError): + [item async for item in response.read_stream()] async def test_send_crawl_error_for_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: @@ -177,4 +176,5 @@ async def test_send_crawl_error_for_read_stream(http_client: CurlImpersonateHttp http_response = response.http_response assert http_response.status_code == 200 - assert b''.join([item async for item in http_response.read_stream()]) == b'' + with pytest.raises(RuntimeError): + [item async for item in http_response.read_stream()] diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index 1e378daa3d..38c28f7ade 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -147,7 +147,7 @@ async def test_stream(http_client: HttpxHttpClient, server_url: URL) -> None: assert content_body == check_body -async def test_stream_double_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: +async def test_stream_error_double_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: check_body = b"""\ Hello, world! @@ -161,12 +161,10 @@ async def test_stream_double_read_stream(http_client: HttpxHttpClient, server_ur async for chunk in response.read_stream(): content_body_first += chunk - content_body_second: bytes = b'' - async for chunk in response.read_stream(): - content_body_second += chunk + with pytest.raises(RuntimeError): + [chunk async for chunk in response.read_stream()] assert content_body_first == check_body - assert content_body_second == b'' async def test_stream_error_for_read(http_client: HttpxHttpClient, server_url: URL) -> None: @@ -181,7 +179,8 @@ async def test_send_request_error_for_read_stream(http_client: HttpxHttpClient, response = await http_client.send_request(str(server_url)) assert response.status_code == 200 - assert b''.join([item async for item in response.read_stream()]) == b'' + with pytest.raises(RuntimeError): + [item async for item in response.read_stream()] async def test_send_crawl_error_for_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None: @@ -189,4 +188,5 @@ async def test_send_crawl_error_for_read_stream(http_client: HttpxHttpClient, se http_response = response.http_response assert http_response.status_code == 200 - assert b''.join([item async for item in http_response.read_stream()]) == b'' + with pytest.raises(RuntimeError): + [item async for item in http_response.read_stream()] From 54ed3f8893436246d48a00b95b1325a6672734d3 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 12 Jun 2025 14:59:58 +0000 Subject: [PATCH 26/30] update Error --- src/crawlee/http_clients/_base.py | 3 ++- src/crawlee/http_clients/_curl_impersonate.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 46e2029f03..5cb48759e3 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -53,7 +53,8 @@ def read_stream(self) -> AsyncIterator[bytes]: processing of potentially large data by yielding chunks sequentially. Raises: - RuntimeError: If called after the stream has been fully consumed. + RuntimeError: If the stream has already been consumed or if the response was not obtained from the `stream` + method. """ diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index f35df40929..ccfa1bda33 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -92,7 +92,9 @@ def read(self) -> bytes: async def read_stream(self) -> AsyncGenerator[bytes, None]: if not self._response.astream_task or self._response.astream_task.done(): # type: ignore[attr-defined] - raise RuntimeError('Stream is already consumed.') + raise RuntimeError( + 'Cannot read stream: either already consumed or Response not obtained from `stream` method' + ) async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] yield chunk From f9339540ddc18df0e6ddff97f718e241d22ff64c Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 12 Jun 2025 18:29:12 +0000 Subject: [PATCH 27/30] add reuse tests for context manager --- src/crawlee/http_clients/_httpx.py | 27 ++++++++++++------- .../http_clients/test_curl_impersonate.py | 23 ++++++++++++++++ tests/unit/http_clients/test_httpx.py | 23 ++++++++++++++++ 3 files changed, 63 insertions(+), 10 deletions(-) diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index a80e271f43..a580e4ca8b 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -134,15 +134,7 @@ def __init__( self._ssl_context = httpx.create_ssl_context(verify=verify) - # Configure connection pool limits and keep-alive connections for transport - limits = async_client_kwargs.get('limits', httpx.Limits(max_connections=1000, max_keepalive_connections=200)) - - self._transport = _HttpxTransport( - http1=http1, - http2=http2, - verify=self._ssl_context, - limits=limits, - ) + self._transport: _HttpxTransport | None = None self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]() @@ -278,6 +270,19 @@ def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient: If a client for the specified proxy URL does not exist, create and store a new one. """ + if not self._transport: + # Configure connection pool limits and keep-alive connections for transport + limits = self._async_client_kwargs.get( + 'limits', httpx.Limits(max_connections=1000, max_keepalive_connections=200) + ) + + self._transport = _HttpxTransport( + http1=self._http1, + http2=self._http2, + verify=self._ssl_context, + limits=limits, + ) + if proxy_url not in self._client_by_proxy_url: # Prepare a default kwargs for the new client. kwargs: dict[str, Any] = { @@ -335,4 +340,6 @@ async def cleanup(self) -> None: for client in self._client_by_proxy_url.values(): await client.aclose() self._client_by_proxy_url.clear() - await self._transport.aclose() + if self._transport: + await self._transport.aclose() + self._transport = None diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py index 5a0d632db6..2e6e2cd42c 100644 --- a/tests/unit/http_clients/test_curl_impersonate.py +++ b/tests/unit/http_clients/test_curl_impersonate.py @@ -178,3 +178,26 @@ async def test_send_crawl_error_for_read_stream(http_client: CurlImpersonateHttp assert http_response.status_code == 200 with pytest.raises(RuntimeError): [item async for item in http_response.read_stream()] + + +async def test_reuse_context_manager(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: + async with http_client: + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 + + # Reusing the context manager should not raise an error + async with http_client: + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 + + +async def test_work_after_cleanup(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 + + # Cleanup the client + await http_client.cleanup() + + # After cleanup, the client should still work + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index 38c28f7ade..1f061f4292 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -190,3 +190,26 @@ async def test_send_crawl_error_for_read_stream(http_client: HttpxHttpClient, se assert http_response.status_code == 200 with pytest.raises(RuntimeError): [item async for item in http_response.read_stream()] + + +async def test_reuse_context_manager(http_client: HttpxHttpClient, server_url: URL) -> None: + async with http_client: + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 + + # Reusing the context manager should not raise an error + async with http_client: + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 + + +async def test_work_after_cleanup(http_client: HttpxHttpClient, server_url: URL) -> None: + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 + + # Cleanup the client + await http_client.cleanup() + + # After cleanup, the client should still work + response = await http_client.send_request(str(server_url)) + assert response.status_code == 200 From b501739668dd58c072d1d015bf8ef43a0a8f25e0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 18 Jun 2025 16:01:19 +0000 Subject: [PATCH 28/30] Use context manager in test --- .../http_clients/test_curl_impersonate.py | 41 ++++++++++--------- tests/unit/http_clients/test_httpx.py | 11 +++-- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py index 2e6e2cd42c..77e79d474f 100644 --- a/tests/unit/http_clients/test_curl_impersonate.py +++ b/tests/unit/http_clients/test_curl_impersonate.py @@ -12,14 +12,18 @@ from crawlee.statistics import Statistics if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from yarl import URL + from crawlee.http_clients import HttpClient from crawlee.proxy_configuration import ProxyInfo @pytest.fixture -def http_client() -> CurlImpersonateHttpClient: - return CurlImpersonateHttpClient(http_version=CurlHttpVersion.V1_1) +async def http_client() -> AsyncGenerator[HttpClient]: + async with CurlImpersonateHttpClient(http_version=CurlHttpVersion.V1_1) as client: + yield client @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') @@ -84,17 +88,16 @@ async def test_crawl_allow_redirects_by_default(http_client: CurlImpersonateHttp async def test_crawl_allow_redirects_false(server_url: URL) -> None: - http_client = CurlImpersonateHttpClient(allow_redirects=False, http_version=CurlHttpVersion.V1_1) - - target_url = str(server_url / 'status/200') - redirect_url = str((server_url / 'redirect').update_query(url=target_url)) - request = Request.from_url(redirect_url) + async with CurlImpersonateHttpClient(allow_redirects=False, http_version=CurlHttpVersion.V1_1) as http_client: + target_url = str(server_url / 'status/200') + redirect_url = str((server_url / 'redirect').update_query(url=target_url)) + request = Request.from_url(redirect_url) - crawling_result = await http_client.crawl(request) + crawling_result = await http_client.crawl(request) - assert crawling_result.http_response.status_code == 302 - assert crawling_result.http_response.headers['Location'] == target_url - assert request.loaded_url == redirect_url + assert crawling_result.http_response.status_code == 302 + assert crawling_result.http_response.headers['Location'] == target_url + assert request.loaded_url == redirect_url async def test_send_request_allow_redirects_by_default(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: @@ -107,15 +110,14 @@ async def test_send_request_allow_redirects_by_default(http_client: CurlImperson async def test_send_request_allow_redirects_false(server_url: URL) -> None: - http_client = CurlImpersonateHttpClient(allow_redirects=False, http_version=CurlHttpVersion.V1_1) + async with CurlImpersonateHttpClient(allow_redirects=False, http_version=CurlHttpVersion.V1_1) as http_client: + target_url = str(server_url / 'status/200') + redirect_url = str((server_url / 'redirect').update_query(url=target_url)) - target_url = str(server_url / 'status/200') - redirect_url = str((server_url / 'redirect').update_query(url=target_url)) - - response = await http_client.send_request(redirect_url) + response = await http_client.send_request(redirect_url) - assert response.status_code == 302 - assert response.headers['Location'] == target_url + assert response.status_code == 302 + assert response.headers['Location'] == target_url async def test_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: @@ -180,7 +182,8 @@ async def test_send_crawl_error_for_read_stream(http_client: CurlImpersonateHttp [item async for item in http_response.read_stream()] -async def test_reuse_context_manager(http_client: CurlImpersonateHttpClient, server_url: URL) -> None: +async def test_reuse_context_manager(server_url: URL) -> None: + http_client = CurlImpersonateHttpClient() async with http_client: response = await http_client.send_request(str(server_url)) assert response.status_code == 200 diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index 1f061f4292..4bee7f9460 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -14,14 +14,18 @@ from crawlee.statistics import Statistics if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from yarl import URL + from crawlee.http_clients import HttpClient from crawlee.proxy_configuration import ProxyInfo @pytest.fixture -def http_client() -> HttpxHttpClient: - return HttpxHttpClient(http2=False) +async def http_client() -> AsyncGenerator[HttpClient]: + async with HttpxHttpClient(http2=False) as client: + yield client async def test_http_1(server_url: URL) -> None: @@ -192,7 +196,8 @@ async def test_send_crawl_error_for_read_stream(http_client: HttpxHttpClient, se [item async for item in http_response.read_stream()] -async def test_reuse_context_manager(http_client: HttpxHttpClient, server_url: URL) -> None: +async def test_reuse_context_manager(server_url: URL) -> None: + http_client = HttpxHttpClient() async with http_client: response = await http_client.send_request(str(server_url)) assert response.status_code == 200 From 391af8e3d07201d6eab04414f921d7ed3e6f7015 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 24 Jun 2025 18:52:02 +0000 Subject: [PATCH 29/30] add stream decoder for UTF-8 --- src/crawlee/_utils/sitemap.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 2ebe27f0a0..6a5b85d710 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -2,6 +2,7 @@ import asyncio import zlib +from codecs import getincrementaldecoder from contextlib import suppress from dataclasses import dataclass from datetime import datetime, timedelta @@ -320,6 +321,8 @@ async def _fetch_and_process_sitemap( # Determine content type and compression content_type = response.headers.get('content-type', '') + decoder = getincrementaldecoder('utf-8')(errors='replace') + # Create appropriate parser parser = _get_parser(content_type, sitemap_url) decompressor = None @@ -333,7 +336,7 @@ async def _fetch_and_process_sitemap( first_chunk = False chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk - text_chunk = chunk.decode('utf-8', errors='replace') + text_chunk = decoder.decode(chunk) async for item in parser.process_chunk(text_chunk): async for result in _process_sitemap_item( item, From 592621129fe3df2b8070affe2e3c2c9fed92075e Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 27 Jun 2025 19:34:16 +0000 Subject: [PATCH 30/30] add `SitemapRequestLoader` in docs guide --- .../request_loaders/sitemap_example.py | 28 +++++++++++++++++++ docs/guides/request_loaders.mdx | 20 +++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 docs/guides/code_examples/request_loaders/sitemap_example.py diff --git a/docs/guides/code_examples/request_loaders/sitemap_example.py b/docs/guides/code_examples/request_loaders/sitemap_example.py new file mode 100644 index 0000000000..2ed2a62e96 --- /dev/null +++ b/docs/guides/code_examples/request_loaders/sitemap_example.py @@ -0,0 +1,28 @@ +import asyncio +import re + +from crawlee.http_clients import HttpxHttpClient +from crawlee.request_loaders import SitemapRequestLoader + + +async def main() -> None: + # Create an HTTP client for fetching sitemaps + async with HttpxHttpClient() as http_client: + # Create a sitemap request loader with URL filtering + sitemap_loader = SitemapRequestLoader( + sitemap_urls=['https://crawlee.dev/sitemap.xml'], + http_client=http_client, + # Exclude all URLs that do not contain 'blog' + exclude=[re.compile(r'^((?!blog).)*$')], + max_buffer_size=500, # Buffer up to 500 URLs in memory + ) + + while request := await sitemap_loader.fetch_next_request(): + # Do something with it... + + # And mark it as handled. + await sitemap_loader.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx index 73fe374a62..d42305a623 100644 --- a/docs/guides/request_loaders.mdx +++ b/docs/guides/request_loaders.mdx @@ -10,6 +10,7 @@ import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py'; +import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py'; import TandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example.py'; import ExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example_explicit.py'; @@ -23,9 +24,10 @@ The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/ - `RequestManager`: Extends `RequestLoader` with write capabilities. - `RequestManagerTandem`: Combines a read-only `RequestLoader` with a writable `RequestManager`. -And one specific request loader: +And specific request loaders: - `RequestList`: A lightweight implementation of request loader for managing a static list of URLs. +- `SitemapRequestLoader`: A request loader that reads URLs from XML sitemaps with filtering capabilities. Below is a class diagram that illustrates the relationships between these components and the `RequestQueue`: @@ -83,6 +85,11 @@ class RequestList { _methods_() } +class SitemapRequestLoader { + _attributes_ + _methods_() +} + class RequestManagerTandem { _attributes_ _methods_() @@ -97,6 +104,7 @@ RequestManager <|-- RequestQueue RequestLoader <|-- RequestManager RequestLoader <|-- RequestList +RequestLoader <|-- SitemapRequestLoader RequestManager <|-- RequestManagerTandem ``` @@ -112,6 +120,14 @@ Here is a basic example of working with the `Req {RlBasicExample} +## Sitemap request loader + +The `SitemapRequestLoader` is a specialized request loader that reads URLs from XML sitemaps. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The `SitemapRequestLoader` provides streaming processing of sitemaps, which ensures efficient memory usage without loading the entire sitemap into memory. + + + {SitemapExample} + + ## Request manager The `RequestManager` extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add or reclaim them. This is important for dynamic crawling projects, where new URLs may emerge during the crawl process. Or when certain requests may failed and need to be retried. For more details refer to the `RequestManager` API reference. @@ -139,4 +155,4 @@ This sections describes the combination of the ` ## Conclusion -This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` class. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` classes. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!