diff --git a/docs/guides/code_examples/request_loaders/sitemap_example.py b/docs/guides/code_examples/request_loaders/sitemap_example.py
new file mode 100644
index 0000000000..2ed2a62e96
--- /dev/null
+++ b/docs/guides/code_examples/request_loaders/sitemap_example.py
@@ -0,0 +1,28 @@
+import asyncio
+import re
+
+from crawlee.http_clients import HttpxHttpClient
+from crawlee.request_loaders import SitemapRequestLoader
+
+
+async def main() -> None:
+ # Create an HTTP client for fetching sitemaps
+ async with HttpxHttpClient() as http_client:
+ # Create a sitemap request loader with URL filtering
+ sitemap_loader = SitemapRequestLoader(
+ sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+ http_client=http_client,
+ # Exclude all URLs that do not contain 'blog'
+ exclude=[re.compile(r'^((?!blog).)*$')],
+ max_buffer_size=500, # Buffer up to 500 URLs in memory
+ )
+
+ while request := await sitemap_loader.fetch_next_request():
+ # Do something with it...
+
+ # And mark it as handled.
+ await sitemap_loader.mark_request_as_handled(request)
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx
index 73fe374a62..d42305a623 100644
--- a/docs/guides/request_loaders.mdx
+++ b/docs/guides/request_loaders.mdx
@@ -10,6 +10,7 @@ import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py';
+import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py';
import TandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example.py';
import ExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/tandem_example_explicit.py';
@@ -23,9 +24,10 @@ The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/
- `RequestManager`: Extends `RequestLoader` with write capabilities.
- `RequestManagerTandem`: Combines a read-only `RequestLoader` with a writable `RequestManager`.
-And one specific request loader:
+And specific request loaders:
- `RequestList`: A lightweight implementation of request loader for managing a static list of URLs.
+- `SitemapRequestLoader`: A request loader that reads URLs from XML sitemaps with filtering capabilities.
Below is a class diagram that illustrates the relationships between these components and the `RequestQueue`:
@@ -83,6 +85,11 @@ class RequestList {
_methods_()
}
+class SitemapRequestLoader {
+ _attributes_
+ _methods_()
+}
+
class RequestManagerTandem {
_attributes_
_methods_()
@@ -97,6 +104,7 @@ RequestManager <|-- RequestQueue
RequestLoader <|-- RequestManager
RequestLoader <|-- RequestList
+RequestLoader <|-- SitemapRequestLoader
RequestManager <|-- RequestManagerTandem
```
@@ -112,6 +120,14 @@ Here is a basic example of working with the `Req
{RlBasicExample}
+## Sitemap request loader
+
+The `SitemapRequestLoader` is a specialized request loader that reads URLs from XML sitemaps. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The `SitemapRequestLoader` provides streaming processing of sitemaps, which ensures efficient memory usage without loading the entire sitemap into memory.
+
+
+ {SitemapExample}
+
+
## Request manager
The `RequestManager` extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add or reclaim them. This is important for dynamic crawling projects, where new URLs may emerge during the crawl process. Or when certain requests may failed and need to be retried. For more details refer to the `RequestManager` API reference.
@@ -139,4 +155,4 @@ This sections describes the combination of the `
## Conclusion
-This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` class. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
+This guide explained the `request_loaders` sub-package, which extends the functionality of the `RequestQueue` with additional tools for managing URLs. You learned about the `RequestLoader`, `RequestManager`, and `RequestManagerTandem` classes, as well as the `RequestList` and `SitemapRequestLoader` classes. You also saw examples of how to work with these classes in practice. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index 930ae09431..5cbe59b5ed 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -5,6 +5,7 @@
from protego import Protego
from yarl import URL
+from crawlee._utils.sitemap import Sitemap
from crawlee._utils.web import is_status_code_client_error
if TYPE_CHECKING:
@@ -15,9 +16,13 @@
class RobotsTxtFile:
- def __init__(self, url: str, robots: Protego) -> None:
+ def __init__(
+ self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
+ ) -> None:
self._robots = robots
self._original_url = URL(url).origin()
+ self._http_client = http_client
+ self._proxy_info = proxy_info
@classmethod
async def from_content(cls, url: str, content: str) -> Self:
@@ -56,7 +61,7 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
robots = Protego.parse(body.decode('utf-8'))
- return cls(url, robots)
+ return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
def is_allowed(self, url: str, user_agent: str = '*') -> bool:
"""Check if the given URL is allowed for the given user agent.
@@ -83,3 +88,16 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
"""
crawl_delay = self._robots.crawl_delay(user_agent)
return int(crawl_delay) if crawl_delay is not None else None
+
+ async def parse_sitemaps(self) -> Sitemap:
+ """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
+ sitemaps = self.get_sitemaps()
+ if not self._http_client:
+ raise ValueError('HTTP client is required to parse sitemaps.')
+
+ return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)
+
+ async def parse_urls_from_sitemaps(self) -> list[str]:
+ """Parse the sitemaps in the robots.txt file and return a list URLs."""
+ sitemap = await self.parse_sitemaps()
+ return sitemap.urls
diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
new file mode 100644
index 0000000000..6a5b85d710
--- /dev/null
+++ b/src/crawlee/_utils/sitemap.py
@@ -0,0 +1,477 @@
+from __future__ import annotations
+
+import asyncio
+import zlib
+from codecs import getincrementaldecoder
+from contextlib import suppress
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from hashlib import sha256
+from logging import getLogger
+from typing import TYPE_CHECKING, Literal, TypedDict
+from xml.sax.expatreader import ExpatParser
+from xml.sax.handler import ContentHandler
+
+from typing_extensions import NotRequired, override
+from yarl import URL
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator
+ from xml.sax.xmlreader import AttributesImpl
+
+ from crawlee.http_clients import HttpClient
+ from crawlee.proxy_configuration import ProxyInfo
+
+logger = getLogger(__name__)
+
+VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'}
+SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'}
+
+
+@dataclass()
+class SitemapUrl:
+ loc: str
+ lastmod: datetime | None = None
+ changefreq: str | None = None
+ priority: float | None = None
+ origin_sitemap_url: str | None = None
+
+
+@dataclass()
+class NestedSitemap:
+ loc: str
+ origin_sitemap_url: str | None = None
+
+
+class ParseSitemapOptions(TypedDict, total=False):
+ emit_nested_sitemaps: bool
+ max_depth: int
+ sitemap_retries: int
+ timeout: timedelta | None
+
+
+class SitemapSource(TypedDict):
+ type: Literal['url', 'raw']
+ url: NotRequired[str]
+ content: NotRequired[str]
+ depth: NotRequired[int]
+
+
+class _SitemapItem(TypedDict, total=False):
+ type: Literal['url', 'sitemap_url']
+ loc: str
+ url: str
+ lastmod: datetime | None
+ changefreq: str | None
+ priority: float | None
+
+
+class _XMLSaxSitemapHandler(ContentHandler):
+ def __init__(self) -> None:
+ super().__init__()
+ self._root_tag_name: str | None = None
+ self._current_tag: str | None = None
+ self._current_url: _SitemapItem = {}
+ self._buffer: str = ''
+ self._items: list[_SitemapItem] = []
+
+ @property
+ def items(self) -> list[_SitemapItem]:
+ return self._items
+
+ @override
+ def startElement(self, name: str, attrs: AttributesImpl) -> None:
+ if self._root_tag_name is None and name in ('urlset', 'sitemapindex'):
+ self._root_tag_name = name
+
+ if name in ('loc', 'lastmod', 'changefreq', 'priority'):
+ self._current_tag = name
+ self._buffer = ''
+
+ def characters(self, content: str) -> None:
+ if self._current_tag:
+ self._buffer += content
+
+ @override
+ def endElement(self, name: str) -> None:
+ if name == self._current_tag:
+ text = self._buffer.strip()
+
+ if name == 'loc':
+ if self._root_tag_name == 'sitemapindex':
+ self._items.append({'type': 'sitemap_url', 'url': text})
+ else:
+ self._current_url['loc'] = text
+
+ elif name == 'lastmod' and text:
+ with suppress(ValueError):
+ self._current_url['lastmod'] = datetime.fromisoformat(text.replace('Z', '+00:00'))
+
+ elif name == 'priority' and text:
+ with suppress(ValueError):
+ self._current_url['priority'] = float(text)
+
+ elif name == 'changefreq' and text in VALID_CHANGE_FREQS:
+ self._current_url['changefreq'] = text
+
+ self.current_tag = None
+
+ if name == 'url' and 'loc' in self._current_url:
+ self.items.append({'type': 'url', **self._current_url})
+ self._current_url = {}
+
+
+class _TxtSitemapParser:
+ """Parser for plaintext sitemaps that processes data as a stream."""
+
+ def __init__(self) -> None:
+ self._buffer = ''
+
+ async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]:
+ """Process a chunk of text data and yield items one by one."""
+ self._buffer += chunk
+
+ # Process complete lines
+ if '\n' in self._buffer:
+ lines = self._buffer.split('\n')
+ # Last element might be incomplete, save for next chunk
+ self._buffer = lines.pop()
+
+ for line in lines:
+ url = line.strip()
+ if url:
+ yield {'type': 'url', 'loc': url}
+
+ async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
+ """Process any remaining data in the buffer, yielding items one by one."""
+ if self._buffer:
+ url = self._buffer.strip()
+ if url:
+ yield {'type': 'url', 'loc': url}
+ self.buffer = ''
+
+ def close(self) -> None:
+ """Clean up resources."""
+ self._buffer = ''
+
+
+class _XmlSitemapParser:
+ """Parser for XML sitemaps using SAX to process data as a stream."""
+
+ def __init__(self) -> None:
+ self._parser = ExpatParser()
+ self._handler = _XMLSaxSitemapHandler()
+ self._parser.setContentHandler(self._handler)
+
+ async def process_chunk(self, chunk: str) -> AsyncGenerator[_SitemapItem, None]:
+ """Process a chunk of XML data and yield items one by one."""
+ try:
+ self._parser.feed(chunk)
+
+ # If we get here, the XML was valid and complete
+ for item in self._handler.items:
+ yield item
+
+ self._handler.items.clear()
+
+ except Exception as e:
+ logger.warning(f'Failed to parse XML data chunk: {e}', exc_info=True)
+
+ async def flush(self) -> AsyncGenerator[_SitemapItem, None]:
+ """Process any remaining data in the buffer, yielding items one by one."""
+ try:
+ self._parser.flush()
+
+ for item in self._handler.items:
+ yield item
+
+ self._handler.items.clear()
+
+ except Exception as e:
+ logger.warning(f'Failed to parse remaining XML data: {e}')
+
+ def close(self) -> None:
+ """Clean up resources."""
+ self._parser.close()
+
+
+def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:
+ """Create appropriate parser based on content type and URL."""
+ if 'text/plain' in content_type.lower() or (url and URL(url).path.endswith('.txt')):
+ return _TxtSitemapParser()
+ # Default to XML parser for most cases
+ return _XmlSitemapParser()
+
+
+def _get_origin_url(source: SitemapSource) -> str:
+ """Determine the origin URL for a sitemap source."""
+ if source['type'] == 'url' and 'url' in source:
+ return source['url']
+ if source['type'] == 'raw' and 'content' in source:
+ # For raw content sources, create a consistent identifier
+ return f'raw://{sha256(source["content"].encode()).hexdigest()}'
+ return ''
+
+
+async def _process_sitemap_item(
+ item: _SitemapItem,
+ source: SitemapSource,
+ depth: int,
+ visited_sitemap_urls: set[str],
+ sources: list[SitemapSource],
+ *,
+ emit_nested_sitemaps: bool,
+) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]:
+ """Process a sitemap item and yield appropriate results."""
+ item_copy = item.copy() # Work with a copy to avoid modifying the original
+
+ if 'type' not in item_copy:
+ return
+
+ item_type = item_copy.pop('type')
+
+ # Handle sitemap URL references (nested sitemaps)
+ if item_type == 'sitemap_url' and 'url' in item_copy:
+ sitemap_url = item_copy['url']
+ if sitemap_url and sitemap_url not in visited_sitemap_urls:
+ # Add to processing queue
+ sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1))
+
+ # Output the nested sitemap reference if requested
+ if emit_nested_sitemaps:
+ yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=None)
+
+ # Handle individual URL entries
+ elif item_type == 'url' and 'loc' in item_copy:
+ # Determine the origin sitemap URL for tracking purposes
+ origin_url = _get_origin_url(source)
+
+ # Create and yield the sitemap URL object
+ yield SitemapUrl(
+ loc=item_copy['loc'],
+ lastmod=item_copy.get('lastmod'),
+ changefreq=item_copy.get('changefreq'),
+ priority=item_copy.get('priority'),
+ origin_sitemap_url=origin_url,
+ )
+
+
+async def _process_raw_source(
+ source: SitemapSource,
+ depth: int,
+ visited_sitemap_urls: set[str],
+ sources: list[SitemapSource],
+ *,
+ emit_nested_sitemaps: bool,
+) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
+ """Process a raw content sitemap source."""
+ if 'content' not in source:
+ logger.warning(f'Raw source missing content: {source}')
+ return
+
+ content = source['content']
+ parser = _get_parser('text/xml')
+
+ try:
+ # Process the content
+ async for item in parser.process_chunk(content):
+ async for result in _process_sitemap_item(
+ item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
+ ):
+ if result:
+ yield result
+
+ # Process any remaining content
+ async for item in parser.flush():
+ async for result in _process_sitemap_item(
+ item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
+ ):
+ if result:
+ yield result
+ except Exception as e:
+ logger.warning(f'Failed to parse raw sitemap content: {e}')
+ finally:
+ parser.close()
+
+
+async def _fetch_and_process_sitemap(
+ http_client: HttpClient,
+ source: SitemapSource,
+ depth: int,
+ visited_sitemap_urls: set[str],
+ sources: list[SitemapSource],
+ retries_left: int,
+ *,
+ proxy_info: ProxyInfo | None = None,
+ timeout: timedelta | None = None,
+ emit_nested_sitemaps: bool,
+) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
+ """Fetch a sitemap from a URL and process its content."""
+ if 'url' not in source:
+ return
+
+ sitemap_url = source['url']
+
+ try:
+ while retries_left > 0:
+ retries_left -= 1
+ async with http_client.stream(
+ sitemap_url, method='GET', headers=SITEMAP_HEADERS, proxy_info=proxy_info, timeout=timeout
+ ) as response:
+ # Determine content type and compression
+ content_type = response.headers.get('content-type', '')
+
+ decoder = getincrementaldecoder('utf-8')(errors='replace')
+
+ # Create appropriate parser
+ parser = _get_parser(content_type, sitemap_url)
+ decompressor = None
+ try:
+ # Process chunks as they arrive
+ first_chunk = True
+ async for raw_chunk in response.read_stream():
+ # Check if the first chunk is a valid gzip header
+ if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
+ decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
+ first_chunk = False
+
+ chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
+ text_chunk = decoder.decode(chunk)
+ async for item in parser.process_chunk(text_chunk):
+ async for result in _process_sitemap_item(
+ item,
+ source,
+ depth,
+ visited_sitemap_urls,
+ sources,
+ emit_nested_sitemaps=emit_nested_sitemaps,
+ ):
+ if result:
+ yield result
+
+ # Process any remaining content
+ async for item in parser.flush():
+ async for result in _process_sitemap_item(
+ item,
+ source,
+ depth,
+ visited_sitemap_urls,
+ sources,
+ emit_nested_sitemaps=emit_nested_sitemaps,
+ ):
+ if result:
+ yield result
+ finally:
+ parser.close()
+ break
+
+ except Exception as e:
+ if retries_left > 0:
+ logger.warning(f'Error fetching sitemap {sitemap_url}: {e}. Retries left: {retries_left}')
+ await asyncio.sleep(1) # Brief pause before retry
+
+
+class Sitemap:
+ def __init__(self, urls: list[str]) -> None:
+ self._urls = urls
+
+ @property
+ def urls(self) -> list[str]:
+ return self._urls
+
+ @classmethod
+ async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap:
+ base_url = URL(url)
+ sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))]
+ return await cls.load(sitemap_urls, http_client, proxy_info)
+
+ @classmethod
+ async def load(
+ cls,
+ urls: str | list[str],
+ http_client: HttpClient,
+ proxy_info: ProxyInfo | None = None,
+ parse_sitemap_options: ParseSitemapOptions | None = None,
+ ) -> Sitemap:
+ if isinstance(urls, str):
+ urls = [urls]
+ return await cls.parse(
+ [SitemapSource(type='url', url=url) for url in urls], http_client, proxy_info, parse_sitemap_options
+ )
+
+ @classmethod
+ async def from_xml_string(cls, content: str) -> Sitemap:
+ return await cls.parse([SitemapSource(type='raw', content=content)])
+
+ @classmethod
+ async def parse(
+ cls,
+ sources: list[SitemapSource],
+ http_client: HttpClient | None = None,
+ proxy_info: ProxyInfo | None = None,
+ parse_sitemap_options: ParseSitemapOptions | None = None,
+ ) -> Sitemap:
+ urls = [item.loc async for item in parse_sitemap(sources, http_client, proxy_info, parse_sitemap_options)]
+ return cls(urls)
+
+
+async def parse_sitemap(
+ initial_sources: list[SitemapSource],
+ http_client: HttpClient | None = None,
+ proxy_info: ProxyInfo | None = None,
+ options: ParseSitemapOptions | None = None,
+) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
+ """Parse sitemap(s) and yield URLs found in them.
+
+ This function coordinates the process of fetching and parsing sitemaps,
+ handling both URL-based and raw content sources. It follows nested sitemaps
+ up to the specified maximum depth.
+ """
+ # Set default options
+ options = options or {}
+ emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
+ max_depth = options.get('max_depth', float('inf'))
+ sitemap_retries = options.get('sitemap_retries', 3)
+
+ # Setup working state
+ sources = list(initial_sources)
+ visited_sitemap_urls: set[str] = set()
+
+ # Process sources until the queue is empty
+ while sources:
+ source = sources.pop(0)
+ depth = source.get('depth', 0)
+
+ # Skip if we've reached max depth
+ if depth > max_depth:
+ logger.debug(f'Skipping sitemap {source.get("url", "")} - exceeded max depth {max_depth}')
+ continue
+
+ # Process based on source type
+ if source['type'] == 'raw':
+ async for result in _process_raw_source(
+ source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
+ ):
+ yield result
+
+ elif source['type'] == 'url' and 'url' in source:
+ # Add to visited set before processing to avoid duplicates
+ if http_client is None:
+ raise RuntimeError('HttpClient must be provided for URL-based sitemap sources.')
+
+ visited_sitemap_urls.add(source['url'])
+
+ async for result in _fetch_and_process_sitemap(
+ http_client,
+ source,
+ depth,
+ visited_sitemap_urls,
+ sources,
+ sitemap_retries,
+ emit_nested_sitemaps=emit_nested_sitemaps,
+ proxy_info=proxy_info,
+ timeout=options.get('timeout', timedelta(seconds=30)),
+ ):
+ yield result
+ else:
+ logger.warning(f'Invalid source configuration: {source}')
diff --git a/src/crawlee/request_loaders/__init__.py b/src/crawlee/request_loaders/__init__.py
index 57829ec5ce..c04d9aa810 100644
--- a/src/crawlee/request_loaders/__init__.py
+++ b/src/crawlee/request_loaders/__init__.py
@@ -2,10 +2,6 @@
from ._request_loader import RequestLoader
from ._request_manager import RequestManager
from ._request_manager_tandem import RequestManagerTandem
+from ._sitemap_request_loader import SitemapRequestLoader
-__all__ = [
- 'RequestList',
- 'RequestLoader',
- 'RequestManager',
- 'RequestManagerTandem',
-]
+__all__ = ['RequestList', 'RequestLoader', 'RequestManager', 'RequestManagerTandem', 'SitemapRequestLoader']
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
new file mode 100644
index 0000000000..c3dda01938
--- /dev/null
+++ b/src/crawlee/request_loaders/_sitemap_request_loader.py
@@ -0,0 +1,177 @@
+from __future__ import annotations
+
+import asyncio
+from contextlib import suppress
+from logging import getLogger
+from typing import TYPE_CHECKING, Any
+
+from crawlee import Request
+from crawlee._utils.docs import docs_group
+from crawlee._utils.globs import Glob
+from crawlee._utils.sitemap import ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
+from crawlee.request_loaders._request_loader import RequestLoader
+
+if TYPE_CHECKING:
+ import re
+ from collections.abc import Sequence
+
+ from crawlee.http_clients import HttpClient
+ from crawlee.proxy_configuration import ProxyInfo
+ from crawlee.storage_clients.models import ProcessedRequest
+
+
+logger = getLogger(__name__)
+
+
+@docs_group('Classes')
+class SitemapRequestLoader(RequestLoader):
+ """A request loader that reads URLs from sitemap(s).
+
+ The loader fetches and parses sitemaps in the background, allowing crawling to start
+ before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
+ """
+
+ def __init__(
+ self,
+ sitemap_urls: list[str],
+ http_client: HttpClient,
+ *,
+ proxy_info: ProxyInfo | None = None,
+ include: list[re.Pattern[Any] | Glob] | None = None,
+ exclude: list[re.Pattern[Any] | Glob] | None = None,
+ max_buffer_size: int = 200,
+ parse_sitemap_options: ParseSitemapOptions | None = None,
+ ) -> None:
+ """Initialize the sitemap request loader.
+
+ Args:
+ sitemap_urls: Configuration options for the loader.
+ proxy_info: Optional proxy to use for fetching sitemaps.
+ include: List of glob or regex patterns to include URLs.
+ exclude: List of glob or regex patterns to exclude URLs.
+ max_buffer_size: Maximum number of URLs to buffer in memory.
+ parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`.
+ http_client: the instance of `HttpClient` to use for fetching sitemaps.
+ """
+ self._http_client = http_client
+
+ self._sitemap_urls = sitemap_urls
+ self._include = include
+ self._exclude = exclude
+ self._proxy_info = proxy_info
+ self._parse_sitemap_options = parse_sitemap_options or ParseSitemapOptions()
+
+ self._handled_count = 0
+ self._total_count = 0
+
+ # URL queue and tracking
+ self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size)
+ self._in_progress: set[str] = set()
+ self._processed_urls: set[str] = set()
+
+ # Loading state
+ self._loading_task = asyncio.create_task(self._load_sitemaps())
+
+ def _check_url_patterns(
+ self,
+ target_url: str,
+ include: Sequence[re.Pattern[Any] | Glob] | None,
+ exclude: Sequence[re.Pattern[Any] | Glob] | None,
+ ) -> bool:
+ """Check if a URL matches configured include/exclude patterns."""
+ # If the URL matches any `exclude` pattern, reject it
+ for pattern in exclude or ():
+ if isinstance(pattern, Glob):
+ pattern = pattern.regexp # noqa: PLW2901
+
+ if pattern.match(target_url) is not None:
+ return False
+
+ # If there are no `include` patterns and the URL passed all `exclude` patterns, accept the URL
+ if include is None:
+ return True
+
+ # If the URL matches any `include` pattern, accept it
+ for pattern in include:
+ if isinstance(pattern, Glob):
+ pattern = pattern.regexp # noqa: PLW2901
+
+ if pattern.match(target_url) is not None:
+ return True
+
+ # The URL does not match any `include` pattern - reject it
+ return False
+
+ async def _load_sitemaps(self) -> None:
+ """Load URLs from sitemaps in the background."""
+ try:
+ async for item in parse_sitemap(
+ [SitemapSource(type='url', url=url) for url in self._sitemap_urls],
+ self._http_client,
+ proxy_info=self._proxy_info,
+ options=self._parse_sitemap_options,
+ ):
+ # Only process URL items (not nested sitemaps)
+ if isinstance(item, SitemapUrl):
+ url = item.loc
+
+ # Skip if already processed
+ if url in self._processed_urls:
+ continue
+
+ # Check if URL should be included
+ if not self._check_url_patterns(url, self._include, self._exclude):
+ continue
+
+ await self._url_queue.put(url)
+ self._processed_urls.add(url)
+ self._total_count += 1
+
+ except Exception:
+ logger.exception('Error loading sitemaps')
+ raise
+
+ async def get_total_count(self) -> int:
+ """Return the total number of URLs found so far."""
+ return self._total_count
+
+ async def is_empty(self) -> bool:
+ """Check if there are no more URLs to process."""
+ return self._url_queue.empty() and self._loading_task.done()
+
+ async def is_finished(self) -> bool:
+ """Check if all URLs have been processed."""
+ return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_task.done()
+
+ async def fetch_next_request(self) -> Request | None:
+ """Fetch the next request to process."""
+ while not (self._loading_task.done() and self._url_queue.empty()):
+ if self._url_queue.empty():
+ await asyncio.sleep(0.5)
+ continue
+
+ url = await self._url_queue.get()
+
+ request = Request.from_url(url)
+ self._in_progress.add(request.id)
+ return request
+
+ return None
+
+ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
+ """Mark a request as successfully handled."""
+ if request.id in self._in_progress:
+ self._in_progress.remove(request.id)
+ self._handled_count += 1
+ return None
+
+ async def get_handled_count(self) -> int:
+ """Return the number of handled requests."""
+ return self._handled_count
+
+ async def abort_loading(self) -> None:
+ """Abort the sitemap loading process."""
+ if self._loading_task and not self._loading_task.done():
+ self._loading_task.cancel()
+ with suppress(asyncio.CancelledError):
+ await self._loading_task
diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py
new file mode 100644
index 0000000000..807090eaa4
--- /dev/null
+++ b/tests/unit/_utils/test_sitemap.py
@@ -0,0 +1,248 @@
+import base64
+import gzip
+from datetime import datetime
+
+from yarl import URL
+
+from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap
+from crawlee.http_clients._base import HttpClient
+
+BASIC_SITEMAP = """
+
+
+
+http://not-exists.com/
+2005-02-03
+monthly
+0.8
+
+
+http://not-exists.com/catalog?item=12&desc=vacation_hawaii
+weekly
+
+
+http://not-exists.com/catalog?item=73&desc=vacation_new_zealand
+2004-12-23
+weekly
+
+
+http://not-exists.com/catalog?item=74&desc=vacation_newfoundland
+2004-12-23T18:00:15+00:00
+0.3
+
+
+http://not-exists.com/catalog?item=83&desc=vacation_usa
+2004-11-23
+
+
+""".strip()
+
+BASIC_RESULTS = {
+ 'http://not-exists.com/',
+ 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
+ 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
+ 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
+ 'http://not-exists.com/catalog?item=83&desc=vacation_usa',
+}
+
+
+def compress_gzip(data: str) -> bytes:
+ """Compress a string using gzip."""
+ return gzip.compress(data.encode())
+
+
+def encode_base64(data: bytes) -> str:
+ """Encode bytes to a base64 string."""
+ return base64.b64encode(data).decode('utf-8')
+
+
+async def test_sitemap(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a basic sitemap."""
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'
+ )
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 5
+ assert set(sitemap.urls) == BASIC_RESULTS
+
+
+async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient) -> None:
+ """Test extracting item metadata from a sitemap."""
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'
+ )
+
+ items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}], http_client=http_client)]
+ assert len(items) == 5
+ assert items[0] == SitemapUrl(
+ loc='http://not-exists.com/',
+ priority=0.8,
+ changefreq='monthly',
+ lastmod=datetime.fromisoformat('2005-02-03'),
+ origin_sitemap_url=str(sitemap_url),
+ )
+
+
+async def test_gzipped_sitemap(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a gzipped sitemap with correct type and .xml.gz url."""
+ gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
+ sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip')
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+ assert len(sitemap.urls) == 5
+ assert set(sitemap.urls) == BASIC_RESULTS
+
+
+async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a invalid gzipped sitemap with correct type and .xml.gz url."""
+ compress_data = compress_gzip(BASIC_SITEMAP)
+ invalid_gzipped_data = encode_base64(compress_data[:30])
+ sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip')
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 0
+ assert sitemap.urls == []
+
+
+async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data."""
+ sitemap_url = (server_url / 'sitemap.xml.gz').with_query(
+ base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip'
+ )
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 5
+ assert set(sitemap.urls) == BASIC_RESULTS
+
+
+async def test_gzipped_sitemap_with_bad_type(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a gzipped sitemap with bad type and .xml.gz url."""
+ gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
+ sitemap_url = (server_url / 'sitemap.xml.gz').with_query(
+ base64=gzipped_data, c_type='application/xml; charset=utf-8'
+ )
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 5
+ assert set(sitemap.urls) == BASIC_RESULTS
+
+
+async def test_xml_sitemap_with_gzipped_data(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a gzipped sitemap with correct type and .xml url."""
+ gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip')
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 5
+ assert set(sitemap.urls) == BASIC_RESULTS
+
+
+async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a parent sitemap that references child sitemaps."""
+ parent_sitemap = """
+
+
+
+{child_sitemap}
+2004-12-23
+
+
+{child_sitemap_2}
+2004-12-23
+
+
+""".strip()
+ child_sitemap = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP)))
+ parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2)
+ encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode())
+ parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content)
+
+ sitemap = await Sitemap.load(str(parent_sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 10
+ assert set(sitemap.urls) == BASIC_RESULTS
+
+
+async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a URL that does not point to a sitemap."""
+ sitemap = await Sitemap.load(str(server_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 0
+ assert sitemap.urls == []
+
+
+async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a sitemap with CDATA sections."""
+ cdata_sitemap = """
+
+
+
+
+
+
+ """.strip()
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(cdata_sitemap.encode()), c_type='application/xml; charset=utf-8'
+ )
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 1
+ assert sitemap.urls == ['http://not-exists.com/catalog']
+
+
+async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a plain text sitemap."""
+ urls = [
+ 'http://not-exists.com/catalog?item=78&desc=vacation_crete',
+ 'http://not-exists.com/catalog?item=79&desc=vacation_somalia',
+ ]
+ txt_sitemap_content = '\n'.join(urls)
+
+ sitemap_url = (server_url / 'sitemap.txt').with_query(base64=encode_base64(txt_sitemap_content.encode()))
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 2
+ assert set(sitemap.urls) == {
+ 'http://not-exists.com/catalog?item=78&desc=vacation_crete',
+ 'http://not-exists.com/catalog?item=79&desc=vacation_somalia',
+ }
+
+
+async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a pretty-printed sitemap."""
+ pretty_sitemap = """
+
+
+
+
+ http://not-exists.com/catalog?item=80&desc=vacation_turkey
+
+
+ 2005-02-03
+
+
+
+ monthly
+
+
+ 0.8
+
+
+
+""".strip()
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(pretty_sitemap.encode()), c_type='application/xml; charset=utf-8'
+ )
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 1
+ assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey']
+
+
+async def test_sitemap_from_string() -> None:
+ """Test creating a Sitemap instance from an XML string."""
+ sitemap = await Sitemap.from_xml_string(BASIC_SITEMAP)
+
+ assert len(sitemap.urls) == 5
+ assert set(sitemap.urls) == BASIC_RESULTS
diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py
index 3ee386324a..873672790a 100644
--- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py
+++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py
@@ -61,8 +61,8 @@
def test_urls(server_url: URL) -> list[str]:
"""Example pages used in the test are mocked for static requests."""
return [
- str(server_url.with_path('dynamic_content').with_query(content=_PAGE_CONTENT_STATIC)),
- str(server_url.with_path('dynamic_content').with_query(id='test2', content=_PAGE_CONTENT_STATIC)),
+ str(server_url.with_path('echo_content').with_query(content=_PAGE_CONTENT_STATIC)),
+ str(server_url.with_path('echo_content').with_query(id='test2', content=_PAGE_CONTENT_STATIC)),
]
diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py
new file mode 100644
index 0000000000..6e73708cb2
--- /dev/null
+++ b/tests/unit/request_loaders/test_sitemap_request_loader.py
@@ -0,0 +1,105 @@
+import base64
+import gzip
+
+from yarl import URL
+
+from crawlee.http_clients._base import HttpClient
+from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
+
+BASIC_SITEMAP = """
+
+
+
+http://not-exists.com/
+2005-02-03
+monthly
+0.8
+
+
+http://not-exists.com/catalog?item=12&desc=vacation_hawaii
+weekly
+
+
+http://not-exists.com/catalog?item=73&desc=vacation_new_zealand
+2004-12-23
+weekly
+
+
+http://not-exists.com/catalog?item=74&desc=vacation_newfoundland
+2004-12-23T18:00:15+00:00
+0.3
+
+
+http://not-exists.com/catalog?item=83&desc=vacation_usa
+2004-11-23
+
+
+""".strip()
+
+
+def compress_gzip(data: str) -> bytes:
+ """Compress a string using gzip."""
+ return gzip.compress(data.encode())
+
+
+def encode_base64(data: bytes) -> str:
+ """Encode bytes to a base64 string."""
+ return base64.b64encode(data).decode('utf-8')
+
+
+async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> None:
+ sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client)
+
+ while not await sitemap_loader.is_finished():
+ item = await sitemap_loader.fetch_next_request()
+ assert item is not None
+
+ await sitemap_loader.mark_request_as_handled(item)
+
+ assert await sitemap_loader.is_empty()
+ assert await sitemap_loader.is_finished()
+ assert await sitemap_loader.get_total_count() == 5
+ assert await sitemap_loader.get_handled_count() == 5
+
+
+async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, http_client: HttpClient) -> None:
+ sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client)
+
+ items = []
+
+ for _ in range(5):
+ item = await sitemap_loader.fetch_next_request()
+ assert item is not None
+ assert not await sitemap_loader.is_finished()
+ items.append(item)
+
+ assert await sitemap_loader.is_empty()
+ assert not await sitemap_loader.is_finished()
+
+ for item in items:
+ await sitemap_loader.mark_request_as_handled(item)
+
+ assert await sitemap_loader.is_empty()
+ assert await sitemap_loader.is_finished()
+
+
+async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -> None:
+ sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_loader = SitemapRequestLoader([str(sitemap_url)], max_buffer_size=2, http_client=http_client)
+
+ item = await sitemap_loader.fetch_next_request()
+ assert item is not None
+ await sitemap_loader.mark_request_as_handled(item)
+
+ assert not await sitemap_loader.is_empty()
+ assert not await sitemap_loader.is_finished()
+
+ await sitemap_loader.abort_loading()
+
+ item = await sitemap_loader.fetch_next_request()
+ assert item is not None
+ await sitemap_loader.mark_request_as_handled(item)
+
+ assert await sitemap_loader.is_finished()
diff --git a/tests/unit/server.py b/tests/unit/server.py
index 21ba01cec8..363bc61186 100644
--- a/tests/unit/server.py
+++ b/tests/unit/server.py
@@ -1,6 +1,7 @@
from __future__ import annotations
import asyncio
+import base64
import json
import threading
import time
@@ -23,9 +24,9 @@
if TYPE_CHECKING:
from socket import socket
-
Receive = Callable[[], Awaitable[dict[str, Any]]]
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
+PathHandler = Callable[[dict[str, Any], Receive, Send], Coroutine[None, None, None]]
def get_headers_dict(scope: dict[str, Any]) -> dict[str, str]:
@@ -92,55 +93,47 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
send: The ASGI send function.
"""
assert scope['type'] == 'http'
- path = scope['path']
-
+ paths: dict[str, PathHandler] = {
+ 'start_enqueue': start_enqueue_endpoint,
+ 'sub_index': secondary_index_endpoint,
+ 'incapsula': incapsula_endpoint,
+ 'page_1': generic_response_endpoint,
+ 'page_2': generic_response_endpoint,
+ 'page_3': generic_response_endpoint,
+ 'set_cookies': set_cookies,
+ 'set_complex_cookies': set_complex_cookies,
+ 'cookies': get_cookies,
+ 'status': echo_status,
+ 'headers': echo_headers,
+ 'user-agent': echo_user_agent,
+ 'echo_content': echo_content,
+ 'sitemap.txt': echo_content,
+ 'sitemap.xml': echo_content,
+ 'sitemap.xml.gz': echo_content,
+ 'get': get_echo,
+ 'post': post_echo,
+ 'redirect': redirect_to_url,
+ 'json': hello_world_json,
+ 'xml': hello_world_xml,
+ 'robots.txt': robots_txt,
+ }
+ path = URL(scope['path']).parts[1]
# Route requests to appropriate handlers
- if path.startswith('/start_enqueue'):
- await start_enqueue_endpoint(send)
- elif path.startswith('/sub_index'):
- await secondary_index_endpoint(send)
- elif path.startswith('/incapsula'):
- await incapsula_endpoint(send)
- elif path.startswith(('/page_1', '/page_2', '/page_3')):
- await generic_response_endpoint(send)
- elif path.startswith('/set_cookies'):
- await set_cookies(scope, send)
- elif path.startswith('/set_complex_cookies'):
- await set_complex_cookies(send)
- elif path.startswith('/cookies'):
- await get_cookies(scope, send)
- elif path.startswith('/status/'):
- await echo_status(scope, send)
- elif path.startswith('/headers'):
- await echo_headers(scope, send)
- elif path.startswith('/user-agent'):
- await echo_user_agent(scope, send)
- elif path.startswith('/get'):
- await get_echo(scope, send)
- elif path.startswith('/post'):
- await post_echo(scope, receive, send)
- elif path.startswith('/dynamic_content'):
- await dynamic_content(scope, send)
- elif path.startswith('/redirect'):
- await redirect_to_url(scope, send)
- elif path.startswith('/json'):
- await hello_world_json(send)
- elif path.startswith('/xml'):
- await hello_world_xml(send)
- elif path.startswith('/robots.txt'):
- await robots_txt(send)
+ if path in paths:
+ path_func = paths[path]
+ await path_func(scope, receive, send)
else:
- await hello_world(send)
+ await hello_world(scope, receive, send)
-async def get_cookies(scope: dict[str, Any], send: Send) -> None:
+async def get_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests to retrieve cookies sent in the request."""
headers = get_headers_dict(scope)
cookies = get_cookies_from_headers(headers)
await send_json_response(send, {'cookies': cookies})
-async def set_cookies(scope: dict[str, Any], send: Send) -> None:
+async def set_cookies(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests to set cookies from query parameters and redirect."""
query_params = get_query_params(scope.get('query_string', b''))
@@ -165,7 +158,7 @@ async def set_cookies(scope: dict[str, Any], send: Send) -> None:
await send({'type': 'http.response.body', 'body': b'Redirecting to get_cookies...'})
-async def hello_world(send: Send) -> None:
+async def hello_world(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle basic requests with a simple HTML response."""
await send_html_response(
send,
@@ -173,7 +166,7 @@ async def hello_world(send: Send) -> None:
)
-async def hello_world_json(send: Send) -> None:
+async def hello_world_json(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle basic requests with a simple JSON response."""
await send_json_response(
send,
@@ -181,7 +174,7 @@ async def hello_world_json(send: Send) -> None:
)
-async def hello_world_xml(send: Send) -> None:
+async def hello_world_xml(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle basic requests with a simple XML response."""
await send_html_response(
send,
@@ -240,7 +233,7 @@ async def post_echo(scope: dict[str, Any], receive: Receive, send: Send) -> None
await send_json_response(send, response)
-async def echo_status(scope: dict[str, Any], send: Send) -> None:
+async def echo_status(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Echo the status code from the URL path."""
status_code = int(scope['path'].replace('/status/', ''))
await send(
@@ -253,13 +246,13 @@ async def echo_status(scope: dict[str, Any], send: Send) -> None:
await send({'type': 'http.response.body', 'body': b''})
-async def echo_headers(scope: dict[str, Any], send: Send) -> None:
+async def echo_headers(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Echo back the request headers as JSON."""
headers = get_headers_dict(scope)
await send_json_response(send, headers)
-async def start_enqueue_endpoint(send: Send) -> None:
+async def start_enqueue_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests for the main page with links."""
await send_html_response(
send,
@@ -267,7 +260,7 @@ async def start_enqueue_endpoint(send: Send) -> None:
)
-async def secondary_index_endpoint(send: Send) -> None:
+async def secondary_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests for the secondary page with links."""
await send_html_response(
send,
@@ -275,7 +268,7 @@ async def secondary_index_endpoint(send: Send) -> None:
)
-async def incapsula_endpoint(send: Send) -> None:
+async def incapsula_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests for a page with an incapsula iframe."""
await send_html_response(
send,
@@ -283,7 +276,7 @@ async def incapsula_endpoint(send: Send) -> None:
)
-async def generic_response_endpoint(send: Send) -> None:
+async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests with a generic HTML response."""
await send_html_response(
send,
@@ -291,7 +284,7 @@ async def generic_response_endpoint(send: Send) -> None:
)
-async def redirect_to_url(scope: dict[str, Any], send: Send) -> None:
+async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests that should redirect to a specified full URL."""
query_params = get_query_params(scope.get('query_string', b''))
@@ -311,14 +304,14 @@ async def redirect_to_url(scope: dict[str, Any], send: Send) -> None:
await send({'type': 'http.response.body', 'body': f'Redirecting to {target_url}...'.encode()})
-async def echo_user_agent(scope: dict[str, Any], send: Send) -> None:
+async def echo_user_agent(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Echo back the user agent header as a response."""
headers = get_headers_dict(scope)
user_agent = headers.get('user-agent', 'Not provided')
await send_json_response(send, {'user-agent': user_agent})
-async def get_echo(scope: dict[str, Any], send: Send) -> None:
+async def get_echo(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Echo back GET request details similar to httpbin.org/get."""
path = scope.get('path', '')
query_string = scope.get('query_string', b'')
@@ -343,7 +336,7 @@ async def get_echo(scope: dict[str, Any], send: Send) -> None:
await send_json_response(send, response)
-async def set_complex_cookies(send: Send) -> None:
+async def set_complex_cookies(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests to set specific cookies with various attributes."""
headers = [
@@ -366,16 +359,28 @@ async def set_complex_cookies(send: Send) -> None:
await send({'type': 'http.response.body', 'body': b'Cookies have been set!'})
-async def dynamic_content(scope: dict[str, Any], send: Send) -> None:
- """Handle requests to serve HTML-page with dynamic content received in the request."""
+async def echo_content(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
+ """Echo back content (plain text or base64) with specified content-type."""
query_params = get_query_params(scope.get('query_string', b''))
content = query_params.get('content', '')
+ base64_content = query_params.get('base64', '')
+ c_type = query_params.get('c_type', 'text/html; charset=utf-8')
+
+ out_content = base64.b64decode(base64_content) if base64_content else content.encode()
+
+ await send(
+ {
+ 'type': 'http.response.start',
+ 'status': 200,
+ 'headers': [[b'content-type', c_type.encode()]],
+ }
+ )
- await send_html_response(send, html_content=content.encode())
+ await send({'type': 'http.response.body', 'body': out_content})
-async def robots_txt(send: Send) -> None:
+async def robots_txt(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests for the robots.txt file."""
await send_html_response(send, ROBOTS_TXT)