From bad99d92ae2ff33f87557eed2676487687659c7d Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Mon, 14 Apr 2025 13:09:57 +0200 Subject: [PATCH 1/8] Add bookwyrm support Bookwyrm doesn't provide any API access as far as I know. --- catalog/sites/bookwyrm.py | 123 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 catalog/sites/bookwyrm.py diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py new file mode 100644 index 000000000..2222ca4d1 --- /dev/null +++ b/catalog/sites/bookwyrm.py @@ -0,0 +1,123 @@ +import re +from urllib.parse import urlparse + +from lxml.html import fromstring + +from catalog.common import * +from catalog.models import Edition +from common.models import detect_language + + +@SiteManager.register +class Bookwyrm(AbstractSite): + SITE_NAME = SiteName.Bookwyrm + ID_TYPE = IdType.Bookwyrm + DEFAULT_MODEL = Edition + URL_PATTERNS = [] + + @classmethod + def id_to_url(cls, id_value): + return id_value + + @classmethod + def url_to_id(cls, url: str): + return url + + @classmethod + def validate_url_fallback(cls, url: str): + probe_url = "https://" + urlparse(url).hostname + "/nodeinfo/2.0" # type: ignore + software = ( + CachedDownloader(probe_url).download().json().get("software").get("name") + ) + if software == "bookwyrm": + return True + else: + return False + + def scrape(self, response=None): + r = BasicDownloader(self.id_value).download() + tree = fromstring(r.text) + data = {} + title = "".join(tree.xpath("//h1[contains(@itemprop,'name')]//text()")).strip() # type: ignore + + author = tree.xpath("//a[contains(@itemprop,'author')]//text()") + isbn = "".join(tree.xpath("//dd[contains(@itemprop,'isbn')]//text()")).replace( # type: ignore + "-", "" + ) + + pub_date = ( + "".join( + map( + str, + tree.xpath("//meta[contains(@itemprop,'datePublished')]/@content"), # type: ignore + ) + ) + .strip() + .split("-") + ) + + pub_house = "".join( + map(str, tree.xpath("//meta[contains(@itemprop,'publisher')]/@content")) # type: ignore + ).strip() + + cover_src = tree.xpath("//img[contains(@class,'book-cover')]/@src")[0] # type: ignore + pages = "".join( + map(str, tree.xpath("//meta[contains(@itemprop,'numberOfPages')]/@content")) # type: ignore + ).strip() + + brief = "".join( + tree.xpath("//div[contains(@itemprop,'abstract')]//text()") # type: ignore + ).strip() + + subtitle = "".join( + map( + str, + tree.xpath( + "//meta[contains(@itemprop,'alternativeHeadline')]/@content" # type: ignore + ), + ) + ).strip() + + series = "".join( + tree.xpath( + "//span[contains(@itemprop,'isPartOf')]//span[contains(@itemprop,'name')]//text()" # type: ignore + ) + ).strip() + + lang = detect_language(title + " " + brief) if brief else detect_language(title) + + book_base = "https://" + urlparse(self.id_value).hostname # type: ignore + if re.compile("^https://").match(cover_src): # type: ignore + data["cover_image_url"] = cover_src + else: + data["cover_image_url"] = book_base + cover_src if cover_src else None # type: ignore + + if len(pub_date) == 3: + data["pub_year"] = pub_date[0] + data["pub_month"] = pub_date[1] + + data["pub_house"] = pub_house if pub_house else None + + data["pages"] = pages if pages else None + + data["isbn"] = isbn if isbn else None + + data["series"] = series if series else None + + data["author"] = author + + data["localized_title"] = [{"lang": lang, "text": title}] + + data["localized_subtitle"] = ( + [{"lang": lang, "text": subtitle}] if subtitle else None + ) + + data["localized_description"] = ( + [{"lang": lang, "text": brief}] if brief else None + ) + + pd = ResourceContent( + metadata=data, + lookup_ids={IdType.ISBN: isbn}, + ) + return pd From b891a41ca4e52f438771f5b271f0ef7e4bd9e7b6 Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Mon, 14 Apr 2025 13:10:41 +0200 Subject: [PATCH 2/8] Update __init__.py --- catalog/sites/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py index 9f2acf3f6..6e0d81221 100644 --- a/catalog/sites/__init__.py +++ b/catalog/sites/__init__.py @@ -7,6 +7,7 @@ from .bgg import BoardGameGeek from .bibliotek_dk import BibliotekDK_Edition, BibliotekDK_Work from .bookstw import BooksTW +from .bookwyrm import Bookwyrm from .discogs import DiscogsMaster, DiscogsRelease from .douban_book import DoubanBook from .douban_drama import DoubanDrama From f0c24a0babb58c286152096b0546072f86b7307f Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Mon, 14 Apr 2025 13:11:45 +0200 Subject: [PATCH 3/8] Update models.py --- catalog/common/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/catalog/common/models.py b/catalog/common/models.py index 93a9b9ff6..3f533d414 100644 --- a/catalog/common/models.py +++ b/catalog/common/models.py @@ -35,6 +35,7 @@ class SiteName(models.TextChoices): Goodreads = "goodreads", _("Goodreads") GoogleBooks = "googlebooks", _("Google Books") BooksTW = "bookstw", _("BooksTW") + Bookwyrm = "bookwyrm", _("Bookwyrm") BibliotekDK = "bibliotekdk", _("Bibliotek.dk") BibliotekDK_eReolen = "eReolen", _("eReolen.dk") IMDB = "imdb", _("IMDb") @@ -82,6 +83,7 @@ class IdType(models.TextChoices): DoubanDrama = "doubandrama", _("Douban Drama") DoubanDramaVersion = "doubandrama_version", _("Douban Drama Version") BooksTW = "bookstw", _("BooksTW Book") + Bookwyrm = "bookwyrm", _("Bookwyrm") BibliotekDK_Edition = "bibliotekdk_edition", _("Bibliotek.dk") BibliotekDK_eReolen = "bibliotekdk_ereolen", _("eReolen.dk") BibliotekDK_Work = "bibliotekdk_work", _("Bibliotek.dk") From 4ecb07fb90db99f183ab64ed35de23cecb6fa732 Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Mon, 14 Apr 2025 13:56:15 +0200 Subject: [PATCH 4/8] fix: add url check --- catalog/sites/bookwyrm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py index 2222ca4d1..d94eeaf29 100644 --- a/catalog/sites/bookwyrm.py +++ b/catalog/sites/bookwyrm.py @@ -25,12 +25,17 @@ def url_to_id(cls, url: str): @classmethod def validate_url_fallback(cls, url: str): - probe_url = "https://" + urlparse(url).hostname + "/nodeinfo/2.0" # type: ignore + parsed = urlparse(url) + probe_url = "https://" + parsed.hostname + "/nodeinfo/2.0" # type: ignore software = ( CachedDownloader(probe_url).download().json().get("software").get("name") ) if software == "bookwyrm": - return True + p = parsed.path + if re.compile("^/book/[0-9]+").match(p): + return True + else: + return False else: return False From 0a643247d094d93c1dc872f966c1bd216817eda7 Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Mon, 14 Apr 2025 17:52:58 +0200 Subject: [PATCH 5/8] fix: actually add the model --- catalog/sites/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py index 6e0d81221..291634f22 100644 --- a/catalog/sites/__init__.py +++ b/catalog/sites/__init__.py @@ -38,6 +38,7 @@ "BibliotekDK_Work", "BoardGameGeek", "BooksTW", + "Bookwyrm", "DiscogsMaster", "DiscogsRelease", "DoubanBook", From 6f2638d0eabda85436d432912a500a53348ff323 Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Wed, 16 Apr 2025 00:20:59 +0200 Subject: [PATCH 6/8] handle redirection settle with a fixed id that doesn't vary according to bookwyrm redirection --- catalog/sites/bookwyrm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py index d94eeaf29..39c5930a2 100644 --- a/catalog/sites/bookwyrm.py +++ b/catalog/sites/bookwyrm.py @@ -21,8 +21,8 @@ def id_to_url(cls, id_value): @classmethod def url_to_id(cls, url: str): - return url - + return "https://" + urlparse(url).hostname + "/book/" + re.search(r'/book/(\d+)', url).group(1) + @classmethod def validate_url_fallback(cls, url: str): parsed = urlparse(url) From c331c79104b1a6f80d5cb52a1006b5e18d4c7554 Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Wed, 16 Apr 2025 00:25:53 +0200 Subject: [PATCH 7/8] fix: deal with pyright --- catalog/sites/bookwyrm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py index 39c5930a2..245a702bd 100644 --- a/catalog/sites/bookwyrm.py +++ b/catalog/sites/bookwyrm.py @@ -21,7 +21,12 @@ def id_to_url(cls, id_value): @classmethod def url_to_id(cls, url: str): - return "https://" + urlparse(url).hostname + "/book/" + re.search(r'/book/(\d+)', url).group(1) + return ( + "https://" + + urlparse(url).hostname # type: ignore + + "/book/" + + re.search(r"/book/(\d+)", url).group(1) # type: ignore + ) @classmethod def validate_url_fallback(cls, url: str): From 9740891cae05922d2b1d2f842aecfb1b35085e3c Mon Sep 17 00:00:00 2001 From: luxaeternati Date: Wed, 16 Apr 2025 13:54:05 +0200 Subject: [PATCH 8/8] fix: handle exception --- catalog/sites/bookwyrm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py index 245a702bd..de7e75530 100644 --- a/catalog/sites/bookwyrm.py +++ b/catalog/sites/bookwyrm.py @@ -32,9 +32,12 @@ def url_to_id(cls, url: str): def validate_url_fallback(cls, url: str): parsed = urlparse(url) probe_url = "https://" + parsed.hostname + "/nodeinfo/2.0" # type: ignore - software = ( - CachedDownloader(probe_url).download().json().get("software").get("name") - ) + try: + software = ( + CachedDownloader(probe_url).download().json().get("software").get("name") + ) + except Exception: + return False if software == "bookwyrm": p = parsed.path if re.compile("^/book/[0-9]+").match(p):