From bad99d92ae2ff33f87557eed2676487687659c7d Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Mon, 14 Apr 2025 13:09:57 +0200
Subject: [PATCH 1/8] Add bookwyrm support

Bookwyrm doesn't provide any API access as far as I know.
---
 catalog/sites/bookwyrm.py | 123 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 catalog/sites/bookwyrm.py

diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py
new file mode 100644
index 000000000..2222ca4d1
--- /dev/null
+++ b/catalog/sites/bookwyrm.py
@@ -0,0 +1,123 @@
+import re
+from urllib.parse import urlparse
+
+from lxml.html import fromstring
+
+from catalog.common import *
+from catalog.models import Edition
+from common.models import detect_language
+
+
+@SiteManager.register
+class Bookwyrm(AbstractSite):
+    SITE_NAME = SiteName.Bookwyrm
+    ID_TYPE = IdType.Bookwyrm
+    DEFAULT_MODEL = Edition
+    URL_PATTERNS = []
+
+    @classmethod
+    def id_to_url(cls, id_value):
+        return id_value
+
+    @classmethod
+    def url_to_id(cls, url: str):
+        return url
+
+    @classmethod
+    def validate_url_fallback(cls, url: str):
+        probe_url = "https://" + urlparse(url).hostname + "/nodeinfo/2.0"  # type: ignore
+        software = (
+            CachedDownloader(probe_url).download().json().get("software").get("name")
+        )
+        if software == "bookwyrm":
+            return True
+        else:
+            return False
+
+    def scrape(self, response=None):
+        r = BasicDownloader(self.id_value).download()
+        tree = fromstring(r.text)
+        data = {}
+        title = "".join(tree.xpath("//h1[contains(@itemprop,'name')]//text()")).strip()  # type: ignore
+
+        author = tree.xpath("//a[contains(@itemprop,'author')]//text()")
+        isbn = "".join(tree.xpath("//dd[contains(@itemprop,'isbn')]//text()")).replace(  # type: ignore
+            "-", ""
+        )
+
+        pub_date = (
+            "".join(
+                map(
+                    str,
+                    tree.xpath("//meta[contains(@itemprop,'datePublished')]/@content"),  # type: ignore
+                )
+            )
+            .strip()
+            .split("-")
+        )
+
+        pub_house = "".join(
+            map(str, tree.xpath("//meta[contains(@itemprop,'publisher')]/@content"))  # type: ignore
+        ).strip()
+
+        cover_src = tree.xpath("//img[contains(@class,'book-cover')]/@src")[0]  # type: ignore
+        pages = "".join(
+            map(str, tree.xpath("//meta[contains(@itemprop,'numberOfPages')]/@content"))  # type: ignore
+        ).strip()
+
+        brief = "".join(
+            tree.xpath("//div[contains(@itemprop,'abstract')]//text()")  # type: ignore
+        ).strip()
+
+        subtitle = "".join(
+            map(
+                str,
+                tree.xpath(
+                    "//meta[contains(@itemprop,'alternativeHeadline')]/@content"  # type: ignore
+                ),
+            )
+        ).strip()
+
+        series = "".join(
+            tree.xpath(
+                "//span[contains(@itemprop,'isPartOf')]//span[contains(@itemprop,'name')]//text()"  # type: ignore
+            )
+        ).strip()
+
+        lang = detect_language(title + " " + brief) if brief else detect_language(title)
+
+        book_base = "https://" + urlparse(self.id_value).hostname  # type: ignore
+        if re.compile("^https://").match(cover_src):  # type: ignore
+            data["cover_image_url"] = cover_src
+        else:
+            data["cover_image_url"] = book_base + cover_src if cover_src else None  # type: ignore
+
+        if len(pub_date) == 3:
+            data["pub_year"] = pub_date[0]
+            data["pub_month"] = pub_date[1]
+
+        data["pub_house"] = pub_house if pub_house else None
+
+        data["pages"] = pages if pages else None
+
+        data["isbn"] = isbn if isbn else None
+
+        data["series"] = series if series else None
+
+        data["author"] = author
+
+        data["localized_title"] = [{"lang": lang, "text": title}]
+
+        data["localized_subtitle"] = (
+            [{"lang": lang, "text": subtitle}] if subtitle else None
+        )
+
+        data["localized_description"] = (
+            [{"lang": lang, "text": brief}] if brief else None
+        )
+
+        pd = ResourceContent(
+            metadata=data,
+            lookup_ids={IdType.ISBN: isbn},
+        )
+        return pd

From b891a41ca4e52f438771f5b271f0ef7e4bd9e7b6 Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Mon, 14 Apr 2025 13:10:41 +0200
Subject: [PATCH 2/8] Update __init__.py

---
 catalog/sites/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py
index 9f2acf3f6..6e0d81221 100644
--- a/catalog/sites/__init__.py
+++ b/catalog/sites/__init__.py
@@ -7,6 +7,7 @@
 from .bgg import BoardGameGeek
 from .bibliotek_dk import BibliotekDK_Edition, BibliotekDK_Work
 from .bookstw import BooksTW
+from .bookwyrm import Bookwyrm
 from .discogs import DiscogsMaster, DiscogsRelease
 from .douban_book import DoubanBook
 from .douban_drama import DoubanDrama

From f0c24a0babb58c286152096b0546072f86b7307f Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Mon, 14 Apr 2025 13:11:45 +0200
Subject: [PATCH 3/8] Update models.py

---
 catalog/common/models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/catalog/common/models.py b/catalog/common/models.py
index 93a9b9ff6..3f533d414 100644
--- a/catalog/common/models.py
+++ b/catalog/common/models.py
@@ -35,6 +35,7 @@ class SiteName(models.TextChoices):
     Goodreads = "goodreads", _("Goodreads")
     GoogleBooks = "googlebooks", _("Google Books")
     BooksTW = "bookstw", _("BooksTW")
+    Bookwyrm = "bookwyrm", _("Bookwyrm")
     BibliotekDK = "bibliotekdk", _("Bibliotek.dk")
     BibliotekDK_eReolen = "eReolen", _("eReolen.dk")
     IMDB = "imdb", _("IMDb")
@@ -82,6 +83,7 @@ class IdType(models.TextChoices):
     DoubanDrama = "doubandrama", _("Douban Drama")
     DoubanDramaVersion = "doubandrama_version", _("Douban Drama Version")
     BooksTW = "bookstw", _("BooksTW Book")
+    Bookwyrm = "bookwyrm", _("Bookwyrm")
     BibliotekDK_Edition = "bibliotekdk_edition", _("Bibliotek.dk")
     BibliotekDK_eReolen = "bibliotekdk_ereolen", _("eReolen.dk")
     BibliotekDK_Work = "bibliotekdk_work", _("Bibliotek.dk")

From 4ecb07fb90db99f183ab64ed35de23cecb6fa732 Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Mon, 14 Apr 2025 13:56:15 +0200
Subject: [PATCH 4/8] fix: add url check

---
 catalog/sites/bookwyrm.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py
index 2222ca4d1..d94eeaf29 100644
--- a/catalog/sites/bookwyrm.py
+++ b/catalog/sites/bookwyrm.py
@@ -25,12 +25,17 @@ def url_to_id(cls, url: str):
 
     @classmethod
     def validate_url_fallback(cls, url: str):
-        probe_url = "https://" + urlparse(url).hostname + "/nodeinfo/2.0"  # type: ignore
+        parsed = urlparse(url)
+        probe_url = "https://" + parsed.hostname + "/nodeinfo/2.0"  # type: ignore
         software = (
             CachedDownloader(probe_url).download().json().get("software").get("name")
         )
         if software == "bookwyrm":
-            return True
+            p = parsed.path
+            if re.compile("^/book/[0-9]+").match(p):
+                return True
+            else:
+                return False
         else:
             return False
 

From 0a643247d094d93c1dc872f966c1bd216817eda7 Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Mon, 14 Apr 2025 17:52:58 +0200
Subject: [PATCH 5/8] fix: actually add the model

---
 catalog/sites/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/catalog/sites/__init__.py b/catalog/sites/__init__.py
index 6e0d81221..291634f22 100644
--- a/catalog/sites/__init__.py
+++ b/catalog/sites/__init__.py
@@ -38,6 +38,7 @@
     "BibliotekDK_Work",
     "BoardGameGeek",
     "BooksTW",
+    "Bookwyrm",
     "DiscogsMaster",
     "DiscogsRelease",
     "DoubanBook",

From 6f2638d0eabda85436d432912a500a53348ff323 Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Wed, 16 Apr 2025 00:20:59 +0200
Subject: [PATCH 6/8] handle redirection

settle with a fixed id that doesn't vary according to bookwyrm redirection
---
 catalog/sites/bookwyrm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py
index d94eeaf29..39c5930a2 100644
--- a/catalog/sites/bookwyrm.py
+++ b/catalog/sites/bookwyrm.py
@@ -21,8 +21,8 @@ def id_to_url(cls, id_value):
 
     @classmethod
     def url_to_id(cls, url: str):
-        return url
-
+        return "https://" + urlparse(url).hostname + "/book/" + re.search(r'/book/(\d+)', url).group(1)
+        
     @classmethod
     def validate_url_fallback(cls, url: str):
         parsed = urlparse(url)

From c331c79104b1a6f80d5cb52a1006b5e18d4c7554 Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Wed, 16 Apr 2025 00:25:53 +0200
Subject: [PATCH 7/8] fix: deal with pyright

---
 catalog/sites/bookwyrm.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py
index 39c5930a2..245a702bd 100644
--- a/catalog/sites/bookwyrm.py
+++ b/catalog/sites/bookwyrm.py
@@ -21,7 +21,12 @@ def id_to_url(cls, id_value):
 
     @classmethod
     def url_to_id(cls, url: str):
-        return "https://" + urlparse(url).hostname + "/book/" + re.search(r'/book/(\d+)', url).group(1)
+        return (
+            "https://"
+            + urlparse(url).hostname  # type: ignore
+            + "/book/"
+            + re.search(r"/book/(\d+)", url).group(1)  # type: ignore
+        )
         
     @classmethod
     def validate_url_fallback(cls, url: str):

From 9740891cae05922d2b1d2f842aecfb1b35085e3c Mon Sep 17 00:00:00 2001
From: luxaeternati <luxaeterna@fedora.email>
Date: Wed, 16 Apr 2025 13:54:05 +0200
Subject: [PATCH 8/8] fix: handle exception

---
 catalog/sites/bookwyrm.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/catalog/sites/bookwyrm.py b/catalog/sites/bookwyrm.py
index 245a702bd..de7e75530 100644
--- a/catalog/sites/bookwyrm.py
+++ b/catalog/sites/bookwyrm.py
@@ -32,9 +32,12 @@ def url_to_id(cls, url: str):
     def validate_url_fallback(cls, url: str):
         parsed = urlparse(url)
         probe_url = "https://" + parsed.hostname + "/nodeinfo/2.0"  # type: ignore
-        software = (
-            CachedDownloader(probe_url).download().json().get("software").get("name")
-        )
+        try:
+            software = (
+                CachedDownloader(probe_url).download().json().get("software").get("name")
+            )
+        except Exception:
+            return False
         if software == "bookwyrm":
             p = parsed.path
             if re.compile("^/book/[0-9]+").match(p):