refactor(api): Rewrite PDFGenerator

2025-06-09 14:23:10 +00:00
parent 7c3e02f347
commit 0327a230bb
1 changed files with 204 additions and 0 deletions
@@ -0,0 +1,204 @@
+from base64 import b64encode
+from io import BytesIO
+from pathlib import Path
+from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
+from typing import List, cast
+
+from bs4 import BeautifulSoup, Tag
+from exiftool import ExifTool
+from jinja2 import Template
+from weasyprint import CSS, HTML
+from weasyprint.text.fonts import FontConfiguration
+
+from ..models import Story
+from .types import AbstractGenerator
+
+DATA_PATH = Path(__file__).parent / "pdf"
+ASSET_PATH = DATA_PATH / "assets"
+
+COPYRIGHT_DATA = {
+    1: {
+        "name": "All Rights Reserved",
+        "statement": "©️ {published_year} by {username}. All Rights Reserved.",
+        "freedoms": "No reuse, redistribution, or modification without permission.",
+        "printing": "Not allowed without explicit permission.",
+        "asset": None,
+    },
+    2: {
+        "name": "Public Domain",
+        "statement": "This work is in the public domain. Originally published in {published_year} by {username}.",
+        "freedoms": "Free to use for any purpose without permission.",
+        "printing": "Allowed for personal or commercial purposes.",
+        "asset": ASSET_PATH / "cc-zero.png",
+    },
+    3: {
+        "name": "Creative Commons Attribution (CC-BY)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution 4.0 International License.",
+        "freedoms": "Allows reuse, redistribution, and modification with credit to the author.",
+        "printing": "Allowed with proper credit.",
+        "asset": ASSET_PATH / "by.png",
+    },
+    4: {
+        "name": "CC Attribution NonCommercial (CC-BY-NC)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.",
+        "freedoms": "Allows reuse and modification for non-commercial purposes with credit.",
+        "printing": "Allowed for non-commercial purposes with proper credit.",
+        "asset": ASSET_PATH / "by-nc.png",
+    },
+    5: {
+        "name": "CC Attribution NonCommercial NoDerivs (CC-BY-NC-ND)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivs 4.0 International License.",
+        "freedoms": "Allows sharing in original form for non-commercial purposes with credit; no modifications allowed.",
+        "printing": "Allowed for non-commercial purposes in original form with proper credit.",
+        "asset": ASSET_PATH / "by-nc-nd.png",
+    },
+    6: {
+        "name": "CC Attribution NonCommercial ShareAlike (CC-BY-NC-SA)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.",
+        "freedoms": "Allows reuse and modification for non-commercial purposes under the same license, with credit.",
+        "printing": "Allowed for non-commercial purposes with proper credit under the same license.",
+        "asset": ASSET_PATH / "by-nc-sa.png",
+    },
+    7: {
+        "name": "CC Attribution ShareAlike (CC-BY-SA)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.",
+        "freedoms": "Allows reuse and modification for any purpose under the same license, with credit.",
+        "printing": "Allowed with proper credit under the same license.",
+        "asset": ASSET_PATH / "by-sa.png",
+    },
+    8: {
+        "name": "CC Attribution NoDerivs (CC-BY-ND)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NoDerivs 4.0 International License.",
+        "freedoms": "Allows sharing in original form for any purpose with credit; no modifications allowed.",
+        "printing": "Allowed in original form with proper credit.",
+        "asset": ASSET_PATH / "by-nd.png",
+    },
+}  # Maps Wattpad Copyright IDs to their corresponding data.
+
+with open(DATA_PATH / "stylesheet.css") as reader:
+    STYLESHEET = reader.read()
+
+
+with open(DATA_PATH / "book.html") as reader:
+    TEMPLATE = reader.read()
+
+
+class PDFGenerator(AbstractGenerator):
+    def __init__(
+        self,
+        metadata: Story,
+        part_trees: List[BeautifulSoup],
+        cover: bytes,
+        images: List[List[bytes]] | None,
+        author: bytes,
+    ):
+        self.story = metadata
+        self.parts = part_trees
+        self.cover = cover
+        self.images = images
+        self.author = author
+
+        self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf")
+        self.content = TEMPLATE
+
+    def generate_chapters(self) -> dict[int, str]:
+        """Return a dictionary of part_ids to content trees, with image URLs replaced with base64 encoded images if provided during initialization."""
+        data: dict[int, str] = {}
+        for idx, (part, tree) in enumerate(zip(self.story["parts"], self.parts)):
+            if self.images:
+                for img_idx, (img_data, img_tag) in enumerate(
+                    zip(self.images[idx], tree.find_all("img"))
+                ):
+                    img_tag[
+                        "src"
+                    ] = f"data:image/jpg;base64,{b64encode(img_data).decode()}"
+
+            data[part["id"]] = tree.prettify()
+
+        return data
+
+    def populate_template(self, parts: dict[int, str]):
+        """Populate HTML Template with Story data."""
+        copyright = COPYRIGHT_DATA[self.story["copyright"]]
+        data = {
+            "statement": copyright["statement"].format(
+                username=self.story["user"]["username"],
+                published_year=self.story["createDate"].split("-", 2)[0],
+            ),
+            "author": self.story["user"]["username"],
+            "freedoms": copyright["freedoms"],
+            "printing": copyright["printing"],
+            "book_id": self.story["id"],
+            "book_title": self.story["title"],
+            "cover": f"data:image/jpg;base64,{b64encode(self.cover).decode()}",
+            "username": self.story["user"]["username"],
+            "description": self.story["description"],
+            "avatar": b64encode(self.author).decode(),
+            "copyright": {
+                "data": b64encode(copyright["asset"].read_bytes()).decode()
+                if copyright["asset"]
+                else "",
+                "name": copyright["name"],
+            },
+            "parts": parts,
+        }
+
+        self.content: str = Template(self.content).render(data)
+
+    def generate_pdf(self):
+        """Generate and write the PDF to a temporary file (self.book)."""
+        font_config = FontConfiguration()
+
+        stylesheet_obj = CSS(string=STYLESHEET, font_config=font_config)
+
+        html_obj = HTML(string=self.content)
+        html_obj.write_pdf(
+            self.book.name, stylesheets=[stylesheet_obj], font_config=font_config
+        )
+
+    def add_metadata(self):
+        """Write metadata to generated PDF file at self.book, using ExifTool."""
+
+        clean_description = (
+            self.story["description"].strip().replace("\n", "$/")
+        )  # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `&#xa;` is another option.
+
+        metadata = {
+            "Author": self.story["user"]["username"],
+            "Title": self.story["title"],
+            "Subject": clean_description,
+            "CreationDate": self.story["createDate"],
+            "ModDate": self.story["modifyDate"],
+            "Keywords": ",".join(self.story["tags"]),
+            "Language": self.story["language"]["name"],
+            "Completed": self.story["completed"],
+            "MatureContent": self.story["mature"],
+            "Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
+        }  # As per https://exiftool.org/TagNames/PDF.html
+
+        with ExifTool(config_file=DATA_PATH / "exiftool.config") as et:
+            # Custom configuration adds Completed and MatureContent tags.
+            # exiftool logger logs executed command
+            et.execute(
+                *(
+                    [f"-{key}={value}" for key, value in metadata.items()]
+                    + [
+                        "-overwrite_original",
+                        self.book.file.name,
+                    ]
+                )
+            )
+
+    def compile(self):
+        parts = self.generate_chapters()
+        self.populate_template(parts)
+        self.generate_pdf()
+        self.add_metadata()
+        return True
+
+    def dump(self) -> BytesIO:
+        self.book.seek(0)
+        buffer = BytesIO(self.book.read())
+        self.book.close()
+
+        return buffer