feat(api): Use new Parser and Generators

2025-06-09 14:43:18 +00:00
parent a790021057
commit 3853e0d586
6 changed files with 271 additions and 668 deletions
@@ -1,116 +1,108 @@
 from io import BytesIO
-from typing import List
+from typing import Generator, List

-import bs4
-from aiohttp_client_cache.session import CachedSession
+from bs4 import BeautifulSoup
 from ebooklib import epub

 from ..models import Story
-
-headers = {}
+from .types import AbstractGenerator


-class EPUBGenerator:
-    """EPUB Generation utilities"""
-
-    def __init__(self, data: Story, cover: bytes):
-        """Initialize EPUBGenerator. Create epub.EpubBook() and set metadata and cover."""
-        self.epub = epub.EpubBook()
-        self.data = data
+class EPUBGenerator(AbstractGenerator):
+    def __init__(
+        self,
+        metadata: Story,
+        part_trees: List[BeautifulSoup],
+        cover: bytes,
+        images: List[Generator[bytes]] | None,
+    ):
+        self.story = metadata
+        self.parts = part_trees
        self.cover = cover
+        self.images = images

-        # set metadata, defined in https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2
-        self.epub.add_author(data["user"]["username"])
+        self.book: epub.EpubBook = epub.EpubBook()

-        self.epub.add_metadata("DC", "title", data["title"])
-        self.epub.add_metadata("DC", "description", data["description"])
-        self.epub.add_metadata("DC", "date", data["createDate"])
-        self.epub.add_metadata("DC", "modified", data["modifyDate"])
-        self.epub.add_metadata("DC", "language", data["language"]["name"])
+    def add_metadata(self):
+        """Add metadata to epub."""
+        self.book.add_author(self.story["user"]["username"])

-        self.epub.add_metadata(
-            None, "meta", "", {"name": "tags", "content": ", ".join(data["tags"])}
+        self.book.add_metadata("DC", "title", self.story["title"])
+        self.book.add_metadata("DC", "description", self.story["description"])
+        self.book.add_metadata("DC", "date", self.story["createDate"])
+        self.book.add_metadata("DC", "modified", self.story["modifyDate"])
+        self.book.add_metadata("DC", "language", self.story["language"]["name"])
+
+        self.book.add_metadata(
+            None, "meta", "", {"name": "tags", "content": ", ".join(self.story["tags"])}
        )
-        self.epub.add_metadata(
-            None, "meta", "", {"name": "mature", "content": str(int(data["mature"]))}
-        )
-        self.epub.add_metadata(
+        self.book.add_metadata(
            None,
            "meta",
            "",
-            {"name": "completed", "content": str(int(data["completed"]))},
+            {"name": "mature", "content": str(int(self.story["mature"]))},
+        )
+        self.book.add_metadata(
+            None,
+            "meta",
+            "",
+            {"name": "completed", "content": str(int(self.story["completed"]))},
        )

-        # Set cover
-        self.epub.set_cover("cover.jpg", cover)
+    def add_cover(self):
+        """Add cover to epub."""
+        self.book.set_cover("cover.jpg", self.cover)
        cover_chapter = epub.EpubHtml(
            file_name="titlepage.xhtml",  # Standard for cover page
        )
        cover_chapter.set_content('<img src="cover.jpg">')
-        self.epub.add_item(cover_chapter)
+        self.book.add_item(cover_chapter)

-    async def add_chapters(
-        self, contents: List[bs4.Tag], download_images: bool = False
-    ):
-        """Add chapters to the Epub, downloading images if necessary. Sets the table of contents and spine."""
-        chapters: List[epub.EpubHtml] = []
+    def add_chapters(self):
+        """Add chapters to epub, replacing references to image urls to static image paths if images are provided during initialization."""
+        chapters = []

-        for cidx, (part, content) in enumerate(zip(self.data["parts"], contents)):
-            title = part["title"]
-
-            # Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1
+        for idx, (part, tree) in enumerate(zip(self.story["parts"], self.parts)):
            chapter = epub.EpubHtml(
-                title=title,
-                file_name=f"{cidx}_{part['id']}.xhtml",  # See issue #30
-                lang=self.data["language"]["name"],
-                uid=str(part["id"]).encode(),
+                title=part["title"], file_name=f"{idx}_{part['id']}"
            )

-            str_content = content.prettify()
-            if download_images:  # ! TODO : Download images elsewhere
-                soup = content
+            if self.images:
+                for img_idx, (img_data, img_tag) in enumerate(
+                    zip(self.images[idx], tree.find_all("img"))
+                ):
+                    path = f"static/{idx}_{part['id']}/{img_idx}.jpeg"
+                    img = epub.EpubImage(
+                        media_type="image/jpeg", content=img_data, file_name=path
+                    )
+                    self.book.add_item(img)

-                async with CachedSession(
-                    headers=headers, cache=None
-                ) as session:  # Don't cache images.
-                    for idx, image in enumerate(soup.find_all("img")):
-                        if not image["src"]:
-                            continue
-                        # Find all image tags and filter for those with sources
-
-                        async with session.get(image["src"]) as response:
-                            img = epub.EpubImage(
-                                media_type="image/jpeg",
-                                content=await response.read(),
-                                file_name=f"static/{cidx}/{idx}.jpeg",
-                            )
-                            self.epub.add_item(img)
-                            # Fetch image and pack
-
-                            str_content = str_content.replace(
-                                str(image["src"]), f"static/{cidx}/{idx}.jpeg"
-                            )
-
-            chapter.set_content(str_content)
-            self.epub.add_item(chapter)
+                    img_tag["src"] = path

+            chapter.set_content(tree.prettify())
+            self.book.add_item(chapter)
            chapters.append(chapter)

-            yield title
-
-        self.epub.toc = chapters
+        # ! Review, are these needed? #11
+        self.book.toc = chapters

        # Thanks https://github.com/aerkalov/ebooklib/blob/master/samples/09_create_image/create.py
-        self.epub.add_item(epub.EpubNcx())
-        self.epub.add_item(epub.EpubNav())
+        self.book.add_item(epub.EpubNcx())
+        self.book.add_item(epub.EpubNav())

        # create spine
-        self.epub.spine = ["nav"] + chapters
+        self.book.spine = ["nav"] + chapters
+
+    def compile(self):
+        self.add_metadata()
+        self.add_cover()
+        self.add_chapters()
+        return True

    def dump(self) -> BytesIO:
        # Thanks https://stackoverflow.com/a/75398222
        buffer = BytesIO()
-        epub.write_epub(buffer, self.epub)
+        epub.write_epub(buffer, self.book)

        buffer.seek(0)

@@ -1,109 +0,0 @@
-from io import BytesIO
-from typing import List
-
-from bs4 import BeautifulSoup
-from ebooklib import epub
-
-from ..models import Story
-from .types import AbstractGenerator
-
-
-class EPUBGenerator(AbstractGenerator):
-    def __init__(
-        self,
-        metadata: Story,
-        part_trees: List[BeautifulSoup],
-        cover: bytes,
-        images: List[List[bytes]] | None,
-    ):
-        self.story = metadata
-        self.parts = part_trees
-        self.cover = cover
-        self.images = images
-
-        self.book: epub.EpubBook = epub.EpubBook()
-
-    def add_metadata(self):
-        """Add metadata to epub."""
-        self.book.add_author(self.story["user"]["username"])
-
-        self.book.add_metadata("DC", "title", self.story["title"])
-        self.book.add_metadata("DC", "description", self.story["description"])
-        self.book.add_metadata("DC", "date", self.story["createDate"])
-        self.book.add_metadata("DC", "modified", self.story["modifyDate"])
-        self.book.add_metadata("DC", "language", self.story["language"]["name"])
-
-        self.book.add_metadata(
-            None, "meta", "", {"name": "tags", "content": ", ".join(self.story["tags"])}
-        )
-        self.book.add_metadata(
-            None,
-            "meta",
-            "",
-            {"name": "mature", "content": str(int(self.story["mature"]))},
-        )
-        self.book.add_metadata(
-            None,
-            "meta",
-            "",
-            {"name": "completed", "content": str(int(self.story["completed"]))},
-        )
-
-    def add_cover(self):
-        """Add cover to epub."""
-        self.book.set_cover("cover.jpg", self.cover)
-        cover_chapter = epub.EpubHtml(
-            file_name="titlepage.xhtml",  # Standard for cover page
-        )
-        cover_chapter.set_content('<img src="cover.jpg">')
-        self.book.add_item(cover_chapter)
-
-    def add_chapters(self):
-        """Add chapters to epub, replacing references to image urls to static image paths if images are provided during initialization."""
-        chapters = []
-
-        for idx, (part, tree) in enumerate(zip(self.story["parts"], self.parts)):
-            chapter = epub.EpubHtml(
-                title=part["title"], file_name=f"{idx}_{part['id']}"
-            )
-
-            if self.images:
-                for img_idx, (img_data, img_tag) in enumerate(
-                    zip(self.images[idx], tree.find_all("img"))
-                ):
-                    path = f"static/{idx}_{part['id']}/{img_idx}.jpeg"
-                    img = epub.EpubImage(
-                        media_type="image/jpeg", content=img_data, file_name=path
-                    )
-                    self.book.add_item(img)
-
-                    img_tag["src"] = path
-
-            chapter.set_content(tree.prettify())
-            self.book.add_item(chapter)
-            chapters.append(chapter)
-
-        # ! Review, are these needed? #11
-        self.book.toc = chapters
-
-        # Thanks https://github.com/aerkalov/ebooklib/blob/master/samples/09_create_image/create.py
-        self.book.add_item(epub.EpubNcx())
-        self.book.add_item(epub.EpubNav())
-
-        # create spine
-        self.book.spine = ["nav"] + chapters
-
-    def compile(self):
-        self.add_metadata()
-        self.add_cover()
-        self.add_chapters()
-        return True
-
-    def dump(self) -> BytesIO:
-        # Thanks https://stackoverflow.com/a/75398222
-        buffer = BytesIO()
-        epub.write_epub(buffer, self.book)
-
-        buffer.seek(0)
-
-        return buffer
@@ -1,204 +0,0 @@
-from base64 import b64encode
-from io import BytesIO
-from pathlib import Path
-from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
-from typing import List, cast
-
-from bs4 import BeautifulSoup, Tag
-from exiftool import ExifTool
-from jinja2 import Template
-from weasyprint import CSS, HTML
-from weasyprint.text.fonts import FontConfiguration
-
-from ..models import Story
-from .types import AbstractGenerator
-
-DATA_PATH = Path(__file__).parent / "pdf"
-ASSET_PATH = DATA_PATH / "assets"
-
-COPYRIGHT_DATA = {
-    1: {
-        "name": "All Rights Reserved",
-        "statement": "©️ {published_year} by {username}. All Rights Reserved.",
-        "freedoms": "No reuse, redistribution, or modification without permission.",
-        "printing": "Not allowed without explicit permission.",
-        "asset": None,
-    },
-    2: {
-        "name": "Public Domain",
-        "statement": "This work is in the public domain. Originally published in {published_year} by {username}.",
-        "freedoms": "Free to use for any purpose without permission.",
-        "printing": "Allowed for personal or commercial purposes.",
-        "asset": ASSET_PATH / "cc-zero.png",
-    },
-    3: {
-        "name": "Creative Commons Attribution (CC-BY)",
-        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution 4.0 International License.",
-        "freedoms": "Allows reuse, redistribution, and modification with credit to the author.",
-        "printing": "Allowed with proper credit.",
-        "asset": ASSET_PATH / "by.png",
-    },
-    4: {
-        "name": "CC Attribution NonCommercial (CC-BY-NC)",
-        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.",
-        "freedoms": "Allows reuse and modification for non-commercial purposes with credit.",
-        "printing": "Allowed for non-commercial purposes with proper credit.",
-        "asset": ASSET_PATH / "by-nc.png",
-    },
-    5: {
-        "name": "CC Attribution NonCommercial NoDerivs (CC-BY-NC-ND)",
-        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivs 4.0 International License.",
-        "freedoms": "Allows sharing in original form for non-commercial purposes with credit; no modifications allowed.",
-        "printing": "Allowed for non-commercial purposes in original form with proper credit.",
-        "asset": ASSET_PATH / "by-nc-nd.png",
-    },
-    6: {
-        "name": "CC Attribution NonCommercial ShareAlike (CC-BY-NC-SA)",
-        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.",
-        "freedoms": "Allows reuse and modification for non-commercial purposes under the same license, with credit.",
-        "printing": "Allowed for non-commercial purposes with proper credit under the same license.",
-        "asset": ASSET_PATH / "by-nc-sa.png",
-    },
-    7: {
-        "name": "CC Attribution ShareAlike (CC-BY-SA)",
-        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.",
-        "freedoms": "Allows reuse and modification for any purpose under the same license, with credit.",
-        "printing": "Allowed with proper credit under the same license.",
-        "asset": ASSET_PATH / "by-sa.png",
-    },
-    8: {
-        "name": "CC Attribution NoDerivs (CC-BY-ND)",
-        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NoDerivs 4.0 International License.",
-        "freedoms": "Allows sharing in original form for any purpose with credit; no modifications allowed.",
-        "printing": "Allowed in original form with proper credit.",
-        "asset": ASSET_PATH / "by-nd.png",
-    },
-}  # Maps Wattpad Copyright IDs to their corresponding data.
-
-with open(DATA_PATH / "stylesheet.css") as reader:
-    STYLESHEET = reader.read()
-
-
-with open(DATA_PATH / "book.html") as reader:
-    TEMPLATE = reader.read()
-
-
-class PDFGenerator(AbstractGenerator):
-    def __init__(
-        self,
-        metadata: Story,
-        part_trees: List[BeautifulSoup],
-        cover: bytes,
-        images: List[List[bytes]] | None,
-        author: bytes,
-    ):
-        self.story = metadata
-        self.parts = part_trees
-        self.cover = cover
-        self.images = images
-        self.author = author
-
-        self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf")
-        self.content = TEMPLATE
-
-    def generate_chapters(self) -> dict[int, str]:
-        """Return a dictionary of part_ids to content trees, with image URLs replaced with base64 encoded images if provided during initialization."""
-        data: dict[int, str] = {}
-        for idx, (part, tree) in enumerate(zip(self.story["parts"], self.parts)):
-            if self.images:
-                for img_idx, (img_data, img_tag) in enumerate(
-                    zip(self.images[idx], tree.find_all("img"))
-                ):
-                    img_tag[
-                        "src"
-                    ] = f"data:image/jpg;base64,{b64encode(img_data).decode()}"
-
-            data[part["id"]] = tree.prettify()
-
-        return data
-
-    def populate_template(self, parts: dict[int, str]):
-        """Populate HTML Template with Story data."""
-        copyright = COPYRIGHT_DATA[self.story["copyright"]]
-        data = {
-            "statement": copyright["statement"].format(
-                username=self.story["user"]["username"],
-                published_year=self.story["createDate"].split("-", 2)[0],
-            ),
-            "author": self.story["user"]["username"],
-            "freedoms": copyright["freedoms"],
-            "printing": copyright["printing"],
-            "book_id": self.story["id"],
-            "book_title": self.story["title"],
-            "cover": f"data:image/jpg;base64,{b64encode(self.cover).decode()}",
-            "username": self.story["user"]["username"],
-            "description": self.story["description"],
-            "avatar": b64encode(self.author).decode(),
-            "copyright": {
-                "data": b64encode(copyright["asset"].read_bytes()).decode()
-                if copyright["asset"]
-                else "",
-                "name": copyright["name"],
-            },
-            "parts": parts,
-        }
-
-        self.content: str = Template(self.content).render(data)
-
-    def generate_pdf(self):
-        """Generate and write the PDF to a temporary file (self.book)."""
-        font_config = FontConfiguration()
-
-        stylesheet_obj = CSS(string=STYLESHEET, font_config=font_config)
-
-        html_obj = HTML(string=self.content)
-        html_obj.write_pdf(
-            self.book.name, stylesheets=[stylesheet_obj], font_config=font_config
-        )
-
-    def add_metadata(self):
-        """Write metadata to generated PDF file at self.book, using ExifTool."""
-
-        clean_description = (
-            self.story["description"].strip().replace("\n", "$/")
-        )  # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `&#xa;` is another option.
-
-        metadata = {
-            "Author": self.story["user"]["username"],
-            "Title": self.story["title"],
-            "Subject": clean_description,
-            "CreationDate": self.story["createDate"],
-            "ModDate": self.story["modifyDate"],
-            "Keywords": ",".join(self.story["tags"]),
-            "Language": self.story["language"]["name"],
-            "Completed": self.story["completed"],
-            "MatureContent": self.story["mature"],
-            "Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
-        }  # As per https://exiftool.org/TagNames/PDF.html
-
-        with ExifTool(config_file=DATA_PATH / "exiftool.config") as et:
-            # Custom configuration adds Completed and MatureContent tags.
-            # exiftool logger logs executed command
-            et.execute(
-                *(
-                    [f"-{key}={value}" for key, value in metadata.items()]
-                    + [
-                        "-overwrite_original",
-                        self.book.file.name,
-                    ]
-                )
-            )
-
-    def compile(self):
-        parts = self.generate_chapters()
-        self.populate_template(parts)
-        self.generate_pdf()
-        self.add_metadata()
-        return True
-
-    def dump(self) -> BytesIO:
-        self.book.seek(0)
-        buffer = BytesIO(self.book.read())
-        self.book.close()
-
-        return buffer
@@ -1,288 +1,204 @@
-import tempfile
 from base64 import b64encode
 from io import BytesIO
-from typing import List, cast
+from pathlib import Path
+from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
+from typing import Generator, List, cast

-import bs4
-from bs4 import BeautifulSoup
-from eliot import start_action
+from bs4 import BeautifulSoup, Tag
 from exiftool import ExifTool
+from jinja2 import Template
 from weasyprint import CSS, HTML
 from weasyprint.text.fonts import FontConfiguration

-from ..logs import exiftool_logger
 from ..models import Story
-from ..utils import smart_trim
+from .types import AbstractGenerator
+
+DATA_PATH = Path(__file__).parent / "pdf"
+ASSET_PATH = DATA_PATH / "assets"
+
+COPYRIGHT_DATA = {
+    1: {
+        "name": "All Rights Reserved",
+        "statement": "©️ {published_year} by {username}. All Rights Reserved.",
+        "freedoms": "No reuse, redistribution, or modification without permission.",
+        "printing": "Not allowed without explicit permission.",
+        "asset": None,
+    },
+    2: {
+        "name": "Public Domain",
+        "statement": "This work is in the public domain. Originally published in {published_year} by {username}.",
+        "freedoms": "Free to use for any purpose without permission.",
+        "printing": "Allowed for personal or commercial purposes.",
+        "asset": ASSET_PATH / "cc-zero.png",
+    },
+    3: {
+        "name": "Creative Commons Attribution (CC-BY)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution 4.0 International License.",
+        "freedoms": "Allows reuse, redistribution, and modification with credit to the author.",
+        "printing": "Allowed with proper credit.",
+        "asset": ASSET_PATH / "by.png",
+    },
+    4: {
+        "name": "CC Attribution NonCommercial (CC-BY-NC)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.",
+        "freedoms": "Allows reuse and modification for non-commercial purposes with credit.",
+        "printing": "Allowed for non-commercial purposes with proper credit.",
+        "asset": ASSET_PATH / "by-nc.png",
+    },
+    5: {
+        "name": "CC Attribution NonCommercial NoDerivs (CC-BY-NC-ND)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivs 4.0 International License.",
+        "freedoms": "Allows sharing in original form for non-commercial purposes with credit; no modifications allowed.",
+        "printing": "Allowed for non-commercial purposes in original form with proper credit.",
+        "asset": ASSET_PATH / "by-nc-nd.png",
+    },
+    6: {
+        "name": "CC Attribution NonCommercial ShareAlike (CC-BY-NC-SA)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.",
+        "freedoms": "Allows reuse and modification for non-commercial purposes under the same license, with credit.",
+        "printing": "Allowed for non-commercial purposes with proper credit under the same license.",
+        "asset": ASSET_PATH / "by-nc-sa.png",
+    },
+    7: {
+        "name": "CC Attribution ShareAlike (CC-BY-SA)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.",
+        "freedoms": "Allows reuse and modification for any purpose under the same license, with credit.",
+        "printing": "Allowed with proper credit under the same license.",
+        "asset": ASSET_PATH / "by-sa.png",
+    },
+    8: {
+        "name": "CC Attribution NoDerivs (CC-BY-ND)",
+        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NoDerivs 4.0 International License.",
+        "freedoms": "Allows sharing in original form for any purpose with credit; no modifications allowed.",
+        "printing": "Allowed in original form with proper credit.",
+        "asset": ASSET_PATH / "by-nd.png",
+    },
+}  # Maps Wattpad Copyright IDs to their corresponding data.
+
+with open(DATA_PATH / "stylesheet.css") as reader:
+    STYLESHEET = reader.read()


-async def fetch_image(*args, **kwargs):
-    # TODO
-    raise NotImplementedError()
+with open(DATA_PATH / "book.html") as reader:
+    TEMPLATE = reader.read()


-class PDFGenerator:
-    """PDF Generation utilities"""
-
-    def __init__(self, data: Story, cover: bytes):
-        """Initialize PDGenerator, create PDF Temporary file."""
-        self.data = data
-        self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True)
+class PDFGenerator(AbstractGenerator):
+    def __init__(
+        self,
+        metadata: Story,
+        part_trees: List[BeautifulSoup],
+        cover: bytes,
+        images: List[Generator[bytes]] | None,
+        author_image: bytes,
+    ):
+        self.story = metadata
+        self.parts = part_trees
        self.cover = cover
-        self.content: str = ""
-        self.copyright = {
-            1: {
-                "name": "All Rights Reserved",
-                "statement": "©️ {published_year} by {username}. All Rights Reserved.",
-                "freedoms": "No reuse, redistribution, or modification without permission.",
-                "printing": "Not allowed without explicit permission.",
-                "image_url": None,
-            },
-            2: {
-                "name": "Public Domain",
-                "statement": "This work is in the public domain. Originally published in {published_year} by {username}.",
-                "freedoms": "Free to use for any purpose without permission.",
-                "printing": "Allowed for personal or commercial purposes.",
-                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/cc-zero.png",
-            },
-            3: {
-                "name": "Creative Commons Attribution (CC-BY)",
-                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution 4.0 International License.",
-                "freedoms": "Allows reuse, redistribution, and modification with credit to the author.",
-                "printing": "Allowed with proper credit.",
-                "image_url": "https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by.png",
-            },
-            4: {
-                "name": "CC Attribution NonCommercial (CC-BY-NC)",
-                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.",
-                "freedoms": "Allows reuse and modification for non-commercial purposes with credit.",
-                "printing": "Allowed for non-commercial purposes with proper credit.",
-                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc.png",
-            },
-            5: {
-                "name": "CC Attribution NonCommercial NoDerivs (CC-BY-NC-ND)",
-                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivs 4.0 International License.",
-                "freedoms": "Allows sharing in original form for non-commercial purposes with credit; no modifications allowed.",
-                "printing": "Allowed for non-commercial purposes in original form with proper credit.",
-                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc-nd.png",
-            },
-            6: {
-                "name": "CC Attribution NonCommercial ShareAlike (CC-BY-NC-SA)",
-                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.",
-                "freedoms": "Allows reuse and modification for non-commercial purposes under the same license, with credit.",
-                "printing": "Allowed for non-commercial purposes with proper credit under the same license.",
-                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc-sa.png",
-            },
-            7: {
-                "name": "CC Attribution ShareAlike (CC-BY-SA)",
-                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.",
-                "freedoms": "Allows reuse and modification for any purpose under the same license, with credit.",
-                "printing": "Allowed with proper credit under the same license.",
-                "image_url": "https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-sa.png",
-            },
-            8: {
-                "name": "CC Attribution NoDerivs (CC-BY-ND)",
-                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NoDerivs 4.0 International License.",
-                "freedoms": "Allows sharing in original form for any purpose with credit; no modifications allowed.",
-                "printing": "Allowed in original form with proper credit.",
-                "image_url": "https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nd.png",
+        self.images = images
+        self.author = author_image
+
+        self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf")
+        self.content = TEMPLATE
+
+    def generate_chapters(self) -> dict[int, str]:
+        """Return a dictionary of part_ids to content trees, with image URLs replaced with base64 encoded images if provided during initialization."""
+        data: dict[int, str] = {}
+        for idx, (part, tree) in enumerate(zip(self.story["parts"], self.parts)):
+            if self.images:
+                for img_idx, (img_data, img_tag) in enumerate(
+                    zip(self.images[idx], tree.find_all("img"))
+                ):
+                    img_tag["src"] = (
+                        f"data:image/jpg;base64,{b64encode(img_data).decode()}"
+                    )
+
+            data[part["id"]] = tree.prettify()
+
+        return data
+
+    def populate_template(self, parts: dict[int, str]):
+        """Populate HTML Template with Story data."""
+        copyright = COPYRIGHT_DATA[self.story["copyright"]]
+        data = {
+            "statement": copyright["statement"].format(
+                username=self.story["user"]["username"],
+                published_year=self.story["createDate"].split("-", 2)[0],
+            ),
+            "author": self.story["user"]["username"],
+            "freedoms": copyright["freedoms"],
+            "printing": copyright["printing"],
+            "book_id": self.story["id"],
+            "book_title": self.story["title"],
+            "cover": f"data:image/jpg;base64,{b64encode(self.cover).decode()}",
+            "username": self.story["user"]["username"],
+            "description": self.story["description"],
+            "avatar": b64encode(self.author).decode(),
+            "copyright": {
+                "data": b64encode(copyright["asset"].read_bytes()).decode()
+                if copyright["asset"]
+                else "",
+                "name": copyright["name"],
            },
+            "parts": parts,
        }

-        with open("./pdf/stylesheet.css") as reader:
-            self.stylesheet = reader.read()
-        with open("./pdf/book.html") as reader:
-            self.template = reader.read()
+        self.content: str = Template(self.content).render(data)

-    async def generate_cover_and_copyright_html(
-        self,
-    ) -> str:
-        """Generate Cover and Copyright file, fetch copyright image (cached), use self.cover for cover."""
+    def generate_pdf(self):
+        """Generate and write the PDF to a temporary file (self.book)."""
+        font_config = FontConfiguration()

-        copyright_data = self.copyright[self.data["copyright"]]
+        stylesheet_obj = CSS(string=STYLESHEET, font_config=font_config)

-        template = self.template
-        about_copyright = (
-            template.replace(
-                "{statement}",
-                copyright_data["statement"].format(
-                    username=self.data["user"]["username"],
-                    published_year=self.data["createDate"].split("-", 2)[0],
-                ),
-            )
-            .replace("{author}", self.data["user"]["username"])
-            .replace("{freedoms}", copyright_data["freedoms"])
-            .replace(
-                "{printing}",
-                copyright_data["printing"],
-            )
-            .replace("{book_id}", self.data["id"])
-            .replace("{book_title}", self.data["title"])
+        html_obj = HTML(string=self.content)
+        html_obj.write_pdf(
+            self.book.name, stylesheets=[stylesheet_obj], font_config=font_config
        )

-        copyright_image = (
-            await fetch_image(copyright_data["image_url"], should_cache=True)
-            if copyright_data["image_url"]
-            else None
-        )
-        image_block = (
-            """<img src="{image_url}" 
-alt="{name}" 
-width="88" 
-height="31" 
-id="copyright-license-image">""".format(
-                image_url=f"data:image/jpg;base64,{b64encode(copyright_image).decode()}",
-                name=copyright_data["name"],
-            )
-            if copyright_image
-            else ""
-        )
-        about_copyright = (
-            about_copyright.replace(
-                "{copyright_image}",
-                image_block,
-            )
-            if image_block
-            else about_copyright.replace("{copyright_image}", "")
-        )
-        about_copyright = about_copyright.replace(
-            "{cover}", f"data:image/jpg;base64,{b64encode(self.cover).decode()}"
-        )
+    def add_metadata(self):
+        """Write metadata to generated PDF file at self.book, using ExifTool."""

-        self.template = about_copyright
-        return about_copyright
+        clean_description = (
+            self.story["description"].strip().replace("\n", "$/")
+        )  # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `&#xa;` is another option.

-    async def generate_about_author_chapter(self) -> str:
-        """Generate About the Author file, fetch avatar."""
-        author_avatar = (
-            await fetch_image(
-                self.data["user"]["avatar"].replace("128", "512")
-            )  # Increase image resolution
-            if self.data["user"]["avatar"]
-            else None
-        )
-        about_author = self.template.replace(
-            "{username}", self.data["user"]["username"]
-        ).replace("{description}", smart_trim(self.data["user"]["description"]))
+        metadata = {
+            "Author": self.story["user"]["username"],
+            "Title": self.story["title"],
+            "Subject": clean_description,
+            "CreationDate": self.story["createDate"],
+            "ModDate": self.story["modifyDate"],
+            "Keywords": ",".join(self.story["tags"]),
+            "Language": self.story["language"]["name"],
+            "Completed": self.story["completed"],
+            "MatureContent": self.story["mature"],
+            "Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
+        }  # As per https://exiftool.org/TagNames/PDF.html

-        about_author = (
-            about_author.replace(
-                "{avatar}",
-                f"""
-                <img src="data:image/jpg;base64,{b64encode(author_avatar).decode()}" alt="Author's profile picture" id="author-profile-picture">""",
-            )
-            if author_avatar
-            else about_author.replace("{avatar}", "")
-        )
-
-        self.template = about_author
-        return about_author
-
-    def generate_toc(self):
-        ids = [part["id"] for part in self.data["parts"]]
-        clean = BeautifulSoup(
-            """
-        <section id="contents" class="toc">
-        <h1>Table of Contents</h1>
-        <ul></ul>
-        </section>
-        """,
-            "html.parser",
-        )  # html.parser doesn't create <html>/<body> tags automatically
-
-        ul = cast(bs4.Tag, clean.find("ul"))
-        for part_id in ids:
-            li = clean.new_tag("li")
-            a = clean.new_tag("a")
-            a["href"] = f"#{part_id}"
-            li.append(a)
-            ul.append(li)
-
-        insert_point = cast(bs4.Tag, self.tree.find("div", {"id": "book"}))
-        insert_point.append(clean)
-        return str(clean)
-
-    async def add_chapters(
-        self, contents: List[bs4.Tag], download_images: bool = False
-    ):
-        """Add chapters to the PDF, downloading images if necessary. Also add Cover, Copyright, and About the Author pages."""
-
-        # # Cover and Copyright Page
-        await self.generate_cover_and_copyright_html()
-        await self.generate_about_author_chapter()
-        self.tree = BeautifulSoup(self.template, "lxml")
-
-        self.generate_toc()
-        for part, content in zip(self.data["parts"], contents):
-            insert_point = cast(bs4.Tag, self.tree.find("div", {"id": "book"}))
-            insert_point.append(content)
-
-            yield part["title"]
-
-        # # About the Author page
-        # about_author_html = await self.generate_about_author_chapter()
-
-        # chapters.insert(0, cover_and_copyright_html)
-        # chapters.append(about_author_html)
-
-        with start_action(
-            action_type="generate_pdf",
-            output_filename=self.file.name,
-            title=self.data["title"],
-        ):
-            # PDF Generation with wkhtmltopdf, written to self.file
-
-            # At this stage, we have a bunch of HTML Files representing all the chapters that need to be generated. PDFKit handles ToC generation, so that's not included.
-
-            font_config = FontConfiguration()
-
-            stylesheet_obj = CSS(string=self.stylesheet, font_config=font_config)
-
-            html_obj = HTML(string=str(self.tree))
-            html_obj.write_pdf(
-                self.file.name, stylesheets=[stylesheet_obj], font_config=font_config
-            )
-
-        with start_action(action_type="add_metadata") as action:
-            # Metadata generation with Exiftool
-            clean_description = (
-                self.data["description"].strip().replace("\n", "$/")
-            )  # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `&#xa;` is another option.
-
-            action.log(f"clean_description: {clean_description}")
-
-            metadata = {
-                "Author": self.data["user"]["username"],
-                "Title": self.data["title"],
-                "Subject": clean_description,
-                "CreationDate": self.data["createDate"],
-                "ModDate": self.data["modifyDate"],
-                "Keywords": ",".join(self.data["tags"]),
-                "Language": self.data["language"]["name"],
-                "Completed": self.data["completed"],
-                "MatureContent": self.data["mature"],
-                "Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
-            }  # As per https://exiftool.org/TagNames/PDF.html
-
-            action.log(f"options: {metadata}")
-
-            with ExifTool(
-                config_file="../exiftool.config", logger=exiftool_logger
-            ) as et:
-                # Custom configuration adds Completed and MatureContent tags.
-                # exiftool logger logs executed command
-                et.execute(
-                    *(
-                        [f"-{key}={value}" for key, value in metadata.items()]
-                        + [
-                            "-overwrite_original",
-                            self.file.file.name,
-                        ]
-                    )
+        with ExifTool(config_file=DATA_PATH / "exiftool.config") as et:
+            # Custom configuration adds Completed and MatureContent tags.
+            # exiftool logger logs executed command
+            et.execute(
+                *(
+                    [f"-{key}={value}" for key, value in metadata.items()]
+                    + [
+                        "-overwrite_original",
+                        self.book.file.name,
+                    ]
                )
+            )
+
+    def compile(self):
+        parts = self.generate_chapters()
+        self.populate_template(parts)
+        self.generate_pdf()
+        self.add_metadata()
+        return True

    def dump(self) -> BytesIO:
-        self.file.seek(0)
-        buffer = BytesIO(self.file.read())
-        self.file.close()
+        self.book.seek(0)
+        buffer = BytesIO(self.book.read())
+        self.book.close()

        return buffer
@@ -1,6 +1,6 @@
 from io import BytesIO
 from tempfile import _TemporaryFileWrapper
-from typing import List, Literal
+from typing import Generator, List, Literal

 from bs4 import BeautifulSoup
 from ebooklib.epub import EpubBook
@@ -23,7 +23,7 @@ class AbstractGenerator:
        metadata: Story,
        part_trees: List[BeautifulSoup],
        cover: bytes,
-        images: List[List[bytes]] | None,
+        images: List[Generator[bytes]] | None,
    ):
        self.story = metadata
        self.parts = part_trees
@@ -7,6 +7,7 @@ from typing import Optional
 from zipfile import ZipFile

 from aiohttp import ClientResponseError
+from bs4 import BeautifulSoup
 from eliot import start_action
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import (
@@ -19,6 +20,7 @@ from fastapi.staticfiles import StaticFiles

 from create_book import (
    EPUBGenerator,
+    PDFGenerator,
    StoryNotFoundError,
    WattpadError,
    fetch_cookies,
@@ -26,10 +28,10 @@ from create_book import (
    fetch_story,
    fetch_story_content_zip,
    fetch_story_from_partId,
-    generate_clean_part_html,
    logger,
    slugify,
 )
+from create_book.parser import clean_tree, download_tree_images

 app = FastAPI()
 BUILD_PATH = Path(__file__).parent / "build"
@@ -73,7 +75,7 @@ app.add_middleware(RequestCancelledMiddleware)


 class DownloadFormat(Enum):
-    # pdf = "pdf"
+    pdf = "pdf"
    epub = "epub"


@@ -169,30 +171,36 @@ async def handle_download(
        if not cover_data:
            raise HTTPException(status_code=422)

-        match format:
-            case DownloadFormat.epub:
-                book = EPUBGenerator(metadata, cover_data)
-                media_type = "application/epub+zip"
-            # case DownloadFormat.pdf:
-            #     book = PDFGenerator(metadata, cover_data)
-            #     media_type = "application/pdf"
-
-        logger.info(f"Retrieved story metadata and cover ({story_id=})")
-
        story_zip = await fetch_story_content_zip(story_id, cookies)
        archive = ZipFile(story_zip, "r")

-        part_contents = [
-            generate_clean_part_html(
-                part, archive.read(str(part["id"])).decode("utf-8")
+        part_trees: list[BeautifulSoup] = [
+            clean_tree(
+                part["title"], part["id"], archive.read(str(part["id"])).decode("utf-8")
            )
            for part in metadata["parts"]
        ]

-        async for title in book.add_chapters(
-            part_contents, download_images=download_images
-        ):
-            ...
+        if download_images:
+            images = [await download_tree_images(tree) for tree in part_trees]
+
+        match format:
+            case DownloadFormat.epub:
+                book = EPUBGenerator(metadata, part_trees, cover_data, images)
+                media_type = "application/epub+zip"
+            case DownloadFormat.pdf:
+                author_image = await fetch_image(
+                    metadata["user"]["avatar"].replace("-256-", "-512-")
+                )
+                if not author_image:
+                    raise HTTPException(status_code=422)
+
+                book = PDFGenerator(
+                    metadata, part_trees, cover_data, images, author_image
+                )
+                media_type = "application/pdf"
+
+        logger.info(f"Retrieved story metadata and cover ({story_id=})")

        book_buffer = book.dump()