feat(api): Add parsing engine (#68 - @theonlywayup, @aaronbendaniel)

Add parsing engine
2025-06-11 13:55:20 +05:30
parent d6095bb122 551e91cb7f
commit d3b06f5d21
38 changed files with 1337 additions and 1657 deletions
@@ -1,8 +1,8 @@
 __pycache__
 venv
 *epub
-*pdf
+*.pdf
-*html
+# *html
 data
 *ipynb
 build
@@ -9,7 +9,7 @@ COPY src/frontend/. .
 RUN npm run build
 # Thanks https://stackoverflow.com/q/76988450
-FROM python:3.10-slim
+FROM python:3.13-slim
 WORKDIR /app
@@ -38,7 +38,7 @@ WORKDIR /app
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 COPY src/api/requirements.txt requirements.txt
-COPY src/api/exiftool.config exiftool.config
+COPY src/api/src/create_book/generators/pdf/exiftool.config exiftool.config
 RUN uv pip install -r requirements.txt --system
 COPY --from=0 /build/build /app/src/build
 COPY src/api/src src
@@ -1 +1 @@
-3.10
+3.13
@@ -3,7 +3,7 @@ name = "api"
 version = "0.1.0"
 description = "Wattpad Downloader API"
 readme = "../../README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.13"
 dependencies = [
    "aiohttp>=3.9.1",
    "rich>=13.9.4",
@@ -19,10 +19,17 @@ dependencies = [
    "uvicorn>=0.32.1",
    "pyexiftool>=0.5.6",
    "weasyprint>=63.0",
    "jinja2>=3.1.6",
 ]
 [tool.ruff.lint]
-ignore = ['E402']
+ignore = ['E402'] # module import not at top of file
 [tool.uv.sources]
-aiohttp-client-cache = { git = "https://github.com/TheOnlyWayUp/aiohttp-client-cache.git", rev = "keydb-ttl" }
+aiohttp-client-cache = { git = "https://github.com/TheOnlyWayUp/aiohttp-client-cache.git", rev = "keydb-ttl" } # Fork which leverages keydb's EXPIREMEMBER feature for TTLs on Hash members.
 [dependency-groups]
 dev = [
    "ipykernel>=6.29.5",
    "ruff>=0.11.12",
 ]
@@ -31,6 +31,7 @@ frozenlist==1.4.1
 h11==0.14.0
 idna==3.6
 itsdangerous==2.2.0
 jinja2==3.1.6
 jmespath==1.0.1
 lxml==5.3.0
 markdown-it-py==3.0.0
@@ -1,782 +0,0 @@
 from __future__ import annotations
 from typing import List, Optional, Tuple, cast
 from typing_extensions import TypedDict
 import re
 import logging
 import tempfile
 import unicodedata
 from os import environ
 from io import BytesIO
 from enum import Enum
 from base64 import b64encode
 import bs4
 import backoff
 from weasyprint import HTML, CSS, default_url_fetcher
 from weasyprint.text.fonts import FontConfiguration
 from ebooklib import epub
 from exiftool import ExifTool
 from eliot import to_file, start_action
 from eliot.stdlib import EliotHandler
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from pydantic import TypeAdapter, model_validator, field_validator
 from pydantic_settings import BaseSettings
 from aiohttp import ClientResponseError
 from aiohttp_client_cache.session import CachedSession
 from aiohttp_client_cache import FileBackend, RedisBackend
 load_dotenv(override=True)
 handler = EliotHandler()
 logging.getLogger("fastapi").setLevel(logging.INFO)
 logging.getLogger("fastapi").addHandler(handler)
 exiftool_logger = logging.getLogger("exiftool")
 exiftool_logger.addHandler(handler)
 logger = logging.Logger("wpd")
 logger.addHandler(handler)
 if environ.get("DEBUG"):
    to_file(open("eliot.log", "wb"))
 # --- #
 class CacheTypes(Enum):
    file = "file"
    redis = "redis"
 class Config(BaseSettings):
    USE_CACHE: bool = True
    CACHE_TYPE: CacheTypes = CacheTypes.file
    REDIS_CONNECTION_URL: str = ""
    @field_validator("USE_CACHE", mode="before")
    def validate_use_cache(cls, value):
        # Return default if value is an empty string
        if value == "":
            return True  # Default value for USE_CACHE
        return value
    @field_validator("CACHE_TYPE", mode="before")
    def validate_cache_type(cls, value):
        # Thanks https://stackoverflow.com/a/78157474
        if value == "":
            return "file"
        return value
    @model_validator(mode="after")
    def prevent_mismatched_redis_url(self):
        match self.CACHE_TYPE:
            case CacheTypes.file:
                if self.REDIS_CONNECTION_URL:
                    raise ValueError(
                        "REDIS_CONNECTION_URL provided when File cache selected. To use Redis as a cache, set CACHE_TYPE=redis."
                    )
            case CacheTypes.redis:
                if not self.REDIS_CONNECTION_URL:
                    raise ValueError(
                        "REDIS_CONNECTION_URL not provided when Redis cache selected. To use File cache, set CACHE_TYPE=file."
                    )
        return self
 config = Config()
 # --- #
 headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
 }
 if config.USE_CACHE:
    match config.CACHE_TYPE:
        case CacheTypes.file:
            cache = FileBackend(use_temp=True, expire_after=43200)  # 12 hours
        case CacheTypes.redis:
            cache = RedisBackend(
                cache_name="wpd-aiohttp-cache",
                address=config.REDIS_CONNECTION_URL,
                expire_after=43200,  # 12 hours
            )
 else:
    cache = None
 logger.info(f"Using {cache=}")
 # --- Utilities --- #
 def smart_trim(text: str, max_length: int = 400) -> str:
    """Truncate a string intelligently at newlines. Coherence and max-length adherence."""
    chunks = [t for t in text.split("\n") if t]
    to_return = ""
    for chunk in chunks:
        if len(to_return) + len(chunk) < max_length:
            to_return = chunk + "<br />"
        else:
            to_return = to_return.rstrip("<br />")
            break
    return to_return
 def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
    """Rebuild HTML Structure for a Part."""
    chapter_title = part["title"]
    chapter_id = part["id"]
    clean = BeautifulSoup(
        f"""
    <section id="section_{chapter_id}" class="chapitre">
        <h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
    </section>
    """,
        "html.parser",
    )  # html.parser doesn't create <html>/<body> tags automatically
    html = BeautifulSoup(content, "lxml")
    for br in html.find_all("br"):
        # Check if no content after br
        if not br.next_sibling or br.next_sibling.name in ["br", None]:
            br.decompose()
    section = cast(bs4.Tag, clean.find("section"))
    if not section:
        raise Exception()
    for child in html.find_all("p"):
        current_paragraph = clean.new_tag("p")
        # Attempt to carry over paragraph styling
        current_paragraph["style"] = child.get("style", "text-align: left;")
        for p_child in list(child.children):
            if not p_child:
                continue
            if isinstance(p_child, bs4.element.Tag):
                if p_child.name == "br":
                    p_child.decompose()
                elif p_child.name == "img":
                    src = p_child["src"]
                    img_tag = clean.new_tag("img")
                    img_tag["src"] = src
                    section.append(img_tag)
                    section.append(clean.new_tag("br"))
                elif p_child.name in ["b", "i"]:
                    styled_tag = clean.new_tag(p_child.name)
                    styled_content = clean.new_string(p_child.text)
                    styled_tag.append(styled_content)
                    current_paragraph.append(styled_tag)
                else:
                    # Append any other tags as-is
                    current_paragraph.append(p_child)
            elif isinstance(p_child, bs4.element.NavigableString):
                content = clean.new_string(p_child)
                current_paragraph.append(content)
        if current_paragraph.contents:
            section.append(current_paragraph)
        if not list(child.children):
            # Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
            child.decompose()
    return section
 def slugify(value, allow_unicode=False) -> str:
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    Thanks https://stackoverflow.com/a/295466.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize("NFKC", value)
    else:
        value = (
            unicodedata.normalize("NFKD", value)
            .encode("ascii", "ignore")
            .decode("ascii")
        )
    value = re.sub(r"[^\w\s-]", "", value.lower())
    return re.sub(r"[-\s]+", "-", value).strip("-_")
 async def fetch_cookies(username: str, password: str) -> dict:
    # source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
    """Retrieves authorization cookies from Wattpad by logging in with user creds.
    Args:
        username (str): Username.
        password (str): Password.
    Raises:
        ValueError: Bad status code.
        ValueError: No cookies returned.
    Returns:
        dict: Authorization cookies.
    """
    with start_action(action_type="api_fetch_cookies"):
        async with CachedSession(headers=headers, cache=None) as session:
            async with session.post(
                "https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
                data={
                    "username": username.lower(),
                    "password": password,
                },  # the username.lower() is for caching
            ) as response:
                if response.status != 204:
                    raise ValueError("Not a 204.")
                cookies = {
                    k: v.value
                    for k, v in response.cookies.items()  # Thanks https://stackoverflow.com/a/32281245
                }
                if not cookies:
                    raise ValueError("No cookies.")
                return cookies
 # --- Models --- #
 class CopyrightData(TypedDict):
    name: str
    statement: str
    freedoms: str
    printing: str
    image_url: Optional[str]
 class Language(TypedDict):
    name: str
 class User(TypedDict):
    username: str
    avatar: str
    description: str
 class Part(TypedDict):
    id: int
    title: str
 class Story(TypedDict):
    id: str
    title: str
    createDate: str
    modifyDate: str
    language: Language
    user: User
    description: str
    cover: str
    completed: bool
    tags: List[str]
    mature: bool
    url: str
    parts: List[Part]
    isPaywalled: bool
    copyright: int
 story_ta = TypeAdapter(Story)
 # --- Exceptions --- #
 class WattpadError(Exception):
    """Base Exception class for Wattpad related errors."""
 class StoryNotFoundError(WattpadError):
    """Display the "This story was not found" error to the user."""
    ...
 class PartNotFoundError(StoryNotFoundError): ...
 # --- API Calls --- #
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story_from_partId(
    part_id: int, cookies: Optional[dict] = None
 ) -> Tuple[int, Story]:
    """Fetch Story metadata from a Part ID."""
    with start_action(action_type="api_fetch_storyFromPartId"):
        async with CachedSession(
            headers=headers, cache=None if cookies else cache
        ) as session:  # Don't cache requests with Cookies.
            async with session.get(
                f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId,group(tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username,avatar,description),parts(id,title),cover,copyright)"
            ) as response:
                body = await response.json()
                if response.status == 400:
                    match body.get("error_code"):
                        case 1020:  # "Story part not found"
                            logger.info(f"{part_id=} not found on Wattpad, returning.")
                            raise PartNotFoundError()
                response.raise_for_status()
        return int(body["groupId"]), story_ta.validate_python(body["group"])
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story(story_id: int, cookies: Optional[dict] = None) -> Story:
    """Fetch Story metadata from a Story ID."""
    with start_action(action_type="api_fetch_story", story_id=story_id):
        async with CachedSession(
            headers=headers, cookies=cookies, cache=None if cookies else cache
        ) as session:
            async with session.get(
                f"https://www.wattpad.com/api/v3/stories/{story_id}?fields=tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username,avatar,description),parts(id,title),cover,copyright"
            ) as response:
                body = await response.json()
                if response.status == 400:
                    match body.get("error_code"):
                        case 1017:  # "Story not found"
                            logger.info(f"{story_id=} not found on Wattpad, returning.")
                            raise StoryNotFoundError()
                response.raise_for_status()
        return story_ta.validate_python(body)
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story_content_zip(
    story_id: int, cookies: Optional[dict] = None
 ) -> BytesIO:
    """BytesIO Stream of an Archive of Part Contents for a Story."""
    with start_action(action_type="api_fetch_storyZip", story_id=story_id):
        async with CachedSession(
            headers=headers,
            cookies=cookies,
            cache=None if cookies else cache,
        ) as session:
            async with session.get(
                f"https://www.wattpad.com/apiv2/?m=storytext&group_id={story_id}&output=zip"
            ) as response:
                response.raise_for_status()
                bytes_stream = BytesIO(await response.read())
        return bytes_stream
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_image(url: str, should_cache: bool = False) -> bytes:
    """Fetch image bytes."""
    with start_action(action_type="api_fetch_image", url=url):
        async with CachedSession(
            headers=headers, cache=cache if should_cache else None
        ) as session:  # Don't cache images.
            async with session.get(url) as response:
                response.raise_for_status()
                body = await response.read()
        return body
 # --- Generation --- #
 class EPUBGenerator:
    """EPUB Generation utilities"""
    def __init__(self, data: Story, cover: bytes):
        """Initialize EPUBGenerator. Create epub.EpubBook() and set metadata and cover."""
        self.epub = epub.EpubBook()
        self.data = data
        self.cover = cover
        # set metadata, defined in https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2
        self.epub.add_author(data["user"]["username"])
        self.epub.add_metadata("DC", "title", data["title"])
        self.epub.add_metadata("DC", "description", data["description"])
        self.epub.add_metadata("DC", "date", data["createDate"])
        self.epub.add_metadata("DC", "modified", data["modifyDate"])
        self.epub.add_metadata("DC", "language", data["language"]["name"])
        self.epub.add_metadata(
            None, "meta", "", {"name": "tags", "content": ", ".join(data["tags"])}
        )
        self.epub.add_metadata(
            None, "meta", "", {"name": "mature", "content": str(int(data["mature"]))}
        )
        self.epub.add_metadata(
            None,
            "meta",
            "",
            {"name": "completed", "content": str(int(data["completed"]))},
        )
        # Set cover
        self.epub.set_cover("cover.jpg", cover)
        cover_chapter = epub.EpubHtml(
            file_name="titlepage.xhtml",  # Standard for cover page
        )
        cover_chapter.set_content('<img src="cover.jpg">')
        self.epub.add_item(cover_chapter)
    async def add_chapters(
        self, contents: List[bs4.Tag], download_images: bool = False
    ):
        """Add chapters to the Epub, downloading images if necessary. Sets the table of contents and spine."""
        chapters: List[epub.EpubHtml] = []
        for cidx, (part, content) in enumerate(zip(self.data["parts"], contents)):
            title = part["title"]
            # Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1
            chapter = epub.EpubHtml(
                title=title,
                file_name=f"{cidx}_{part['id']}.xhtml",  # See issue #30
                lang=self.data["language"]["name"],
                uid=str(part["id"]).encode(),
            )
            str_content = content.prettify()
            if download_images:
                soup = content
                async with CachedSession(
                    headers=headers, cache=None
                ) as session:  # Don't cache images.
                    for idx, image in enumerate(soup.find_all("img")):
                        if not image["src"]:
                            continue
                        # Find all image tags and filter for those with sources
                        async with session.get(image["src"]) as response:
                            img = epub.EpubImage(
                                media_type="image/jpeg",
                                content=await response.read(),
                                file_name=f"static/{cidx}/{idx}.jpeg",
                            )
                            self.epub.add_item(img)
                            # Fetch image and pack
                            str_content = str_content.replace(
                                str(image["src"]), f"static/{cidx}/{idx}.jpeg"
                            )
            chapter.set_content(str_content)
            self.epub.add_item(chapter)
            chapters.append(chapter)
            yield title
        self.epub.toc = chapters
        # Thanks https://github.com/aerkalov/ebooklib/blob/master/samples/09_create_image/create.py
        self.epub.add_item(epub.EpubNcx())
        self.epub.add_item(epub.EpubNav())
        # create spine
        self.epub.spine = ["nav"] + chapters
    def dump(self) -> BytesIO:
        # Thanks https://stackoverflow.com/a/75398222
        buffer = BytesIO()
        epub.write_epub(buffer, self.epub)
        buffer.seek(0)
        return buffer
 class PDFGenerator:
    """PDF Generation utilities"""
    def __init__(self, data: Story, cover: bytes):
        """Initialize PDGenerator, create PDF Temporary file."""
        self.data = data
        self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True)
        self.cover = cover
        self.content: str = ""
        self.copyright = {
            1: {
                "name": "All Rights Reserved",
                "statement": "©️ {published_year} by {username}. All Rights Reserved.",
                "freedoms": "No reuse, redistribution, or modification without permission.",
                "printing": "Not allowed without explicit permission.",
                "image_url": None,
            },
            2: {
                "name": "Public Domain",
                "statement": "This work is in the public domain. Originally published in {published_year} by {username}.",
                "freedoms": "Free to use for any purpose without permission.",
                "printing": "Allowed for personal or commercial purposes.",
                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/cc-zero.png",
            },
            3: {
                "name": "Creative Commons Attribution (CC-BY)",
                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution 4.0 International License.",
                "freedoms": "Allows reuse, redistribution, and modification with credit to the author.",
                "printing": "Allowed with proper credit.",
                "image_url": "https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by.png",
            },
            4: {
                "name": "CC Attribution NonCommercial (CC-BY-NC)",
                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.",
                "freedoms": "Allows reuse and modification for non-commercial purposes with credit.",
                "printing": "Allowed for non-commercial purposes with proper credit.",
                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc.png",
            },
            5: {
                "name": "CC Attribution NonCommercial NoDerivs (CC-BY-NC-ND)",
                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivs 4.0 International License.",
                "freedoms": "Allows sharing in original form for non-commercial purposes with credit; no modifications allowed.",
                "printing": "Allowed for non-commercial purposes in original form with proper credit.",
                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc-nd.png",
            },
            6: {
                "name": "CC Attribution NonCommercial ShareAlike (CC-BY-NC-SA)",
                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.",
                "freedoms": "Allows reuse and modification for non-commercial purposes under the same license, with credit.",
                "printing": "Allowed for non-commercial purposes with proper credit under the same license.",
                "image_url": "http://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nc-sa.png",
            },
            7: {
                "name": "CC Attribution ShareAlike (CC-BY-SA)",
                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.",
                "freedoms": "Allows reuse and modification for any purpose under the same license, with credit.",
                "printing": "Allowed with proper credit under the same license.",
                "image_url": "https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-sa.png",
            },
            8: {
                "name": "CC Attribution NoDerivs (CC-BY-ND)",
                "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NoDerivs 4.0 International License.",
                "freedoms": "Allows sharing in original form for any purpose with credit; no modifications allowed.",
                "printing": "Allowed in original form with proper credit.",
                "image_url": "https://mirrors.creativecommons.org/presskit/buttons/88x31/png/by-nd.png",
            },
        }
        with open("./pdf/stylesheet.css") as reader:
            self.stylesheet = reader.read()
        with open("./pdf/book.html") as reader:
            self.template = reader.read()
    async def generate_cover_and_copyright_html(
        self,
    ) -> str:
        """Generate Cover and Copyright file, fetch copyright image (cached), use self.cover for cover."""
        copyright_data = self.copyright[self.data["copyright"]]
        template = self.template
        about_copyright = (
            template.replace(
                "{statement}",
                copyright_data["statement"].format(
                    username=self.data["user"]["username"],
                    published_year=self.data["createDate"].split("-", 2)[0],
                ),
            )
            .replace("{author}", self.data["user"]["username"])
            .replace("{freedoms}", copyright_data["freedoms"])
            .replace(
                "{printing}",
                copyright_data["printing"],
            )
            .replace("{book_id}", self.data["id"])
            .replace("{book_title}", self.data["title"])
        )
        copyright_image = (
            await fetch_image(copyright_data["image_url"], should_cache=True)
            if copyright_data["image_url"]
            else None
        )
        image_block = (
            """<img src="{image_url}" 
 alt="{name}" 
 width="88" 
 height="31" 
 id="copyright-license-image">""".format(
                image_url=f"data:image/jpg;base64,{b64encode(copyright_image).decode()}",
                name=copyright_data["name"],
            )
            if copyright_image
            else ""
        )
        about_copyright = (
            about_copyright.replace(
                "{copyright_image}",
                image_block,
            )
            if image_block
            else about_copyright.replace("{copyright_image}", "")
        )
        about_copyright = about_copyright.replace(
            "{cover}", f"data:image/jpg;base64,{b64encode(self.cover).decode()}"
        )
        self.template = about_copyright
        return about_copyright
    async def generate_about_author_chapter(self) -> str:
        """Generate About the Author file, fetch avatar."""
        author_avatar = (
            await fetch_image(
                self.data["user"]["avatar"].replace("128", "512")
            )  # Increase image resolution
            if self.data["user"]["avatar"]
            else None
        )
        about_author = self.template.replace(
            "{username}", self.data["user"]["username"]
        ).replace("{description}", smart_trim(self.data["user"]["description"]))
        about_author = (
            about_author.replace(
                "{avatar}",
                f"""
                <img src="data:image/jpg;base64,{b64encode(author_avatar).decode()}" alt="Author's profile picture" id="author-profile-picture">""",
            )
            if author_avatar
            else about_author.replace("{avatar}", "")
        )
        self.template = about_author
        return about_author
    def generate_toc(self):
        ids = [part["id"] for part in self.data["parts"]]
        clean = BeautifulSoup(
            """
        <section id="contents" class="toc">
        <h1>Table of Contents</h1>
        <ul></ul>
        </section>
        """,
            "html.parser",
        )  # html.parser doesn't create <html>/<body> tags automatically
        ul = cast(bs4.Tag, clean.find("ul"))
        for part_id in ids:
            li = clean.new_tag("li")
            a = clean.new_tag("a")
            a["href"] = f"#{part_id}"
            li.append(a)
            ul.append(li)
        insert_point = cast(bs4.Tag, self.tree.find("div", {"id": "book"}))
        insert_point.append(clean)
        return str(clean)
    async def add_chapters(
        self, contents: List[bs4.Tag], download_images: bool = False
    ):
        """Add chapters to the PDF, downloading images if necessary. Also add Cover, Copyright, and About the Author pages."""
        # # Cover and Copyright Page
        await self.generate_cover_and_copyright_html()
        await self.generate_about_author_chapter()
        self.tree = BeautifulSoup(self.template, "lxml")
        self.generate_toc()
        for part, content in zip(self.data["parts"], contents):
            insert_point = cast(bs4.Tag, self.tree.find("div", {"id": "book"}))
            insert_point.append(content)
            yield part["title"]
        # # About the Author page
        # about_author_html = await self.generate_about_author_chapter()
        # chapters.insert(0, cover_and_copyright_html)
        # chapters.append(about_author_html)
        with start_action(
            action_type="generate_pdf",
            output_filename=self.file.name,
            title=self.data["title"],
        ):
            # PDF Generation with wkhtmltopdf, written to self.file
            # At this stage, we have a bunch of HTML Files representing all the chapters that need to be generated. PDFKit handles ToC generation, so that's not included.
            font_config = FontConfiguration()
            stylesheet_obj = CSS(string=self.stylesheet, font_config=font_config)
            html_obj = HTML(string=str(self.tree))
            html_obj.write_pdf(
                self.file.name, stylesheets=[stylesheet_obj], font_config=font_config
            )
        with start_action(action_type="add_metadata") as action:
            # Metadata generation with Exiftool
            clean_description = (
                self.data["description"].strip().replace("\n", "$/")
            )  # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `&#xa;` is another option.
            action.log(f"clean_description: {clean_description}")
            metadata = {
                "Author": self.data["user"]["username"],
                "Title": self.data["title"],
                "Subject": clean_description,
                "CreationDate": self.data["createDate"],
                "ModDate": self.data["modifyDate"],
                "Keywords": ",".join(self.data["tags"]),
                "Language": self.data["language"]["name"],
                "Completed": self.data["completed"],
                "MatureContent": self.data["mature"],
                "Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
            }  # As per https://exiftool.org/TagNames/PDF.html
            action.log(f"options: {metadata}")
            with ExifTool(
                config_file="../exiftool.config", logger=exiftool_logger
            ) as et:
                # Custom configuration adds Completed and MatureContent tags.
                # exiftool logger logs executed command
                et.execute(
                    *(
                        [f"-{key}={value}" for key, value in metadata.items()]
                        + [
                            "-overwrite_original",
                            self.file.file.name,
                        ]
                    )
                )
    def dump(self) -> BytesIO:
        self.file.seek(0)
        buffer = BytesIO(self.file.read())
        self.file.close()
        return buffer
 # ------ #
@@ -0,0 +1,13 @@
 # ruff: noqa: F401
 from .create_book import (
    fetch_cookies,
    fetch_story,
    fetch_story_content_zip,
    fetch_story_from_partId,
 )
 from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError
 from .generators import EPUBGenerator, PDFGenerator
 from .logs import logger
 from .parser import fetch_image
 from .utils import slugify
@@ -0,0 +1,46 @@
 from enum import Enum
 from pydantic import field_validator, model_validator
 from pydantic_settings import BaseSettings
 class CacheTypes(Enum):
    file = "file"
    redis = "redis"
 class Config(BaseSettings):
    # Values can be overriden by envvars.
    USE_CACHE: bool = True
    CACHE_TYPE: CacheTypes = CacheTypes.file
    REDIS_CONNECTION_URL: str = ""
    @field_validator("USE_CACHE", mode="before")
    def validate_use_cache(cls, value):
        # Return default if value is an empty string
        if value == "":
            return True  # Default value for USE_CACHE
        return value
    @field_validator("CACHE_TYPE", mode="before")
    def validate_cache_type(cls, value):
        # Thanks https://stackoverflow.com/a/78157474
        if value == "":
            return "file"
        return value
    @model_validator(mode="after")
    def prevent_mismatched_redis_url(self):
        match self.CACHE_TYPE:
            case CacheTypes.file:
                if self.REDIS_CONNECTION_URL:
                    raise ValueError(
                        "REDIS_CONNECTION_URL provided when File cache selected. To use Redis as a cache, set CACHE_TYPE=redis."
                    )
            case CacheTypes.redis:
                if not self.REDIS_CONNECTION_URL:
                    raise ValueError(
                        "REDIS_CONNECTION_URL not provided when Redis cache selected. To use File cache, set CACHE_TYPE=file."
                    )
        return self
@@ -0,0 +1,129 @@
 from __future__ import annotations
 from io import BytesIO
 from typing import Optional
 import backoff
 from aiohttp import ClientResponseError
 from aiohttp_client_cache.session import CachedSession
 from eliot import start_action
 from pydantic import TypeAdapter
 from .exceptions import PartNotFoundError, StoryNotFoundError
 from .logs import logger
 from .models import Story
 from .vars import cache, headers
 story_ta = TypeAdapter(Story)
 # --- #
 async def fetch_cookies(username: str, password: str) -> dict:
    # source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
    """Retrieves authorization cookies from Wattpad by logging in with user creds.
    Args:
        username (str): Username.
        password (str): Password.
    Raises:
        ValueError: Bad status code.
        ValueError: No cookies returned.
    Returns:
        dict: Authorization cookies.
    """
    with start_action(action_type="api_fetch_cookies"):
        async with CachedSession(headers=headers, cache=None) as session:
            async with session.post(
                "https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
                data={
                    "username": username.lower(),
                    "password": password,
                },  # the username.lower() is for caching
            ) as response:
                if response.status != 204:
                    raise ValueError("Not a 204.")
                cookies = {
                    k: v.value
                    for k, v in response.cookies.items()  # Thanks https://stackoverflow.com/a/32281245
                }
                if not cookies:
                    raise ValueError("No cookies.")
                return cookies
 # --- API Calls --- #
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story_from_partId(
    part_id: int, cookies: Optional[dict] = None
 ) -> tuple[int, Story]:
    """Fetch Story metadata from a Part ID."""
    with start_action(action_type="api_fetch_storyFromPartId"):
        async with CachedSession(
            headers=headers, cache=None if cookies else cache
        ) as session:  # Don't cache requests with Cookies.
            async with session.get(
                f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId,group(tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username,avatar,description),parts(id,title),cover,copyright)"
            ) as response:
                body = await response.json()
                if response.status == 400:
                    match body.get("error_code"):
                        case 1020:  # "Story part not found"
                            logger.info(f"{part_id=} not found on Wattpad, returning.")
                            raise PartNotFoundError()
                response.raise_for_status()
        return int(body["groupId"]), story_ta.validate_python(body["group"])
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story(story_id: int, cookies: Optional[dict] = None) -> Story:
    """Fetch Story metadata from a Story ID."""
    with start_action(action_type="api_fetch_story", story_id=story_id):
        async with CachedSession(
            headers=headers, cookies=cookies, cache=None if cookies else cache
        ) as session:
            async with session.get(
                f"https://www.wattpad.com/api/v3/stories/{story_id}?fields=tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username,avatar,description),parts(id,title),cover,copyright"
            ) as response:
                body = await response.json()
                if response.status == 400:
                    match body.get("error_code"):
                        case 1017:  # "Story not found"
                            logger.info(f"{story_id=} not found on Wattpad, returning.")
                            raise StoryNotFoundError()
                response.raise_for_status()
        return story_ta.validate_python(body)
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story_content_zip(
    story_id: int, cookies: Optional[dict] = None
 ) -> BytesIO:
    """BytesIO Stream of an Archive of Part Contents for a Story."""
    with start_action(action_type="api_fetch_storyZip", story_id=story_id):
        async with CachedSession(
            headers=headers,
            cookies=cookies,
            cache=None if cookies else cache,
        ) as session:
            async with session.get(
                f"https://www.wattpad.com/apiv2/?m=storytext&group_id={story_id}&output=zip"
            ) as response:
                response.raise_for_status()
                bytes_stream = BytesIO(await response.read())
        return bytes_stream
@@ -0,0 +1,12 @@
 class WattpadError(Exception):
    """Base Exception class for Wattpad related errors."""
 class StoryNotFoundError(WattpadError):
    """Display the "This story was not found" error to the user."""
    ...
 class PartNotFoundError(StoryNotFoundError):
    ...
@@ -0,0 +1,4 @@
 # ruff: noqa: F401
 from .epub import EPUBGenerator
 from .pdf import PDFGenerator
@@ -0,0 +1,108 @@
 from io import BytesIO
 from bs4 import BeautifulSoup
 from ebooklib import epub
 from ..models import Story
 from .types import AbstractGenerator
 class EPUBGenerator(AbstractGenerator):
    def __init__(
        self,
        metadata: Story,
        part_trees: list[BeautifulSoup],
        cover: bytes,
        images: list[list[bytes | None]],
    ):
        self.story = metadata
        self.parts = part_trees
        self.cover = cover
        self.images = images
        self.book: epub.EpubBook = epub.EpubBook()
    def add_metadata(self):
        """Add metadata to epub."""
        self.book.add_author(self.story["user"]["username"])
        self.book.add_metadata("DC", "title", self.story["title"])
        self.book.add_metadata("DC", "description", self.story["description"])
        self.book.add_metadata("DC", "date", self.story["createDate"])
        self.book.add_metadata("DC", "modified", self.story["modifyDate"])
        self.book.add_metadata("DC", "language", self.story["language"]["name"])
        self.book.add_metadata(
            None, "meta", "", {"name": "tags", "content": ", ".join(self.story["tags"])}
        )
        self.book.add_metadata(
            None,
            "meta",
            "",
            {"name": "mature", "content": str(int(self.story["mature"]))},
        )
        self.book.add_metadata(
            None,
            "meta",
            "",
            {"name": "completed", "content": str(int(self.story["completed"]))},
        )
    def add_cover(self):
        """Add cover to epub."""
        self.book.set_cover("cover.jpg", self.cover)
        cover_chapter = epub.EpubHtml(
            file_name="titlepage.xhtml",  # Standard for cover page
        )
        cover_chapter.set_content('<img src="cover.jpg">')
        self.book.add_item(cover_chapter)
    def add_chapters(self):
        """Add chapters to epub, replacing references to image urls to static image paths if images are provided during initialization."""
        chapters = []
        for idx, (part, tree) in enumerate(zip(self.story["parts"], self.parts)):
            chapter = epub.EpubHtml(
                title=part["title"], file_name=f"{idx}_{part['id']}.xhtml"
            )
            if self.images:
                for img_idx, (img_data, img_tag) in enumerate(
                    zip(self.images[idx], tree.find_all("img"))
                ):
                    path = f"static/{idx}_{part['id']}/{img_idx}.jpeg"
                    img = epub.EpubImage(
                        media_type="image/jpeg", content=img_data, file_name=path
                    )
                    self.book.add_item(img)
                    img_tag["src"] = path
            chapter.set_content(tree.prettify())
            self.book.add_item(chapter)
            chapters.append(chapter)
        # ! Review, are these needed? #11
        self.book.toc = chapters
        # Thanks https://github.com/aerkalov/ebooklib/blob/master/samples/09_create_image/create.py
        self.book.add_item(epub.EpubNcx())
        self.book.add_item(epub.EpubNav())
        # create spine
        self.book.spine = ["nav"] + chapters
    def compile(self):
        self.add_metadata()
        self.add_cover()
        self.add_chapters()
        return True
    def dump(self) -> BytesIO:
        # Thanks https://stackoverflow.com/a/75398222
        buffer = BytesIO()
        epub.write_epub(buffer, self.book)
        buffer.seek(0)
        return buffer
@@ -0,0 +1,208 @@
 from base64 import b64encode
 from io import BytesIO
 from pathlib import Path
 from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
 from bs4 import BeautifulSoup
 from exiftool import ExifTool
 from jinja2 import Template
 from weasyprint import CSS, HTML
 from weasyprint.text.fonts import FontConfiguration
 from ..models import Story
 from .types import AbstractGenerator
 DATA_PATH = Path(__file__).parent / "pdf"
 ASSET_PATH = DATA_PATH / "assets"
 COPYRIGHT_DATA = {
    1: {
        "name": "All Rights Reserved",
        "statement": "©️ {published_year} by {username}. All Rights Reserved.",
        "freedoms": "No reuse, redistribution, or modification without permission.",
        "printing": "Not allowed without explicit permission.",
        "asset": None,
    },
    2: {
        "name": "Public Domain",
        "statement": "This work is in the public domain. Originally published in {published_year} by {username}.",
        "freedoms": "Free to use for any purpose without permission.",
        "printing": "Allowed for personal or commercial purposes.",
        "asset": ASSET_PATH / "cc-zero.png",
    },
    3: {
        "name": "Creative Commons Attribution (CC-BY)",
        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution 4.0 International License.",
        "freedoms": "Allows reuse, redistribution, and modification with credit to the author.",
        "printing": "Allowed with proper credit.",
        "asset": ASSET_PATH / "by.png",
    },
    4: {
        "name": "CC Attribution NonCommercial (CC-BY-NC)",
        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.",
        "freedoms": "Allows reuse and modification for non-commercial purposes with credit.",
        "printing": "Allowed for non-commercial purposes with proper credit.",
        "asset": ASSET_PATH / "by-nc.png",
    },
    5: {
        "name": "CC Attribution NonCommercial NoDerivs (CC-BY-NC-ND)",
        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivs 4.0 International License.",
        "freedoms": "Allows sharing in original form for non-commercial purposes with credit; no modifications allowed.",
        "printing": "Allowed for non-commercial purposes in original form with proper credit.",
        "asset": ASSET_PATH / "by-nc-nd.png",
    },
    6: {
        "name": "CC Attribution NonCommercial ShareAlike (CC-BY-NC-SA)",
        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.",
        "freedoms": "Allows reuse and modification for non-commercial purposes under the same license, with credit.",
        "printing": "Allowed for non-commercial purposes with proper credit under the same license.",
        "asset": ASSET_PATH / "by-nc-sa.png",
    },
    7: {
        "name": "CC Attribution ShareAlike (CC-BY-SA)",
        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.",
        "freedoms": "Allows reuse and modification for any purpose under the same license, with credit.",
        "printing": "Allowed with proper credit under the same license.",
        "asset": ASSET_PATH / "by-sa.png",
    },
    8: {
        "name": "CC Attribution NoDerivs (CC-BY-ND)",
        "statement": "©️ {published_year} by {username}. This work is licensed under a Creative Commons Attribution-NoDerivs 4.0 International License.",
        "freedoms": "Allows sharing in original form for any purpose with credit; no modifications allowed.",
        "printing": "Allowed in original form with proper credit.",
        "asset": ASSET_PATH / "by-nd.png",
    },
 }  # Maps Wattpad Copyright IDs to their corresponding data.
 with open(DATA_PATH / "stylesheet.css") as reader:
    STYLESHEET = reader.read()
 with open(DATA_PATH / "book.html") as reader:
    TEMPLATE = reader.read()
 class PDFGenerator(AbstractGenerator):
    def __init__(
        self,
        metadata: Story,
        part_trees: list[BeautifulSoup],
        cover: bytes,
        images: list[list[bytes | None]],
        author_image: bytes,
    ):
        self.story = metadata
        self.parts = part_trees
        self.cover = cover
        self.images = images
        self.author = author_image
        self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf")
        self.content = TEMPLATE
    def generate_chapters(self) -> dict[int, str]:
        """Return a dictionary of part_ids to content trees, with image URLs replaced with base64 encoded images if provided during initialization."""
        data: dict[int, str] = {}
        for idx, (part, tree) in enumerate(zip(self.story["parts"], self.parts)):
            if self.images:
                for img_idx, (img_data, img_tag) in enumerate(
                    zip(self.images[idx], tree.find_all("img"))
                ):
                    if not img_data:
                        continue
                    img_tag["src"] = (
                        f"data:image/jpg;base64,{b64encode(img_data).decode()}"
                    )
            data[part["id"]] = tree.prettify()
        return data
    def populate_template(self, parts: dict[int, str]):
        """Populate HTML Template with Story data."""
        copyright = COPYRIGHT_DATA[self.story["copyright"]]
        data = {
            "statement": copyright["statement"].format(
                username=self.story["user"]["username"],
                published_year=self.story["createDate"].split("-", 2)[0],
            ),
            "author": self.story["user"]["username"],
            "freedoms": copyright["freedoms"],
            "printing": copyright["printing"],
            "book_id": self.story["id"],
            "book_title": self.story["title"],
            "cover": f"data:image/jpg;base64,{b64encode(self.cover).decode()}",
            "username": self.story["user"]["username"],
            "description": self.story["description"],
            "avatar": b64encode(self.author).decode(),
            "copyright": {
                "data": (
                    b64encode(copyright["asset"].read_bytes()).decode()
                    if copyright["asset"]
                    else ""
                ),
                "name": copyright["name"],
            },
            "parts": parts,
        }
        self.content: str = Template(self.content).render(data)
    def generate_pdf(self):
        """Generate and write the PDF to a temporary file (self.book)."""
        font_config = FontConfiguration()
        stylesheet_obj = CSS(string=STYLESHEET, font_config=font_config)
        html_obj = HTML(string=self.content)
        html_obj.write_pdf(
            self.book.name, stylesheets=[stylesheet_obj], font_config=font_config
        )
    def add_metadata(self):
        """Write metadata to generated PDF file at self.book, using ExifTool."""
        clean_description = (
            self.story["description"].strip().replace("\n", "$/")
        )  # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `&#xa;` is another option.
        metadata = {
            "Author": self.story["user"]["username"],
            "Title": self.story["title"],
            "Subject": clean_description,
            "CreationDate": self.story["createDate"],
            "ModDate": self.story["modifyDate"],
            "Keywords": ",".join(self.story["tags"]),
            "Language": self.story["language"]["name"],
            "Completed": self.story["completed"],
            "MatureContent": self.story["mature"],
            "Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
        }  # As per https://exiftool.org/TagNames/PDF.html
        with ExifTool(config_file=DATA_PATH / "exiftool.config") as et:
            # Custom configuration adds Completed and MatureContent tags.
            # exiftool logger logs executed command
            et.execute(
                *(
                    [f"-{key}={value}" for key, value in metadata.items()]
                    + [
                        "-overwrite_original",
                        self.book.file.name,
                    ]
                )
            )
    def compile(self):
        parts = self.generate_chapters()
        self.populate_template(parts)
        self.generate_pdf()
        self.add_metadata()
        return True
    def dump(self) -> BytesIO:
        self.book.seek(0)
        buffer = BytesIO(self.book.read())
        self.book.close()
        return buffer
@@ -0,0 +1,73 @@
 <!DOCTYPE html>
 <html lang="{{ langcode }}">
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{{ book_title }}</title>
    <section class="fullpage">
        <img src="{{ cover }}" alt="Cover">
    </section>
    <div id="copyright-container">
        <h1 id="copyright-notice">Copyright Notice</h1>
        <h2 id="copyright-title">{{ book_title }}</h2>
        <p id="copyright-author">By {{ author }}</p>
        <div id="copyright-separator"></div>
        <p id="copyright-ex-libris">Ex Libris Sapientiae</p>
        <div id="copyright-separator"></div>
        {% if copyright.data %}
        <img src="data:image/jpg;base64,{{copyright.data}}" 
 alt="{{copyright.name}}" 
 width="88" 
 height="31" 
 id="copyright-license-image">
        {% endif %}
        <p id="copyright-copyright">{{ statement }}</p>
        <p id="copyright-rights">{{ freedoms }}</p>
        <p id="copyright-printing">Printing: {{ printing }}</p>
        <p id="book-link">
            ID: {{ book_id }}.
            <a href="https://wattpad.com/story/{{ book_id }}" target="_blank" id="copyright-link">View this Book Online</a>
        </p>
    </div>
    <div id="book">
        <section id="contents" class="toc">
            <h1>Table of Contents</h1>
            <ul>
                {% for part_id in parts %}
                    <li><a href="#{{part_id}}"></a></li>
                {% endfor %}
            </ul>
            </section>
            {% for part_id in parts %}
            {{parts[part_id] | safe}}
        {% endfor %}
    </div>
    <h1>About the Author</h1>
    <div id="author-container">
        <div id="author-about">
            <img src="data:image/jpg;base64,{{avatar}}" alt="{{author}}'s profile picture" id="author-profile-picture">
            <h2 id="author-name">
                <a href="https://wattpad.com/user/{{ username }}" id="author-link">{{ username }}</a>
            </h2>
            <hr id="author-divider">
            <p id="author-bio">
                {{ description }}
            </p>
        </div>
    </div>
 </html>
@@ -205,6 +205,8 @@ section {
 #contents a {
  color: inherit;
  text-decoration: none;
  display: flex;
  justify-content: space-between;
 }
 #contents a::before {
  content: target-counter(attr(href), h2-counter) '. ' target-text(attr(href));
@@ -389,6 +391,14 @@ a:hover {
 }
 #book-link {
  font-size: 14px;
  color: #666;
  margin: 8px 0;
  text-align: center;
 }
 #copyright-separator {
  width: 100%;
  max-width: 400px;
@@ -0,0 +1,47 @@
 from io import BytesIO
 from tempfile import _TemporaryFileWrapper
 from typing import Literal
 from bs4 import BeautifulSoup
 from ebooklib.epub import EpubBook
 from ..models import Story
 class AbstractGenerator:
    """Compile parsed part trees to a file.
    Args:
        metadata (Story): Story Metadata.
        part_trees (List[BeautifulSoup]): Parsed part trees.
        cover (bytes): Cover image.
        images (List[List[bytes | None]]): An array of images for each chapter, if images have been downloaded.
    """
    def __init__(
        self,
        metadata: Story,
        part_trees: list[BeautifulSoup],
        cover: bytes,
        images: list[list[bytes | None]],
    ):
        self.story = metadata
        self.parts = part_trees
        self.cover = cover
        self.images = images
        self.book: EpubBook | _TemporaryFileWrapper = None  # type: ignore
    def compile(self) -> Literal[True]:
        """Compile the part trees into the corresponding in-memory representation of the generator format.
        Returns:
            Literal[True]: Compiled successfully.
        """
        return True
    def dump(self) -> BytesIO:
        """Return a Buffer of the compiled file."""
        buffer = BytesIO()
        return buffer
@@ -0,0 +1,19 @@
 import logging
 from os import environ
 from eliot import to_file
 from eliot.stdlib import EliotHandler
 handler = EliotHandler()
 logging.getLogger("fastapi").setLevel(logging.INFO)
 logging.getLogger("fastapi").addHandler(handler)
 exiftool_logger = logging.getLogger("exiftool")
 exiftool_logger.addHandler(handler)
 logger = logging.Logger("wpd")
 logger.addHandler(handler)
 if environ.get("DEBUG"):
    to_file(open("eliot.log", "wb"))
@@ -0,0 +1,42 @@
 from typing import Optional, TypedDict
 class CopyrightData(TypedDict):
    name: str
    statement: str
    freedoms: str
    printing: str
    image_url: Optional[str]
 class Language(TypedDict):
    name: str
 class User(TypedDict):
    username: str
    avatar: str
    description: str
 class Part(TypedDict):
    id: int
    title: str
 class Story(TypedDict):
    id: str
    title: str
    createDate: str
    modifyDate: str
    language: Language
    user: User
    description: str
    cover: str
    completed: bool
    tags: list[str]
    mature: bool
    url: str
    parts: list[Part]
    isPaywalled: bool
    copyright: int
@@ -0,0 +1,86 @@
 import asyncio
 from itertools import batched
 from typing import cast
 from aiohttp import ClientSession
 from bs4 import BeautifulSoup, Tag
 from eliot import start_action
 from .vars import headers
 def clean_tree(title: str, id: int, body: str) -> BeautifulSoup:
    original_soup = BeautifulSoup(body, features="lxml")
    new_soup = BeautifulSoup(
        f"""
    <h1 class="chapter-title" id={id}>{title}</h1>
    <section class="chapter-body"></section>
 """,
        features="html.parser",  # head/body tags aren't generated
    )
    insert_at = cast(Tag, new_soup.find("section"))
    children = cast(Tag, original_soup.find("body")).children
    for tag in cast(list[Tag], list(children)):
        if tag.name != "p":  # Casted to lower
            continue
        style = tag.attrs.get("style")
        for child in cast(list[Tag], tag.children):
            # tag is a <p> enclosing either text, media, or a break
            if child.name in [None, "b", "i", "u", "strong", "em"]:
                # text is enclosed, can be italic, bold, underlined, or a mix
                tag.attrs = {}
                p_tag = tag
                if style:
                    p_tag["style"] = style
                insert_at.append(p_tag)
                break
            elif child.name == "img":
                # image is enclosed
                img_tag = Tag(name="img")
                img_tag.attrs = {
                    "height": child.attrs.get("data-original-height"),
                    "width": child.attrs.get("data-original-width"),
                    "src": child["src"],
                }
                if style:
                    img_tag["style"] = style
                insert_at.append(img_tag)
            elif child.name == "br":
                # br tag is enclosed
                br_tag = Tag(name="br", can_be_empty_element=True)
                if style:
                    br_tag["style"] = style
                insert_at.append(br_tag)
    return new_soup
 async def fetch_image(url: str) -> bytes | None:
    """Fetch image bytes."""
    with start_action(action_type="api_fetch_image", url=url):
        async with ClientSession(headers=headers) as session:  # Don't cache images.
            async with session.get(url) as response:
                if not response.ok:
                    return None
                body = await response.read()
        return body
 async def fetch_tree_images(tree: BeautifulSoup):
    """Return a Generator of bytes containing image data for all images referenced in the tree."""
    image_urls = [img["src"] for img in tree.find_all("img")]
    images = []
    for chunk in batched(image_urls, 3):
        for image_data in await asyncio.gather(*[fetch_image(url) for url in chunk]):
            images.append(image_data)
    return images
@@ -0,0 +1,25 @@
 import re
 import unicodedata
 def slugify(value, allow_unicode=False) -> str:
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    Thanks https://stackoverflow.com/a/295466.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize("NFKC", value)
    else:
        value = (
            unicodedata.normalize("NFKD", value)
            .encode("ascii", "ignore")
            .decode("ascii")
        )
    value = re.sub(r"[^\w\s-]", "", value.lower())
    return re.sub(r"[-\s]+", "-", value).strip("-_")
@@ -0,0 +1,28 @@
 from aiohttp_client_cache import FileBackend, RedisBackend
 from dotenv import load_dotenv
 from .config import CacheTypes, Config
 from .logs import logger
 headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
 }
 load_dotenv(override=True)
 config = Config()
 if config.USE_CACHE:
    match config.CACHE_TYPE:
        case CacheTypes.file:
            cache = FileBackend(use_temp=True, expire_after=43200)  # 12 hours
        case CacheTypes.redis:
            cache = RedisBackend(
                cache_name="wpd-aiohttp-cache",
                address=config.REDIS_CONNECTION_URL,
                expire_after=43200,  # 12 hours
            )
 else:
    cache = None
 logger.info(f"Using {cache=}")
@@ -1,13 +1,15 @@
 """WattpadDownloader API Server."""
 from typing import Optional
 import asyncio
 from pathlib import Path
 from enum import Enum
 from pathlib import Path
 from typing import Optional
 from zipfile import ZipFile
-from eliot import start_action
+
 from aiohttp import ClientResponseError
-from fastapi import FastAPI, Request
+from bs4 import BeautifulSoup
 from eliot import start_action
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import (
    FileResponse,
    HTMLResponse,
@@ -15,29 +17,25 @@ from fastapi.responses import (
    StreamingResponse,
 )
 from fastapi.staticfiles import StaticFiles
 from create_book import (
    EPUBGenerator,
    PDFGenerator,
    fetch_story,
    fetch_story_from_partId,
    fetch_story_content_zip,
    fetch_image,
    fetch_cookies,
    WattpadError,
    StoryNotFoundError,
-    generate_clean_part_html,
+    WattpadError,
-    slugify,
+    fetch_cookies,
    fetch_image,
    fetch_story,
    fetch_story_content_zip,
    fetch_story_from_partId,
    logger,
    slugify,
 )
-
+from create_book.parser import clean_tree, fetch_tree_images
 app = FastAPI()
 BUILD_PATH = Path(__file__).parent / "build"
 headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
 }
 class RequestCancelledMiddleware:
    # Thanks https://github.com/fastapi/fastapi/discussions/11360#discussion-6427734
@@ -77,7 +75,7 @@ app.add_middleware(RequestCancelledMiddleware)
 class DownloadFormat(Enum):
-    # pdf = "pdf"
+    pdf = "pdf"
    epub = "epub"
@@ -170,31 +168,44 @@ async def handle_download(
        cover_data = await fetch_image(
            metadata["cover"].replace("-256-", "-512-")
        )  # Increase resolution
-
+        if not cover_data:
-        match format:
+            raise HTTPException(status_code=422)
            case DownloadFormat.epub:
                book = EPUBGenerator(metadata, cover_data)
                media_type = "application/epub+zip"
            # case DownloadFormat.pdf:
            #     book = PDFGenerator(metadata, cover_data)
            #     media_type = "application/pdf"
        logger.info(f"Retrieved story metadata and cover ({story_id=})")
        story_zip = await fetch_story_content_zip(story_id, cookies)
        archive = ZipFile(story_zip, "r")
-        part_contents = [
+        part_trees: list[BeautifulSoup] = [
-            generate_clean_part_html(
+            clean_tree(
-                part, archive.read(str(part["id"])).decode("utf-8")
+                part["title"], part["id"], archive.read(str(part["id"])).decode("utf-8")
            )
            for part in metadata["parts"]
        ]
-        async for title in book.add_chapters(
+        images = (
-            part_contents, download_images=download_images
+            [await fetch_tree_images(tree) for tree in part_trees]
-        ):
+            if download_images
-            ...
+            else []
        )
        match format:
            case DownloadFormat.epub:
                book = EPUBGenerator(metadata, part_trees, cover_data, images)
                media_type = "application/epub+zip"
            case DownloadFormat.pdf:
                author_image = await fetch_image(
                    metadata["user"]["avatar"].replace("-256-", "-512-")
                )
                if not author_image:
                    raise HTTPException(status_code=422)
                book = PDFGenerator(
                    metadata, part_trees, cover_data, images, author_image
                )
                media_type = "application/pdf"
        logger.info(f"Retrieved story metadata and cover ({story_id=})")
        book.compile()
        book_buffer = book.dump()
@@ -1,54 +0,0 @@
 <!DOCTYPE html>
 <html lang="{langcode}">
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{book_title}</title>
    <section class="fullpage">
        <img src="{cover}" alt="Cover">
      </section>
      <div id="copyright-container">
        <h1 id="copyright-notice">Copyright Notice</h1>
        <h2 id="copyright-title">{book_title}</h2>
        <p id="copyright-author">By {author}</p>
        <div id="copyright-separator"></div>
        <p id="copyright-ex-libris">Ex Libris Sapientiae</p>
        <div id="copyright-separator"></div>
        {copyright_image}
        <p id="copyright-copyright">{statement}</p>
        <p id="copyright-rights">{freedoms}</p>
        <p id="copyright-printing">Printing: {printing}</p>
        <p id="copyright-printing">ID: {book_id}. <a href="https://wattpad.com/story/{book_id}" target="_blank" id="copyright-link">View this Book Online</a></p>
    </div>
    <div id="book">
    </div>
    <h1>About the Author</h1>
    <div id="author-container">
        <div id="author-about">
            {avatar}
            <h2 id="author-name"><a href="https://wattpad.com/user/{username}" id="author-link">{username}</a></h2>
            <hr id="author-divider">
            <p id="author-bio">
                {description}
            </p>
        </div>
    </div>
 </html>
@@ -1,120 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <xsl:stylesheet version="2.0"
                xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
                xmlns:outline="http://wkhtmltopdf.org/outline"
                xmlns="http://www.w3.org/1999/xhtml">
  <xsl:output doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
              doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
              indent="yes" />
  <xsl:template match="outline:outline">
    <html>
      <head>
 <style>
@font-face {
  font-family: 'PT Serif';
  src: url('./fonts/PTSerif-Regular.ttf') format('truetype');
  font-weight: 400;
  font-style: normal;
 }
@font-face {
  font-family: 'PT Serif';
  src: url('./fonts/PTSerif-Bold.ttf') format('truetype');
  font-weight: 700;
  font-style: normal;
 }
@font-face {
  font-family: 'PT Serif';
  src: url('./fonts/PTSerif-Italic.ttf') format('truetype');
  font-weight: 400;
  font-style: italic;
 }
@font-face {
  font-family: 'PT Serif';
  src: url('./fonts/PTSerif-BoldItalic.ttf') format('truetype');
  font-weight: 700;
  font-style: italic;
 }
 .pt-serif-regular {
  font-family: "PT Serif", serif;
  font-weight: 400;
  font-style: normal;
 }
 .pt-serif-bold {
  font-family: "PT Serif", serif;
  font-weight: 700;
  font-style: normal;
 }
 .pt-serif-regular-italic {
  font-family: "PT Serif", serif;
  font-weight: 400;
  font-style: italic;
 }
 .pt-serif-bold-italic {
  font-family: "PT Serif", serif;
  font-weight: 700;
  font-style: italic;
 }
 h1 {
  text-align: center;
  font-family: "PT Serif", serif !important;
  font-weight: 700 !important;
  font-style: normal !important;
  font-size: 36px !important; /* Uniform size */
  margin-bottom: 20px; /* Space below the heading */
  border-bottom: 4px solid black; /* Black line */
  padding-bottom: 10px; /* Space between text and line */
 }
          div {border-bottom: 1px dashed rgb(100,000,100);
          padding-top: 5px;}
          span {float: right;}
          li {list-style: none;}
          ul {
            font-size: 22px;
            font-family: arial;
          }
          ul ul {font-size: 80%; }
          ul {padding-left: 0em;}
          ul ul {padding-left: 1em;}
          a {text-decoration:none; color: black;}
        </style>
      </head>
      <body>
        <h1>Table of Contents</h1>
        <ul><xsl:apply-templates select="outline:item/outline:item"/></ul>
      </body>
    </html>
  </xsl:template>
  <xsl:template match="outline:item">
    <li>
      <xsl:if test="@title!=''">
        <div>
          <a class="pt-serif-regular">
            <xsl:if test="@link">
              <xsl:attribute name="href"><xsl:value-of select="@link"/></xsl:attribute>
            </xsl:if>
            <xsl:if test="@backLink">
              <xsl:attribute name="name"><xsl:value-of select="@backLink"/></xsl:attribute>
            </xsl:if>
            <xsl:value-of select="@title" /> 
          </a>
          <span> <xsl:value-of select="@page" /> </span>
        </div>
      </xsl:if>
      <ul>
        <xsl:comment>added to prevent self-closing tags in QtXmlPatterns</xsl:comment>
        <xsl:apply-templates select="outline:item"/>
      </ul>
    </li>
  </xsl:template>
 </xsl:stylesheet>