Merge branch 'feature/#31-zip-downloading' into feature/#29-pdf-downloads

2024-12-22 10:58:28 +00:00
parent 8dc7d16578 a1191b2600
commit 5215689836
2 changed files with 107 additions and 112 deletions
@@ -2,7 +2,6 @@ from __future__ import annotations
 from typing import List, Optional, Tuple, cast
 from typing_extensions import TypedDict
 import re
 import json
 import logging
 import tempfile
 import unicodedata
@@ -127,16 +126,77 @@ def smart_trim(text: str, max_length: int = 400) -> str:
    return to_return
-def clean_part_text(text: str) -> str:
+def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
-    """Remove unnecessary newlines from Text"""
+    chapter_title = part["title"]
-    soup = BeautifulSoup(text, "lxml")
+    chapter_id = part["id"]
-    for br in soup.find_all("br"):
+    clean = BeautifulSoup(
        f"""
    <section id="section_{chapter_id}" class="chapitre">
        <h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
    </section>
    """,
        "html.parser",
    )  # html.parser doesn't create <html>/<body> tags automatically
    html = BeautifulSoup(content, "lxml")
    for br in html.find_all("br"):
        # Check if no content after br
        if not br.next_sibling or br.next_sibling.name in ["br", None]:
            br.decompose()
-    return str(soup)
+    section = cast(bs4.Tag, clean.find("section"))
    if not section:
        raise Exception()
    for child in html.find_all("p"):
        for p_child in list(child.children):
            if not p_child:
                continue
            if isinstance(p_child, bs4.element.Tag):
                if p_child.name == "br":
                    p_child.decompose()
                elif p_child.name == "img":
                    src = p_child["src"]
                    img_tag = clean.new_tag("img")
                    img_tag["src"] = src
                    break_tag = clean.new_tag("br")
                    section.append(img_tag)
                    section.append(break_tag)
                elif p_child.name == "b":
                    content = p_child.text
                    p_tag = clean.new_tag("p")
                    bold_tag = clean.new_tag("b")
                    bold_content = clean.new_string(content)
                    bold_tag.append(bold_content)
                    p_tag.append(bold_tag)
                    section.append(p_tag)
                elif p_child.name == "i":
                    content = p_child.text
                    p_tag = clean.new_tag("p")
                    italic_tag = clean.new_tag("i")
                    italic_content = clean.new_string(content)
                    italic_tag.append(italic_content)
                    p_tag.append(italic_tag)
                    section.append(p_tag)
            elif isinstance(p_child, bs4.element.NavigableString):
                content = p_child.text
                p_tag = clean.new_tag("p")
                p_content = clean.new_string(content)
                p_tag.append(p_content)
                section.append(p_tag)
        if not list(child.children):
            # Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
            child.decompose()
    return section
 def slugify(value, allow_unicode=False) -> str:
@@ -269,8 +329,8 @@ class PartNotFoundError(StoryNotFoundError): ...
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story_from_partId(
    part_id: int, cookies: Optional[dict] = None
-) -> Tuple[str, Story]:
+) -> Tuple[int, Story]:
-    """Return a Story ID from a Part ID."""
+    """Fetch Story ID from Part ID."""
    with start_action(action_type="api_fetch_storyFromPartId"):
        async with CachedSession(
            headers=headers, cache=None if cookies else cache
@@ -288,12 +348,12 @@ async def fetch_story_from_partId(
                response.raise_for_status()
-        return str(body["groupId"]), story_ta.validate_python(body["group"])
+        return int(body["groupId"]), story_ta.validate_python(body["group"])
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_story(story_id: int, cookies: Optional[dict] = None) -> Story:
-    """Taking a story_id, return its information from the Wattpad API."""
+    """Fetch Story metadata using a Story ID."""
    with start_action(action_type="api_fetch_story", story_id=story_id):
        async with CachedSession(
            headers=headers, cookies=cookies, cache=None if cookies else cache
@@ -315,29 +375,25 @@ async def fetch_story(story_id: int, cookies: Optional[dict] = None) -> Story:
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
-async def fetch_part_content(part_id: int, cookies: Optional[dict] = None) -> str:
+async def fetch_story_content_zip(
-    """Return the HTML Content of a Part."""
+    story_id: int, cookies: Optional[dict] = None
-    with start_action(action_type="api_fetch_partContent", part_id=part_id):
+) -> BytesIO:
    """Return a BytesIO stream of a .zip file containing each part's HTML content."""
    with start_action(action_type="api_fetch_storyZip", story_id=story_id):
        async with CachedSession(
-            headers=headers, cookies=cookies, cache=None if cookies else cache
+            headers=headers,
            cookies=cookies,
            cache=None if cookies else cache,
        ) as session:
            async with session.get(
-                f"https://www.wattpad.com/apiv2/?m=storytext&id={part_id}"
+                f"https://www.wattpad.com/apiv2/?m=storytext&group_id={story_id}&output=zip"
            ) as response:
                body = await response.text()
                if response.status == 400:
                    data = json.loads(body)
                    match data.get("code"):
                        case 463:  # ""Could not find any parts for that story""
                            logger.info(
                                f"{part_id=} for text not found on Wattpad, returning."
                            )
                            raise PartNotFoundError()
                response.raise_for_status()
-        return body
+                bytes_object = await response.read()
                bytes_stream = BytesIO(bytes_object)
        return bytes_stream
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
@@ -397,7 +453,9 @@ class EPUBGenerator:
        cover_chapter.set_content('<img src="cover.jpg">')
        self.epub.add_item(cover_chapter)
-    async def add_chapters(self, contents: List[str], download_images: bool = False):
+    async def add_chapters(
        self, contents: List[bs4.Tag], download_images: bool = False
    ):
        """Add chapters to the Epub, downloading images if necessary. Sets the table of contents and spine."""
        chapters: List[epub.EpubHtml] = []
@@ -412,8 +470,9 @@ class EPUBGenerator:
                uid=str(part["id"]).encode(),
            )
            str_content = content.prettify()
            if download_images:
-                soup = BeautifulSoup(content, "lxml")
+                soup = content
                async with CachedSession(
                    headers=headers, cache=None
@@ -432,11 +491,11 @@ class EPUBGenerator:
                            self.epub.add_item(img)
                            # Fetch image and pack
-                            content = content.replace(
+                            str_content = str_content.replace(
                                str(image["src"]), f"static/{cidx}/{idx}.jpeg"
                            )
-            chapter.set_content(content)
+            chapter.set_content(str_content)
            self.epub.add_item(chapter)
            chapters.append(chapter)
@@ -535,7 +594,7 @@ class PDFGenerator:
        with open("./pdf/book.html") as reader:
            self.template = reader.read()
-    async def genernate_cover_and_copyright_html(
+    async def generate_cover_and_copyright_html(
        self,
    ) -> str:
        """Generate Cover and Copyright file, fetch copyright image (cached), use self.cover for cover."""
@@ -618,75 +677,6 @@ id="copyright-license-image">""".format(
        return about_author
    def generate_clean_part_html(self, part: Part, content: str):
        chapter_title = part["title"]
        chapter_id = part["id"]
        clean = BeautifulSoup(
            f"""
        <section id="section_{chapter_id}" class="chapitre">
            <h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
        </section>
        """,
            "html.parser",
        )  # html.parser doesn't create <html>/<body> tags automatically
        html = BeautifulSoup(content, "lxml")
        section = clean.find("section")
        if not section:
            raise Exception()
        for child in html.find_all("p"):
            for p_child in list(child.children):
                if not p_child:
                    continue
                if isinstance(p_child, bs4.element.Tag):
                    if p_child.name == "br":
                        p_child.decompose()
                    elif p_child.name == "img":
                        src = p_child["src"]
                        img_tag = clean.new_tag("img")
                        img_tag["src"] = src
                        break_tag = clean.new_tag("br")
                        section.append(img_tag)
                        section.append(break_tag)
                    elif p_child.name == "b":
                        content = p_child.text
                        p_tag = clean.new_tag("p")
                        bold_tag = clean.new_tag("b")
                        bold_content = clean.new_string(content)
                        bold_tag.append(bold_content)
                        p_tag.append(bold_tag)
                        section.append(p_tag)
                    elif p_child.name == "i":
                        content = p_child.text
                        p_tag = clean.new_tag("p")
                        italic_tag = clean.new_tag("i")
                        italic_content = clean.new_string(content)
                        italic_tag.append(italic_content)
                        p_tag.append(italic_tag)
                        section.append(p_tag)
                elif isinstance(p_child, bs4.element.NavigableString):
                    content = p_child.text
                    p_tag = clean.new_tag("p")
                    p_content = clean.new_string(content)
                    p_tag.append(p_content)
                    section.append(p_tag)
            if not list(child.children):
                # Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
                child.decompose()
        insert_point = cast(bs4.Tag, self.tree.find("div", {"id": "book"}))
        insert_point.append(section)
        return str(clean)
    def generate_toc(self):
        ids = [part["id"] for part in self.data["parts"]]
        clean = BeautifulSoup(
@@ -711,17 +701,21 @@ id="copyright-license-image">""".format(
        insert_point.append(clean)
        return str(clean)
-    async def add_chapters(self, contents: List[str], download_images: bool = False):
+    async def add_chapters(
        self, contents: List[bs4.Tag], download_images: bool = False
    ):
        """Add chapters to the PDF, downloading images if necessary. Also add Cover, Copyright, and About the Author pages."""
        # # Cover and Copyright Page
-        await self.genernate_cover_and_copyright_html()
+        await self.generate_cover_and_copyright_html()
        await self.generate_about_author_chapter()
        self.tree = BeautifulSoup(self.template)
        self.generate_toc()
        for part, content in zip(self.data["parts"], contents):
-            self.generate_clean_part_html(part, content)
+            insert_point = cast(bs4.Tag, self.tree.find("div", {"id": "book"}))
            insert_point.append(content)
            yield part["title"]
        # # About the Author page
@@ -786,10 +780,6 @@ id="copyright-license-image">""".format(
                    )
                )
        # Close files and delete them from tmp
        for chapter in chapters:
            chapter.file.close()
    def dump(self) -> BytesIO:
        self.file.seek(0)
        buffer = BytesIO(self.file.read())
@@ -4,6 +4,7 @@ from typing import Optional
 import asyncio
 from pathlib import Path
 from enum import Enum
 from zipfile import ZipFile
 from eliot import start_action
 from aiohttp import ClientResponseError
 from fastapi import FastAPI, Request
@@ -19,12 +20,12 @@ from create_book import (
    PDFGenerator,
    fetch_story,
    fetch_story_from_partId,
-    fetch_part_content,
+    fetch_story_content_zip,
    fetch_image,
    fetch_cookies,
    WattpadError,
    StoryNotFoundError,
-    clean_part_text,
+    generate_clean_part_html,
    slugify,
    logger,
 )
@@ -180,9 +181,13 @@ async def handle_download(
        logger.info(f"Retrieved story metadata and cover ({story_id=})")
        story_zip = await fetch_story_content_zip(story_id, cookies)
        archive = ZipFile(story_zip, "r")
        part_contents = [
-            f"<h1>{part['title']}</h1>"
+            generate_clean_part_html(
-            + (clean_part_text(await fetch_part_content(part["id"], cookies=cookies)))
+                part, archive.read(str(part["id"])).decode("utf-8")
            )
            for part in metadata["parts"]
        ]