fix(api): Clean code

2024-12-07 10:00:49 +00:00
parent c116300272
commit dd38369832
2 changed files with 45 additions and 39 deletions
@@ -6,7 +6,6 @@ import json
 import logging
 import tempfile
 import unicodedata
-from io import BytesIO, StringIO
 from os import environ
 from enum import Enum
 from base64 import b64encode
@@ -16,7 +15,6 @@ from eliot import to_file, start_action
 from eliot.stdlib import EliotHandler
 from dotenv import load_dotenv
 from ebooklib import epub
-from ebooklib.epub import EpubBook
 from exiftool import ExifTool
 from bs4 import BeautifulSoup
 from pydantic import TypeAdapter, model_validator, field_validator
@@ -28,14 +26,19 @@ from aiohttp_client_cache import FileBackend, RedisBackend
 load_dotenv(override=True)

 handler = EliotHandler()
+
 logging.getLogger("fastapi").setLevel(logging.INFO)
 logging.getLogger("fastapi").addHandler(handler)

+exiftool_logger = logging.getLogger("exiftool")
+exiftool_logger.addHandler(handler)
+
+logger = logging.Logger("wpd")
+logger.addHandler(handler)
+
 if environ.get("DEBUG"):
    to_file(open("eliot.log", "wb"))

-logger = logging.Logger("wpd")
-logger.addHandler(handler)

 # --- #

@@ -106,6 +109,18 @@ logger.info(f"Using {cache=}")
 # --- Utilities --- #


+def clean_part_text(text: str):
+    """Remove unnecessary newlines from Text"""
+    soup = BeautifulSoup(text)
+
+    for br in soup.find_all("br"):
+        # Check if no content after br
+        if not br.next_sibling or br.next_sibling.name in ["br", None]:
+            br.decompose()
+
+    return str(soup)
+
+
 def slugify(value, allow_unicode=False) -> str:
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
@@ -319,7 +334,7 @@ class EPUBGenerator:
        self.data = data
        self.cover = cover

-        # set metadata
+        # set metadata, defined in https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2
        self.epub.add_author(data["user"]["username"])

        self.epub.add_metadata("DC", "title", data["title"])
@@ -358,7 +373,7 @@ class EPUBGenerator:
            # Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1
            chapter = epub.EpubHtml(
                title=title,
-                file_name=f"{cidx}.xhtml",  # Used to be clean_title.xhtml, but that broke Arabic support as slugify turns arabic strings into '', leading to multiple files with the same name, breaking those chapters.
+                file_name=f"{cidx}.xhtml",  # See issue #30
                lang=self.data["language"]["name"],
            )

@@ -387,10 +402,9 @@ class EPUBGenerator:
                            )

            chapter.set_content(content)
-
            chapters.append(chapter)

-            yield title  # Yield the chapter's title upon insertion preceeded by retrieval.
+            yield title

        for chapter in chapters:
            self.epub.add_item(chapter)
@@ -475,13 +489,16 @@ wp_copyright = {


 class PDFGenerator:
+    """PDF Generation utilities"""
+
    def __init__(self, data: Story, cover: bytes):
        self.data = data
        self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True)
        self.cover = cover
-        # self.canvas = Canvas(self.file)

    async def add_chapters(self, contents: List[str], download_images: bool = False):
+        """Add chapters to the PDF"""
+
        chapters: List[tempfile._TemporaryFileWrapper] = []

        for part, content in zip(self.data["parts"], contents):
@@ -489,6 +506,7 @@ class PDFGenerator:
            image_sources: List[str] = []

            for image_container in html.find_all("p", {"data-media-type": "image"}):
+                # Find all images, download them if download_images, else clear them (else wkhtmltopdf _might_ fetch them)
                img = image_container.findChild("img")
                source = img.get("src")
                if not download_images and source:
@@ -508,7 +526,7 @@ class PDFGenerator:
                            writable_html = writable_html.replace(
                                image_url,
                                f"data:image/jpg;base64,{b64encode(image).decode()}",
-                            )
+                            )  # Base64-encoded images are better than referencing NamedTemporaryFiles as it's less access to the local filesystem, the enable-local-file-access would be disabled if not for local fonts.

            tempie = tempfile.NamedTemporaryFile(suffix=".html", delete=True)
            tempie.write(writable_html.encode())
@@ -541,7 +559,9 @@ class PDFGenerator:
            cover_first=True,
        )

-        clean_description = self.data["description"].strip().replace("\n", "$/")
+        clean_description = (
+            self.data["description"].strip().replace("\n", "$/")
+        )  # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `&#xa;` is another option.
        metadata = {
            "Author": self.data["user"]["username"],
            "Title": self.data["title"],
@@ -556,6 +576,7 @@ class PDFGenerator:
        }  # As per https://exiftool.org/TagNames/PDF.html

        with ExifTool(config_file="../exiftool.config", logger=logger) as et:
+            # Custom configuration adds Completed and MatureContent tags.
            et.execute(
                *(
                    [f"-{key}={value}" for key, value in metadata.items()]
@@ -573,15 +594,3 @@ class PDFGenerator:
        self.file.seek(0)

        return self
-
-
-def clean_part_text(text: str):
-    """Remove unnecessary newlines from Text"""
-    soup = BeautifulSoup(text)
-
-    for br in soup.find_all("br"):
-        # Check if no content after br
-        if not br.next_sibling or br.next_sibling.name in ["br", None]:
-            br.decompose()
-
-    return str(soup)
@@ -87,7 +87,7 @@ def home():


@app.exception_handler(ClientResponseError)
-def download_error_handler(request: Request, exception: ClientResponseError):
+def download_error_handler(exception: ClientResponseError):
    match exception.status:
        case 400 | 404:
            return HTMLResponse(
@@ -109,7 +109,7 @@ def download_error_handler(request: Request, exception: ClientResponseError):


@app.exception_handler(WattpadError)
-def download_wp_error_handler(request: Request, exception: WattpadError):
+def download_wp_error_handler(exception: WattpadError):
    if isinstance(exception, StoryNotFoundError):
        return HTMLResponse(
            status_code=404,
@@ -162,36 +162,33 @@ async def handle_download(
            case DownloadMode.part:
                story_id, metadata = await fetch_story_from_partId(download_id, cookies)

+        cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
+
+        match format:
+            case DownloadFormat.epub:
+                book = EPUBGenerator(metadata, cover_data)
+                media_type = "application/epub+zip"
+            case DownloadFormat.pdf:
+                book = PDFGenerator(metadata, cover_data)
+                media_type = "application/pdf"
+
        logger.info(f"Retrieved story id ({story_id=})")

-        cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
        part_contents = [
            f"<h1>{part['title']}</h1>"
            + (clean_part_text(await fetch_part_content(part["id"], cookies=cookies)))
            for part in metadata["parts"]
        ]

-        match format:
-            case DownloadFormat.epub:
-                book = EPUBGenerator(metadata, cover_data)
-            case DownloadFormat.pdf:
-                book = PDFGenerator(metadata, cover_data)
-
        async for title in book.add_chapters(
            part_contents, download_images=download_images
        ):
-            print(title)
+            ...

        book_file = book.dump().file
        book_bytes = book_file.read()
        book_file.close()

-        match format:
-            case DownloadFormat.epub:
-                media_type = "application/epub+zip"
-            case DownloadFormat.pdf:
-                media_type = "application/pdf"
-
        return StreamingResponse(
            BytesIO(book_bytes),
            media_type=media_type,