fix(api): Clean code

This commit is contained in:
TheOnlyWayUp
2024-12-07 10:00:49 +00:00
parent c116300272
commit dd38369832
2 changed files with 45 additions and 39 deletions
+32 -23
View File
@@ -6,7 +6,6 @@ import json
import logging import logging
import tempfile import tempfile
import unicodedata import unicodedata
from io import BytesIO, StringIO
from os import environ from os import environ
from enum import Enum from enum import Enum
from base64 import b64encode from base64 import b64encode
@@ -16,7 +15,6 @@ from eliot import to_file, start_action
from eliot.stdlib import EliotHandler from eliot.stdlib import EliotHandler
from dotenv import load_dotenv from dotenv import load_dotenv
from ebooklib import epub from ebooklib import epub
from ebooklib.epub import EpubBook
from exiftool import ExifTool from exiftool import ExifTool
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pydantic import TypeAdapter, model_validator, field_validator from pydantic import TypeAdapter, model_validator, field_validator
@@ -28,14 +26,19 @@ from aiohttp_client_cache import FileBackend, RedisBackend
load_dotenv(override=True) load_dotenv(override=True)
handler = EliotHandler() handler = EliotHandler()
logging.getLogger("fastapi").setLevel(logging.INFO) logging.getLogger("fastapi").setLevel(logging.INFO)
logging.getLogger("fastapi").addHandler(handler) logging.getLogger("fastapi").addHandler(handler)
exiftool_logger = logging.getLogger("exiftool")
exiftool_logger.addHandler(handler)
logger = logging.Logger("wpd")
logger.addHandler(handler)
if environ.get("DEBUG"): if environ.get("DEBUG"):
to_file(open("eliot.log", "wb")) to_file(open("eliot.log", "wb"))
logger = logging.Logger("wpd")
logger.addHandler(handler)
# --- # # --- #
@@ -106,6 +109,18 @@ logger.info(f"Using {cache=}")
# --- Utilities --- # # --- Utilities --- #
def clean_part_text(text: str):
"""Remove unnecessary newlines from Text"""
soup = BeautifulSoup(text)
for br in soup.find_all("br"):
# Check if no content after br
if not br.next_sibling or br.next_sibling.name in ["br", None]:
br.decompose()
return str(soup)
def slugify(value, allow_unicode=False) -> str: def slugify(value, allow_unicode=False) -> str:
""" """
Taken from https://github.com/django/django/blob/master/django/utils/text.py Taken from https://github.com/django/django/blob/master/django/utils/text.py
@@ -319,7 +334,7 @@ class EPUBGenerator:
self.data = data self.data = data
self.cover = cover self.cover = cover
# set metadata # set metadata, defined in https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2
self.epub.add_author(data["user"]["username"]) self.epub.add_author(data["user"]["username"])
self.epub.add_metadata("DC", "title", data["title"]) self.epub.add_metadata("DC", "title", data["title"])
@@ -358,7 +373,7 @@ class EPUBGenerator:
# Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1 # Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1
chapter = epub.EpubHtml( chapter = epub.EpubHtml(
title=title, title=title,
file_name=f"{cidx}.xhtml", # Used to be clean_title.xhtml, but that broke Arabic support as slugify turns arabic strings into '', leading to multiple files with the same name, breaking those chapters. file_name=f"{cidx}.xhtml", # See issue #30
lang=self.data["language"]["name"], lang=self.data["language"]["name"],
) )
@@ -387,10 +402,9 @@ class EPUBGenerator:
) )
chapter.set_content(content) chapter.set_content(content)
chapters.append(chapter) chapters.append(chapter)
yield title # Yield the chapter's title upon insertion preceeded by retrieval. yield title
for chapter in chapters: for chapter in chapters:
self.epub.add_item(chapter) self.epub.add_item(chapter)
@@ -475,13 +489,16 @@ wp_copyright = {
class PDFGenerator: class PDFGenerator:
"""PDF Generation utilities"""
def __init__(self, data: Story, cover: bytes): def __init__(self, data: Story, cover: bytes):
self.data = data self.data = data
self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True)
self.cover = cover self.cover = cover
# self.canvas = Canvas(self.file)
async def add_chapters(self, contents: List[str], download_images: bool = False): async def add_chapters(self, contents: List[str], download_images: bool = False):
"""Add chapters to the PDF"""
chapters: List[tempfile._TemporaryFileWrapper] = [] chapters: List[tempfile._TemporaryFileWrapper] = []
for part, content in zip(self.data["parts"], contents): for part, content in zip(self.data["parts"], contents):
@@ -489,6 +506,7 @@ class PDFGenerator:
image_sources: List[str] = [] image_sources: List[str] = []
for image_container in html.find_all("p", {"data-media-type": "image"}): for image_container in html.find_all("p", {"data-media-type": "image"}):
# Find all images, download them if download_images, else clear them (else wkhtmltopdf _might_ fetch them)
img = image_container.findChild("img") img = image_container.findChild("img")
source = img.get("src") source = img.get("src")
if not download_images and source: if not download_images and source:
@@ -508,7 +526,7 @@ class PDFGenerator:
writable_html = writable_html.replace( writable_html = writable_html.replace(
image_url, image_url,
f"data:image/jpg;base64,{b64encode(image).decode()}", f"data:image/jpg;base64,{b64encode(image).decode()}",
) ) # Base64-encoded images are better than referencing NamedTemporaryFiles as it's less access to the local filesystem, the enable-local-file-access would be disabled if not for local fonts.
tempie = tempfile.NamedTemporaryFile(suffix=".html", delete=True) tempie = tempfile.NamedTemporaryFile(suffix=".html", delete=True)
tempie.write(writable_html.encode()) tempie.write(writable_html.encode())
@@ -541,7 +559,9 @@ class PDFGenerator:
cover_first=True, cover_first=True,
) )
clean_description = self.data["description"].strip().replace("\n", "$/") clean_description = (
self.data["description"].strip().replace("\n", "$/")
) # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `
` is another option.
metadata = { metadata = {
"Author": self.data["user"]["username"], "Author": self.data["user"]["username"],
"Title": self.data["title"], "Title": self.data["title"],
@@ -556,6 +576,7 @@ class PDFGenerator:
} # As per https://exiftool.org/TagNames/PDF.html } # As per https://exiftool.org/TagNames/PDF.html
with ExifTool(config_file="../exiftool.config", logger=logger) as et: with ExifTool(config_file="../exiftool.config", logger=logger) as et:
# Custom configuration adds Completed and MatureContent tags.
et.execute( et.execute(
*( *(
[f"-{key}={value}" for key, value in metadata.items()] [f"-{key}={value}" for key, value in metadata.items()]
@@ -573,15 +594,3 @@ class PDFGenerator:
self.file.seek(0) self.file.seek(0)
return self return self
def clean_part_text(text: str):
"""Remove unnecessary newlines from Text"""
soup = BeautifulSoup(text)
for br in soup.find_all("br"):
# Check if no content after br
if not br.next_sibling or br.next_sibling.name in ["br", None]:
br.decompose()
return str(soup)
+13 -16
View File
@@ -87,7 +87,7 @@ def home():
@app.exception_handler(ClientResponseError) @app.exception_handler(ClientResponseError)
def download_error_handler(request: Request, exception: ClientResponseError): def download_error_handler(exception: ClientResponseError):
match exception.status: match exception.status:
case 400 | 404: case 400 | 404:
return HTMLResponse( return HTMLResponse(
@@ -109,7 +109,7 @@ def download_error_handler(request: Request, exception: ClientResponseError):
@app.exception_handler(WattpadError) @app.exception_handler(WattpadError)
def download_wp_error_handler(request: Request, exception: WattpadError): def download_wp_error_handler(exception: WattpadError):
if isinstance(exception, StoryNotFoundError): if isinstance(exception, StoryNotFoundError):
return HTMLResponse( return HTMLResponse(
status_code=404, status_code=404,
@@ -162,36 +162,33 @@ async def handle_download(
case DownloadMode.part: case DownloadMode.part:
story_id, metadata = await fetch_story_from_partId(download_id, cookies) story_id, metadata = await fetch_story_from_partId(download_id, cookies)
cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
match format:
case DownloadFormat.epub:
book = EPUBGenerator(metadata, cover_data)
media_type = "application/epub+zip"
case DownloadFormat.pdf:
book = PDFGenerator(metadata, cover_data)
media_type = "application/pdf"
logger.info(f"Retrieved story id ({story_id=})") logger.info(f"Retrieved story id ({story_id=})")
cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
part_contents = [ part_contents = [
f"<h1>{part['title']}</h1>" f"<h1>{part['title']}</h1>"
+ (clean_part_text(await fetch_part_content(part["id"], cookies=cookies))) + (clean_part_text(await fetch_part_content(part["id"], cookies=cookies)))
for part in metadata["parts"] for part in metadata["parts"]
] ]
match format:
case DownloadFormat.epub:
book = EPUBGenerator(metadata, cover_data)
case DownloadFormat.pdf:
book = PDFGenerator(metadata, cover_data)
async for title in book.add_chapters( async for title in book.add_chapters(
part_contents, download_images=download_images part_contents, download_images=download_images
): ):
print(title) ...
book_file = book.dump().file book_file = book.dump().file
book_bytes = book_file.read() book_bytes = book_file.read()
book_file.close() book_file.close()
match format:
case DownloadFormat.epub:
media_type = "application/epub+zip"
case DownloadFormat.pdf:
media_type = "application/pdf"
return StreamingResponse( return StreamingResponse(
BytesIO(book_bytes), BytesIO(book_bytes),
media_type=media_type, media_type=media_type,