fix(api): Clean code

This commit is contained in:
TheOnlyWayUp
2024-12-07 10:00:49 +00:00
parent c116300272
commit dd38369832
2 changed files with 45 additions and 39 deletions
+32 -23
View File
@@ -6,7 +6,6 @@ import json
import logging
import tempfile
import unicodedata
from io import BytesIO, StringIO
from os import environ
from enum import Enum
from base64 import b64encode
@@ -16,7 +15,6 @@ from eliot import to_file, start_action
from eliot.stdlib import EliotHandler
from dotenv import load_dotenv
from ebooklib import epub
from ebooklib.epub import EpubBook
from exiftool import ExifTool
from bs4 import BeautifulSoup
from pydantic import TypeAdapter, model_validator, field_validator
@@ -28,14 +26,19 @@ from aiohttp_client_cache import FileBackend, RedisBackend
load_dotenv(override=True)
handler = EliotHandler()
logging.getLogger("fastapi").setLevel(logging.INFO)
logging.getLogger("fastapi").addHandler(handler)
exiftool_logger = logging.getLogger("exiftool")
exiftool_logger.addHandler(handler)
logger = logging.Logger("wpd")
logger.addHandler(handler)
if environ.get("DEBUG"):
to_file(open("eliot.log", "wb"))
logger = logging.Logger("wpd")
logger.addHandler(handler)
# --- #
@@ -106,6 +109,18 @@ logger.info(f"Using {cache=}")
# --- Utilities --- #
def clean_part_text(text: str):
"""Remove unnecessary newlines from Text"""
soup = BeautifulSoup(text)
for br in soup.find_all("br"):
# Check if no content after br
if not br.next_sibling or br.next_sibling.name in ["br", None]:
br.decompose()
return str(soup)
def slugify(value, allow_unicode=False) -> str:
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
@@ -319,7 +334,7 @@ class EPUBGenerator:
self.data = data
self.cover = cover
# set metadata
# set metadata, defined in https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2
self.epub.add_author(data["user"]["username"])
self.epub.add_metadata("DC", "title", data["title"])
@@ -358,7 +373,7 @@ class EPUBGenerator:
# Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1
chapter = epub.EpubHtml(
title=title,
file_name=f"{cidx}.xhtml", # Used to be clean_title.xhtml, but that broke Arabic support as slugify turns arabic strings into '', leading to multiple files with the same name, breaking those chapters.
file_name=f"{cidx}.xhtml", # See issue #30
lang=self.data["language"]["name"],
)
@@ -387,10 +402,9 @@ class EPUBGenerator:
)
chapter.set_content(content)
chapters.append(chapter)
yield title # Yield the chapter's title upon insertion preceeded by retrieval.
yield title
for chapter in chapters:
self.epub.add_item(chapter)
@@ -475,13 +489,16 @@ wp_copyright = {
class PDFGenerator:
"""PDF Generation utilities"""
def __init__(self, data: Story, cover: bytes):
self.data = data
self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True)
self.cover = cover
# self.canvas = Canvas(self.file)
async def add_chapters(self, contents: List[str], download_images: bool = False):
"""Add chapters to the PDF"""
chapters: List[tempfile._TemporaryFileWrapper] = []
for part, content in zip(self.data["parts"], contents):
@@ -489,6 +506,7 @@ class PDFGenerator:
image_sources: List[str] = []
for image_container in html.find_all("p", {"data-media-type": "image"}):
# Find all images, download them if download_images, else clear them (else wkhtmltopdf _might_ fetch them)
img = image_container.findChild("img")
source = img.get("src")
if not download_images and source:
@@ -508,7 +526,7 @@ class PDFGenerator:
writable_html = writable_html.replace(
image_url,
f"data:image/jpg;base64,{b64encode(image).decode()}",
)
) # Base64-encoded images are better than referencing NamedTemporaryFiles as it's less access to the local filesystem, the enable-local-file-access would be disabled if not for local fonts.
tempie = tempfile.NamedTemporaryFile(suffix=".html", delete=True)
tempie.write(writable_html.encode())
@@ -541,7 +559,9 @@ class PDFGenerator:
cover_first=True,
)
clean_description = self.data["description"].strip().replace("\n", "$/")
clean_description = (
self.data["description"].strip().replace("\n", "$/")
) # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `
` is another option.
metadata = {
"Author": self.data["user"]["username"],
"Title": self.data["title"],
@@ -556,6 +576,7 @@ class PDFGenerator:
} # As per https://exiftool.org/TagNames/PDF.html
with ExifTool(config_file="../exiftool.config", logger=logger) as et:
# Custom configuration adds Completed and MatureContent tags.
et.execute(
*(
[f"-{key}={value}" for key, value in metadata.items()]
@@ -573,15 +594,3 @@ class PDFGenerator:
self.file.seek(0)
return self
def clean_part_text(text: str):
"""Remove unnecessary newlines from Text"""
soup = BeautifulSoup(text)
for br in soup.find_all("br"):
# Check if no content after br
if not br.next_sibling or br.next_sibling.name in ["br", None]:
br.decompose()
return str(soup)
+13 -16
View File
@@ -87,7 +87,7 @@ def home():
@app.exception_handler(ClientResponseError)
def download_error_handler(request: Request, exception: ClientResponseError):
def download_error_handler(exception: ClientResponseError):
match exception.status:
case 400 | 404:
return HTMLResponse(
@@ -109,7 +109,7 @@ def download_error_handler(request: Request, exception: ClientResponseError):
@app.exception_handler(WattpadError)
def download_wp_error_handler(request: Request, exception: WattpadError):
def download_wp_error_handler(exception: WattpadError):
if isinstance(exception, StoryNotFoundError):
return HTMLResponse(
status_code=404,
@@ -162,36 +162,33 @@ async def handle_download(
case DownloadMode.part:
story_id, metadata = await fetch_story_from_partId(download_id, cookies)
cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
match format:
case DownloadFormat.epub:
book = EPUBGenerator(metadata, cover_data)
media_type = "application/epub+zip"
case DownloadFormat.pdf:
book = PDFGenerator(metadata, cover_data)
media_type = "application/pdf"
logger.info(f"Retrieved story id ({story_id=})")
cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
part_contents = [
f"<h1>{part['title']}</h1>"
+ (clean_part_text(await fetch_part_content(part["id"], cookies=cookies)))
for part in metadata["parts"]
]
match format:
case DownloadFormat.epub:
book = EPUBGenerator(metadata, cover_data)
case DownloadFormat.pdf:
book = PDFGenerator(metadata, cover_data)
async for title in book.add_chapters(
part_contents, download_images=download_images
):
print(title)
...
book_file = book.dump().file
book_bytes = book_file.read()
book_file.close()
match format:
case DownloadFormat.epub:
media_type = "application/epub+zip"
case DownloadFormat.pdf:
media_type = "application/pdf"
return StreamingResponse(
BytesIO(book_bytes),
media_type=media_type,