fix(api): Clean code
This commit is contained in:
+32
-23
@@ -6,7 +6,6 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from io import BytesIO, StringIO
|
|
||||||
from os import environ
|
from os import environ
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from base64 import b64encode
|
from base64 import b64encode
|
||||||
@@ -16,7 +15,6 @@ from eliot import to_file, start_action
|
|||||||
from eliot.stdlib import EliotHandler
|
from eliot.stdlib import EliotHandler
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from ebooklib import epub
|
from ebooklib import epub
|
||||||
from ebooklib.epub import EpubBook
|
|
||||||
from exiftool import ExifTool
|
from exiftool import ExifTool
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from pydantic import TypeAdapter, model_validator, field_validator
|
from pydantic import TypeAdapter, model_validator, field_validator
|
||||||
@@ -28,14 +26,19 @@ from aiohttp_client_cache import FileBackend, RedisBackend
|
|||||||
load_dotenv(override=True)
|
load_dotenv(override=True)
|
||||||
|
|
||||||
handler = EliotHandler()
|
handler = EliotHandler()
|
||||||
|
|
||||||
logging.getLogger("fastapi").setLevel(logging.INFO)
|
logging.getLogger("fastapi").setLevel(logging.INFO)
|
||||||
logging.getLogger("fastapi").addHandler(handler)
|
logging.getLogger("fastapi").addHandler(handler)
|
||||||
|
|
||||||
|
exiftool_logger = logging.getLogger("exiftool")
|
||||||
|
exiftool_logger.addHandler(handler)
|
||||||
|
|
||||||
|
logger = logging.Logger("wpd")
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
if environ.get("DEBUG"):
|
if environ.get("DEBUG"):
|
||||||
to_file(open("eliot.log", "wb"))
|
to_file(open("eliot.log", "wb"))
|
||||||
|
|
||||||
logger = logging.Logger("wpd")
|
|
||||||
logger.addHandler(handler)
|
|
||||||
|
|
||||||
# --- #
|
# --- #
|
||||||
|
|
||||||
@@ -106,6 +109,18 @@ logger.info(f"Using {cache=}")
|
|||||||
# --- Utilities --- #
|
# --- Utilities --- #
|
||||||
|
|
||||||
|
|
||||||
|
def clean_part_text(text: str):
|
||||||
|
"""Remove unnecessary newlines from Text"""
|
||||||
|
soup = BeautifulSoup(text)
|
||||||
|
|
||||||
|
for br in soup.find_all("br"):
|
||||||
|
# Check if no content after br
|
||||||
|
if not br.next_sibling or br.next_sibling.name in ["br", None]:
|
||||||
|
br.decompose()
|
||||||
|
|
||||||
|
return str(soup)
|
||||||
|
|
||||||
|
|
||||||
def slugify(value, allow_unicode=False) -> str:
|
def slugify(value, allow_unicode=False) -> str:
|
||||||
"""
|
"""
|
||||||
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
||||||
@@ -319,7 +334,7 @@ class EPUBGenerator:
|
|||||||
self.data = data
|
self.data = data
|
||||||
self.cover = cover
|
self.cover = cover
|
||||||
|
|
||||||
# set metadata
|
# set metadata, defined in https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2
|
||||||
self.epub.add_author(data["user"]["username"])
|
self.epub.add_author(data["user"]["username"])
|
||||||
|
|
||||||
self.epub.add_metadata("DC", "title", data["title"])
|
self.epub.add_metadata("DC", "title", data["title"])
|
||||||
@@ -358,7 +373,7 @@ class EPUBGenerator:
|
|||||||
# Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1
|
# Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1
|
||||||
chapter = epub.EpubHtml(
|
chapter = epub.EpubHtml(
|
||||||
title=title,
|
title=title,
|
||||||
file_name=f"{cidx}.xhtml", # Used to be clean_title.xhtml, but that broke Arabic support as slugify turns arabic strings into '', leading to multiple files with the same name, breaking those chapters.
|
file_name=f"{cidx}.xhtml", # See issue #30
|
||||||
lang=self.data["language"]["name"],
|
lang=self.data["language"]["name"],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -387,10 +402,9 @@ class EPUBGenerator:
|
|||||||
)
|
)
|
||||||
|
|
||||||
chapter.set_content(content)
|
chapter.set_content(content)
|
||||||
|
|
||||||
chapters.append(chapter)
|
chapters.append(chapter)
|
||||||
|
|
||||||
yield title # Yield the chapter's title upon insertion preceeded by retrieval.
|
yield title
|
||||||
|
|
||||||
for chapter in chapters:
|
for chapter in chapters:
|
||||||
self.epub.add_item(chapter)
|
self.epub.add_item(chapter)
|
||||||
@@ -475,13 +489,16 @@ wp_copyright = {
|
|||||||
|
|
||||||
|
|
||||||
class PDFGenerator:
|
class PDFGenerator:
|
||||||
|
"""PDF Generation utilities"""
|
||||||
|
|
||||||
def __init__(self, data: Story, cover: bytes):
|
def __init__(self, data: Story, cover: bytes):
|
||||||
self.data = data
|
self.data = data
|
||||||
self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True)
|
self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True)
|
||||||
self.cover = cover
|
self.cover = cover
|
||||||
# self.canvas = Canvas(self.file)
|
|
||||||
|
|
||||||
async def add_chapters(self, contents: List[str], download_images: bool = False):
|
async def add_chapters(self, contents: List[str], download_images: bool = False):
|
||||||
|
"""Add chapters to the PDF"""
|
||||||
|
|
||||||
chapters: List[tempfile._TemporaryFileWrapper] = []
|
chapters: List[tempfile._TemporaryFileWrapper] = []
|
||||||
|
|
||||||
for part, content in zip(self.data["parts"], contents):
|
for part, content in zip(self.data["parts"], contents):
|
||||||
@@ -489,6 +506,7 @@ class PDFGenerator:
|
|||||||
image_sources: List[str] = []
|
image_sources: List[str] = []
|
||||||
|
|
||||||
for image_container in html.find_all("p", {"data-media-type": "image"}):
|
for image_container in html.find_all("p", {"data-media-type": "image"}):
|
||||||
|
# Find all images, download them if download_images, else clear them (else wkhtmltopdf _might_ fetch them)
|
||||||
img = image_container.findChild("img")
|
img = image_container.findChild("img")
|
||||||
source = img.get("src")
|
source = img.get("src")
|
||||||
if not download_images and source:
|
if not download_images and source:
|
||||||
@@ -508,7 +526,7 @@ class PDFGenerator:
|
|||||||
writable_html = writable_html.replace(
|
writable_html = writable_html.replace(
|
||||||
image_url,
|
image_url,
|
||||||
f"data:image/jpg;base64,{b64encode(image).decode()}",
|
f"data:image/jpg;base64,{b64encode(image).decode()}",
|
||||||
)
|
) # Base64-encoded images are better than referencing NamedTemporaryFiles as it's less access to the local filesystem, the enable-local-file-access would be disabled if not for local fonts.
|
||||||
|
|
||||||
tempie = tempfile.NamedTemporaryFile(suffix=".html", delete=True)
|
tempie = tempfile.NamedTemporaryFile(suffix=".html", delete=True)
|
||||||
tempie.write(writable_html.encode())
|
tempie.write(writable_html.encode())
|
||||||
@@ -541,7 +559,9 @@ class PDFGenerator:
|
|||||||
cover_first=True,
|
cover_first=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
clean_description = self.data["description"].strip().replace("\n", "$/")
|
clean_description = (
|
||||||
|
self.data["description"].strip().replace("\n", "$/")
|
||||||
|
) # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `
` is another option.
|
||||||
metadata = {
|
metadata = {
|
||||||
"Author": self.data["user"]["username"],
|
"Author": self.data["user"]["username"],
|
||||||
"Title": self.data["title"],
|
"Title": self.data["title"],
|
||||||
@@ -556,6 +576,7 @@ class PDFGenerator:
|
|||||||
} # As per https://exiftool.org/TagNames/PDF.html
|
} # As per https://exiftool.org/TagNames/PDF.html
|
||||||
|
|
||||||
with ExifTool(config_file="../exiftool.config", logger=logger) as et:
|
with ExifTool(config_file="../exiftool.config", logger=logger) as et:
|
||||||
|
# Custom configuration adds Completed and MatureContent tags.
|
||||||
et.execute(
|
et.execute(
|
||||||
*(
|
*(
|
||||||
[f"-{key}={value}" for key, value in metadata.items()]
|
[f"-{key}={value}" for key, value in metadata.items()]
|
||||||
@@ -573,15 +594,3 @@ class PDFGenerator:
|
|||||||
self.file.seek(0)
|
self.file.seek(0)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def clean_part_text(text: str):
|
|
||||||
"""Remove unnecessary newlines from Text"""
|
|
||||||
soup = BeautifulSoup(text)
|
|
||||||
|
|
||||||
for br in soup.find_all("br"):
|
|
||||||
# Check if no content after br
|
|
||||||
if not br.next_sibling or br.next_sibling.name in ["br", None]:
|
|
||||||
br.decompose()
|
|
||||||
|
|
||||||
return str(soup)
|
|
||||||
|
|||||||
+13
-16
@@ -87,7 +87,7 @@ def home():
|
|||||||
|
|
||||||
|
|
||||||
@app.exception_handler(ClientResponseError)
|
@app.exception_handler(ClientResponseError)
|
||||||
def download_error_handler(request: Request, exception: ClientResponseError):
|
def download_error_handler(exception: ClientResponseError):
|
||||||
match exception.status:
|
match exception.status:
|
||||||
case 400 | 404:
|
case 400 | 404:
|
||||||
return HTMLResponse(
|
return HTMLResponse(
|
||||||
@@ -109,7 +109,7 @@ def download_error_handler(request: Request, exception: ClientResponseError):
|
|||||||
|
|
||||||
|
|
||||||
@app.exception_handler(WattpadError)
|
@app.exception_handler(WattpadError)
|
||||||
def download_wp_error_handler(request: Request, exception: WattpadError):
|
def download_wp_error_handler(exception: WattpadError):
|
||||||
if isinstance(exception, StoryNotFoundError):
|
if isinstance(exception, StoryNotFoundError):
|
||||||
return HTMLResponse(
|
return HTMLResponse(
|
||||||
status_code=404,
|
status_code=404,
|
||||||
@@ -162,36 +162,33 @@ async def handle_download(
|
|||||||
case DownloadMode.part:
|
case DownloadMode.part:
|
||||||
story_id, metadata = await fetch_story_from_partId(download_id, cookies)
|
story_id, metadata = await fetch_story_from_partId(download_id, cookies)
|
||||||
|
|
||||||
|
cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
|
||||||
|
|
||||||
|
match format:
|
||||||
|
case DownloadFormat.epub:
|
||||||
|
book = EPUBGenerator(metadata, cover_data)
|
||||||
|
media_type = "application/epub+zip"
|
||||||
|
case DownloadFormat.pdf:
|
||||||
|
book = PDFGenerator(metadata, cover_data)
|
||||||
|
media_type = "application/pdf"
|
||||||
|
|
||||||
logger.info(f"Retrieved story id ({story_id=})")
|
logger.info(f"Retrieved story id ({story_id=})")
|
||||||
|
|
||||||
cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-"))
|
|
||||||
part_contents = [
|
part_contents = [
|
||||||
f"<h1>{part['title']}</h1>"
|
f"<h1>{part['title']}</h1>"
|
||||||
+ (clean_part_text(await fetch_part_content(part["id"], cookies=cookies)))
|
+ (clean_part_text(await fetch_part_content(part["id"], cookies=cookies)))
|
||||||
for part in metadata["parts"]
|
for part in metadata["parts"]
|
||||||
]
|
]
|
||||||
|
|
||||||
match format:
|
|
||||||
case DownloadFormat.epub:
|
|
||||||
book = EPUBGenerator(metadata, cover_data)
|
|
||||||
case DownloadFormat.pdf:
|
|
||||||
book = PDFGenerator(metadata, cover_data)
|
|
||||||
|
|
||||||
async for title in book.add_chapters(
|
async for title in book.add_chapters(
|
||||||
part_contents, download_images=download_images
|
part_contents, download_images=download_images
|
||||||
):
|
):
|
||||||
print(title)
|
...
|
||||||
|
|
||||||
book_file = book.dump().file
|
book_file = book.dump().file
|
||||||
book_bytes = book_file.read()
|
book_bytes = book_file.read()
|
||||||
book_file.close()
|
book_file.close()
|
||||||
|
|
||||||
match format:
|
|
||||||
case DownloadFormat.epub:
|
|
||||||
media_type = "application/epub+zip"
|
|
||||||
case DownloadFormat.pdf:
|
|
||||||
media_type = "application/pdf"
|
|
||||||
|
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
BytesIO(book_bytes),
|
BytesIO(book_bytes),
|
||||||
media_type=media_type,
|
media_type=media_type,
|
||||||
|
|||||||
Reference in New Issue
Block a user