diff --git a/src/api/src/create_book.py b/src/api/src/create_book.py index 3c827ae..003b61f 100644 --- a/src/api/src/create_book.py +++ b/src/api/src/create_book.py @@ -6,7 +6,6 @@ import json import logging import tempfile import unicodedata -from io import BytesIO, StringIO from os import environ from enum import Enum from base64 import b64encode @@ -16,7 +15,6 @@ from eliot import to_file, start_action from eliot.stdlib import EliotHandler from dotenv import load_dotenv from ebooklib import epub -from ebooklib.epub import EpubBook from exiftool import ExifTool from bs4 import BeautifulSoup from pydantic import TypeAdapter, model_validator, field_validator @@ -28,14 +26,19 @@ from aiohttp_client_cache import FileBackend, RedisBackend load_dotenv(override=True) handler = EliotHandler() + logging.getLogger("fastapi").setLevel(logging.INFO) logging.getLogger("fastapi").addHandler(handler) +exiftool_logger = logging.getLogger("exiftool") +exiftool_logger.addHandler(handler) + +logger = logging.Logger("wpd") +logger.addHandler(handler) + if environ.get("DEBUG"): to_file(open("eliot.log", "wb")) -logger = logging.Logger("wpd") -logger.addHandler(handler) # --- # @@ -106,6 +109,18 @@ logger.info(f"Using {cache=}") # --- Utilities --- # +def clean_part_text(text: str): + """Remove unnecessary newlines from Text""" + soup = BeautifulSoup(text) + + for br in soup.find_all("br"): + # Check if no content after br + if not br.next_sibling or br.next_sibling.name in ["br", None]: + br.decompose() + + return str(soup) + + def slugify(value, allow_unicode=False) -> str: """ Taken from https://github.com/django/django/blob/master/django/utils/text.py @@ -319,7 +334,7 @@ class EPUBGenerator: self.data = data self.cover = cover - # set metadata + # set metadata, defined in https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2 self.epub.add_author(data["user"]["username"]) self.epub.add_metadata("DC", "title", data["title"]) @@ -358,7 +373,7 @@ class EPUBGenerator: # Thanks https://eu17.proxysite.com/process.php?d=5VyWYcoQl%2BVF0BYOuOavtvjOloFUZz2BJ%2Fepiusk6Nz7PV%2B9i8rs7cFviGftrBNll%2B0a3qO7UiDkTt4qwCa0fDES&b=1 chapter = epub.EpubHtml( title=title, - file_name=f"{cidx}.xhtml", # Used to be clean_title.xhtml, but that broke Arabic support as slugify turns arabic strings into '', leading to multiple files with the same name, breaking those chapters. + file_name=f"{cidx}.xhtml", # See issue #30 lang=self.data["language"]["name"], ) @@ -387,10 +402,9 @@ class EPUBGenerator: ) chapter.set_content(content) - chapters.append(chapter) - yield title # Yield the chapter's title upon insertion preceeded by retrieval. + yield title for chapter in chapters: self.epub.add_item(chapter) @@ -475,13 +489,16 @@ wp_copyright = { class PDFGenerator: + """PDF Generation utilities""" + def __init__(self, data: Story, cover: bytes): self.data = data self.file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) self.cover = cover - # self.canvas = Canvas(self.file) async def add_chapters(self, contents: List[str], download_images: bool = False): + """Add chapters to the PDF""" + chapters: List[tempfile._TemporaryFileWrapper] = [] for part, content in zip(self.data["parts"], contents): @@ -489,6 +506,7 @@ class PDFGenerator: image_sources: List[str] = [] for image_container in html.find_all("p", {"data-media-type": "image"}): + # Find all images, download them if download_images, else clear them (else wkhtmltopdf _might_ fetch them) img = image_container.findChild("img") source = img.get("src") if not download_images and source: @@ -508,7 +526,7 @@ class PDFGenerator: writable_html = writable_html.replace( image_url, f"data:image/jpg;base64,{b64encode(image).decode()}", - ) + ) # Base64-encoded images are better than referencing NamedTemporaryFiles as it's less access to the local filesystem, the enable-local-file-access would be disabled if not for local fonts. tempie = tempfile.NamedTemporaryFile(suffix=".html", delete=True) tempie.write(writable_html.encode()) @@ -541,7 +559,9 @@ class PDFGenerator: cover_first=True, ) - clean_description = self.data["description"].strip().replace("\n", "$/") + clean_description = ( + self.data["description"].strip().replace("\n", "$/") + ) # exiftool doesn't parse \ns correctly, they support $/ for the same instead. ` ` is another option. metadata = { "Author": self.data["user"]["username"], "Title": self.data["title"], @@ -556,6 +576,7 @@ class PDFGenerator: } # As per https://exiftool.org/TagNames/PDF.html with ExifTool(config_file="../exiftool.config", logger=logger) as et: + # Custom configuration adds Completed and MatureContent tags. et.execute( *( [f"-{key}={value}" for key, value in metadata.items()] @@ -573,15 +594,3 @@ class PDFGenerator: self.file.seek(0) return self - - -def clean_part_text(text: str): - """Remove unnecessary newlines from Text""" - soup = BeautifulSoup(text) - - for br in soup.find_all("br"): - # Check if no content after br - if not br.next_sibling or br.next_sibling.name in ["br", None]: - br.decompose() - - return str(soup) diff --git a/src/api/src/main.py b/src/api/src/main.py index cc48579..3cedc8e 100644 --- a/src/api/src/main.py +++ b/src/api/src/main.py @@ -87,7 +87,7 @@ def home(): @app.exception_handler(ClientResponseError) -def download_error_handler(request: Request, exception: ClientResponseError): +def download_error_handler(exception: ClientResponseError): match exception.status: case 400 | 404: return HTMLResponse( @@ -109,7 +109,7 @@ def download_error_handler(request: Request, exception: ClientResponseError): @app.exception_handler(WattpadError) -def download_wp_error_handler(request: Request, exception: WattpadError): +def download_wp_error_handler(exception: WattpadError): if isinstance(exception, StoryNotFoundError): return HTMLResponse( status_code=404, @@ -162,36 +162,33 @@ async def handle_download( case DownloadMode.part: story_id, metadata = await fetch_story_from_partId(download_id, cookies) + cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-")) + + match format: + case DownloadFormat.epub: + book = EPUBGenerator(metadata, cover_data) + media_type = "application/epub+zip" + case DownloadFormat.pdf: + book = PDFGenerator(metadata, cover_data) + media_type = "application/pdf" + logger.info(f"Retrieved story id ({story_id=})") - cover_data = await fetch_cover(metadata["cover"].replace("-256-", "-512-")) part_contents = [ f"

{part['title']}

" + (clean_part_text(await fetch_part_content(part["id"], cookies=cookies))) for part in metadata["parts"] ] - match format: - case DownloadFormat.epub: - book = EPUBGenerator(metadata, cover_data) - case DownloadFormat.pdf: - book = PDFGenerator(metadata, cover_data) - async for title in book.add_chapters( part_contents, download_images=download_images ): - print(title) + ... book_file = book.dump().file book_bytes = book_file.read() book_file.close() - match format: - case DownloadFormat.epub: - media_type = "application/epub+zip" - case DownloadFormat.pdf: - media_type = "application/pdf" - return StreamingResponse( BytesIO(book_bytes), media_type=media_type,