From 8dc7d1657896bdef9c2e7743ed5e62c510e646ee Mon Sep 17 00:00:00 2001 From: TheOnlyWayUp Date: Tue, 10 Dec 2024 18:37:22 +0000 Subject: [PATCH] feat(api): Generate PDFs with Weasyprint! --- src/api/src/create_book.py | 86 +++++++++----------------------------- 1 file changed, 20 insertions(+), 66 deletions(-) diff --git a/src/api/src/create_book.py b/src/api/src/create_book.py index 8e638de..7df2f2f 100644 --- a/src/api/src/create_book.py +++ b/src/api/src/create_book.py @@ -10,8 +10,10 @@ from os import environ from io import BytesIO from enum import Enum from base64 import b64encode +import bs4 import backoff -import pdfkit +from weasyprint import HTML, CSS, default_url_fetcher +from weasyprint.text.fonts import FontConfiguration from ebooklib import epub from exiftool import ExifTool from eliot import to_file, start_action @@ -712,86 +714,38 @@ id="copyright-license-image">""".format( async def add_chapters(self, contents: List[str], download_images: bool = False): """Add chapters to the PDF, downloading images if necessary. Also add Cover, Copyright, and About the Author pages.""" - chapters: List[tempfile._TemporaryFileWrapper] = [] + # # Cover and Copyright Page + await self.genernate_cover_and_copyright_html() + await self.generate_about_author_chapter() + self.tree = BeautifulSoup(self.template) + self.generate_toc() for part, content in zip(self.data["parts"], contents): - html = BeautifulSoup(content, features="lxml") - image_sources: List[str] = [] - - for image_container in html.find_all("p", {"data-media-type": "image"}): - # Find all images, download them if download_images, else clear them (else wkhtmltopdf _might_ fetch them) - img = image_container.findChild("img") - if not img: - image_container.decompose() # If empty, delete parent (ex:

       

) - continue - source = img.get("src") - if not download_images and source: - img["src"] = "" - image_container.replace_with(img) - image_sources.append(source) - - writable_html = str(html) - if download_images: - async with CachedSession(cache=None) as session: # Don't cache images - for image_url in image_sources: - async with session.get(image_url) as response: - response.raise_for_status() - - image = await response.read() - - writable_html = writable_html.replace( - image_url, - f"data:image/jpg;base64,{b64encode(image).decode()}", - ) # Base64-encoded images are better than referencing NamedTemporaryFiles as it's less access to the local filesystem, the enable-local-file-access would be disabled if not for local fonts. - - tempie = tempfile.NamedTemporaryFile( - suffix=".html", delete=True - ) # tempie 🫡 - tempie.write(writable_html.encode()) - tempie.file.seek(0) - - chapters.append(tempie) - + self.generate_clean_part_html(part, content) yield part["title"] - # Cover and Copyright Page - cover_and_copyright_file = await self.genernate_cover_and_copyright_file() + # # About the Author page + # about_author_html = await self.generate_about_author_chapter() - # About the Author page - about_author_file = await self.generate_about_author_file() - chapters.append(about_author_file) - - chapter_filenames = [chapter.file.name for chapter in chapters] + # chapters.insert(0, cover_and_copyright_html) + # chapters.append(about_author_html) with start_action( action_type="generate_pdf", - chapter_filenames=chapter_filenames, output_filename=self.file.name, - cover_filename=cover_and_copyright_file.file.name, title=self.data["title"], ): # PDF Generation with wkhtmltopdf, written to self.file # At this stage, we have a bunch of HTML Files representing all the chapters that need to be generated. PDFKit handles ToC generation, so that's not included. - pdfkit.from_file( - chapter_filenames, - self.file.name, - cover=cover_and_copyright_file.file.name, - toc={ - "toc-header-text": "Table of Contents", - "xsl-style-sheet": "./pdf/toc.xsl", - }, - options={ - "footer-html": "./pdf/footer.html", - "margin-top": "10mm", - "margin-bottom": "10mm", - "title": self.data["title"], - "encoding": "UTF-8", - "user-style-sheet": "./pdf/stylesheet.css", - "enable-local-file-access": "", - }, - cover_first=True, + font_config = FontConfiguration() + + stylesheet_obj = CSS(string=self.stylesheet, font_config=font_config) + + html_obj = HTML(string=str(self.tree)) + html_obj.write_pdf( + self.file.name, stylesheets=[stylesheet_obj], font_config=font_config ) with start_action(action_type="add_metadata") as action: