feat(api): Generate PDFs with Weasyprint!

This commit is contained in:
TheOnlyWayUp
2024-12-10 18:37:22 +00:00
parent f8ab318210
commit 8dc7d16578
+20 -66
View File
@@ -10,8 +10,10 @@ from os import environ
from io import BytesIO from io import BytesIO
from enum import Enum from enum import Enum
from base64 import b64encode from base64 import b64encode
import bs4
import backoff import backoff
import pdfkit from weasyprint import HTML, CSS, default_url_fetcher
from weasyprint.text.fonts import FontConfiguration
from ebooklib import epub from ebooklib import epub
from exiftool import ExifTool from exiftool import ExifTool
from eliot import to_file, start_action from eliot import to_file, start_action
@@ -712,86 +714,38 @@ id="copyright-license-image">""".format(
async def add_chapters(self, contents: List[str], download_images: bool = False): async def add_chapters(self, contents: List[str], download_images: bool = False):
"""Add chapters to the PDF, downloading images if necessary. Also add Cover, Copyright, and About the Author pages.""" """Add chapters to the PDF, downloading images if necessary. Also add Cover, Copyright, and About the Author pages."""
chapters: List[tempfile._TemporaryFileWrapper] = [] # # Cover and Copyright Page
await self.genernate_cover_and_copyright_html()
await self.generate_about_author_chapter()
self.tree = BeautifulSoup(self.template)
self.generate_toc()
for part, content in zip(self.data["parts"], contents): for part, content in zip(self.data["parts"], contents):
html = BeautifulSoup(content, features="lxml") self.generate_clean_part_html(part, content)
image_sources: List[str] = []
for image_container in html.find_all("p", {"data-media-type": "image"}):
# Find all images, download them if download_images, else clear them (else wkhtmltopdf _might_ fetch them)
img = image_container.findChild("img")
if not img:
image_container.decompose() # If empty, delete parent (ex: <p data-image-layout="one-horizontal" data-media-type="image" data-p-id="bb6e18f2bb7d13f317bb6ccded04899b">        </p>)
continue
source = img.get("src")
if not download_images and source:
img["src"] = ""
image_container.replace_with(img)
image_sources.append(source)
writable_html = str(html)
if download_images:
async with CachedSession(cache=None) as session: # Don't cache images
for image_url in image_sources:
async with session.get(image_url) as response:
response.raise_for_status()
image = await response.read()
writable_html = writable_html.replace(
image_url,
f"data:image/jpg;base64,{b64encode(image).decode()}",
) # Base64-encoded images are better than referencing NamedTemporaryFiles as it's less access to the local filesystem, the enable-local-file-access would be disabled if not for local fonts.
tempie = tempfile.NamedTemporaryFile(
suffix=".html", delete=True
) # tempie 🫡
tempie.write(writable_html.encode())
tempie.file.seek(0)
chapters.append(tempie)
yield part["title"] yield part["title"]
# Cover and Copyright Page # # About the Author page
cover_and_copyright_file = await self.genernate_cover_and_copyright_file() # about_author_html = await self.generate_about_author_chapter()
# About the Author page # chapters.insert(0, cover_and_copyright_html)
about_author_file = await self.generate_about_author_file() # chapters.append(about_author_html)
chapters.append(about_author_file)
chapter_filenames = [chapter.file.name for chapter in chapters]
with start_action( with start_action(
action_type="generate_pdf", action_type="generate_pdf",
chapter_filenames=chapter_filenames,
output_filename=self.file.name, output_filename=self.file.name,
cover_filename=cover_and_copyright_file.file.name,
title=self.data["title"], title=self.data["title"],
): ):
# PDF Generation with wkhtmltopdf, written to self.file # PDF Generation with wkhtmltopdf, written to self.file
# At this stage, we have a bunch of HTML Files representing all the chapters that need to be generated. PDFKit handles ToC generation, so that's not included. # At this stage, we have a bunch of HTML Files representing all the chapters that need to be generated. PDFKit handles ToC generation, so that's not included.
pdfkit.from_file( font_config = FontConfiguration()
chapter_filenames,
self.file.name, stylesheet_obj = CSS(string=self.stylesheet, font_config=font_config)
cover=cover_and_copyright_file.file.name,
toc={ html_obj = HTML(string=str(self.tree))
"toc-header-text": "Table of Contents", html_obj.write_pdf(
"xsl-style-sheet": "./pdf/toc.xsl", self.file.name, stylesheets=[stylesheet_obj], font_config=font_config
},
options={
"footer-html": "./pdf/footer.html",
"margin-top": "10mm",
"margin-bottom": "10mm",
"title": self.data["title"],
"encoding": "UTF-8",
"user-style-sheet": "./pdf/stylesheet.css",
"enable-local-file-access": "",
},
cover_first=True,
) )
with start_action(action_type="add_metadata") as action: with start_action(action_type="add_metadata") as action: