feat(api): Remove dependency on exiftool (#82)
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
__pycache__
|
||||
*ipynb
|
||||
build
|
||||
.idea
|
||||
.vscode
|
||||
.venv
|
||||
.env
|
||||
*log
|
||||
*.md
|
||||
uv.lock
|
||||
+9
-17
@@ -13,23 +13,17 @@ FROM python:3.13-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install apt-fast, git, exiftool
|
||||
|
||||
COPY --from=nobodyxu/apt-fast:latest-debian-buster-slim /usr/local/ /usr/local/
|
||||
|
||||
RUN apt update
|
||||
RUN apt install -y aria2
|
||||
RUN apt-fast install -y git build-essential libpango-1.0-0 libpangoft2-1.0-0 wget
|
||||
RUN apt-fast install -y git build-essential python3.13-dev libgobject-2.0 libpango-1.0 libpangoft2-1.0
|
||||
# aiohttp-client-cache depends on multipart, which requires python3.13-dev to build successfully on 3.13
|
||||
# weasyprint depends on libgoject, libpango, and libpangoft2
|
||||
|
||||
ENV EXIFTOOL_VERSION="13.06"
|
||||
RUN wget "https://exiftool.org/Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz"
|
||||
RUN gzip -dc "Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz" | tar -xf -
|
||||
WORKDIR /app/Image-ExifTool-${EXIFTOOL_VERSION}
|
||||
RUN perl Makefile.PL
|
||||
RUN make test
|
||||
RUN make install
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
# https://github.com/TheOnlyWayUp/WattpadDownloader/pull/82#discussion_r2470358950
|
||||
|
||||
RUN rm -rf /var/lib/apt/lists/* /app/Image-ExifTool-${EXIFTOOL_VERSION}
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -37,17 +31,15 @@ WORKDIR /app
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||
|
||||
COPY src/api/requirements.txt requirements.txt
|
||||
COPY src/api/src/create_book/generators/pdf/exiftool.config exiftool.config
|
||||
RUN uv pip install -r requirements.txt --system
|
||||
COPY src/api/pyproject.toml /app
|
||||
RUN uv sync
|
||||
COPY src/api/ /app
|
||||
COPY --from=0 /build/build /app/src/build
|
||||
COPY src/api/src src
|
||||
|
||||
# Is this still needed?
|
||||
RUN ln -s /app/src/pdf/fonts /tmp/fonts
|
||||
|
||||
WORKDIR /app/src
|
||||
|
||||
EXPOSE 80
|
||||
|
||||
CMD [ "python3", "main.py"]
|
||||
CMD [ "uv", "run", "main.py"]
|
||||
|
||||
@@ -17,7 +17,6 @@ dependencies = [
|
||||
"aiohttp-client-cache[all]",
|
||||
"bs4>=0.0.2",
|
||||
"uvicorn>=0.32.1",
|
||||
"pyexiftool>=0.5.6",
|
||||
"weasyprint>=63.0",
|
||||
"jinja2>=3.1.6",
|
||||
]
|
||||
@@ -31,5 +30,6 @@ aiohttp-client-cache = { git = "https://github.com/TheOnlyWayUp/aiohttp-client-c
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ipykernel>=6.29.5",
|
||||
"ipynb>=0.5.1",
|
||||
"ruff>=0.11.12",
|
||||
]
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
aioboto3==13.2.0
|
||||
aiobotocore==2.15.2
|
||||
aiofiles==24.1.0
|
||||
aiohappyeyeballs==2.4.4
|
||||
aiohttp==3.11.9
|
||||
aiohttp-client-cache @ git+https://github.com/TheOnlyWayUp/aiohttp-client-cache.git@1f94f1d751e7320c0ea981d532ff02924782dae6
|
||||
aioitertools==0.12.0
|
||||
aiosignal==1.3.1
|
||||
aiosqlite==0.20.0
|
||||
annotated-types==0.7.0
|
||||
anyio==4.6.2.post1
|
||||
async-timeout==4.0.3
|
||||
attrs==23.1.0
|
||||
backoff==2.2.1
|
||||
beautifulsoup4==4.12.3
|
||||
boltons==24.1.0
|
||||
boto3==1.35.36
|
||||
botocore==1.35.36
|
||||
brotli==1.1.0
|
||||
bs4==0.0.2
|
||||
cffi==1.17.1
|
||||
click==8.1.7
|
||||
cssselect2==0.7.0
|
||||
dnspython==2.7.0
|
||||
ebooklib==0.18
|
||||
eliot==1.16.0
|
||||
exceptiongroup==1.2.2
|
||||
fastapi==0.115.5
|
||||
fonttools==4.55.2
|
||||
frozenlist==1.4.1
|
||||
h11==0.14.0
|
||||
idna==3.6
|
||||
itsdangerous==2.2.0
|
||||
jinja2==3.1.6
|
||||
jmespath==1.0.1
|
||||
lxml==5.3.0
|
||||
markdown-it-py==3.0.0
|
||||
mdurl==0.1.2
|
||||
motor==3.6.0
|
||||
multidict==6.0.4
|
||||
orjson==3.10.12
|
||||
pillow==10.4.0
|
||||
propcache==0.2.1
|
||||
pycparser==2.22
|
||||
pydantic==2.10.2
|
||||
pydantic-core==2.27.1
|
||||
pydantic-settings==2.6.1
|
||||
pydyf==0.11.0
|
||||
pyexiftool==0.5.6
|
||||
pygments==2.18.0
|
||||
pymongo==4.9.2
|
||||
pyphen==0.15.0
|
||||
pyrsistent==0.20.0
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
redis==5.2.0
|
||||
rich==13.9.4
|
||||
s3transfer==0.10.4
|
||||
setuptools==75.6.0
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.6
|
||||
starlette==0.41.3
|
||||
tinycss2==1.4.0
|
||||
tinyhtml5==2.0.0
|
||||
type-extensions==0.1.2
|
||||
typing-extensions==4.12.2
|
||||
url-normalize==1.4.3
|
||||
urllib3==2.2.3
|
||||
uvicorn==0.32.1
|
||||
weasyprint==63.0
|
||||
webencodings==0.5.1
|
||||
wrapt==1.17.0
|
||||
yarl==1.18.3
|
||||
zope-interface==7.2
|
||||
zopfli==0.2.3.post1
|
||||
@@ -3,10 +3,10 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
|
||||
|
||||
import pydyf
|
||||
from bs4 import BeautifulSoup
|
||||
from exiftool import ExifTool
|
||||
from jinja2 import Template
|
||||
from weasyprint import CSS, HTML
|
||||
from weasyprint import CSS, HTML, Document
|
||||
from weasyprint.text.fonts import FontConfiguration
|
||||
|
||||
from ..models import Story
|
||||
@@ -97,7 +97,7 @@ class PDFGenerator(AbstractGenerator):
|
||||
self.images = images
|
||||
self.author = author_image
|
||||
|
||||
self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf")
|
||||
self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf") # type: ignore
|
||||
self.content = TEMPLATE
|
||||
|
||||
def generate_chapters(self) -> dict[int, str]:
|
||||
@@ -134,6 +134,12 @@ class PDFGenerator(AbstractGenerator):
|
||||
"book_title": self.story["title"],
|
||||
"cover": f"data:image/jpg;base64,{b64encode(self.cover).decode()}",
|
||||
"username": self.story["user"]["username"],
|
||||
"author_bio": self.story["user"]["description"],
|
||||
"clean_tags": ", ".join(self.story["tags"]),
|
||||
"created": self.story["createDate"],
|
||||
"modified": self.story["modifyDate"],
|
||||
"is_completed": self.story["completed"],
|
||||
"is_mature": self.story["mature"],
|
||||
"description": self.story["description"],
|
||||
"avatar": b64encode(self.author).decode(),
|
||||
"copyright": {
|
||||
@@ -149,6 +155,11 @@ class PDFGenerator(AbstractGenerator):
|
||||
|
||||
self.content: str = Template(self.content).render(data)
|
||||
|
||||
def write_custom_metadata(self, document: Document, pdf: pydyf.PDF):
|
||||
"""Write non-standard metadata fields to the PDF."""
|
||||
pdf.info["completed"] = pydyf.String(str(self.story["completed"]))
|
||||
pdf.info["mature"] = pydyf.String(str(self.story["mature"]))
|
||||
|
||||
def generate_pdf(self):
|
||||
"""Generate and write the PDF to a temporary file (self.book)."""
|
||||
font_config = FontConfiguration()
|
||||
@@ -157,47 +168,17 @@ class PDFGenerator(AbstractGenerator):
|
||||
|
||||
html_obj = HTML(string=self.content)
|
||||
html_obj.write_pdf(
|
||||
self.book.name, stylesheets=[stylesheet_obj], font_config=font_config
|
||||
)
|
||||
|
||||
def add_metadata(self):
|
||||
"""Write metadata to generated PDF file at self.book, using ExifTool."""
|
||||
|
||||
clean_description = (
|
||||
self.story["description"].strip().replace("\n", "$/")
|
||||
) # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `
` is another option.
|
||||
|
||||
metadata = {
|
||||
"Author": self.story["user"]["username"],
|
||||
"Title": self.story["title"],
|
||||
"Subject": clean_description,
|
||||
"CreationDate": self.story["createDate"],
|
||||
"ModDate": self.story["modifyDate"],
|
||||
"Keywords": ",".join(self.story["tags"]),
|
||||
"Language": self.story["language"]["name"],
|
||||
"Completed": self.story["completed"],
|
||||
"MatureContent": self.story["mature"],
|
||||
"Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
|
||||
} # As per https://exiftool.org/TagNames/PDF.html
|
||||
|
||||
with ExifTool(config_file=DATA_PATH / "exiftool.config") as et:
|
||||
# Custom configuration adds Completed and MatureContent tags.
|
||||
# exiftool logger logs executed command
|
||||
et.execute(
|
||||
*(
|
||||
[f"-{key}={value}" for key, value in metadata.items()]
|
||||
+ [
|
||||
"-overwrite_original",
|
||||
self.book.file.name,
|
||||
]
|
||||
)
|
||||
self.book.name,
|
||||
stylesheets=[stylesheet_obj],
|
||||
font_config=font_config,
|
||||
finisher=self.write_custom_metadata,
|
||||
options={"custom_metadata": True},
|
||||
)
|
||||
|
||||
def compile(self):
|
||||
parts = self.generate_chapters()
|
||||
self.populate_template(parts)
|
||||
self.generate_pdf()
|
||||
self.add_metadata()
|
||||
return True
|
||||
|
||||
def dump(self) -> BytesIO:
|
||||
|
||||
@@ -1,16 +1,30 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="{{ langcode }}">
|
||||
|
||||
|
||||
<head>
|
||||
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<!-- https://doc.courtbouillon.org/weasyprint/stable/api_reference.html#weasyprint.document.DocumentMetadata -->
|
||||
<title>{{ book_title }}</title>
|
||||
<meta name=description content="{{description}}">
|
||||
<meta name=author content="{{author}}">
|
||||
<meta name=keywords content="{{clean_tags}}">
|
||||
<meta name=language content="{{langcode}}">
|
||||
<meta name=dcterms.created content="{{created}}">
|
||||
<meta name=dcterms.modified content="{{modified}}">
|
||||
<meta name=generator content="Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader">
|
||||
|
||||
<section class="fullpage">
|
||||
|
||||
</head>
|
||||
|
||||
<section class="fullpage">
|
||||
<img src="{{ cover }}" alt="Cover">
|
||||
</section>
|
||||
</section>
|
||||
|
||||
<div id="copyright-container">
|
||||
<div id="copyright-container">
|
||||
<h1 id="copyright-notice">Copyright Notice</h1>
|
||||
|
||||
<h2 id="copyright-title">{{ book_title }}</h2>
|
||||
@@ -23,11 +37,8 @@
|
||||
<div id="copyright-separator"></div>
|
||||
|
||||
{% if copyright.data %}
|
||||
<img src="data:image/jpg;base64,{{copyright.data}}"
|
||||
alt="{{copyright.name}}"
|
||||
width="88"
|
||||
height="31"
|
||||
id="copyright-license-image">
|
||||
<img src="data:image/jpg;base64,{{copyright.data}}" alt="{{copyright.name}}" width="88" height="31"
|
||||
id="copyright-license-image">
|
||||
{% endif %}
|
||||
|
||||
<p id="copyright-copyright">{{ statement }}</p>
|
||||
@@ -40,9 +51,9 @@ id="copyright-license-image">
|
||||
ID: {{ book_id }}.
|
||||
<a href="https://wattpad.com/story/{{ book_id }}" target="_blank" id="copyright-link">View this Book Online</a>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="book">
|
||||
<div id="book">
|
||||
<section id="contents" class="toc">
|
||||
<h1>Table of Contents</h1>
|
||||
<ul>
|
||||
@@ -55,10 +66,10 @@ id="copyright-license-image">
|
||||
|
||||
{{parts[part_id] | safe}}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h1>About the Author</h1>
|
||||
<div id="author-container">
|
||||
<h1>About the Author</h1>
|
||||
<div id="author-container">
|
||||
<div id="author-about">
|
||||
<img src="data:image/jpg;base64,{{avatar}}" alt="{{author}}'s profile picture" id="author-profile-picture">
|
||||
<h2 id="author-name">
|
||||
@@ -66,8 +77,9 @@ id="copyright-license-image">
|
||||
</h2>
|
||||
<hr id="author-divider">
|
||||
<p id="author-bio">
|
||||
{{ description }}
|
||||
{{ author_bio }}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</html>
|
||||
@@ -1,26 +0,0 @@
|
||||
|
||||
%Image::ExifTool::UserDefined = (
|
||||
'Image::ExifTool::XMP::xmp' => {
|
||||
Completed => {
|
||||
Writable => 'boolean', # Can be a boolean (True/False)
|
||||
Groups => { 2 => 'Content' },
|
||||
},
|
||||
MatureContent => {
|
||||
Writable => 'boolean', # Can be a boolean (True/False)
|
||||
Groups => { 2 => 'Content' },
|
||||
},
|
||||
},
|
||||
|
||||
'Image::ExifTool::IPTC::ApplicationRecord' => {
|
||||
161 => {
|
||||
Name => 'Completed',
|
||||
Format => 'string[0,16]', # Store as a string (e.g., "Yes"/"No")
|
||||
},
|
||||
162 => {
|
||||
Name => 'MatureContent',
|
||||
Format => 'string[0,16]', # Store as a string (e.g., "Yes"/"No")
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
1; # End
|
||||
@@ -0,0 +1,5 @@
|
||||
The fonts need to be symlinked to /tmp/fonts, this allows the fonts to be loaded during development and during build-time.
|
||||
It's assumed fonts will be present at `/tmp/fonts`, during development they're at `/src/api/src/create_book/generators/pdf`, and during deployment they're at `/app/src/api/src/create_book/generators/pdf`. This seems like a clean solution.
|
||||
|
||||
`Fontconfig error: Cannot load default config file: No such file: (null)`
|
||||
If the fonts aren't found, this warning pops up in console. It won't cause downloads to fail, though.
|
||||
Generated
+1068
-549
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user