feat(api): Remove dependency on exiftool (#82)

This commit is contained in:
Dhanush R
2025-10-30 16:11:16 +05:30
committed by GitHub
9 changed files with 1184 additions and 767 deletions
+10
View File
@@ -0,0 +1,10 @@
__pycache__
*ipynb
build
.idea
.vscode
.venv
.env
*log
*.md
uv.lock
+9 -17
View File
@@ -13,23 +13,17 @@ FROM python:3.13-slim
WORKDIR /app WORKDIR /app
# Install apt-fast, git, exiftool
COPY --from=nobodyxu/apt-fast:latest-debian-buster-slim /usr/local/ /usr/local/ COPY --from=nobodyxu/apt-fast:latest-debian-buster-slim /usr/local/ /usr/local/
RUN apt update RUN apt update
RUN apt install -y aria2 RUN apt install -y aria2
RUN apt-fast install -y git build-essential libpango-1.0-0 libpangoft2-1.0-0 wget RUN apt-fast install -y git build-essential python3.13-dev libgobject-2.0 libpango-1.0 libpangoft2-1.0
# aiohttp-client-cache depends on multipart, which requires python3.13-dev to build successfully on 3.13
# weasyprint depends on libgoject, libpango, and libpangoft2
ENV EXIFTOOL_VERSION="13.06" RUN rm -rf /var/lib/apt/lists/*
RUN wget "https://exiftool.org/Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz" # https://github.com/TheOnlyWayUp/WattpadDownloader/pull/82#discussion_r2470358950
RUN gzip -dc "Image-ExifTool-${EXIFTOOL_VERSION}.tar.gz" | tar -xf -
WORKDIR /app/Image-ExifTool-${EXIFTOOL_VERSION}
RUN perl Makefile.PL
RUN make test
RUN make install
RUN rm -rf /var/lib/apt/lists/* /app/Image-ExifTool-${EXIFTOOL_VERSION}
WORKDIR /app WORKDIR /app
@@ -37,17 +31,15 @@ WORKDIR /app
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
COPY src/api/requirements.txt requirements.txt COPY src/api/pyproject.toml /app
COPY src/api/src/create_book/generators/pdf/exiftool.config exiftool.config RUN uv sync
RUN uv pip install -r requirements.txt --system COPY src/api/ /app
COPY --from=0 /build/build /app/src/build COPY --from=0 /build/build /app/src/build
COPY src/api/src src
# Is this still needed?
RUN ln -s /app/src/pdf/fonts /tmp/fonts RUN ln -s /app/src/pdf/fonts /tmp/fonts
WORKDIR /app/src WORKDIR /app/src
EXPOSE 80 EXPOSE 80
CMD [ "python3", "main.py"] CMD [ "uv", "run", "main.py"]
+1 -1
View File
@@ -17,7 +17,6 @@ dependencies = [
"aiohttp-client-cache[all]", "aiohttp-client-cache[all]",
"bs4>=0.0.2", "bs4>=0.0.2",
"uvicorn>=0.32.1", "uvicorn>=0.32.1",
"pyexiftool>=0.5.6",
"weasyprint>=63.0", "weasyprint>=63.0",
"jinja2>=3.1.6", "jinja2>=3.1.6",
] ]
@@ -31,5 +30,6 @@ aiohttp-client-cache = { git = "https://github.com/TheOnlyWayUp/aiohttp-client-c
[dependency-groups] [dependency-groups]
dev = [ dev = [
"ipykernel>=6.29.5", "ipykernel>=6.29.5",
"ipynb>=0.5.1",
"ruff>=0.11.12", "ruff>=0.11.12",
] ]
-76
View File
@@ -1,76 +0,0 @@
aioboto3==13.2.0
aiobotocore==2.15.2
aiofiles==24.1.0
aiohappyeyeballs==2.4.4
aiohttp==3.11.9
aiohttp-client-cache @ git+https://github.com/TheOnlyWayUp/aiohttp-client-cache.git@1f94f1d751e7320c0ea981d532ff02924782dae6
aioitertools==0.12.0
aiosignal==1.3.1
aiosqlite==0.20.0
annotated-types==0.7.0
anyio==4.6.2.post1
async-timeout==4.0.3
attrs==23.1.0
backoff==2.2.1
beautifulsoup4==4.12.3
boltons==24.1.0
boto3==1.35.36
botocore==1.35.36
brotli==1.1.0
bs4==0.0.2
cffi==1.17.1
click==8.1.7
cssselect2==0.7.0
dnspython==2.7.0
ebooklib==0.18
eliot==1.16.0
exceptiongroup==1.2.2
fastapi==0.115.5
fonttools==4.55.2
frozenlist==1.4.1
h11==0.14.0
idna==3.6
itsdangerous==2.2.0
jinja2==3.1.6
jmespath==1.0.1
lxml==5.3.0
markdown-it-py==3.0.0
mdurl==0.1.2
motor==3.6.0
multidict==6.0.4
orjson==3.10.12
pillow==10.4.0
propcache==0.2.1
pycparser==2.22
pydantic==2.10.2
pydantic-core==2.27.1
pydantic-settings==2.6.1
pydyf==0.11.0
pyexiftool==0.5.6
pygments==2.18.0
pymongo==4.9.2
pyphen==0.15.0
pyrsistent==0.20.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
redis==5.2.0
rich==13.9.4
s3transfer==0.10.4
setuptools==75.6.0
six==1.16.0
sniffio==1.3.1
soupsieve==2.6
starlette==0.41.3
tinycss2==1.4.0
tinyhtml5==2.0.0
type-extensions==0.1.2
typing-extensions==4.12.2
url-normalize==1.4.3
urllib3==2.2.3
uvicorn==0.32.1
weasyprint==63.0
webencodings==0.5.1
wrapt==1.17.0
yarl==1.18.3
zope-interface==7.2
zopfli==0.2.3.post1
+19 -38
View File
@@ -3,10 +3,10 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile, _TemporaryFileWrapper from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
import pydyf
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from exiftool import ExifTool
from jinja2 import Template from jinja2 import Template
from weasyprint import CSS, HTML from weasyprint import CSS, HTML, Document
from weasyprint.text.fonts import FontConfiguration from weasyprint.text.fonts import FontConfiguration
from ..models import Story from ..models import Story
@@ -97,7 +97,7 @@ class PDFGenerator(AbstractGenerator):
self.images = images self.images = images
self.author = author_image self.author = author_image
self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf") self.book: _TemporaryFileWrapper = NamedTemporaryFile(suffix=".pdf") # type: ignore
self.content = TEMPLATE self.content = TEMPLATE
def generate_chapters(self) -> dict[int, str]: def generate_chapters(self) -> dict[int, str]:
@@ -134,6 +134,12 @@ class PDFGenerator(AbstractGenerator):
"book_title": self.story["title"], "book_title": self.story["title"],
"cover": f"data:image/jpg;base64,{b64encode(self.cover).decode()}", "cover": f"data:image/jpg;base64,{b64encode(self.cover).decode()}",
"username": self.story["user"]["username"], "username": self.story["user"]["username"],
"author_bio": self.story["user"]["description"],
"clean_tags": ", ".join(self.story["tags"]),
"created": self.story["createDate"],
"modified": self.story["modifyDate"],
"is_completed": self.story["completed"],
"is_mature": self.story["mature"],
"description": self.story["description"], "description": self.story["description"],
"avatar": b64encode(self.author).decode(), "avatar": b64encode(self.author).decode(),
"copyright": { "copyright": {
@@ -149,6 +155,11 @@ class PDFGenerator(AbstractGenerator):
self.content: str = Template(self.content).render(data) self.content: str = Template(self.content).render(data)
def write_custom_metadata(self, document: Document, pdf: pydyf.PDF):
"""Write non-standard metadata fields to the PDF."""
pdf.info["completed"] = pydyf.String(str(self.story["completed"]))
pdf.info["mature"] = pydyf.String(str(self.story["mature"]))
def generate_pdf(self): def generate_pdf(self):
"""Generate and write the PDF to a temporary file (self.book).""" """Generate and write the PDF to a temporary file (self.book)."""
font_config = FontConfiguration() font_config = FontConfiguration()
@@ -157,47 +168,17 @@ class PDFGenerator(AbstractGenerator):
html_obj = HTML(string=self.content) html_obj = HTML(string=self.content)
html_obj.write_pdf( html_obj.write_pdf(
self.book.name, stylesheets=[stylesheet_obj], font_config=font_config self.book.name,
stylesheets=[stylesheet_obj],
font_config=font_config,
finisher=self.write_custom_metadata,
options={"custom_metadata": True},
) )
def add_metadata(self):
"""Write metadata to generated PDF file at self.book, using ExifTool."""
clean_description = (
self.story["description"].strip().replace("\n", "$/")
) # exiftool doesn't parse \ns correctly, they support $/ for the same instead. `
` is another option.
metadata = {
"Author": self.story["user"]["username"],
"Title": self.story["title"],
"Subject": clean_description,
"CreationDate": self.story["createDate"],
"ModDate": self.story["modifyDate"],
"Keywords": ",".join(self.story["tags"]),
"Language": self.story["language"]["name"],
"Completed": self.story["completed"],
"MatureContent": self.story["mature"],
"Producer": "Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader",
} # As per https://exiftool.org/TagNames/PDF.html
with ExifTool(config_file=DATA_PATH / "exiftool.config") as et:
# Custom configuration adds Completed and MatureContent tags.
# exiftool logger logs executed command
et.execute(
*(
[f"-{key}={value}" for key, value in metadata.items()]
+ [
"-overwrite_original",
self.book.file.name,
]
)
)
def compile(self): def compile(self):
parts = self.generate_chapters() parts = self.generate_chapters()
self.populate_template(parts) self.populate_template(parts)
self.generate_pdf() self.generate_pdf()
self.add_metadata()
return True return True
def dump(self) -> BytesIO: def dump(self) -> BytesIO:
@@ -1,73 +1,85 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="{{ langcode }}"> <html lang="{{ langcode }}">
<head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- https://doc.courtbouillon.org/weasyprint/stable/api_reference.html#weasyprint.document.DocumentMetadata -->
<title>{{ book_title }}</title> <title>{{ book_title }}</title>
<meta name=description content="{{description}}">
<meta name=author content="{{author}}">
<meta name=keywords content="{{clean_tags}}">
<meta name=language content="{{langcode}}">
<meta name=dcterms.created content="{{created}}">
<meta name=dcterms.modified content="{{modified}}">
<meta name=generator content="Dhanush Rambhatla (TheOnlyWayUp - https://rambhat.la) and WattpadDownloader">
<section class="fullpage">
<img src="{{ cover }}" alt="Cover"> </head>
<section class="fullpage">
<img src="{{ cover }}" alt="Cover">
</section>
<div id="copyright-container">
<h1 id="copyright-notice">Copyright Notice</h1>
<h2 id="copyright-title">{{ book_title }}</h2>
<p id="copyright-author">By {{ author }}</p>
<div id="copyright-separator"></div>
<p id="copyright-ex-libris">Ex Libris Sapientiae</p>
<div id="copyright-separator"></div>
{% if copyright.data %}
<img src="data:image/jpg;base64,{{copyright.data}}" alt="{{copyright.name}}" width="88" height="31"
id="copyright-license-image">
{% endif %}
<p id="copyright-copyright">{{ statement }}</p>
<p id="copyright-rights">{{ freedoms }}</p>
<p id="copyright-printing">Printing: {{ printing }}</p>
<p id="book-link">
ID: {{ book_id }}.
<a href="https://wattpad.com/story/{{ book_id }}" target="_blank" id="copyright-link">View this Book Online</a>
</p>
</div>
<div id="book">
<section id="contents" class="toc">
<h1>Table of Contents</h1>
<ul>
{% for part_id in parts %}
<li><a href="#{{part_id}}"></a></li>
{% endfor %}
</ul>
</section> </section>
{% for part_id in parts %}
<div id="copyright-container"> {{parts[part_id] | safe}}
<h1 id="copyright-notice">Copyright Notice</h1> {% endfor %}
</div>
<h2 id="copyright-title">{{ book_title }}</h2> <h1>About the Author</h1>
<p id="copyright-author">By {{ author }}</p> <div id="author-container">
<div id="author-about">
<div id="copyright-separator"></div> <img src="data:image/jpg;base64,{{avatar}}" alt="{{author}}'s profile picture" id="author-profile-picture">
<h2 id="author-name">
<p id="copyright-ex-libris">Ex Libris Sapientiae</p> <a href="https://wattpad.com/user/{{ username }}" id="author-link">{{ username }}</a>
</h2>
<div id="copyright-separator"></div> <hr id="author-divider">
<p id="author-bio">
{% if copyright.data %} {{ author_bio }}
<img src="data:image/jpg;base64,{{copyright.data}}"
alt="{{copyright.name}}"
width="88"
height="31"
id="copyright-license-image">
{% endif %}
<p id="copyright-copyright">{{ statement }}</p>
<p id="copyright-rights">{{ freedoms }}</p>
<p id="copyright-printing">Printing: {{ printing }}</p>
<p id="book-link">
ID: {{ book_id }}.
<a href="https://wattpad.com/story/{{ book_id }}" target="_blank" id="copyright-link">View this Book Online</a>
</p> </p>
</div> </div>
</div>
<div id="book">
<section id="contents" class="toc">
<h1>Table of Contents</h1>
<ul>
{% for part_id in parts %}
<li><a href="#{{part_id}}"></a></li>
{% endfor %}
</ul>
</section>
{% for part_id in parts %}
{{parts[part_id] | safe}}
{% endfor %}
</div>
<h1>About the Author</h1>
<div id="author-container">
<div id="author-about">
<img src="data:image/jpg;base64,{{avatar}}" alt="{{author}}'s profile picture" id="author-profile-picture">
<h2 id="author-name">
<a href="https://wattpad.com/user/{{ username }}" id="author-link">{{ username }}</a>
</h2>
<hr id="author-divider">
<p id="author-bio">
{{ description }}
</p>
</div>
</div>
</html> </html>
@@ -1,26 +0,0 @@
%Image::ExifTool::UserDefined = (
'Image::ExifTool::XMP::xmp' => {
Completed => {
Writable => 'boolean', # Can be a boolean (True/False)
Groups => { 2 => 'Content' },
},
MatureContent => {
Writable => 'boolean', # Can be a boolean (True/False)
Groups => { 2 => 'Content' },
},
},
'Image::ExifTool::IPTC::ApplicationRecord' => {
161 => {
Name => 'Completed',
Format => 'string[0,16]', # Store as a string (e.g., "Yes"/"No")
},
162 => {
Name => 'MatureContent',
Format => 'string[0,16]', # Store as a string (e.g., "Yes"/"No")
},
},
);
1; # End
@@ -0,0 +1,5 @@
The fonts need to be symlinked to /tmp/fonts, this allows the fonts to be loaded during development and during build-time.
It's assumed fonts will be present at `/tmp/fonts`, during development they're at `/src/api/src/create_book/generators/pdf`, and during deployment they're at `/app/src/api/src/create_book/generators/pdf`. This seems like a clean solution.
`Fontconfig error: Cannot load default config file: No such file: (null)`
If the fonts aren't found, this warning pops up in console. It won't cause downloads to fail, though.
+1068 -549
View File
File diff suppressed because it is too large Load Diff