From da08de17a55761e42e837dc987e0c8a94e375af1 Mon Sep 17 00:00:00 2001 From: TheOnlyWayUp Date: Mon, 9 Jun 2025 14:44:30 +0000 Subject: [PATCH] api: Remove old parser --- src/api/src/create_book/__init__.py | 2 +- src/api/src/create_book/exceptions.py | 3 +- src/api/src/create_book/parser.py | 1 + src/api/src/create_book/utils.py | 79 --------------------------- 4 files changed, 3 insertions(+), 82 deletions(-) diff --git a/src/api/src/create_book/__init__.py b/src/api/src/create_book/__init__.py index e6241ea..880237d 100644 --- a/src/api/src/create_book/__init__.py +++ b/src/api/src/create_book/__init__.py @@ -10,4 +10,4 @@ from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError from .generators import EPUBGenerator, PDFGenerator from .logs import logger from .parser import fetch_image -from .utils import generate_clean_part_html, slugify +from .utils import slugify diff --git a/src/api/src/create_book/exceptions.py b/src/api/src/create_book/exceptions.py index 53d9277..50225d7 100644 --- a/src/api/src/create_book/exceptions.py +++ b/src/api/src/create_book/exceptions.py @@ -8,5 +8,4 @@ class StoryNotFoundError(WattpadError): ... -class PartNotFoundError(StoryNotFoundError): - ... +class PartNotFoundError(StoryNotFoundError): ... diff --git a/src/api/src/create_book/parser.py b/src/api/src/create_book/parser.py index af9f016..d009550 100644 --- a/src/api/src/create_book/parser.py +++ b/src/api/src/create_book/parser.py @@ -75,6 +75,7 @@ async def fetch_image(url: str) -> bytes | None: async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]: + """Return a Generator of bytes containing image data for all images referenced in the tree.""" image_urls = [img["src"] for img in tree.find_all("img")] downloaded_images: Generator[bytes] = chain( await asyncio.gather(*[fetch_image(url) for url in chunk]) diff --git a/src/api/src/create_book/utils.py b/src/api/src/create_book/utils.py index ebebb04..83ccdfc 100644 --- a/src/api/src/create_book/utils.py +++ b/src/api/src/create_book/utils.py @@ -8,85 +8,6 @@ from bs4 import BeautifulSoup from .models import Part -def smart_trim(text: str, max_length: int = 400) -> str: - """Truncate a string intelligently at newlines. Coherence and max-length adherence.""" - chunks = [t for t in text.split("\n") if t] - - to_return = "" - for chunk in chunks: - if len(to_return) + len(chunk) < max_length: - to_return = chunk + "
" - else: - to_return = to_return.rstrip("
") - break - - return to_return - - -def generate_clean_part_html(part: Part, content: str) -> bs4.Tag: - """Rebuild HTML Structure for a Part.""" - chapter_title = part["title"] - chapter_id = part["id"] - - clean = BeautifulSoup( - f""" -
-

{chapter_title}

-
- """, - "html.parser", - ) # html.parser doesn't create / tags automatically - - html = BeautifulSoup(content, "lxml") - for br in html.find_all("br"): - # Check if no content after br - if not br.next_sibling or br.next_sibling.name in ["br", None]: - br.decompose() - - section = cast(bs4.Tag, clean.find("section")) - if not section: - raise Exception() - - for child in html.find_all("p"): - current_paragraph = clean.new_tag("p") - - # Attempt to carry over paragraph styling - current_paragraph["style"] = child.get("style", "text-align: left;") - - for p_child in list(child.children): - if not p_child: - continue - if isinstance(p_child, bs4.element.Tag): - if p_child.name == "br": - p_child.decompose() - elif p_child.name == "img": - src = p_child["src"] - img_tag = clean.new_tag("img") - img_tag["src"] = src - section.append(img_tag) - section.append(clean.new_tag("br")) - elif p_child.name in ["b", "i"]: - styled_tag = clean.new_tag(p_child.name) - styled_content = clean.new_string(p_child.text) - styled_tag.append(styled_content) - current_paragraph.append(styled_tag) - else: - # Append any other tags as-is - current_paragraph.append(p_child) - elif isinstance(p_child, bs4.element.NavigableString): - content = clean.new_string(p_child) - current_paragraph.append(content) - - if current_paragraph.contents: - section.append(current_paragraph) - - if not list(child.children): - # Some p tags only contain brs, once brs are removed, they are empty and can be removed as well. - child.decompose() - - return section - - def slugify(value, allow_unicode=False) -> str: """ Taken from https://github.com/django/django/blob/master/django/utils/text.py