From da08de17a55761e42e837dc987e0c8a94e375af1 Mon Sep 17 00:00:00 2001
From: TheOnlyWayUp <hi@towu.dev>
Date: Mon, 9 Jun 2025 14:44:30 +0000
Subject: [PATCH] api: Remove old parser

---
 src/api/src/create_book/__init__.py   |  2 +-
 src/api/src/create_book/exceptions.py |  3 +-
 src/api/src/create_book/parser.py     |  1 +
 src/api/src/create_book/utils.py      | 79 ---------------------------
 4 files changed, 3 insertions(+), 82 deletions(-)
diff --git a/src/api/src/create_book/__init__.py b/src/api/src/create_book/__init__.py
index e6241ea..880237d 100644
--- a/src/api/src/create_book/__init__.py
+++ b/src/api/src/create_book/__init__.py
@@ -10,4 +10,4 @@ from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError
 from .generators import EPUBGenerator, PDFGenerator
 from .logs import logger
 from .parser import fetch_image
-from .utils import generate_clean_part_html, slugify
+from .utils import slugify
diff --git a/src/api/src/create_book/exceptions.py b/src/api/src/create_book/exceptions.py
index 53d9277..50225d7 100644
--- a/src/api/src/create_book/exceptions.py
+++ b/src/api/src/create_book/exceptions.py
@@ -8,5 +8,4 @@ class StoryNotFoundError(WattpadError):
     ...
 
 
-class PartNotFoundError(StoryNotFoundError):
-    ...
+class PartNotFoundError(StoryNotFoundError): ...
diff --git a/src/api/src/create_book/parser.py b/src/api/src/create_book/parser.py
index af9f016..d009550 100644
--- a/src/api/src/create_book/parser.py
+++ b/src/api/src/create_book/parser.py
@@ -75,6 +75,7 @@ async def fetch_image(url: str) -> bytes | None:
 
 
 async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]:
+    """Return a Generator of bytes containing image data for all images referenced in the tree."""
     image_urls = [img["src"] for img in tree.find_all("img")]
     downloaded_images: Generator[bytes] = chain(
         await asyncio.gather(*[fetch_image(url) for url in chunk])
diff --git a/src/api/src/create_book/utils.py b/src/api/src/create_book/utils.py
index ebebb04..83ccdfc 100644
--- a/src/api/src/create_book/utils.py
+++ b/src/api/src/create_book/utils.py
@@ -8,85 +8,6 @@ from bs4 import BeautifulSoup
 from .models import Part
 
 
-def smart_trim(text: str, max_length: int = 400) -> str:
-    """Truncate a string intelligently at newlines. Coherence and max-length adherence."""
-    chunks = [t for t in text.split("\n") if t]
-
-    to_return = ""
-    for chunk in chunks:
-        if len(to_return) + len(chunk) < max_length:
-            to_return = chunk + "<br />"
-        else:
-            to_return = to_return.rstrip("<br />")
-            break
-
-    return to_return
-
-
-def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
-    """Rebuild HTML Structure for a Part."""
-    chapter_title = part["title"]
-    chapter_id = part["id"]
-
-    clean = BeautifulSoup(
-        f"""
-    <section id="section_{chapter_id}" class="chapitre">
-        <h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
-    </section>
-    """,
-        "html.parser",
-    )  # html.parser doesn't create <html>/<body> tags automatically
-
-    html = BeautifulSoup(content, "lxml")
-    for br in html.find_all("br"):
-        # Check if no content after br
-        if not br.next_sibling or br.next_sibling.name in ["br", None]:
-            br.decompose()
-
-    section = cast(bs4.Tag, clean.find("section"))
-    if not section:
-        raise Exception()
-
-    for child in html.find_all("p"):
-        current_paragraph = clean.new_tag("p")
-
-        # Attempt to carry over paragraph styling
-        current_paragraph["style"] = child.get("style", "text-align: left;")
-
-        for p_child in list(child.children):
-            if not p_child:
-                continue
-            if isinstance(p_child, bs4.element.Tag):
-                if p_child.name == "br":
-                    p_child.decompose()
-                elif p_child.name == "img":
-                    src = p_child["src"]
-                    img_tag = clean.new_tag("img")
-                    img_tag["src"] = src
-                    section.append(img_tag)
-                    section.append(clean.new_tag("br"))
-                elif p_child.name in ["b", "i"]:
-                    styled_tag = clean.new_tag(p_child.name)
-                    styled_content = clean.new_string(p_child.text)
-                    styled_tag.append(styled_content)
-                    current_paragraph.append(styled_tag)
-                else:
-                    # Append any other tags as-is
-                    current_paragraph.append(p_child)
-            elif isinstance(p_child, bs4.element.NavigableString):
-                content = clean.new_string(p_child)
-                current_paragraph.append(content)
-
-        if current_paragraph.contents:
-            section.append(current_paragraph)
-
-        if not list(child.children):
-            # Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
-            child.decompose()
-
-    return section
-
-
 def slugify(value, allow_unicode=False) -> str:
     """
     Taken from https://github.com/django/django/blob/master/django/utils/text.py