api: Remove old parser

2025-06-09 14:44:30 +00:00
parent 3853e0d586
commit da08de17a5
4 changed files with 3 additions and 82 deletions
@@ -10,4 +10,4 @@ from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError
 from .generators import EPUBGenerator, PDFGenerator
 from .logs import logger
 from .parser import fetch_image
-from .utils import generate_clean_part_html, slugify
+from .utils import slugify
@@ -8,5 +8,4 @@ class StoryNotFoundError(WattpadError):
    ...


-class PartNotFoundError(StoryNotFoundError):
-    ...
+class PartNotFoundError(StoryNotFoundError): ...
@@ -75,6 +75,7 @@ async def fetch_image(url: str) -> bytes | None:


 async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]:
+    """Return a Generator of bytes containing image data for all images referenced in the tree."""
    image_urls = [img["src"] for img in tree.find_all("img")]
    downloaded_images: Generator[bytes] = chain(
        await asyncio.gather(*[fetch_image(url) for url in chunk])
@@ -8,85 +8,6 @@ from bs4 import BeautifulSoup
 from .models import Part


-def smart_trim(text: str, max_length: int = 400) -> str:
-    """Truncate a string intelligently at newlines. Coherence and max-length adherence."""
-    chunks = [t for t in text.split("\n") if t]
-
-    to_return = ""
-    for chunk in chunks:
-        if len(to_return) + len(chunk) < max_length:
-            to_return = chunk + "<br />"
-        else:
-            to_return = to_return.rstrip("<br />")
-            break
-
-    return to_return
-
-
-def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
-    """Rebuild HTML Structure for a Part."""
-    chapter_title = part["title"]
-    chapter_id = part["id"]
-
-    clean = BeautifulSoup(
-        f"""
-    <section id="section_{chapter_id}" class="chapitre">
-        <h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
-    </section>
-    """,
-        "html.parser",
-    )  # html.parser doesn't create <html>/<body> tags automatically
-
-    html = BeautifulSoup(content, "lxml")
-    for br in html.find_all("br"):
-        # Check if no content after br
-        if not br.next_sibling or br.next_sibling.name in ["br", None]:
-            br.decompose()
-
-    section = cast(bs4.Tag, clean.find("section"))
-    if not section:
-        raise Exception()
-
-    for child in html.find_all("p"):
-        current_paragraph = clean.new_tag("p")
-
-        # Attempt to carry over paragraph styling
-        current_paragraph["style"] = child.get("style", "text-align: left;")
-
-        for p_child in list(child.children):
-            if not p_child:
-                continue
-            if isinstance(p_child, bs4.element.Tag):
-                if p_child.name == "br":
-                    p_child.decompose()
-                elif p_child.name == "img":
-                    src = p_child["src"]
-                    img_tag = clean.new_tag("img")
-                    img_tag["src"] = src
-                    section.append(img_tag)
-                    section.append(clean.new_tag("br"))
-                elif p_child.name in ["b", "i"]:
-                    styled_tag = clean.new_tag(p_child.name)
-                    styled_content = clean.new_string(p_child.text)
-                    styled_tag.append(styled_content)
-                    current_paragraph.append(styled_tag)
-                else:
-                    # Append any other tags as-is
-                    current_paragraph.append(p_child)
-            elif isinstance(p_child, bs4.element.NavigableString):
-                content = clean.new_string(p_child)
-                current_paragraph.append(content)
-
-        if current_paragraph.contents:
-            section.append(current_paragraph)
-
-        if not list(child.children):
-            # Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
-            child.decompose()
-
-    return section
-
-
 def slugify(value, allow_unicode=False) -> str:
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py