api: Remove old parser

2025-06-09 14:44:30 +00:00
parent 3853e0d586
commit da08de17a5
4 changed files with 3 additions and 82 deletions
@@ -10,4 +10,4 @@ from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError
 from .generators import EPUBGenerator, PDFGenerator
 from .logs import logger
 from .parser import fetch_image
-from .utils import generate_clean_part_html, slugify
+from .utils import slugify
@@ -8,5 +8,4 @@ class StoryNotFoundError(WattpadError):
    ...
-class PartNotFoundError(StoryNotFoundError):
+class PartNotFoundError(StoryNotFoundError): ...
    ...
@@ -75,6 +75,7 @@ async def fetch_image(url: str) -> bytes | None:
 async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]:
    """Return a Generator of bytes containing image data for all images referenced in the tree."""
    image_urls = [img["src"] for img in tree.find_all("img")]
    downloaded_images: Generator[bytes] = chain(
        await asyncio.gather(*[fetch_image(url) for url in chunk])
@@ -8,85 +8,6 @@ from bs4 import BeautifulSoup
 from .models import Part
 def smart_trim(text: str, max_length: int = 400) -> str:
    """Truncate a string intelligently at newlines. Coherence and max-length adherence."""
    chunks = [t for t in text.split("\n") if t]
    to_return = ""
    for chunk in chunks:
        if len(to_return) + len(chunk) < max_length:
            to_return = chunk + "<br />"
        else:
            to_return = to_return.rstrip("<br />")
            break
    return to_return
 def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
    """Rebuild HTML Structure for a Part."""
    chapter_title = part["title"]
    chapter_id = part["id"]
    clean = BeautifulSoup(
        f"""
    <section id="section_{chapter_id}" class="chapitre">
        <h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
    </section>
    """,
        "html.parser",
    )  # html.parser doesn't create <html>/<body> tags automatically
    html = BeautifulSoup(content, "lxml")
    for br in html.find_all("br"):
        # Check if no content after br
        if not br.next_sibling or br.next_sibling.name in ["br", None]:
            br.decompose()
    section = cast(bs4.Tag, clean.find("section"))
    if not section:
        raise Exception()
    for child in html.find_all("p"):
        current_paragraph = clean.new_tag("p")
        # Attempt to carry over paragraph styling
        current_paragraph["style"] = child.get("style", "text-align: left;")
        for p_child in list(child.children):
            if not p_child:
                continue
            if isinstance(p_child, bs4.element.Tag):
                if p_child.name == "br":
                    p_child.decompose()
                elif p_child.name == "img":
                    src = p_child["src"]
                    img_tag = clean.new_tag("img")
                    img_tag["src"] = src
                    section.append(img_tag)
                    section.append(clean.new_tag("br"))
                elif p_child.name in ["b", "i"]:
                    styled_tag = clean.new_tag(p_child.name)
                    styled_content = clean.new_string(p_child.text)
                    styled_tag.append(styled_content)
                    current_paragraph.append(styled_tag)
                else:
                    # Append any other tags as-is
                    current_paragraph.append(p_child)
            elif isinstance(p_child, bs4.element.NavigableString):
                content = clean.new_string(p_child)
                current_paragraph.append(content)
        if current_paragraph.contents:
            section.append(current_paragraph)
        if not list(child.children):
            # Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
            child.decompose()
    return section
 def slugify(value, allow_unicode=False) -> str:
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py