api: Remove old parser

This commit is contained in:
TheOnlyWayUp
2025-06-09 14:44:30 +00:00
parent 3853e0d586
commit da08de17a5
4 changed files with 3 additions and 82 deletions
+1 -1
View File
@@ -10,4 +10,4 @@ from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError
from .generators import EPUBGenerator, PDFGenerator
from .logs import logger
from .parser import fetch_image
from .utils import generate_clean_part_html, slugify
from .utils import slugify
+1 -2
View File
@@ -8,5 +8,4 @@ class StoryNotFoundError(WattpadError):
...
class PartNotFoundError(StoryNotFoundError):
...
class PartNotFoundError(StoryNotFoundError): ...
+1
View File
@@ -75,6 +75,7 @@ async def fetch_image(url: str) -> bytes | None:
async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]:
"""Return a Generator of bytes containing image data for all images referenced in the tree."""
image_urls = [img["src"] for img in tree.find_all("img")]
downloaded_images: Generator[bytes] = chain(
await asyncio.gather(*[fetch_image(url) for url in chunk])
-79
View File
@@ -8,85 +8,6 @@ from bs4 import BeautifulSoup
from .models import Part
def smart_trim(text: str, max_length: int = 400) -> str:
"""Truncate a string intelligently at newlines. Coherence and max-length adherence."""
chunks = [t for t in text.split("\n") if t]
to_return = ""
for chunk in chunks:
if len(to_return) + len(chunk) < max_length:
to_return = chunk + "<br />"
else:
to_return = to_return.rstrip("<br />")
break
return to_return
def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
"""Rebuild HTML Structure for a Part."""
chapter_title = part["title"]
chapter_id = part["id"]
clean = BeautifulSoup(
f"""
<section id="section_{chapter_id}" class="chapitre">
<h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
</section>
""",
"html.parser",
) # html.parser doesn't create <html>/<body> tags automatically
html = BeautifulSoup(content, "lxml")
for br in html.find_all("br"):
# Check if no content after br
if not br.next_sibling or br.next_sibling.name in ["br", None]:
br.decompose()
section = cast(bs4.Tag, clean.find("section"))
if not section:
raise Exception()
for child in html.find_all("p"):
current_paragraph = clean.new_tag("p")
# Attempt to carry over paragraph styling
current_paragraph["style"] = child.get("style", "text-align: left;")
for p_child in list(child.children):
if not p_child:
continue
if isinstance(p_child, bs4.element.Tag):
if p_child.name == "br":
p_child.decompose()
elif p_child.name == "img":
src = p_child["src"]
img_tag = clean.new_tag("img")
img_tag["src"] = src
section.append(img_tag)
section.append(clean.new_tag("br"))
elif p_child.name in ["b", "i"]:
styled_tag = clean.new_tag(p_child.name)
styled_content = clean.new_string(p_child.text)
styled_tag.append(styled_content)
current_paragraph.append(styled_tag)
else:
# Append any other tags as-is
current_paragraph.append(p_child)
elif isinstance(p_child, bs4.element.NavigableString):
content = clean.new_string(p_child)
current_paragraph.append(content)
if current_paragraph.contents:
section.append(current_paragraph)
if not list(child.children):
# Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
child.decompose()
return section
def slugify(value, allow_unicode=False) -> str:
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py