api: Remove old parser
This commit is contained in:
@@ -10,4 +10,4 @@ from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError
|
||||
from .generators import EPUBGenerator, PDFGenerator
|
||||
from .logs import logger
|
||||
from .parser import fetch_image
|
||||
from .utils import generate_clean_part_html, slugify
|
||||
from .utils import slugify
|
||||
|
||||
@@ -8,5 +8,4 @@ class StoryNotFoundError(WattpadError):
|
||||
...
|
||||
|
||||
|
||||
class PartNotFoundError(StoryNotFoundError):
|
||||
...
|
||||
class PartNotFoundError(StoryNotFoundError): ...
|
||||
|
||||
@@ -75,6 +75,7 @@ async def fetch_image(url: str) -> bytes | None:
|
||||
|
||||
|
||||
async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]:
|
||||
"""Return a Generator of bytes containing image data for all images referenced in the tree."""
|
||||
image_urls = [img["src"] for img in tree.find_all("img")]
|
||||
downloaded_images: Generator[bytes] = chain(
|
||||
await asyncio.gather(*[fetch_image(url) for url in chunk])
|
||||
|
||||
@@ -8,85 +8,6 @@ from bs4 import BeautifulSoup
|
||||
from .models import Part
|
||||
|
||||
|
||||
def smart_trim(text: str, max_length: int = 400) -> str:
|
||||
"""Truncate a string intelligently at newlines. Coherence and max-length adherence."""
|
||||
chunks = [t for t in text.split("\n") if t]
|
||||
|
||||
to_return = ""
|
||||
for chunk in chunks:
|
||||
if len(to_return) + len(chunk) < max_length:
|
||||
to_return = chunk + "<br />"
|
||||
else:
|
||||
to_return = to_return.rstrip("<br />")
|
||||
break
|
||||
|
||||
return to_return
|
||||
|
||||
|
||||
def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
|
||||
"""Rebuild HTML Structure for a Part."""
|
||||
chapter_title = part["title"]
|
||||
chapter_id = part["id"]
|
||||
|
||||
clean = BeautifulSoup(
|
||||
f"""
|
||||
<section id="section_{chapter_id}" class="chapitre">
|
||||
<h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
|
||||
</section>
|
||||
""",
|
||||
"html.parser",
|
||||
) # html.parser doesn't create <html>/<body> tags automatically
|
||||
|
||||
html = BeautifulSoup(content, "lxml")
|
||||
for br in html.find_all("br"):
|
||||
# Check if no content after br
|
||||
if not br.next_sibling or br.next_sibling.name in ["br", None]:
|
||||
br.decompose()
|
||||
|
||||
section = cast(bs4.Tag, clean.find("section"))
|
||||
if not section:
|
||||
raise Exception()
|
||||
|
||||
for child in html.find_all("p"):
|
||||
current_paragraph = clean.new_tag("p")
|
||||
|
||||
# Attempt to carry over paragraph styling
|
||||
current_paragraph["style"] = child.get("style", "text-align: left;")
|
||||
|
||||
for p_child in list(child.children):
|
||||
if not p_child:
|
||||
continue
|
||||
if isinstance(p_child, bs4.element.Tag):
|
||||
if p_child.name == "br":
|
||||
p_child.decompose()
|
||||
elif p_child.name == "img":
|
||||
src = p_child["src"]
|
||||
img_tag = clean.new_tag("img")
|
||||
img_tag["src"] = src
|
||||
section.append(img_tag)
|
||||
section.append(clean.new_tag("br"))
|
||||
elif p_child.name in ["b", "i"]:
|
||||
styled_tag = clean.new_tag(p_child.name)
|
||||
styled_content = clean.new_string(p_child.text)
|
||||
styled_tag.append(styled_content)
|
||||
current_paragraph.append(styled_tag)
|
||||
else:
|
||||
# Append any other tags as-is
|
||||
current_paragraph.append(p_child)
|
||||
elif isinstance(p_child, bs4.element.NavigableString):
|
||||
content = clean.new_string(p_child)
|
||||
current_paragraph.append(content)
|
||||
|
||||
if current_paragraph.contents:
|
||||
section.append(current_paragraph)
|
||||
|
||||
if not list(child.children):
|
||||
# Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
|
||||
child.decompose()
|
||||
|
||||
return section
|
||||
|
||||
|
||||
def slugify(value, allow_unicode=False) -> str:
|
||||
"""
|
||||
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
||||
|
||||
Reference in New Issue
Block a user