api: Remove old parser
This commit is contained in:
@@ -10,4 +10,4 @@ from .exceptions import PartNotFoundError, StoryNotFoundError, WattpadError
|
|||||||
from .generators import EPUBGenerator, PDFGenerator
|
from .generators import EPUBGenerator, PDFGenerator
|
||||||
from .logs import logger
|
from .logs import logger
|
||||||
from .parser import fetch_image
|
from .parser import fetch_image
|
||||||
from .utils import generate_clean_part_html, slugify
|
from .utils import slugify
|
||||||
|
|||||||
@@ -8,5 +8,4 @@ class StoryNotFoundError(WattpadError):
|
|||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
class PartNotFoundError(StoryNotFoundError):
|
class PartNotFoundError(StoryNotFoundError): ...
|
||||||
...
|
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ async def fetch_image(url: str) -> bytes | None:
|
|||||||
|
|
||||||
|
|
||||||
async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]:
|
async def download_tree_images(tree: BeautifulSoup) -> Generator[bytes]:
|
||||||
|
"""Return a Generator of bytes containing image data for all images referenced in the tree."""
|
||||||
image_urls = [img["src"] for img in tree.find_all("img")]
|
image_urls = [img["src"] for img in tree.find_all("img")]
|
||||||
downloaded_images: Generator[bytes] = chain(
|
downloaded_images: Generator[bytes] = chain(
|
||||||
await asyncio.gather(*[fetch_image(url) for url in chunk])
|
await asyncio.gather(*[fetch_image(url) for url in chunk])
|
||||||
|
|||||||
@@ -8,85 +8,6 @@ from bs4 import BeautifulSoup
|
|||||||
from .models import Part
|
from .models import Part
|
||||||
|
|
||||||
|
|
||||||
def smart_trim(text: str, max_length: int = 400) -> str:
|
|
||||||
"""Truncate a string intelligently at newlines. Coherence and max-length adherence."""
|
|
||||||
chunks = [t for t in text.split("\n") if t]
|
|
||||||
|
|
||||||
to_return = ""
|
|
||||||
for chunk in chunks:
|
|
||||||
if len(to_return) + len(chunk) < max_length:
|
|
||||||
to_return = chunk + "<br />"
|
|
||||||
else:
|
|
||||||
to_return = to_return.rstrip("<br />")
|
|
||||||
break
|
|
||||||
|
|
||||||
return to_return
|
|
||||||
|
|
||||||
|
|
||||||
def generate_clean_part_html(part: Part, content: str) -> bs4.Tag:
|
|
||||||
"""Rebuild HTML Structure for a Part."""
|
|
||||||
chapter_title = part["title"]
|
|
||||||
chapter_id = part["id"]
|
|
||||||
|
|
||||||
clean = BeautifulSoup(
|
|
||||||
f"""
|
|
||||||
<section id="section_{chapter_id}" class="chapitre">
|
|
||||||
<h1 id="{chapter_id}" class="chapter-title">{chapter_title}</h1>
|
|
||||||
</section>
|
|
||||||
""",
|
|
||||||
"html.parser",
|
|
||||||
) # html.parser doesn't create <html>/<body> tags automatically
|
|
||||||
|
|
||||||
html = BeautifulSoup(content, "lxml")
|
|
||||||
for br in html.find_all("br"):
|
|
||||||
# Check if no content after br
|
|
||||||
if not br.next_sibling or br.next_sibling.name in ["br", None]:
|
|
||||||
br.decompose()
|
|
||||||
|
|
||||||
section = cast(bs4.Tag, clean.find("section"))
|
|
||||||
if not section:
|
|
||||||
raise Exception()
|
|
||||||
|
|
||||||
for child in html.find_all("p"):
|
|
||||||
current_paragraph = clean.new_tag("p")
|
|
||||||
|
|
||||||
# Attempt to carry over paragraph styling
|
|
||||||
current_paragraph["style"] = child.get("style", "text-align: left;")
|
|
||||||
|
|
||||||
for p_child in list(child.children):
|
|
||||||
if not p_child:
|
|
||||||
continue
|
|
||||||
if isinstance(p_child, bs4.element.Tag):
|
|
||||||
if p_child.name == "br":
|
|
||||||
p_child.decompose()
|
|
||||||
elif p_child.name == "img":
|
|
||||||
src = p_child["src"]
|
|
||||||
img_tag = clean.new_tag("img")
|
|
||||||
img_tag["src"] = src
|
|
||||||
section.append(img_tag)
|
|
||||||
section.append(clean.new_tag("br"))
|
|
||||||
elif p_child.name in ["b", "i"]:
|
|
||||||
styled_tag = clean.new_tag(p_child.name)
|
|
||||||
styled_content = clean.new_string(p_child.text)
|
|
||||||
styled_tag.append(styled_content)
|
|
||||||
current_paragraph.append(styled_tag)
|
|
||||||
else:
|
|
||||||
# Append any other tags as-is
|
|
||||||
current_paragraph.append(p_child)
|
|
||||||
elif isinstance(p_child, bs4.element.NavigableString):
|
|
||||||
content = clean.new_string(p_child)
|
|
||||||
current_paragraph.append(content)
|
|
||||||
|
|
||||||
if current_paragraph.contents:
|
|
||||||
section.append(current_paragraph)
|
|
||||||
|
|
||||||
if not list(child.children):
|
|
||||||
# Some p tags only contain brs, once brs are removed, they are empty and can be removed as well.
|
|
||||||
child.decompose()
|
|
||||||
|
|
||||||
return section
|
|
||||||
|
|
||||||
|
|
||||||
def slugify(value, allow_unicode=False) -> str:
|
def slugify(value, allow_unicode=False) -> str:
|
||||||
"""
|
"""
|
||||||
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
||||||
|
|||||||
Reference in New Issue
Block a user