Merge pull request #23 from TheOnlyWayUp/fix/#22-redis-cache

Concurrent requests fail

Co-authored-by: AaronBenDaniel <144371000+AaronBenDaniel@users.noreply.github.com>
This commit is contained in:
Dhanush R
2024-12-01 03:48:07 +05:30
committed by GitHub
9 changed files with 1504 additions and 194 deletions
+3
View File
@@ -5,3 +5,6 @@ data
*ipynb
build
.vscode
.venv
.env
*log
+3
View File
@@ -0,0 +1,3 @@
USE_CACHE=true
CACHE_TYPE=file
REDIS_CONNECTION_URL=
+1
View File
@@ -0,0 +1 @@
3.10
+20
View File
@@ -0,0 +1,20 @@
[project]
name = "api"
version = "0.1.0"
description = "Wattpad Downloader API"
readme = "../../README.md"
requires-python = ">=3.10"
dependencies = [
"aiohttp>=3.9.1",
"aiohttp-client-cache[all]>=0.10.0",
"rich>=13.9.4",
"fastapi>=0.115.5",
"ebooklib>=0.18",
"python-dotenv>=1.0.1",
"pydantic-settings>=2.6.1",
"eliot>=1.16.0",
"type-extensions>=0.1.2",
]
[tool.ruff.lint]
ignore = ['E402']
+37 -15
View File
@@ -1,24 +1,32 @@
aioboto3==12.4.0
aiobotocore==2.12.3
aiofiles==23.2.1
aiohttp==3.9.1
aiohttp-client-cache==0.10.0
aioitertools==0.12.0
aiosignal==1.3.1
aiosqlite==0.19.0
annotated-types==0.6.0
anyio==4.2.0
annotated-types==0.7.0
anyio==4.6.2.post1
asttokens==2.4.1
async-timeout==4.0.3
attrs==23.1.0
backoff==2.2.1
beautifulsoup4==4.12.3
boltons==24.1.0
boto3==1.34.69
botocore==1.34.69
bs4==0.0.2
click==8.1.7
comm==0.2.0
debugpy==1.8.0
decorator==5.1.1
EbookLib==0.18
exceptiongroup==1.2.0
dnspython==2.7.0
ebooklib==0.18
eliot==1.16.0
exceptiongroup==1.2.2
executing==2.0.1
fastapi==0.108.0
fastapi==0.115.5
frozenlist==1.4.1
h11==0.14.0
idna==3.6
@@ -26,14 +34,17 @@ ipykernel==6.28.0
ipython==8.19.0
itsdangerous==2.1.2
jedi==0.19.1
jupyter_client==8.6.0
jupyter_core==5.5.1
lxml==4.9.4
jmespath==1.0.1
jupyter-client==8.6.0
jupyter-core==5.5.1
lxml==5.3.0
markdown-it-py==3.0.0
matplotlib-inline==0.1.6
mdurl==0.1.2
motor==3.6.0
multidict==6.0.4
nest-asyncio==1.5.8
orjson==3.10.12
packaging==23.2
parso==0.8.3
pexpect==4.9.0
@@ -42,21 +53,32 @@ prompt-toolkit==3.0.43
psutil==5.9.7
ptyprocess==0.7.0
pure-eval==0.2.2
pydantic==2.5.3
pydantic_core==2.14.6
Pygments==2.17.2
pydantic==2.10.2
pydantic-core==2.27.1
pydantic-settings==2.6.1
pygments==2.18.0
pymongo==4.9.2
pyrsistent==0.20.0
python-dateutil==2.8.2
python-dotenv==1.0.1
pyzmq==25.1.2
rich==13.7.0
redis==5.2.0
rich==13.9.4
s3transfer==0.10.4
setuptools==75.6.0
six==1.16.0
sniffio==1.3.0
sniffio==1.3.1
soupsieve==2.5
stack-data==0.6.3
starlette==0.32.0.post1
starlette==0.41.3
tornado==6.4
traitlets==5.14.0
typing_extensions==4.9.0
type-extensions==0.1.2
typing-extensions==4.12.2
url-normalize==1.4.3
urllib3==2.2.3
uvicorn==0.25.0
wcwidth==0.2.12
wrapt==1.17.0
yarl==1.9.4
zope-interface==7.2
+196 -81
View File
@@ -1,61 +1,102 @@
import asyncio
from typing import Optional
from ebooklib import epub
import unicodedata
from typing import List, Optional, Tuple
from typing_extensions import TypedDict
import re
import unicodedata
import logging
from os import environ
from enum import Enum
import backoff
from aiohttp import ClientResponseError, ClientSession
from aiohttp_client_cache.session import CachedSession
from aiohttp_client_cache import FileBackend
from eliot import to_file, start_action
from eliot.stdlib import EliotHandler
from dotenv import load_dotenv
from ebooklib import epub
from ebooklib.epub import EpubBook
from bs4 import BeautifulSoup
from pydantic import TypeAdapter, model_validator, field_validator
from pydantic_settings import BaseSettings
from aiohttp import ClientResponseError
from aiohttp_client_cache.session import CachedSession
from aiohttp_client_cache import FileBackend, RedisBackend
load_dotenv(override=True)
handler = EliotHandler()
logging.getLogger("fastapi").setLevel(logging.INFO)
logging.getLogger("fastapi").addHandler(handler)
if environ.get("DEBUG"):
to_file(open("eliot.log", "wb"))
logger = logging.Logger("wpd")
logger.addHandler(handler)
# --- #
class CacheTypes(Enum):
file = "file"
redis = "redis"
class Config(BaseSettings):
USE_CACHE: bool = True
CACHE_TYPE: CacheTypes = CacheTypes.file
REDIS_CONNECTION_URL: str = ""
@field_validator("USE_CACHE", mode="before")
def validate_use_cache(cls, value):
# Return default if value is an empty string
if value == "":
return True # Default value for USE_CACHE
return value
@field_validator("CACHE_TYPE", mode="before")
def validate_cache_type(cls, value):
# Thanks https://stackoverflow.com/a/78157474
if value == "":
return "file"
return value
@model_validator(mode="after")
def prevent_mismatched_redis_url(self):
match self.CACHE_TYPE:
case CacheTypes.file:
if self.REDIS_CONNECTION_URL:
raise ValueError(
"REDIS_CONNECTION_URL provided when File cache selected. To use Redis as a cache, set CACHE_TYPE=redis."
)
case CacheTypes.redis:
if not self.REDIS_CONNECTION_URL:
raise ValueError(
"REDIS_CONNECTION_URL not provided when Redis cache selected. To use File cache, set CACHE_TYPE=file."
)
return self
config = Config()
# --- #
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
if config.USE_CACHE:
match config.CACHE_TYPE:
case CacheTypes.file:
cache = FileBackend(use_temp=True, expire_after=43200) # 12 hours
case CacheTypes.redis:
cache = RedisBackend(
cache_name="wpd-aiohttp-cache", address=config.REDIS_CONNECTION_URL
)
else:
cache = None
logger.info(f"Using {cache=}")
# --- Utilities --- #
async def wp_get_cookies(username: str, password: str) -> dict:
# source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
"""Retrieves authorization cookies from Wattpad by logging in with user creds.
Args:
username (str): Username.
password (str): Password.
Raises:
ValueError: Bad status code.
ValueError: No cookies returned.
Returns:
dict: Authorization cookies.
"""
async with ClientSession(headers=headers) as session:
async with session.post(
"https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
data={
"username": username.lower(),
"password": password,
}, # the username.lower() is for caching
) as response:
if response.status != 204:
raise ValueError("Not a 204.")
cookies = {
k: v.value
for k, v in response.cookies.items() # Thanks https://stackoverflow.com/a/32281245
}
if not cookies:
raise ValueError("No cookies.")
return cookies
def slugify(value, allow_unicode=False) -> str:
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
@@ -79,35 +120,108 @@ def slugify(value, allow_unicode=False) -> str:
return re.sub(r"[-\s]+", "-", value).strip("-_")
async def wp_get_cookies(username: str, password: str) -> dict:
# source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
"""Retrieves authorization cookies from Wattpad by logging in with user creds.
Args:
username (str): Username.
password (str): Password.
Raises:
ValueError: Bad status code.
ValueError: No cookies returned.
Returns:
dict: Authorization cookies.
"""
with start_action(action_type="api_fetch_cookies"):
async with CachedSession(headers=headers, cache=None) as session:
async with session.post(
"https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
data={
"username": username.lower(),
"password": password,
}, # the username.lower() is for caching
) as response:
if response.status != 204:
raise ValueError("Not a 204.")
cookies = {
k: v.value
for k, v in response.cookies.items() # Thanks https://stackoverflow.com/a/32281245
}
if not cookies:
raise ValueError("No cookies.")
return cookies
# --- Models --- #
class Language(TypedDict):
name: str
class User(TypedDict):
username: str
class Part(TypedDict):
id: int
title: str
class Story(TypedDict):
id: str
title: str
createDate: str
modifyDate: str
language: Language
user: User
description: str
cover: str
completed: bool
tags: List[str]
mature: bool
url: str
parts: List[Part]
isPaywalled: bool
story_ta = TypeAdapter(Story)
# --- API Calls --- #
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
async def fetch_story_id(part_id: int, cookies: Optional[dict] = None) -> int:
async def fetch_story_from_partId(
part_id: int, cookies: Optional[dict] = None
) -> Tuple[str, Story]:
"""Return a Story ID from a Part ID."""
async with (
CachedSession(headers=headers, cache=cache)
if not cookies
else ClientSession(headers=headers, cookies=cookies)
with start_action(action_type="api_fetch_storyFromPartId"):
async with CachedSession(
headers=headers, cache=None if cookies else cache
) as session: # Don't cache requests with Cookies.
async with session.get(
f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId"
f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId,group(tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username),parts(id,title),cover)"
) as response:
response.raise_for_status()
body = await response.json()
return body["groupId"]
return str(body["groupId"]), story_ta.validate_python(body["group"])
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> dict:
async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> Story:
"""Taking a story_id, return its information from the Wattpad API."""
async with (
CachedSession(headers=headers, cache=cache)
if not cookies
else ClientSession(headers=headers, cookies=cookies)
) as session: # Don't cache requests with Cookies.
with start_action(action_type="api_fetch_story", story_id=story_id):
async with CachedSession(
headers=headers, cookies=cookies, cache=None if cookies else cache
) as session:
async with session.get(
f"https://www.wattpad.com/api/v3/stories/{story_id}?fields=tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username),parts(id,title),cover"
) as response:
@@ -115,17 +229,16 @@ async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> dict:
body = await response.json()
return body
return story_ta.validate_python(body)
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
async def fetch_part_content(part_id: int, cookies: Optional[dict] = None) -> str:
"""Return the HTML Content of a Part."""
async with (
CachedSession(headers=headers, cache=cache)
if not cookies
else ClientSession(headers=headers, cookies=cookies)
) as session: # Don't cache requests with Cookies.
with start_action(action_type="api_fetch_partContent", part_id=part_id):
async with CachedSession(
headers=headers, cookies=cookies, cache=None if cookies else cache
) as session:
async with session.get(
f"https://www.wattpad.com/apiv2/?m=storytext&id={part_id}"
) as response:
@@ -137,13 +250,12 @@ async def fetch_part_content(part_id: int, cookies: Optional[dict] = None) -> st
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
async def fetch_cover(url: str, cookies: Optional[dict] = None) -> bytes:
"""Fetch image bytes."""
async with (
CachedSession(headers=headers, cache=cache)
if not cookies
else ClientSession(headers=headers, cookies=cookies)
) as session: # Don't cache requests with Cookies.
async def fetch_cover(url: str) -> bytes:
"""Fetch cover image bytes."""
with start_action(action_type="api_fetch_cover", url=url):
async with CachedSession(
headers=headers, cache=None
) as session: # Don't cache images.
async with session.get(url) as response:
response.raise_for_status()
@@ -155,7 +267,8 @@ async def fetch_cover(url: str, cookies: Optional[dict] = None) -> bytes:
# --- EPUB Generation --- #
def set_metadata(book, data):
def set_metadata(book: EpubBook, data: Story) -> None:
"""Set book metadata."""
book.add_author(data["user"]["username"])
book.add_metadata("DC", "title", data["title"])
@@ -175,16 +288,20 @@ def set_metadata(book, data):
)
async def set_cover(book, data, cookies: Optional[dict] = None):
book.set_cover("cover.jpg", await fetch_cover(data["cover"], cookies=cookies))
async def set_cover(book: EpubBook, data: Story) -> None:
"""Set book cover."""
book.set_cover("cover.jpg", await fetch_cover(data["cover"]))
chapter = epub.EpubHtml(
file_name=f"titlepage.xhtml", # Standard for cover page
file_name="titlepage.xhtml", # Standard for cover page
)
chapter.set_content('<img src="cover.jpg">')
async def add_chapters(
book, data, download_images: bool = False, cookies: Optional[dict] = None
book: EpubBook,
data: Story,
download_images: bool = False,
cookies: Optional[dict] = None,
):
chapters = []
@@ -202,11 +319,9 @@ async def add_chapters(
if download_images:
soup = BeautifulSoup(content, "lxml")
async with (
CachedSession(headers=headers, cache=cache)
if not cookies
else ClientSession(headers=headers, cookies=cookies)
) as session: # Don't cache requests with Cookies.
async with CachedSession(
headers=headers, cache=None
) as session: # Don't cache images.
for idx, image in enumerate(soup.find_all("img")):
if not image["src"]:
continue
@@ -234,7 +349,7 @@ async def add_chapters(
for chapter in chapters:
book.add_item(chapter)
book.toc = tuple(chapters)
book.toc = chapters
# Thanks https://github.com/aerkalov/ebooklib/blob/master/samples/09_create_image/create.py
book.add_item(epub.EpubNcx())
+59 -9
View File
@@ -1,10 +1,12 @@
"""WattpadDownloader API Server."""
from typing import Optional
import asyncio
import tempfile
from pathlib import Path
from io import BytesIO
from enum import Enum
from eliot import start_action
from aiohttp import ClientResponseError
from fastapi import FastAPI, Request
from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
@@ -17,9 +19,11 @@ from create_book import (
add_chapters,
slugify,
wp_get_cookies,
fetch_story_id,
fetch_story_from_partId,
logger,
)
app = FastAPI()
BUILD_PATH = Path(__file__).parent / "build"
@@ -28,10 +32,46 @@ headers = {
}
class RequestCancelledMiddleware:
# Thanks https://github.com/fastapi/fastapi/discussions/11360#discussion-6427734
def __init__(self, app):
self.app = app
async def __call__(self, scope, receive, send):
if scope["type"] != "http":
await self.app(scope, receive, send)
return
# Let's make a shared queue for the request messages
queue = asyncio.Queue()
async def message_poller(sentinel, handler_task):
nonlocal queue
while True:
message = await receive()
if message["type"] == "http.disconnect":
handler_task.cancel()
return sentinel # Break the loop
# Puts the message in the queue
await queue.put(message)
sentinel = object()
handler_task = asyncio.create_task(self.app(scope, queue.get, send))
asyncio.create_task(message_poller(sentinel, handler_task))
try:
return await handler_task
except asyncio.CancelledError:
logger.info("Cancelling task as connection closed")
app.add_middleware(RequestCancelledMiddleware)
class DownloadMode(Enum):
story = "story"
part = "part"
collection = "collection"
@app.get("/")
@@ -68,8 +108,17 @@ async def handle_download(
mode: DownloadMode = DownloadMode.story,
username: Optional[str] = None,
password: Optional[str] = None,
):
with start_action(
action_type="download",
download_id=download_id,
download_images=download_images,
mode=mode,
):
if username and not password or password and not username:
logger.error(
"Username with no Password or Password with no Username provided."
)
return HTMLResponse(
status_code=422,
content='Include both the username <u>and</u> password, or neither. Support is available on the <a href="https://discord.gg/P9RHC4KCwd" target="_blank">Discord</a>',
@@ -80,6 +129,7 @@ async def handle_download(
try:
cookies = await wp_get_cookies(username=username, password=password)
except ValueError:
logger.error("Invalid username or password.")
return HTMLResponse(
status_code=403,
content='Incorrect Username and/or Password. Support is available on the <a href="https://discord.gg/P9RHC4KCwd" target="_blank">Discord</a>',
@@ -90,15 +140,15 @@ async def handle_download(
match mode:
case DownloadMode.story:
story_id = download_id
metadata = await retrieve_story(story_id, cookies)
case DownloadMode.part:
story_id = await fetch_story_id(download_id, cookies)
story_id, metadata = await fetch_story_from_partId(download_id, cookies)
logger.info(f"Retrieved story id ({story_id=})")
book = epub.EpubBook()
metadata = await retrieve_story(story_id, cookies)
set_metadata(book, metadata)
await set_cover(book, metadata, cookies=cookies)
await set_cover(book, metadata)
async for title in add_chapters(
book, metadata, download_images=download_images, cookies=cookies
@@ -120,7 +170,7 @@ async def handle_download(
BytesIO(book_data),
media_type="application/epub+zip",
headers={
"Content-Disposition": f'attachment; filename="{slugify(metadata["title"])}_{story_id}_{"images" if download_images else ""}.epub"' # Thanks https://stackoverflow.com/a/72729058
"Content-Disposition": f'attachment; filename="{slugify(metadata["title"])}_{story_id}{"_images" if download_images else ""}.epub"' # Thanks https://stackoverflow.com/a/72729058
},
)
@@ -131,4 +181,4 @@ app.mount("/", StaticFiles(directory=BUILD_PATH), "static")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=80)
uvicorn.run("main:app", host="0.0.0.0", port=80, workers=16)
+1083
View File
File diff suppressed because it is too large Load Diff
+18 -5
View File
@@ -31,12 +31,13 @@
input_url = input_url.toLowerCase();
invalid_url = false;
if (!input_url.includes("wattpad.com/")) {
invalid_url = true;
}
// Originally, I was going to call the Wattpad API (wattpad.com/api/v3/stories/${story_id}), but Wattpad kept blocking those requests. I suspect it has something to do with the Origin header, I wasn't able to remove it.
// In the future, if this is considered, it would be cool if we could derive the Story ID from a pasted Part URL. Refer to @AaronBenDaniel's https://github.com/AaronBenDaniel/WattpadDownloader/blob/49b29b245188149f2d24c0b1c59e4c7f90f289a9/src/api/src/create_book.py#L156 (https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=url).
if (/^\d+$/.test(input_url)) {
// All numbers
download_id = input_url;
mode = "story";
} else if (input_url.includes("wattpad.com/")) {
// Is a string and contains contain wattpad.com/
if (input_url.includes("/story/")) {
// https://wattpad.com/story/237369078-wattpad-books-presents
@@ -61,6 +62,18 @@
download_id = "";
}
}
} else {
invalid_url = true;
}
input_url = input_url.match(/\d+/g)?.join("") || "";
download_id = input_url;
// Originally, I was going to call the Wattpad API (wattpad.com/api/v3/stories/${story_id}), but Wattpad kept blocking those requests. I suspect it has something to do with the Origin header, I wasn't able to remove it.
// In the future, if this is considered, it would be cool if we could derive the Story ID from a pasted Part URL. Refer to @AaronBenDaniel's https://github.com/AaronBenDaniel/WattpadDownloader/blob/49b29b245188149f2d24c0b1c59e4c7f90f289a9/src/api/src/create_book.py#L156 (https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=url).
} else {
invalid_url = false;
download_id = "";
}
}
</script>