Merge pull request #23 from TheOnlyWayUp/fix/#22-redis-cache

Concurrent requests fail Co-authored-by: AaronBenDaniel <144371000+AaronBenDaniel@users.noreply.github.com>
2024-12-01 03:48:07 +05:30
parent 96877d9c9b ec700ce284
commit 5f0676a19d
9 changed files with 1504 additions and 194 deletions
@@ -5,3 +5,6 @@ data
 *ipynb
 build
 .vscode
 .venv
 .env
 *log
@@ -0,0 +1,3 @@
 USE_CACHE=true
 CACHE_TYPE=file
 REDIS_CONNECTION_URL=
@@ -0,0 +1 @@
 3.10
@@ -0,0 +1,20 @@
 [project]
 name = "api"
 version = "0.1.0"
 description = "Wattpad Downloader API"
 readme = "../../README.md"
 requires-python = ">=3.10"
 dependencies = [
    "aiohttp>=3.9.1",
    "aiohttp-client-cache[all]>=0.10.0",
    "rich>=13.9.4",
    "fastapi>=0.115.5",
    "ebooklib>=0.18",
    "python-dotenv>=1.0.1",
    "pydantic-settings>=2.6.1",
    "eliot>=1.16.0",
    "type-extensions>=0.1.2",
 ]
 [tool.ruff.lint]
 ignore = ['E402']
@@ -1,24 +1,32 @@
 aioboto3==12.4.0
 aiobotocore==2.12.3
 aiofiles==23.2.1
 aiohttp==3.9.1
 aiohttp-client-cache==0.10.0
 aioitertools==0.12.0
 aiosignal==1.3.1
 aiosqlite==0.19.0
-annotated-types==0.6.0
+annotated-types==0.7.0
-anyio==4.2.0
+anyio==4.6.2.post1
 asttokens==2.4.1
 async-timeout==4.0.3
 attrs==23.1.0
 backoff==2.2.1
 beautifulsoup4==4.12.3
 boltons==24.1.0
 boto3==1.34.69
 botocore==1.34.69
 bs4==0.0.2
 click==8.1.7
 comm==0.2.0
 debugpy==1.8.0
 decorator==5.1.1
-EbookLib==0.18
+dnspython==2.7.0
-exceptiongroup==1.2.0
+ebooklib==0.18
 eliot==1.16.0
 exceptiongroup==1.2.2
 executing==2.0.1
-fastapi==0.108.0
+fastapi==0.115.5
 frozenlist==1.4.1
 h11==0.14.0
 idna==3.6
@@ -26,14 +34,17 @@ ipykernel==6.28.0
 ipython==8.19.0
 itsdangerous==2.1.2
 jedi==0.19.1
-jupyter_client==8.6.0
+jmespath==1.0.1
-jupyter_core==5.5.1
+jupyter-client==8.6.0
-lxml==4.9.4
+jupyter-core==5.5.1
 lxml==5.3.0
 markdown-it-py==3.0.0
 matplotlib-inline==0.1.6
 mdurl==0.1.2
 motor==3.6.0
 multidict==6.0.4
 nest-asyncio==1.5.8
 orjson==3.10.12
 packaging==23.2
 parso==0.8.3
 pexpect==4.9.0
@@ -42,21 +53,32 @@ prompt-toolkit==3.0.43
 psutil==5.9.7
 ptyprocess==0.7.0
 pure-eval==0.2.2
-pydantic==2.5.3
+pydantic==2.10.2
-pydantic_core==2.14.6
+pydantic-core==2.27.1
-Pygments==2.17.2
+pydantic-settings==2.6.1
 pygments==2.18.0
 pymongo==4.9.2
 pyrsistent==0.20.0
 python-dateutil==2.8.2
 python-dotenv==1.0.1
 pyzmq==25.1.2
-rich==13.7.0
+redis==5.2.0
 rich==13.9.4
 s3transfer==0.10.4
 setuptools==75.6.0
 six==1.16.0
-sniffio==1.3.0
+sniffio==1.3.1
 soupsieve==2.5
 stack-data==0.6.3
-starlette==0.32.0.post1
+starlette==0.41.3
 tornado==6.4
 traitlets==5.14.0
-typing_extensions==4.9.0
+type-extensions==0.1.2
 typing-extensions==4.12.2
 url-normalize==1.4.3
 urllib3==2.2.3
 uvicorn==0.25.0
 wcwidth==0.2.12
 wrapt==1.17.0
 yarl==1.9.4
 zope-interface==7.2
@@ -1,61 +1,102 @@
-import asyncio
+from typing import List, Optional, Tuple
-from typing import Optional
+from typing_extensions import TypedDict
 from ebooklib import epub
 import unicodedata
 import re
 import unicodedata
 import logging
 from os import environ
 from enum import Enum
 import backoff
-from aiohttp import ClientResponseError, ClientSession
+from eliot import to_file, start_action
-from aiohttp_client_cache.session import CachedSession
+from eliot.stdlib import EliotHandler
-from aiohttp_client_cache import FileBackend
+from dotenv import load_dotenv
 from ebooklib import epub
 from ebooklib.epub import EpubBook
 from bs4 import BeautifulSoup
 from pydantic import TypeAdapter, model_validator, field_validator
 from pydantic_settings import BaseSettings
 from aiohttp import ClientResponseError
 from aiohttp_client_cache.session import CachedSession
 from aiohttp_client_cache import FileBackend, RedisBackend
 load_dotenv(override=True)
 handler = EliotHandler()
 logging.getLogger("fastapi").setLevel(logging.INFO)
 logging.getLogger("fastapi").addHandler(handler)
 if environ.get("DEBUG"):
    to_file(open("eliot.log", "wb"))
 logger = logging.Logger("wpd")
 logger.addHandler(handler)
 # --- #
 class CacheTypes(Enum):
    file = "file"
    redis = "redis"
 class Config(BaseSettings):
    USE_CACHE: bool = True
    CACHE_TYPE: CacheTypes = CacheTypes.file
    REDIS_CONNECTION_URL: str = ""
    @field_validator("USE_CACHE", mode="before")
    def validate_use_cache(cls, value):
        # Return default if value is an empty string
        if value == "":
            return True  # Default value for USE_CACHE
        return value
    @field_validator("CACHE_TYPE", mode="before")
    def validate_cache_type(cls, value):
        # Thanks https://stackoverflow.com/a/78157474
        if value == "":
            return "file"
        return value
    @model_validator(mode="after")
    def prevent_mismatched_redis_url(self):
        match self.CACHE_TYPE:
            case CacheTypes.file:
                if self.REDIS_CONNECTION_URL:
                    raise ValueError(
                        "REDIS_CONNECTION_URL provided when File cache selected. To use Redis as a cache, set CACHE_TYPE=redis."
                    )
            case CacheTypes.redis:
                if not self.REDIS_CONNECTION_URL:
                    raise ValueError(
                        "REDIS_CONNECTION_URL not provided when Redis cache selected. To use File cache, set CACHE_TYPE=file."
                    )
        return self
 config = Config()
 # --- #
 headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
 }
 if config.USE_CACHE:
    match config.CACHE_TYPE:
        case CacheTypes.file:
            cache = FileBackend(use_temp=True, expire_after=43200)  # 12 hours
        case CacheTypes.redis:
            cache = RedisBackend(
                cache_name="wpd-aiohttp-cache", address=config.REDIS_CONNECTION_URL
            )
 else:
    cache = None
 logger.info(f"Using {cache=}")
 # --- Utilities --- #
 async def wp_get_cookies(username: str, password: str) -> dict:
    # source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
    """Retrieves authorization cookies from Wattpad by logging in with user creds.
    Args:
        username (str): Username.
        password (str): Password.
    Raises:
        ValueError: Bad status code.
        ValueError: No cookies returned.
    Returns:
        dict: Authorization cookies.
    """
    async with ClientSession(headers=headers) as session:
        async with session.post(
            "https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
            data={
                "username": username.lower(),
                "password": password,
            },  # the username.lower() is for caching
        ) as response:
            if response.status != 204:
                raise ValueError("Not a 204.")
            cookies = {
                k: v.value
                for k, v in response.cookies.items()  # Thanks https://stackoverflow.com/a/32281245
            }
            if not cookies:
                raise ValueError("No cookies.")
            return cookies
 def slugify(value, allow_unicode=False) -> str:
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
@@ -79,35 +120,108 @@ def slugify(value, allow_unicode=False) -> str:
    return re.sub(r"[-\s]+", "-", value).strip("-_")
 async def wp_get_cookies(username: str, password: str) -> dict:
    # source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
    """Retrieves authorization cookies from Wattpad by logging in with user creds.
    Args:
        username (str): Username.
        password (str): Password.
    Raises:
        ValueError: Bad status code.
        ValueError: No cookies returned.
    Returns:
        dict: Authorization cookies.
    """
    with start_action(action_type="api_fetch_cookies"):
        async with CachedSession(headers=headers, cache=None) as session:
            async with session.post(
                "https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
                data={
                    "username": username.lower(),
                    "password": password,
                },  # the username.lower() is for caching
            ) as response:
                if response.status != 204:
                    raise ValueError("Not a 204.")
                cookies = {
                    k: v.value
                    for k, v in response.cookies.items()  # Thanks https://stackoverflow.com/a/32281245
                }
                if not cookies:
                    raise ValueError("No cookies.")
                return cookies
 # --- Models --- #
 class Language(TypedDict):
    name: str
 class User(TypedDict):
    username: str
 class Part(TypedDict):
    id: int
    title: str
 class Story(TypedDict):
    id: str
    title: str
    createDate: str
    modifyDate: str
    language: Language
    user: User
    description: str
    cover: str
    completed: bool
    tags: List[str]
    mature: bool
    url: str
    parts: List[Part]
    isPaywalled: bool
 story_ta = TypeAdapter(Story)
 # --- API Calls --- #
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
-async def fetch_story_id(part_id: int, cookies: Optional[dict] = None) -> int:
+async def fetch_story_from_partId(
    part_id: int, cookies: Optional[dict] = None
 ) -> Tuple[str, Story]:
    """Return a Story ID from a Part ID."""
-    async with (
+    with start_action(action_type="api_fetch_storyFromPartId"):
-        CachedSession(headers=headers, cache=cache)
+        async with CachedSession(
-        if not cookies
+            headers=headers, cache=None if cookies else cache
        else ClientSession(headers=headers, cookies=cookies)
        ) as session:  # Don't cache requests with Cookies.
            async with session.get(
-            f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId"
+                f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId,group(tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username),parts(id,title),cover)"
            ) as response:
                response.raise_for_status()
                body = await response.json()
-    return body["groupId"]
+        return str(body["groupId"]), story_ta.validate_python(body["group"])
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
-async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> dict:
+async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> Story:
    """Taking a story_id, return its information from the Wattpad API."""
-    async with (
+    with start_action(action_type="api_fetch_story", story_id=story_id):
-        CachedSession(headers=headers, cache=cache)
+        async with CachedSession(
-        if not cookies
+            headers=headers, cookies=cookies, cache=None if cookies else cache
-        else ClientSession(headers=headers, cookies=cookies)
+        ) as session:
    ) as session:  # Don't cache requests with Cookies.
            async with session.get(
                f"https://www.wattpad.com/api/v3/stories/{story_id}?fields=tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username),parts(id,title),cover"
            ) as response:
@@ -115,17 +229,16 @@ async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> dict:
                body = await response.json()
-    return body
+        return story_ta.validate_python(body)
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_part_content(part_id: int, cookies: Optional[dict] = None) -> str:
    """Return the HTML Content of a Part."""
-    async with (
+    with start_action(action_type="api_fetch_partContent", part_id=part_id):
-        CachedSession(headers=headers, cache=cache)
+        async with CachedSession(
-        if not cookies
+            headers=headers, cookies=cookies, cache=None if cookies else cache
-        else ClientSession(headers=headers, cookies=cookies)
+        ) as session:
    ) as session:  # Don't cache requests with Cookies.
            async with session.get(
                f"https://www.wattpad.com/apiv2/?m=storytext&id={part_id}"
            ) as response:
@@ -137,13 +250,12 @@ async def fetch_part_content(part_id: int, cookies: Optional[dict] = None) -> st
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
-async def fetch_cover(url: str, cookies: Optional[dict] = None) -> bytes:
+async def fetch_cover(url: str) -> bytes:
-    """Fetch image bytes."""
+    """Fetch cover image bytes."""
-    async with (
+    with start_action(action_type="api_fetch_cover", url=url):
-        CachedSession(headers=headers, cache=cache)
+        async with CachedSession(
-        if not cookies
+            headers=headers, cache=None
-        else ClientSession(headers=headers, cookies=cookies)
+        ) as session:  # Don't cache images.
    ) as session:  # Don't cache requests with Cookies.
            async with session.get(url) as response:
                response.raise_for_status()
@@ -155,7 +267,8 @@ async def fetch_cover(url: str, cookies: Optional[dict] = None) -> bytes:
 # --- EPUB Generation --- #
-def set_metadata(book, data):
+def set_metadata(book: EpubBook, data: Story) -> None:
    """Set book metadata."""
    book.add_author(data["user"]["username"])
    book.add_metadata("DC", "title", data["title"])
@@ -175,16 +288,20 @@ def set_metadata(book, data):
    )
-async def set_cover(book, data, cookies: Optional[dict] = None):
+async def set_cover(book: EpubBook, data: Story) -> None:
-    book.set_cover("cover.jpg", await fetch_cover(data["cover"], cookies=cookies))
+    """Set book cover."""
    book.set_cover("cover.jpg", await fetch_cover(data["cover"]))
    chapter = epub.EpubHtml(
-        file_name=f"titlepage.xhtml",  # Standard for cover page
+        file_name="titlepage.xhtml",  # Standard for cover page
    )
    chapter.set_content('<img src="cover.jpg">')
 async def add_chapters(
-    book, data, download_images: bool = False, cookies: Optional[dict] = None
+    book: EpubBook,
    data: Story,
    download_images: bool = False,
    cookies: Optional[dict] = None,
 ):
    chapters = []
@@ -202,11 +319,9 @@ async def add_chapters(
        if download_images:
            soup = BeautifulSoup(content, "lxml")
-            async with (
+            async with CachedSession(
-                CachedSession(headers=headers, cache=cache)
+                headers=headers, cache=None
-                if not cookies
+            ) as session:  # Don't cache images.
                else ClientSession(headers=headers, cookies=cookies)
            ) as session:  # Don't cache requests with Cookies.
                for idx, image in enumerate(soup.find_all("img")):
                    if not image["src"]:
                        continue
@@ -234,7 +349,7 @@ async def add_chapters(
    for chapter in chapters:
        book.add_item(chapter)
-    book.toc = tuple(chapters)
+    book.toc = chapters
    # Thanks https://github.com/aerkalov/ebooklib/blob/master/samples/09_create_image/create.py
    book.add_item(epub.EpubNcx())
@@ -1,10 +1,12 @@
 """WattpadDownloader API Server."""
 from typing import Optional
 import asyncio
 import tempfile
 from pathlib import Path
 from io import BytesIO
 from enum import Enum
 from eliot import start_action
 from aiohttp import ClientResponseError
 from fastapi import FastAPI, Request
 from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
@@ -17,9 +19,11 @@ from create_book import (
    add_chapters,
    slugify,
    wp_get_cookies,
-    fetch_story_id,
+    fetch_story_from_partId,
    logger,
 )
 app = FastAPI()
 BUILD_PATH = Path(__file__).parent / "build"
@@ -28,10 +32,46 @@ headers = {
 }
 class RequestCancelledMiddleware:
    # Thanks https://github.com/fastapi/fastapi/discussions/11360#discussion-6427734
    def __init__(self, app):
        self.app = app
    async def __call__(self, scope, receive, send):
        if scope["type"] != "http":
            await self.app(scope, receive, send)
            return
        # Let's make a shared queue for the request messages
        queue = asyncio.Queue()
        async def message_poller(sentinel, handler_task):
            nonlocal queue
            while True:
                message = await receive()
                if message["type"] == "http.disconnect":
                    handler_task.cancel()
                    return sentinel  # Break the loop
                # Puts the message in the queue
                await queue.put(message)
        sentinel = object()
        handler_task = asyncio.create_task(self.app(scope, queue.get, send))
        asyncio.create_task(message_poller(sentinel, handler_task))
        try:
            return await handler_task
        except asyncio.CancelledError:
            logger.info("Cancelling task as connection closed")
 app.add_middleware(RequestCancelledMiddleware)
 class DownloadMode(Enum):
    story = "story"
    part = "part"
    collection = "collection"
@app.get("/")
@@ -68,8 +108,17 @@ async def handle_download(
    mode: DownloadMode = DownloadMode.story,
    username: Optional[str] = None,
    password: Optional[str] = None,
 ):
    with start_action(
        action_type="download",
        download_id=download_id,
        download_images=download_images,
        mode=mode,
    ):
        if username and not password or password and not username:
            logger.error(
                "Username with no Password or Password with no Username provided."
            )
            return HTMLResponse(
                status_code=422,
                content='Include both the username <u>and</u> password, or neither. Support is available on the <a href="https://discord.gg/P9RHC4KCwd" target="_blank">Discord</a>',
@@ -80,6 +129,7 @@ async def handle_download(
            try:
                cookies = await wp_get_cookies(username=username, password=password)
            except ValueError:
                logger.error("Invalid username or password.")
                return HTMLResponse(
                    status_code=403,
                    content='Incorrect Username and/or Password. Support is available on the <a href="https://discord.gg/P9RHC4KCwd" target="_blank">Discord</a>',
@@ -90,15 +140,15 @@ async def handle_download(
        match mode:
            case DownloadMode.story:
                story_id = download_id
                metadata = await retrieve_story(story_id, cookies)
            case DownloadMode.part:
-            story_id = await fetch_story_id(download_id, cookies)
+                story_id, metadata = await fetch_story_from_partId(download_id, cookies)
        logger.info(f"Retrieved story id ({story_id=})")
        book = epub.EpubBook()
    metadata = await retrieve_story(story_id, cookies)
        set_metadata(book, metadata)
-
+        await set_cover(book, metadata)
    await set_cover(book, metadata, cookies=cookies)
        async for title in add_chapters(
            book, metadata, download_images=download_images, cookies=cookies
@@ -120,7 +170,7 @@ async def handle_download(
            BytesIO(book_data),
            media_type="application/epub+zip",
            headers={
-            "Content-Disposition": f'attachment; filename="{slugify(metadata["title"])}_{story_id}_{"images" if download_images else ""}.epub"'  # Thanks https://stackoverflow.com/a/72729058
+                "Content-Disposition": f'attachment; filename="{slugify(metadata["title"])}_{story_id}{"_images" if download_images else ""}.epub"'  # Thanks https://stackoverflow.com/a/72729058
            },
        )
@@ -131,4 +181,4 @@ app.mount("/", StaticFiles(directory=BUILD_PATH), "static")
 if __name__ == "__main__":
    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=80)
+    uvicorn.run("main:app", host="0.0.0.0", port=80, workers=16)
@@ -31,12 +31,13 @@
      input_url = input_url.toLowerCase();
      invalid_url = false;
      if (!input_url.includes("wattpad.com/")) {
        invalid_url = true;
      }
-      // Originally, I was going to call the Wattpad API (wattpad.com/api/v3/stories/${story_id}), but Wattpad kept blocking those requests. I suspect it has something to do with the Origin header, I wasn't able to remove it.
+      if (/^\d+$/.test(input_url)) {
-      // In the future, if this is considered, it would be cool if we could derive the Story ID from a pasted Part URL. Refer to @AaronBenDaniel's https://github.com/AaronBenDaniel/WattpadDownloader/blob/49b29b245188149f2d24c0b1c59e4c7f90f289a9/src/api/src/create_book.py#L156 (https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=url).
+        // All numbers
        download_id = input_url;
        mode = "story";
      } else if (input_url.includes("wattpad.com/")) {
        // Is a string and contains contain wattpad.com/
        if (input_url.includes("/story/")) {
          // https://wattpad.com/story/237369078-wattpad-books-presents
@@ -61,6 +62,18 @@
            download_id = "";
          }
        }
      } else {
        invalid_url = true;
      }
      input_url = input_url.match(/\d+/g)?.join("") || "";
      download_id = input_url;
      // Originally, I was going to call the Wattpad API (wattpad.com/api/v3/stories/${story_id}), but Wattpad kept blocking those requests. I suspect it has something to do with the Origin header, I wasn't able to remove it.
      // In the future, if this is considered, it would be cool if we could derive the Story ID from a pasted Part URL. Refer to @AaronBenDaniel's https://github.com/AaronBenDaniel/WattpadDownloader/blob/49b29b245188149f2d24c0b1c59e4c7f90f289a9/src/api/src/create_book.py#L156 (https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=url).
    } else {
      invalid_url = false;
      download_id = "";
    }
  }
 </script>