Merge pull request #23 from TheOnlyWayUp/fix/#22-redis-cache

Concurrent requests fail Co-authored-by: AaronBenDaniel <144371000+AaronBenDaniel@users.noreply.github.com>
2024-12-01 03:48:07 +05:30
parent 96877d9c9b ec700ce284
commit 5f0676a19d
9 changed files with 1504 additions and 194 deletions
@@ -5,3 +5,6 @@ data
 *ipynb
 build
 .vscode
+.venv
+.env
+*log
@@ -0,0 +1,3 @@
+USE_CACHE=true
+CACHE_TYPE=file
+REDIS_CONNECTION_URL=
@@ -0,0 +1 @@
+3.10
@@ -0,0 +1,20 @@
+[project]
+name = "api"
+version = "0.1.0"
+description = "Wattpad Downloader API"
+readme = "../../README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "aiohttp>=3.9.1",
+    "aiohttp-client-cache[all]>=0.10.0",
+    "rich>=13.9.4",
+    "fastapi>=0.115.5",
+    "ebooklib>=0.18",
+    "python-dotenv>=1.0.1",
+    "pydantic-settings>=2.6.1",
+    "eliot>=1.16.0",
+    "type-extensions>=0.1.2",
+]
+
+[tool.ruff.lint]
+ignore = ['E402']
@@ -1,24 +1,32 @@
+aioboto3==12.4.0
+aiobotocore==2.12.3
 aiofiles==23.2.1
 aiohttp==3.9.1
 aiohttp-client-cache==0.10.0
+aioitertools==0.12.0
 aiosignal==1.3.1
 aiosqlite==0.19.0
-annotated-types==0.6.0
-anyio==4.2.0
+annotated-types==0.7.0
+anyio==4.6.2.post1
 asttokens==2.4.1
 async-timeout==4.0.3
 attrs==23.1.0
 backoff==2.2.1
 beautifulsoup4==4.12.3
+boltons==24.1.0
+boto3==1.34.69
+botocore==1.34.69
 bs4==0.0.2
 click==8.1.7
 comm==0.2.0
 debugpy==1.8.0
 decorator==5.1.1
-EbookLib==0.18
-exceptiongroup==1.2.0
+dnspython==2.7.0
+ebooklib==0.18
+eliot==1.16.0
+exceptiongroup==1.2.2
 executing==2.0.1
-fastapi==0.108.0
+fastapi==0.115.5
 frozenlist==1.4.1
 h11==0.14.0
 idna==3.6
@@ -26,14 +34,17 @@ ipykernel==6.28.0
 ipython==8.19.0
 itsdangerous==2.1.2
 jedi==0.19.1
-jupyter_client==8.6.0
-jupyter_core==5.5.1
-lxml==4.9.4
+jmespath==1.0.1
+jupyter-client==8.6.0
+jupyter-core==5.5.1
+lxml==5.3.0
 markdown-it-py==3.0.0
 matplotlib-inline==0.1.6
 mdurl==0.1.2
+motor==3.6.0
 multidict==6.0.4
 nest-asyncio==1.5.8
+orjson==3.10.12
 packaging==23.2
 parso==0.8.3
 pexpect==4.9.0
@@ -42,21 +53,32 @@ prompt-toolkit==3.0.43
 psutil==5.9.7
 ptyprocess==0.7.0
 pure-eval==0.2.2
-pydantic==2.5.3
-pydantic_core==2.14.6
-Pygments==2.17.2
+pydantic==2.10.2
+pydantic-core==2.27.1
+pydantic-settings==2.6.1
+pygments==2.18.0
+pymongo==4.9.2
+pyrsistent==0.20.0
 python-dateutil==2.8.2
+python-dotenv==1.0.1
 pyzmq==25.1.2
-rich==13.7.0
+redis==5.2.0
+rich==13.9.4
+s3transfer==0.10.4
+setuptools==75.6.0
 six==1.16.0
-sniffio==1.3.0
+sniffio==1.3.1
 soupsieve==2.5
 stack-data==0.6.3
-starlette==0.32.0.post1
+starlette==0.41.3
 tornado==6.4
 traitlets==5.14.0
-typing_extensions==4.9.0
+type-extensions==0.1.2
+typing-extensions==4.12.2
 url-normalize==1.4.3
+urllib3==2.2.3
 uvicorn==0.25.0
 wcwidth==0.2.12
+wrapt==1.17.0
 yarl==1.9.4
+zope-interface==7.2
@@ -1,61 +1,102 @@
-import asyncio
-from typing import Optional
-from ebooklib import epub
-import unicodedata
+from typing import List, Optional, Tuple
+from typing_extensions import TypedDict
 import re
+import unicodedata
+import logging
+from os import environ
+from enum import Enum
 import backoff
-from aiohttp import ClientResponseError, ClientSession
-from aiohttp_client_cache.session import CachedSession
-from aiohttp_client_cache import FileBackend
+from eliot import to_file, start_action
+from eliot.stdlib import EliotHandler
+from dotenv import load_dotenv
+from ebooklib import epub
+from ebooklib.epub import EpubBook
 from bs4 import BeautifulSoup
+from pydantic import TypeAdapter, model_validator, field_validator
+from pydantic_settings import BaseSettings
+from aiohttp import ClientResponseError
+from aiohttp_client_cache.session import CachedSession
+from aiohttp_client_cache import FileBackend, RedisBackend

+load_dotenv(override=True)
+
+handler = EliotHandler()
+logging.getLogger("fastapi").setLevel(logging.INFO)
+logging.getLogger("fastapi").addHandler(handler)
+
+if environ.get("DEBUG"):
+    to_file(open("eliot.log", "wb"))
+
+logger = logging.Logger("wpd")
+logger.addHandler(handler)
+
+# --- #
+
+
+class CacheTypes(Enum):
+    file = "file"
+    redis = "redis"
+
+
+class Config(BaseSettings):
+    USE_CACHE: bool = True
+    CACHE_TYPE: CacheTypes = CacheTypes.file
+    REDIS_CONNECTION_URL: str = ""
+
+    @field_validator("USE_CACHE", mode="before")
+    def validate_use_cache(cls, value):
+        # Return default if value is an empty string
+        if value == "":
+            return True  # Default value for USE_CACHE
+        return value
+
+    @field_validator("CACHE_TYPE", mode="before")
+    def validate_cache_type(cls, value):
+        # Thanks https://stackoverflow.com/a/78157474
+        if value == "":
+            return "file"
+        return value
+
+    @model_validator(mode="after")
+    def prevent_mismatched_redis_url(self):
+        match self.CACHE_TYPE:
+            case CacheTypes.file:
+                if self.REDIS_CONNECTION_URL:
+                    raise ValueError(
+                        "REDIS_CONNECTION_URL provided when File cache selected. To use Redis as a cache, set CACHE_TYPE=redis."
+                    )
+            case CacheTypes.redis:
+                if not self.REDIS_CONNECTION_URL:
+                    raise ValueError(
+                        "REDIS_CONNECTION_URL not provided when Redis cache selected. To use File cache, set CACHE_TYPE=file."
+                    )
+        return self
+
+
+config = Config()
+
+# --- #

 headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
 }

+if config.USE_CACHE:
+    match config.CACHE_TYPE:
+        case CacheTypes.file:
            cache = FileBackend(use_temp=True, expire_after=43200)  # 12 hours
+        case CacheTypes.redis:
+            cache = RedisBackend(
+                cache_name="wpd-aiohttp-cache", address=config.REDIS_CONNECTION_URL
+            )
+else:
+    cache = None
+
+logger.info(f"Using {cache=}")

 # --- Utilities --- #


-async def wp_get_cookies(username: str, password: str) -> dict:
-    # source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
-    """Retrieves authorization cookies from Wattpad by logging in with user creds.
-
-    Args:
-        username (str): Username.
-        password (str): Password.
-
-    Raises:
-        ValueError: Bad status code.
-        ValueError: No cookies returned.
-
-    Returns:
-        dict: Authorization cookies.
-    """
-    async with ClientSession(headers=headers) as session:
-        async with session.post(
-            "https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
-            data={
-                "username": username.lower(),
-                "password": password,
-            },  # the username.lower() is for caching
-        ) as response:
-            if response.status != 204:
-                raise ValueError("Not a 204.")
-
-            cookies = {
-                k: v.value
-                for k, v in response.cookies.items()  # Thanks https://stackoverflow.com/a/32281245
-            }
-
-            if not cookies:
-                raise ValueError("No cookies.")
-
-            return cookies
-
-
 def slugify(value, allow_unicode=False) -> str:
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
@@ -79,35 +120,108 @@ def slugify(value, allow_unicode=False) -> str:
    return re.sub(r"[-\s]+", "-", value).strip("-_")


+async def wp_get_cookies(username: str, password: str) -> dict:
+    # source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
+    """Retrieves authorization cookies from Wattpad by logging in with user creds.
+
+    Args:
+        username (str): Username.
+        password (str): Password.
+
+    Raises:
+        ValueError: Bad status code.
+        ValueError: No cookies returned.
+
+    Returns:
+        dict: Authorization cookies.
+    """
+    with start_action(action_type="api_fetch_cookies"):
+        async with CachedSession(headers=headers, cache=None) as session:
+            async with session.post(
+                "https://www.wattpad.com/auth/login?nextUrl=%2F&_data=routes%2Fauth.login",
+                data={
+                    "username": username.lower(),
+                    "password": password,
+                },  # the username.lower() is for caching
+            ) as response:
+                if response.status != 204:
+                    raise ValueError("Not a 204.")
+
+                cookies = {
+                    k: v.value
+                    for k, v in response.cookies.items()  # Thanks https://stackoverflow.com/a/32281245
+                }
+
+                if not cookies:
+                    raise ValueError("No cookies.")
+
+                return cookies
+
+
+# --- Models --- #
+
+
+class Language(TypedDict):
+    name: str
+
+
+class User(TypedDict):
+    username: str
+
+
+class Part(TypedDict):
+    id: int
+    title: str
+
+
+class Story(TypedDict):
+    id: str
+    title: str
+    createDate: str
+    modifyDate: str
+    language: Language
+    user: User
+    description: str
+    cover: str
+    completed: bool
+    tags: List[str]
+    mature: bool
+    url: str
+    parts: List[Part]
+    isPaywalled: bool
+
+
+story_ta = TypeAdapter(Story)
+
 # --- API Calls --- #


@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
-async def fetch_story_id(part_id: int, cookies: Optional[dict] = None) -> int:
+async def fetch_story_from_partId(
+    part_id: int, cookies: Optional[dict] = None
+) -> Tuple[str, Story]:
    """Return a Story ID from a Part ID."""
-    async with (
-        CachedSession(headers=headers, cache=cache)
-        if not cookies
-        else ClientSession(headers=headers, cookies=cookies)
+    with start_action(action_type="api_fetch_storyFromPartId"):
+        async with CachedSession(
+            headers=headers, cache=None if cookies else cache
        ) as session:  # Don't cache requests with Cookies.
            async with session.get(
-            f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId"
+                f"https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=groupId,group(tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username),parts(id,title),cover)"
            ) as response:
                response.raise_for_status()

                body = await response.json()

-    return body["groupId"]
+        return str(body["groupId"]), story_ta.validate_python(body["group"])


@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
-async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> dict:
+async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> Story:
    """Taking a story_id, return its information from the Wattpad API."""
-    async with (
-        CachedSession(headers=headers, cache=cache)
-        if not cookies
-        else ClientSession(headers=headers, cookies=cookies)
-    ) as session:  # Don't cache requests with Cookies.
+    with start_action(action_type="api_fetch_story", story_id=story_id):
+        async with CachedSession(
+            headers=headers, cookies=cookies, cache=None if cookies else cache
+        ) as session:
            async with session.get(
                f"https://www.wattpad.com/api/v3/stories/{story_id}?fields=tags,id,title,createDate,modifyDate,language(name),description,completed,mature,url,isPaywalled,user(username),parts(id,title),cover"
            ) as response:
@@ -115,17 +229,16 @@ async def retrieve_story(story_id: int, cookies: Optional[dict] = None) -> dict:

                body = await response.json()

-    return body
+        return story_ta.validate_python(body)


@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
 async def fetch_part_content(part_id: int, cookies: Optional[dict] = None) -> str:
    """Return the HTML Content of a Part."""
-    async with (
-        CachedSession(headers=headers, cache=cache)
-        if not cookies
-        else ClientSession(headers=headers, cookies=cookies)
-    ) as session:  # Don't cache requests with Cookies.
+    with start_action(action_type="api_fetch_partContent", part_id=part_id):
+        async with CachedSession(
+            headers=headers, cookies=cookies, cache=None if cookies else cache
+        ) as session:
            async with session.get(
                f"https://www.wattpad.com/apiv2/?m=storytext&id={part_id}"
            ) as response:
@@ -137,13 +250,12 @@ async def fetch_part_content(part_id: int, cookies: Optional[dict] = None) -> st


@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
-async def fetch_cover(url: str, cookies: Optional[dict] = None) -> bytes:
-    """Fetch image bytes."""
-    async with (
-        CachedSession(headers=headers, cache=cache)
-        if not cookies
-        else ClientSession(headers=headers, cookies=cookies)
-    ) as session:  # Don't cache requests with Cookies.
+async def fetch_cover(url: str) -> bytes:
+    """Fetch cover image bytes."""
+    with start_action(action_type="api_fetch_cover", url=url):
+        async with CachedSession(
+            headers=headers, cache=None
+        ) as session:  # Don't cache images.
            async with session.get(url) as response:
                response.raise_for_status()

@@ -155,7 +267,8 @@ async def fetch_cover(url: str, cookies: Optional[dict] = None) -> bytes:
 # --- EPUB Generation --- #


-def set_metadata(book, data):
+def set_metadata(book: EpubBook, data: Story) -> None:
+    """Set book metadata."""
    book.add_author(data["user"]["username"])

    book.add_metadata("DC", "title", data["title"])
@@ -175,16 +288,20 @@ def set_metadata(book, data):
    )


-async def set_cover(book, data, cookies: Optional[dict] = None):
-    book.set_cover("cover.jpg", await fetch_cover(data["cover"], cookies=cookies))
+async def set_cover(book: EpubBook, data: Story) -> None:
+    """Set book cover."""
+    book.set_cover("cover.jpg", await fetch_cover(data["cover"]))
    chapter = epub.EpubHtml(
-        file_name=f"titlepage.xhtml",  # Standard for cover page
+        file_name="titlepage.xhtml",  # Standard for cover page
    )
    chapter.set_content('<img src="cover.jpg">')


 async def add_chapters(
-    book, data, download_images: bool = False, cookies: Optional[dict] = None
+    book: EpubBook,
+    data: Story,
+    download_images: bool = False,
+    cookies: Optional[dict] = None,
 ):
    chapters = []

@@ -202,11 +319,9 @@ async def add_chapters(
        if download_images:
            soup = BeautifulSoup(content, "lxml")

-            async with (
-                CachedSession(headers=headers, cache=cache)
-                if not cookies
-                else ClientSession(headers=headers, cookies=cookies)
-            ) as session:  # Don't cache requests with Cookies.
+            async with CachedSession(
+                headers=headers, cache=None
+            ) as session:  # Don't cache images.
                for idx, image in enumerate(soup.find_all("img")):
                    if not image["src"]:
                        continue
@@ -234,7 +349,7 @@ async def add_chapters(
    for chapter in chapters:
        book.add_item(chapter)

-    book.toc = tuple(chapters)
+    book.toc = chapters

    # Thanks https://github.com/aerkalov/ebooklib/blob/master/samples/09_create_image/create.py
    book.add_item(epub.EpubNcx())
@@ -1,10 +1,12 @@
 """WattpadDownloader API Server."""

 from typing import Optional
+import asyncio
 import tempfile
 from pathlib import Path
 from io import BytesIO
 from enum import Enum
+from eliot import start_action
 from aiohttp import ClientResponseError
 from fastapi import FastAPI, Request
 from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
@@ -17,9 +19,11 @@ from create_book import (
    add_chapters,
    slugify,
    wp_get_cookies,
-    fetch_story_id,
+    fetch_story_from_partId,
+    logger,
 )

+
 app = FastAPI()
 BUILD_PATH = Path(__file__).parent / "build"

@@ -28,10 +32,46 @@ headers = {
 }


+class RequestCancelledMiddleware:
+    # Thanks https://github.com/fastapi/fastapi/discussions/11360#discussion-6427734
+    def __init__(self, app):
+        self.app = app
+
+    async def __call__(self, scope, receive, send):
+        if scope["type"] != "http":
+            await self.app(scope, receive, send)
+            return
+
+        # Let's make a shared queue for the request messages
+        queue = asyncio.Queue()
+
+        async def message_poller(sentinel, handler_task):
+            nonlocal queue
+            while True:
+                message = await receive()
+                if message["type"] == "http.disconnect":
+                    handler_task.cancel()
+                    return sentinel  # Break the loop
+
+                # Puts the message in the queue
+                await queue.put(message)
+
+        sentinel = object()
+        handler_task = asyncio.create_task(self.app(scope, queue.get, send))
+        asyncio.create_task(message_poller(sentinel, handler_task))
+
+        try:
+            return await handler_task
+        except asyncio.CancelledError:
+            logger.info("Cancelling task as connection closed")
+
+
+app.add_middleware(RequestCancelledMiddleware)
+
+
 class DownloadMode(Enum):
    story = "story"
    part = "part"
-    collection = "collection"


@app.get("/")
@@ -68,8 +108,17 @@ async def handle_download(
    mode: DownloadMode = DownloadMode.story,
    username: Optional[str] = None,
    password: Optional[str] = None,
+):
+    with start_action(
+        action_type="download",
+        download_id=download_id,
+        download_images=download_images,
+        mode=mode,
    ):
        if username and not password or password and not username:
+            logger.error(
+                "Username with no Password or Password with no Username provided."
+            )
            return HTMLResponse(
                status_code=422,
                content='Include both the username <u>and</u> password, or neither. Support is available on the <a href="https://discord.gg/P9RHC4KCwd" target="_blank">Discord</a>',
@@ -80,6 +129,7 @@ async def handle_download(
            try:
                cookies = await wp_get_cookies(username=username, password=password)
            except ValueError:
+                logger.error("Invalid username or password.")
                return HTMLResponse(
                    status_code=403,
                    content='Incorrect Username and/or Password. Support is available on the <a href="https://discord.gg/P9RHC4KCwd" target="_blank">Discord</a>',
@@ -90,15 +140,15 @@ async def handle_download(
        match mode:
            case DownloadMode.story:
                story_id = download_id
+                metadata = await retrieve_story(story_id, cookies)
            case DownloadMode.part:
-            story_id = await fetch_story_id(download_id, cookies)
+                story_id, metadata = await fetch_story_from_partId(download_id, cookies)
+
+        logger.info(f"Retrieved story id ({story_id=})")

        book = epub.EpubBook()
-
-    metadata = await retrieve_story(story_id, cookies)
        set_metadata(book, metadata)
-
-    await set_cover(book, metadata, cookies=cookies)
+        await set_cover(book, metadata)

        async for title in add_chapters(
            book, metadata, download_images=download_images, cookies=cookies
@@ -120,7 +170,7 @@ async def handle_download(
            BytesIO(book_data),
            media_type="application/epub+zip",
            headers={
-            "Content-Disposition": f'attachment; filename="{slugify(metadata["title"])}_{story_id}_{"images" if download_images else ""}.epub"'  # Thanks https://stackoverflow.com/a/72729058
+                "Content-Disposition": f'attachment; filename="{slugify(metadata["title"])}_{story_id}{"_images" if download_images else ""}.epub"'  # Thanks https://stackoverflow.com/a/72729058
            },
        )

@@ -131,4 +181,4 @@ app.mount("/", StaticFiles(directory=BUILD_PATH), "static")
 if __name__ == "__main__":
    import uvicorn

-    uvicorn.run(app, host="0.0.0.0", port=80)
+    uvicorn.run("main:app", host="0.0.0.0", port=80, workers=16)
@@ -31,12 +31,13 @@
      input_url = input_url.toLowerCase();

      invalid_url = false;
-      if (!input_url.includes("wattpad.com/")) {
-        invalid_url = true;
-      }

-      // Originally, I was going to call the Wattpad API (wattpad.com/api/v3/stories/${story_id}), but Wattpad kept blocking those requests. I suspect it has something to do with the Origin header, I wasn't able to remove it.
-      // In the future, if this is considered, it would be cool if we could derive the Story ID from a pasted Part URL. Refer to @AaronBenDaniel's https://github.com/AaronBenDaniel/WattpadDownloader/blob/49b29b245188149f2d24c0b1c59e4c7f90f289a9/src/api/src/create_book.py#L156 (https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=url).
+      if (/^\d+$/.test(input_url)) {
+        // All numbers
+        download_id = input_url;
+        mode = "story";
+      } else if (input_url.includes("wattpad.com/")) {
+        // Is a string and contains contain wattpad.com/

        if (input_url.includes("/story/")) {
          // https://wattpad.com/story/237369078-wattpad-books-presents
@@ -61,6 +62,18 @@
            download_id = "";
          }
        }
+      } else {
+        invalid_url = true;
+      }
+
+      input_url = input_url.match(/\d+/g)?.join("") || "";
+      download_id = input_url;
+
+      // Originally, I was going to call the Wattpad API (wattpad.com/api/v3/stories/${story_id}), but Wattpad kept blocking those requests. I suspect it has something to do with the Origin header, I wasn't able to remove it.
+      // In the future, if this is considered, it would be cool if we could derive the Story ID from a pasted Part URL. Refer to @AaronBenDaniel's https://github.com/AaronBenDaniel/WattpadDownloader/blob/49b29b245188149f2d24c0b1c59e4c7f90f289a9/src/api/src/create_book.py#L156 (https://www.wattpad.com/api/v3/story_parts/{part_id}?fields=url).
+    } else {
+      invalid_url = false;
+      download_id = "";
    }
  }
 </script>