diff --git a/src/api/src/create_book/create_book.py b/src/api/src/create_book/create_book.py index d94a375..700a47d 100644 --- a/src/api/src/create_book/create_book.py +++ b/src/api/src/create_book/create_book.py @@ -3,42 +3,18 @@ from typing import Optional, Tuple from io import BytesIO import backoff from pydantic import TypeAdapter -from .config import Config, CacheTypes from .logs import logger from eliot import start_action -from dotenv import load_dotenv from aiohttp import ClientResponseError from aiohttp_client_cache.session import CachedSession -from aiohttp_client_cache import FileBackend, RedisBackend from .models import Story from .exceptions import PartNotFoundError, StoryNotFoundError +from .vars import headers, cache -load_dotenv(override=True) - -config = Config() story_ta = TypeAdapter(Story) # --- # -headers = { - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36" -} - -if config.USE_CACHE: - match config.CACHE_TYPE: - case CacheTypes.file: - cache = FileBackend(use_temp=True, expire_after=43200) # 12 hours - case CacheTypes.redis: - cache = RedisBackend( - cache_name="wpd-aiohttp-cache", - address=config.REDIS_CONNECTION_URL, - expire_after=43200, # 12 hours - ) -else: - cache = None - -logger.info(f"Using {cache=}") - async def fetch_cookies(username: str, password: str) -> dict: # source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58 @@ -148,18 +124,3 @@ async def fetch_story_content_zip( bytes_stream = BytesIO(await response.read()) return bytes_stream - - -@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15) -async def fetch_image(url: str, should_cache: bool = False) -> bytes: - """Fetch image bytes.""" - with start_action(action_type="api_fetch_image", url=url): - async with CachedSession( - headers=headers, cache=cache if should_cache else None - ) as session: # Don't cache images. - async with session.get(url) as response: - response.raise_for_status() - - body = await response.read() - - return body diff --git a/src/api/src/create_book/parser.py b/src/api/src/create_book/parser.py index 9a14017..08533f4 100644 --- a/src/api/src/create_book/parser.py +++ b/src/api/src/create_book/parser.py @@ -1,4 +1,11 @@ +from typing import List, Tuple +from aiohttp import ClientSession from bs4 import BeautifulSoup, Tag +from itertools import batched, chain + +import asyncio +from .vars import headers +from eliot import start_action def clean_tree(title: str, id: int, body: str) -> BeautifulSoup: @@ -48,3 +55,27 @@ def clean_tree(title: str, id: int, body: str) -> BeautifulSoup: insert_at.append(br_tag) return new_soup + + +async def fetch_image(url: str) -> bytes | None: + """Fetch image bytes.""" + with start_action(action_type="api_fetch_image", url=url): + async with ClientSession(headers=headers) as session: # Don't cache images. + async with session.get(url) as response: + if not response.ok: + return None + + body = await response.read() + + return body + + +async def download_tree_images(tree: BeautifulSoup) -> Tuple[bytes]: + image_urls = [img["src"] for img in tree.find_all("img")] + downloaded_images: List[bytes] = list( + chain( + await asyncio.gather(*[fetch_image(url) for url in chunk]) + for chunk in batched(image_urls, 3) + ) + ) + return downloaded_images diff --git a/src/api/src/create_book/vars.py b/src/api/src/create_book/vars.py new file mode 100644 index 0000000..8cedbe6 --- /dev/null +++ b/src/api/src/create_book/vars.py @@ -0,0 +1,28 @@ +from .config import Config, CacheTypes +from aiohttp_client_cache.session import CachedSession +from aiohttp_client_cache import FileBackend, RedisBackend +from dotenv import load_dotenv +from .logs import logger + +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36" +} + +load_dotenv(override=True) + +config = Config() + +if config.USE_CACHE: + match config.CACHE_TYPE: + case CacheTypes.file: + cache = FileBackend(use_temp=True, expire_after=43200) # 12 hours + case CacheTypes.redis: + cache = RedisBackend( + cache_name="wpd-aiohttp-cache", + address=config.REDIS_CONNECTION_URL, + expire_after=43200, # 12 hours + ) +else: + cache = None + +logger.info(f"Using {cache=}")