feat(api): Parallelize image downloads

This commit is contained in:
TheOnlyWayUp
2025-06-04 23:31:42 +00:00
parent 46319507e4
commit fa55e6e1e9
3 changed files with 60 additions and 40 deletions
+1 -40
View File
@@ -3,42 +3,18 @@ from typing import Optional, Tuple
from io import BytesIO
import backoff
from pydantic import TypeAdapter
from .config import Config, CacheTypes
from .logs import logger
from eliot import start_action
from dotenv import load_dotenv
from aiohttp import ClientResponseError
from aiohttp_client_cache.session import CachedSession
from aiohttp_client_cache import FileBackend, RedisBackend
from .models import Story
from .exceptions import PartNotFoundError, StoryNotFoundError
from .vars import headers, cache
load_dotenv(override=True)
config = Config()
story_ta = TypeAdapter(Story)
# --- #
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
if config.USE_CACHE:
match config.CACHE_TYPE:
case CacheTypes.file:
cache = FileBackend(use_temp=True, expire_after=43200) # 12 hours
case CacheTypes.redis:
cache = RedisBackend(
cache_name="wpd-aiohttp-cache",
address=config.REDIS_CONNECTION_URL,
expire_after=43200, # 12 hours
)
else:
cache = None
logger.info(f"Using {cache=}")
async def fetch_cookies(username: str, password: str) -> dict:
# source: https://github.com/TheOnlyWayUp/WP-DM-Export/blob/dd4c7c51cb43f2108e0f63fc10a66cd24a740e4e/src/API/src/main.py#L25-L58
@@ -148,18 +124,3 @@ async def fetch_story_content_zip(
bytes_stream = BytesIO(await response.read())
return bytes_stream
@backoff.on_exception(backoff.expo, ClientResponseError, max_time=15)
async def fetch_image(url: str, should_cache: bool = False) -> bytes:
"""Fetch image bytes."""
with start_action(action_type="api_fetch_image", url=url):
async with CachedSession(
headers=headers, cache=cache if should_cache else None
) as session: # Don't cache images.
async with session.get(url) as response:
response.raise_for_status()
body = await response.read()
return body
+31
View File
@@ -1,4 +1,11 @@
from typing import List, Tuple
from aiohttp import ClientSession
from bs4 import BeautifulSoup, Tag
from itertools import batched, chain
import asyncio
from .vars import headers
from eliot import start_action
def clean_tree(title: str, id: int, body: str) -> BeautifulSoup:
@@ -48,3 +55,27 @@ def clean_tree(title: str, id: int, body: str) -> BeautifulSoup:
insert_at.append(br_tag)
return new_soup
async def fetch_image(url: str) -> bytes | None:
"""Fetch image bytes."""
with start_action(action_type="api_fetch_image", url=url):
async with ClientSession(headers=headers) as session: # Don't cache images.
async with session.get(url) as response:
if not response.ok:
return None
body = await response.read()
return body
async def download_tree_images(tree: BeautifulSoup) -> Tuple[bytes]:
image_urls = [img["src"] for img in tree.find_all("img")]
downloaded_images: List[bytes] = list(
chain(
await asyncio.gather(*[fetch_image(url) for url in chunk])
for chunk in batched(image_urls, 3)
)
)
return downloaded_images
+28
View File
@@ -0,0 +1,28 @@
from .config import Config, CacheTypes
from aiohttp_client_cache.session import CachedSession
from aiohttp_client_cache import FileBackend, RedisBackend
from dotenv import load_dotenv
from .logs import logger
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
load_dotenv(override=True)
config = Config()
if config.USE_CACHE:
match config.CACHE_TYPE:
case CacheTypes.file:
cache = FileBackend(use_temp=True, expire_after=43200) # 12 hours
case CacheTypes.redis:
cache = RedisBackend(
cache_name="wpd-aiohttp-cache",
address=config.REDIS_CONNECTION_URL,
expire_after=43200, # 12 hours
)
else:
cache = None
logger.info(f"Using {cache=}")