refactor(mkbsd.py): switch from requests to aiohttp for async operations to improve performance and non-blocking I/O

feat(mkbsd.py): add asyncio for concurrent downloads and improve download speed
chore(requirements.txt): add aiohttp dependency for async HTTP requests
This commit is contained in:
Andre Saddler 2024-09-27 01:35:09 -04:00
parent ebd7a47ebe
commit 1cd2c133eb
2 changed files with 46 additions and 27 deletions

View file

@ -1,28 +1,30 @@
import argparse import argparse
import asyncio
import json
import multiprocessing as mp import multiprocessing as mp
import os import os
import re import re
import time
import zipfile import zipfile
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import unquote from urllib.parse import unquote
import aiohttp
import imagehash import imagehash
import requests
from PIL import Image from PIL import Image
# python mkbsd.py [--zip] [--zip-name CUSTOM_NAME] [--remove-duplicates] async def fetch_json_data(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
def fetch_json_data(url): if response.status == 200:
response = requests.get(url) text = await response.text()
if response.status_code == 200: try:
return response.json() return json.loads(text)
except json.JSONDecodeError:
raise Exception(f"Failed to parse JSON data from {url}")
else: else:
raise Exception( raise Exception(f"Failed to fetch data. Status code: {response.status}")
f"Failed to fetch JSON data. Status code: {response.status_code}"
)
def extract_urls(element): def extract_urls(element):
@ -39,19 +41,27 @@ def extract_urls(element):
return urls return urls
def download_file(url): async def download_file(session, url):
file_name = os.path.basename(unquote(url.split("?")[0])) file_name = os.path.basename(unquote(url.split("?")[0]))
file_name = clean_filename(file_name) file_name = clean_filename(file_name)
file_path = os.path.join("downloads", file_name) file_path = os.path.join("downloads", file_name)
if not os.path.exists(file_path): if not os.path.exists(file_path):
print(f"Downloading {url}") try:
response = requests.get(url, stream=True) async with session.get(url) as response:
if response.status == 200:
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192): while True:
chunk = await response.content.read(8192)
if not chunk:
break
f.write(chunk) f.write(chunk)
return f"Downloaded: {file_name}"
else: else:
print(f"Skipping {url}") return f"Failed to download {file_name}: HTTP {response.status}"
return file_path except Exception as e:
return f"Error downloading {file_name}: {str(e)}"
else:
return f"Skipped (already exists): {file_name}"
def clean_filename(filename): def clean_filename(filename):
@ -120,7 +130,7 @@ def remove_duplicates(duplicates):
print(f"Error removing duplicate: {e}") print(f"Error removing duplicate: {e}")
def main(): async def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Download images from JSON data and remove duplicates." description="Download images from JSON data and remove duplicates."
) )
@ -141,7 +151,7 @@ def main():
json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json" json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
try: try:
json_data = fetch_json_data(json_url) json_data = await fetch_json_data(json_url)
except Exception as e: except Exception as e:
print(f"Error: {e}") print(f"Error: {e}")
return return
@ -152,8 +162,16 @@ def main():
if not os.path.exists("downloads"): if not os.path.exists("downloads"):
os.makedirs("downloads") os.makedirs("downloads")
with ThreadPoolExecutor(max_workers=10) as executor: start_time = time.time()
executor.map(download_file, urls) async with aiohttp.ClientSession() as session:
tasks = [download_file(session, url) for url in urls]
for batch in [tasks[i : i + 50] for i in range(0, len(tasks), 50)]:
results = await asyncio.gather(*batch)
for result in results:
print(result)
end_time = time.time()
print(f"Download completed in {end_time - start_time:.2f} seconds")
if args.remove_duplicates: if args.remove_duplicates:
print("Searching for duplicate images...") print("Searching for duplicate images...")
@ -172,4 +190,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() asyncio.run(main())

View file

@ -1 +1,2 @@
imagehash imagehash
aiohttp