mirror of
https://github.com/nadimkobeissi/mkbsd.git
synced 2025-01-08 19:15:36 +00:00
refactor(mkbsd.py): switch from requests to aiohttp for async operations to improve performance and non-blocking I/O
feat(mkbsd.py): add asyncio for concurrent downloads and improve download speed chore(requirements.txt): add aiohttp dependency for async HTTP requests
This commit is contained in:
parent
ebd7a47ebe
commit
1cd2c133eb
70
mkbsd.py
70
mkbsd.py
|
@ -1,28 +1,30 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
import imagehash
|
import imagehash
|
||||||
import requests
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
# python mkbsd.py [--zip] [--zip-name CUSTOM_NAME] [--remove-duplicates]
|
async def fetch_json_data(url):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
def fetch_json_data(url):
|
if response.status == 200:
|
||||||
response = requests.get(url)
|
text = await response.text()
|
||||||
if response.status_code == 200:
|
try:
|
||||||
return response.json()
|
return json.loads(text)
|
||||||
else:
|
except json.JSONDecodeError:
|
||||||
raise Exception(
|
raise Exception(f"Failed to parse JSON data from {url}")
|
||||||
f"Failed to fetch JSON data. Status code: {response.status_code}"
|
else:
|
||||||
)
|
raise Exception(f"Failed to fetch data. Status code: {response.status}")
|
||||||
|
|
||||||
|
|
||||||
def extract_urls(element):
|
def extract_urls(element):
|
||||||
|
@ -39,19 +41,27 @@ def extract_urls(element):
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def download_file(url):
|
async def download_file(session, url):
|
||||||
file_name = os.path.basename(unquote(url.split("?")[0]))
|
file_name = os.path.basename(unquote(url.split("?")[0]))
|
||||||
file_name = clean_filename(file_name)
|
file_name = clean_filename(file_name)
|
||||||
file_path = os.path.join("downloads", file_name)
|
file_path = os.path.join("downloads", file_name)
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
print(f"Downloading {url}")
|
try:
|
||||||
response = requests.get(url, stream=True)
|
async with session.get(url) as response:
|
||||||
with open(file_path, "wb") as f:
|
if response.status == 200:
|
||||||
for chunk in response.iter_content(chunk_size=8192):
|
with open(file_path, "wb") as f:
|
||||||
f.write(chunk)
|
while True:
|
||||||
|
chunk = await response.content.read(8192)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
f.write(chunk)
|
||||||
|
return f"Downloaded: {file_name}"
|
||||||
|
else:
|
||||||
|
return f"Failed to download {file_name}: HTTP {response.status}"
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error downloading {file_name}: {str(e)}"
|
||||||
else:
|
else:
|
||||||
print(f"Skipping {url}")
|
return f"Skipped (already exists): {file_name}"
|
||||||
return file_path
|
|
||||||
|
|
||||||
|
|
||||||
def clean_filename(filename):
|
def clean_filename(filename):
|
||||||
|
@ -120,7 +130,7 @@ def remove_duplicates(duplicates):
|
||||||
print(f"Error removing duplicate: {e}")
|
print(f"Error removing duplicate: {e}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
async def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Download images from JSON data and remove duplicates."
|
description="Download images from JSON data and remove duplicates."
|
||||||
)
|
)
|
||||||
|
@ -141,7 +151,7 @@ def main():
|
||||||
|
|
||||||
json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
|
json_url = "https://storage.googleapis.com/panels-cdn/data/20240730/all.json"
|
||||||
try:
|
try:
|
||||||
json_data = fetch_json_data(json_url)
|
json_data = await fetch_json_data(json_url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
return
|
return
|
||||||
|
@ -152,8 +162,16 @@ def main():
|
||||||
if not os.path.exists("downloads"):
|
if not os.path.exists("downloads"):
|
||||||
os.makedirs("downloads")
|
os.makedirs("downloads")
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
start_time = time.time()
|
||||||
executor.map(download_file, urls)
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tasks = [download_file(session, url) for url in urls]
|
||||||
|
for batch in [tasks[i : i + 50] for i in range(0, len(tasks), 50)]:
|
||||||
|
results = await asyncio.gather(*batch)
|
||||||
|
for result in results:
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Download completed in {end_time - start_time:.2f} seconds")
|
||||||
|
|
||||||
if args.remove_duplicates:
|
if args.remove_duplicates:
|
||||||
print("Searching for duplicate images...")
|
print("Searching for duplicate images...")
|
||||||
|
@ -172,4 +190,4 @@ def main():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
asyncio.run(main())
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
imagehash
|
imagehash
|
||||||
|
aiohttp
|
Loading…
Reference in a new issue