Keep track of previously downloaded files

Refactor image download script to track and skip previously downloaded files based on unique key and consistent naming

- Added logic to use unique key from JSON data for consistent filenames and tracking
- Improved file existence check to skip downloading files that already exist in the directory
- Updated JSON list to store keys of downloaded files for persistent tracking across runs
- Incorporated delay between downloads for smoother processing
- Cleaned up old redundant code and ensured consistency in file naming and tracking
- Added ascii art to indicate the start of the download process
This commit is contained in:
Nabil Mohammed Nalakath 2024-09-29 02:11:45 +05:30 committed by GitHub
parent 82e50c64f0
commit 815a9eabdc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,14 +1,19 @@
// Copyright 2024 Nadim Kobeissi
// Licensed under the WTFPL License
const fs = require(`fs`);
const path = require(`path`);
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
async function main() {
const url = 'https://storage.googleapis.com/panels-api/data/20240916/media-1a-i-p~s';
const delay = (ms) => {
return new Promise(resolve => setTimeout(resolve, ms));
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
const downloadedListPath = path.join(__dirname, 'downloadedList.json');
let downloadedList = [];
// Load existing downloaded list if it exists
if (fs.existsSync(downloadedListPath)) {
const downloadedData = await fs.promises.readFile(downloadedListPath, 'utf8');
downloadedList = JSON.parse(downloadedData);
}
try {
const response = await fetch(url);
if (!response.ok) {
@ -16,30 +21,63 @@ async function main() {
}
const jsonData = await response.json();
const data = jsonData.data;
if (!data) {
throw new Error('⛔ JSON does not have a "data" property at its root.');
}
const downloadDir = path.join(__dirname, 'downloads');
const downloadDir = path.join(__dirname, 'downloads-1');
if (!fs.existsSync(downloadDir)) {
fs.mkdirSync(downloadDir);
console.info(`📁 Created directory: ${downloadDir}`);
}
let fileIndex = 1;
let downloadedCount = 0;
let skippedCount = 0;
for (const key in data) {
const subproperty = data[key];
if (subproperty && subproperty.dhd) {
// Use the unique key to track downloads and in the file name
const imageUrl = subproperty.dhd;
console.info(`🔍 Found image URL!`);
await delay(100);
const imageName = `${extractNameFromUrl(imageUrl)}-${key}`;
const ext = path.extname(new URL(imageUrl).pathname) || '.jpg';
const filename = `${fileIndex}${ext}`;
const filePath = path.join(downloadDir, filename);
const filePath = path.join(downloadDir, `${imageName}${ext}`);
// Check if the file already exists
if (fs.existsSync(filePath)) {
// If the file exists but the key is missing in the JSON, add it to avoid re-downloading
if (!downloadedList.includes(key)) {
downloadedList.push(key);
console.info(`✅ Found existing file, added key to list: ${filePath}`);
await fs.promises.writeFile(downloadedListPath, JSON.stringify(downloadedList, null, 2));
}
skippedCount++;
} else {
// Download the image only if it doesn't exist
downloadedCount++;
console.info(`🔍 Found new image URL: ${imageUrl}`);
// Download the image
await downloadImage(imageUrl, filePath);
console.info(`🖼️ Saved image to ${filePath}`);
fileIndex++;
// Add the unique key to the downloaded list
downloadedList.push(key);
// Save the updated downloaded list to JSON file
await fs.promises.writeFile(downloadedListPath, JSON.stringify(downloadedList, null, 2));
console.info(`📄 Updated downloaded list with key: ${key}`);
// Delay for the next download
await delay(250);
}
}
}
console.log(`🚀 🚀 🚀 Downloaded ${downloadedCount} new images`);
console.info(`✅ Skipped ${skippedCount} images that already exist`);
} catch (error) {
console.error(`Error: ${error.message}`);
}
@ -55,6 +93,30 @@ async function downloadImage(url, filePath) {
await fs.promises.writeFile(filePath, buffer);
}
function extractNameFromUrl(url) {
try {
const urlParts = new URL(url).pathname.split('/');
const nameWithExtension = urlParts[urlParts.length - 1]; // Get the last part of the URL
// Remove the query string from the name (everything after the '?' symbol)
const nameWithoutQuery = nameWithExtension.split('?')[0];
// Get the prefix part (e.g., 'hytha', 'outrunyouth', etc.)
const prefixPart = urlParts.find(part => part.startsWith('a~'));
const prefix = prefixPart ? prefixPart.split('~')[1].split('_')[0].toLowerCase() : 'unknown'; // Clean up the prefix
// Simplify the base name by removing everything after the first tilde (~)
const baseName = nameWithoutQuery.split('.')[0].split('~')[0].replace(/[^a-zA-Z0-9]+/g, '').toLowerCase();
return `${prefix}-${baseName}`; // Return cleaned prefix and simplified base name
} catch (error) {
console.error(`Error extracting name from URL: ${error.message}, ${url}`);
// Fallback to deterministic name using hash if extraction fails
const hash = crypto.createHash('md5').update(url).digest('hex');
return `image-${hash}`;
}
}
function asciiArt() {
console.info(`
/$$ /$$ /$$ /$$ /$$$$$$$ /$$$$$$ /$$$$$$$