mirror of
https://github.com/derrod/legendary.git
synced 2025-01-03 04:45:28 +00:00
[downloader] Rework order optimizer to be significantly faster
Also allow for a larger amount of files to be optimized.
This commit is contained in:
parent
9c87f8ab4f
commit
daeee2eb8d
|
@ -195,7 +195,7 @@ class DLManager(Process):
|
||||||
analysis_res.unchanged = len(mc.unchanged)
|
analysis_res.unchanged = len(mc.unchanged)
|
||||||
self.log.debug(f'{analysis_res.unchanged} unchanged files')
|
self.log.debug(f'{analysis_res.unchanged} unchanged files')
|
||||||
|
|
||||||
if processing_optimization and len(manifest.file_manifest_list.elements) > 20_000:
|
if processing_optimization and len(manifest.file_manifest_list.elements) > 100_000:
|
||||||
self.log.warning('Manifest contains too many files, processing optimizations will be disabled.')
|
self.log.warning('Manifest contains too many files, processing optimizations will be disabled.')
|
||||||
processing_optimization = False
|
processing_optimization = False
|
||||||
elif processing_optimization:
|
elif processing_optimization:
|
||||||
|
@ -203,7 +203,6 @@ class DLManager(Process):
|
||||||
|
|
||||||
# count references to chunks for determining runtime cache size later
|
# count references to chunks for determining runtime cache size later
|
||||||
references = Counter()
|
references = Counter()
|
||||||
file_to_chunks = defaultdict(set)
|
|
||||||
fmlist = sorted(manifest.file_manifest_list.elements,
|
fmlist = sorted(manifest.file_manifest_list.elements,
|
||||||
key=lambda a: a.filename.lower())
|
key=lambda a: a.filename.lower())
|
||||||
|
|
||||||
|
@ -217,53 +216,40 @@ class DLManager(Process):
|
||||||
|
|
||||||
for cp in fm.chunk_parts:
|
for cp in fm.chunk_parts:
|
||||||
references[cp.guid_num] += 1
|
references[cp.guid_num] += 1
|
||||||
if processing_optimization:
|
|
||||||
file_to_chunks[fm.filename].add(cp.guid_num)
|
|
||||||
|
|
||||||
if processing_optimization:
|
if processing_optimization:
|
||||||
s_time = time.time()
|
s_time = time.time()
|
||||||
# reorder the file manifest list to group files that share many chunks
|
# reorder the file manifest list to group files that share many chunks
|
||||||
# 5 is mostly arbitrary but has shown in testing to be a good choice
|
# 4 is mostly arbitrary but has shown in testing to be a good choice
|
||||||
min_overlap = 4
|
min_overlap = 4
|
||||||
# enumerate the file list to try and find a "partner" for
|
# ignore files with less than N chunk parts, this speeds things up dramatically
|
||||||
# each file that shares the most chunks with it.
|
cp_threshold = 5
|
||||||
partners = dict()
|
|
||||||
filenames = [fm.filename for fm in fmlist]
|
|
||||||
|
|
||||||
for num, filename in enumerate(filenames[:int((len(filenames) + 1) / 2)]):
|
remaining_files = {fm.filename: {cp.guid_num for cp in fm.chunk_parts}
|
||||||
chunks = file_to_chunks[filename]
|
for fm in fmlist if fm.filename not in mc.unchanged}
|
||||||
partnerlist = list()
|
|
||||||
|
|
||||||
for other_file in filenames[num + 1:]:
|
|
||||||
overlap = len(chunks & file_to_chunks[other_file])
|
|
||||||
if overlap > min_overlap:
|
|
||||||
partnerlist.append(other_file)
|
|
||||||
|
|
||||||
if not partnerlist:
|
|
||||||
continue
|
|
||||||
|
|
||||||
partners[filename] = partnerlist
|
|
||||||
|
|
||||||
# iterate over all the files again and this time around
|
|
||||||
_fmlist = []
|
_fmlist = []
|
||||||
processed = set()
|
|
||||||
for fm in fmlist:
|
|
||||||
if fm.filename in processed:
|
|
||||||
continue
|
|
||||||
_fmlist.append(fm)
|
|
||||||
processed.add(fm.filename)
|
|
||||||
# try to find the file's "partner"
|
|
||||||
f_partners = partners.get(fm.filename, None)
|
|
||||||
if not f_partners:
|
|
||||||
continue
|
|
||||||
# add each partner to list at this point
|
|
||||||
for partner in f_partners:
|
|
||||||
if partner in processed:
|
|
||||||
continue
|
|
||||||
|
|
||||||
partner_fm = manifest.file_manifest_list.get_file_by_path(partner)
|
# iterate over all files that will be downloaded and pair up those that share the most chunks
|
||||||
_fmlist.append(partner_fm)
|
for fm in fmlist:
|
||||||
processed.add(partner)
|
if fm.filename not in remaining_files:
|
||||||
|
continue
|
||||||
|
|
||||||
|
_fmlist.append(fm)
|
||||||
|
f_chunks = remaining_files.pop(fm.filename)
|
||||||
|
if len(f_chunks) < cp_threshold:
|
||||||
|
continue
|
||||||
|
|
||||||
|
best_overlap, match = 0, None
|
||||||
|
for fname, chunks in remaining_files.items():
|
||||||
|
if len(chunks) < cp_threshold:
|
||||||
|
continue
|
||||||
|
overlap = len(f_chunks & chunks)
|
||||||
|
if overlap > min_overlap and overlap > best_overlap:
|
||||||
|
best_overlap, match = overlap, fname
|
||||||
|
|
||||||
|
if match:
|
||||||
|
_fmlist.append(manifest.file_manifest_list.get_file_by_path(match))
|
||||||
|
remaining_files.pop(match)
|
||||||
|
|
||||||
fmlist = _fmlist
|
fmlist = _fmlist
|
||||||
opt_delta = time.time() - s_time
|
opt_delta = time.time() - s_time
|
||||||
|
@ -305,10 +291,7 @@ class DLManager(Process):
|
||||||
# runtime cache requirement by simulating adding/removing from cache during download.
|
# runtime cache requirement by simulating adding/removing from cache during download.
|
||||||
self.log.debug('Creating filetasks and chunktasks...')
|
self.log.debug('Creating filetasks and chunktasks...')
|
||||||
for current_file in fmlist:
|
for current_file in fmlist:
|
||||||
# skip unchanged and empty files
|
if not current_file.chunk_parts:
|
||||||
if current_file.filename in mc.unchanged:
|
|
||||||
continue
|
|
||||||
elif not current_file.chunk_parts:
|
|
||||||
self.tasks.append(FileTask(current_file.filename, empty=True))
|
self.tasks.append(FileTask(current_file.filename, empty=True))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue