From e744357119a2d305dcd98b389a4b68b6cbdd8804 Mon Sep 17 00:00:00 2001
From: Roman513 <rmbelousov@gmail.com>
Date: Tue, 24 Dec 2024 03:50:14 +0400
Subject: [PATCH] Reuse chunks from the same download if duplicated Drastically
 decrease memory footprint

---
 legendary/downloader/mp/manager.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/legendary/downloader/mp/manager.py b/legendary/downloader/mp/manager.py
index 90ab37a..2df74c1 100644
--- a/legendary/downloader/mp/manager.py
+++ b/legendary/downloader/mp/manager.py
@@ -318,6 +318,28 @@ class DLManager(Process):
                             analysis_res.reuse_size += cp.size
                             break
 
+        # determine whether a chunk part is currently in written files
+        reusable_written = defaultdict(dict)
+        cur_written_cps = defaultdict(list)
+        for cur_file in fmlist:
+            cur_file_cps = dict()
+            cur_file_offset = 0
+            for cp in cur_file.chunk_parts:
+                key = (cp.guid_num, cp.offset, cp.size)
+                for wr_file_name, wr_file_offset, wr_cp_offset, wr_cp_end_offset in cur_written_cps[cp.guid_num]:
+                    # check if new chunk part is wholly contained in a written chunk part
+                    cur_cp_end_offset = cp.offset + cp.size
+                    if wr_cp_offset <= cp.offset and wr_cp_end_offset >= cur_cp_end_offset:
+                        references[cp.guid_num] -= 1
+                        reuse_offset = wr_file_offset + (cp.offset - wr_cp_offset)
+                        reusable_written[cur_file.filename][key] = (wr_file_name, reuse_offset)
+                        break
+                cur_file_cps[cp.guid_num] = (cur_file.filename, cur_file_offset, cp.offset, cp.offset + cp.size)
+                cur_file_offset += cp.size
+
+            for guid, value in cur_file_cps.items():
+                cur_written_cps[guid].append(value)
+
         last_cache_size = current_cache_size = 0
         # set to determine whether a file is currently cached or not
         cached = set()
@@ -338,6 +360,7 @@ class DLManager(Process):
                 continue
 
             existing_chunks = re_usable.get(current_file.filename, None)
+            written_chunks = reusable_written.get(current_file.filename, None)
             chunk_tasks = []
             reused = 0
 
@@ -345,10 +368,13 @@ class DLManager(Process):
                 ct = ChunkTask(cp.guid_num, cp.offset, cp.size)
 
                 # re-use the chunk from the existing file if we can
-                if existing_chunks and (cp.guid_num, cp.offset, cp.size) in existing_chunks:
+                key = (cp.guid_num, cp.offset, cp.size)
+                if existing_chunks and key in existing_chunks:
                     reused += 1
                     ct.chunk_file = current_file.filename
-                    ct.chunk_offset = existing_chunks[(cp.guid_num, cp.offset, cp.size)]
+                    ct.chunk_offset = existing_chunks[key]
+                elif written_chunks and key in written_chunks:
+                    ct.chunk_file, ct.chunk_offset = written_chunks[key]
                 else:
                     # add to DL list if not already in it
                     if cp.guid_num not in chunks_in_dl_list: