From 455b6ff5ecb23dcc45fe511e57385dbe8e2197af Mon Sep 17 00:00:00 2001
From: Roman513 <rmbelousov@gmail.com>
Date: Wed, 25 Dec 2024 17:02:28 +0400
Subject: [PATCH] Make reading from existing files optional with fallback,
 provide cmd parameter and config option

---
 README.md                          |  4 +++
 legendary/cli.py                   |  3 +++
 legendary/core.py                  | 30 +++++++++++++++++-----
 legendary/downloader/mp/manager.py | 40 ++++++++++++++++--------------
 4 files changed, 53 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
index 32fa4af..a5cedbd 100644
--- a/README.md
+++ b/README.md
@@ -459,6 +459,8 @@ optional arguments:
   --exclude <prefix>    Exclude files starting with <prefix> (case
                         insensitive)
   --install-tag <tag>   Only download files with the specified install tag
+  --read-files          Read duplicated parts from already saved files, do not
+                        keep them in RAM
   --enable-reordering   Enable reordering optimization to reduce RAM
                         requirements during download (may have adverse results
                         for some titles)
@@ -670,6 +672,8 @@ log_level = debug
 max_memory = 2048
 ; maximum number of worker processes when downloading (fewer workers will be slower, but also use less system resources)
 max_workers = 8
+; Enables reading duplicated data from files during download (decreases RAM usage but increases disk I/O)
+read_files = false
 ; default install directory
 install_dir = /mnt/tank/games
 ; locale override, must be in RFC 1766 format (e.g. "en-US")
diff --git a/legendary/cli.py b/legendary/cli.py
index 9d8430d..0dbc33b 100644
--- a/legendary/cli.py
+++ b/legendary/cli.py
@@ -971,6 +971,7 @@ class LegendaryCLI:
                                                           file_prefix_filter=args.file_prefix,
                                                           file_exclude_filter=args.file_exclude_prefix,
                                                           file_install_tag=args.install_tag,
+                                                          read_files=args.read_files,
                                                           dl_optimizations=args.order_opt,
                                                           dl_timeout=args.dl_timeout,
                                                           repair=args.repair_mode,
@@ -2768,6 +2769,8 @@ def main():
                                 type=str, help='Exclude files starting with <prefix> (case insensitive)')
     install_parser.add_argument('--install-tag', dest='install_tag', action='append', metavar='<tag>',
                                 type=str, help='Only download files with the specified install tag')
+    install_parser.add_argument('--read-files', dest='read_files', action='store_true',
+                                help='Read duplicated parts from already saved files, do not keep them in memory')
     install_parser.add_argument('--enable-reordering', dest='order_opt', action='store_true',
                                 help='Enable reordering optimization to reduce RAM requirements '
                                      'during download (may have adverse results for some titles)')
diff --git a/legendary/core.py b/legendary/core.py
index 04344e3..4e57475 100644
--- a/legendary/core.py
+++ b/legendary/core.py
@@ -1327,6 +1327,7 @@ class LegendaryCore:
                          override_old_manifest: str = '', override_base_url: str = '',
                          platform: str = 'Windows', file_prefix_filter: list = None,
                          file_exclude_filter: list = None, file_install_tag: list = None,
+                         read_files: bool = False,
                          dl_optimizations: bool = False, dl_timeout: int = 10,
                          repair: bool = False, repair_use_latest: bool = False,
                          disable_delta: bool = False, override_delta_manifest: str = '',
@@ -1487,6 +1488,9 @@ class LegendaryCore:
         if not max_shm:
             max_shm = self.lgd.config.getint('Legendary', 'max_memory', fallback=2048)
 
+        if not read_files:
+            read_files = self.lgd.config.getboolean('Legendary', 'read_files', fallback=False)
+
         if dl_optimizations or is_opt_enabled(game.app_name, new_manifest.meta.build_version):
             self.log.info('Download order optimizations are enabled.')
             process_opt = True
@@ -1499,12 +1503,26 @@ class LegendaryCore:
         dlm = DLManager(install_path, base_url, resume_file=resume_file, status_q=status_q,
                         max_shared_memory=max_shm * 1024 * 1024, max_workers=max_workers,
                         dl_timeout=dl_timeout, bind_ip=bind_ip)
-        anlres = dlm.run_analysis(manifest=new_manifest, old_manifest=old_manifest,
-                                  patch=not disable_patching, resume=not force,
-                                  file_prefix_filter=file_prefix_filter,
-                                  file_exclude_filter=file_exclude_filter,
-                                  file_install_tag=file_install_tag,
-                                  processing_optimization=process_opt)
+
+        analysis_kwargs = dict(
+            old_manifest=old_manifest,
+            patch=not disable_patching, resume=not force,
+            file_prefix_filter=file_prefix_filter,
+            file_exclude_filter=file_exclude_filter,
+            file_install_tag=file_install_tag,
+            processing_optimization=process_opt
+        )
+
+        try:
+            anlres = dlm.run_analysis(manifest=new_manifest, **analysis_kwargs, read_files=read_files)
+        except MemoryError:
+            if read_files:
+                raise
+            self.log.warning('Memory error encountered, retrying with file read enabled...')
+            dlm = DLManager(install_path, base_url, resume_file=resume_file, status_q=status_q,
+                        max_shared_memory=max_shm * 1024 * 1024, max_workers=max_workers,
+                        dl_timeout=dl_timeout, bind_ip=bind_ip)
+            anlres = dlm.run_analysis(manifest=new_manifest, **analysis_kwargs, read_files=True)
 
         prereq = None
         if new_manifest.meta.prereq_ids:
diff --git a/legendary/downloader/mp/manager.py b/legendary/downloader/mp/manager.py
index 2df74c1..b360d45 100644
--- a/legendary/downloader/mp/manager.py
+++ b/legendary/downloader/mp/manager.py
@@ -82,6 +82,7 @@ class DLManager(Process):
     def run_analysis(self, manifest: Manifest, old_manifest: Manifest = None,
                      patch=True, resume=True, file_prefix_filter=None,
                      file_exclude_filter=None, file_install_tag=None,
+                     read_files=False,
                      processing_optimization=False) -> AnalysisResult:
         """
         Run analysis on manifest and old manifest (if not None) and return a result
@@ -94,6 +95,7 @@ class DLManager(Process):
         :param file_prefix_filter: Only download files that start with this prefix
         :param file_exclude_filter: Exclude files with this prefix from download
         :param file_install_tag: Only install files with the specified tag
+        :param read_files: Allow reading from already finished files
         :param processing_optimization: Attempt to optimize processing order and RAM usage
         :return: AnalysisResult
         """
@@ -320,25 +322,27 @@ class DLManager(Process):
 
         # determine whether a chunk part is currently in written files
         reusable_written = defaultdict(dict)
-        cur_written_cps = defaultdict(list)
-        for cur_file in fmlist:
-            cur_file_cps = dict()
-            cur_file_offset = 0
-            for cp in cur_file.chunk_parts:
-                key = (cp.guid_num, cp.offset, cp.size)
-                for wr_file_name, wr_file_offset, wr_cp_offset, wr_cp_end_offset in cur_written_cps[cp.guid_num]:
-                    # check if new chunk part is wholly contained in a written chunk part
-                    cur_cp_end_offset = cp.offset + cp.size
-                    if wr_cp_offset <= cp.offset and wr_cp_end_offset >= cur_cp_end_offset:
-                        references[cp.guid_num] -= 1
-                        reuse_offset = wr_file_offset + (cp.offset - wr_cp_offset)
-                        reusable_written[cur_file.filename][key] = (wr_file_name, reuse_offset)
-                        break
-                cur_file_cps[cp.guid_num] = (cur_file.filename, cur_file_offset, cp.offset, cp.offset + cp.size)
-                cur_file_offset += cp.size
+        if read_files:
+            self.log.debug('Analyzing manifest for re-usable chunks in saved files...')
+            cur_written_cps = defaultdict(list)
+            for cur_file in fmlist:
+                cur_file_cps = dict()
+                cur_file_offset = 0
+                for cp in cur_file.chunk_parts:
+                    key = (cp.guid_num, cp.offset, cp.size)
+                    for wr_file_name, wr_file_offset, wr_cp_offset, wr_cp_end_offset in cur_written_cps[cp.guid_num]:
+                        # check if new chunk part is wholly contained in a written chunk part
+                        cur_cp_end_offset = cp.offset + cp.size
+                        if wr_cp_offset <= cp.offset and wr_cp_end_offset >= cur_cp_end_offset:
+                            references[cp.guid_num] -= 1
+                            reuse_offset = wr_file_offset + (cp.offset - wr_cp_offset)
+                            reusable_written[cur_file.filename][key] = (wr_file_name, reuse_offset)
+                            break
+                    cur_file_cps[cp.guid_num] = (cur_file.filename, cur_file_offset, cp.offset, cp.offset + cp.size)
+                    cur_file_offset += cp.size
 
-            for guid, value in cur_file_cps.items():
-                cur_written_cps[guid].append(value)
+                for guid, value in cur_file_cps.items():
+                    cur_written_cps[guid].append(value)
 
         last_cache_size = current_cache_size = 0
         # set to determine whether a file is currently cached or not