video_core/dma_pusher: The full list of headers at once.

Fetching every u32 from memory leads to a big overhead. So let's fetch all of them as a block if possible. This reduces the Memory::* calls by the dma_pusher by a factor of 10.
2025-09-25 19:37:10 +00:00 · 2019-02-19 09:44:33 +01:00 · 2019-02-19 09:44:33 +01:00 · 717394c980
parent 4bce08d497
commit 717394c980
2 changed files with 57 additions and 47 deletions
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@ -38,13 +38,13 @@ bool DmaPusher::Step() {
        const auto address = gpu.MemoryManager().GpuToCpuAddress(dma_get);
        ASSERT_MSG(address, "Invalid GPU address");

-        const CommandHeader command_header{Memory::Read32(*address)};
+        GPUVAddr size = dma_put - dma_get;
+        ASSERT_MSG(size % sizeof(CommandHeader) == 0, "Invalid aligned GPU addresses");
+        command_headers.resize(size / sizeof(CommandHeader));

-        dma_get += sizeof(u32);
+        Memory::ReadBlock(*address, command_headers.data(), size);

-        if (!non_main) {
-            dma_mget = dma_get;
-        }
+        for (const CommandHeader& command_header : command_headers) {

            // now, see if we're in the middle of a command
            if (dma_state.length_pending) {
@ -91,6 +91,14 @@ bool DmaPusher::Step() {
                    break;
                }
            }
+        }
+
+        dma_get = dma_put;
+
+        if (!non_main) {
+            // TODO (degasus): This is dead code, as dma_mget is never read.
+            dma_mget = dma_get;
+        }
    } else if (ib_enable && !dma_pushbuffer.empty()) {
        // Current pushbuffer empty, but we have more IB entries to read
        const CommandList& command_list{dma_pushbuffer.front()};
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@ -75,6 +75,8 @@ private:

    GPU& gpu;

+    std::vector<CommandHeader> command_headers; ///< Buffer for list of commands fetched at once
+
    std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
    std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer