From f23b2878ccde4e570733e9d225f836c20183fb55 Mon Sep 17 00:00:00 2001 From: riperiperi Date: Tue, 6 Dec 2022 23:15:44 +0000 Subject: [PATCH] Shader: Add fallback for LDG from "ube" buffer ranges. (#4027) We have a conversion from LDG on the compute shader to a special constant buffer binding that's used to exceed hardware limits on compute, but it was only running if the byte offset could be identified. The fallback that checks all of the bindings at runtime only checks the storage buffers. This PR adds checking ube ranges to the LoadGlobal fallback. This extends the changes in #4011 to only check ube entries which are accessed by the shader. Fixes particles affected by the wind in The Legend of Zelda: Breath of the Wild. May fix other weird issues with compute shaders in some games. Try a bunch of games and drivers to make sure they don't blow up loading constants willynilly from searchable buffers. --- .../Shader/DiskCache/DiskCacheHostStorage.cs | 2 +- .../Translation/GlobalMemory.cs | 5 ++ .../Optimizations/GlobalToStorage.cs | 19 +++- .../Translation/Optimizations/Optimizer.cs | 5 +- .../Translation/Rewriter.cs | 88 +++++++++++++++---- .../Translation/ShaderConfig.cs | 14 ++- 6 files changed, 108 insertions(+), 25 deletions(-) diff --git a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs index 2bdb85bf0..e0ad30fe0 100644 --- a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs +++ b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs @@ -22,7 +22,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache private const ushort FileFormatVersionMajor = 1; private const ushort FileFormatVersionMinor = 2; private const uint FileFormatVersionPacked = ((uint)FileFormatVersionMajor << 16) | FileFormatVersionMinor; - private const uint CodeGenVersion = 4037; + private const uint CodeGenVersion = 4028; private const string SharedTocFileName = "shared.toc"; private const string SharedDataFileName = "shared.data"; diff --git a/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs b/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs index 1be638684..3915c0d55 100644 --- a/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs +++ b/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs @@ -48,5 +48,10 @@ namespace Ryujinx.Graphics.Shader.Translation _ => 0 }; } + + public static int GetConstantUbeOffset(int slot) + { + return UbeBaseOffset + slot * StorageDescSize; + } } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs index ec8fca1da..c280a6d80 100644 --- a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs +++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs @@ -8,11 +8,14 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations { static class GlobalToStorage { - public static void RunPass(BasicBlock block, ShaderConfig config, ref int sbUseMask) + public static void RunPass(BasicBlock block, ShaderConfig config, ref int sbUseMask, ref int ubeUseMask) { int sbStart = GetStorageBaseCbOffset(config.Stage); int sbEnd = sbStart + StorageDescsSize; + int ubeStart = UbeBaseOffset; + int ubeEnd = UbeBaseOffset + UbeDescsSize; + for (LinkedListNode node = block.Operations.First; node != null; node = node.Next) { for (int index = 0; index < node.Value.SourcesCount; index++) @@ -25,6 +28,16 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations { sbUseMask |= 1 << storageIndex; } + + if (config.Stage == ShaderStage.Compute) + { + int constantIndex = GetStorageIndex(src, ubeStart, ubeEnd); + + if (constantIndex >= 0) + { + ubeUseMask |= 1 << constantIndex; + } + } } if (!(node.Value is Operation operation)) @@ -54,7 +67,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations // so NVN "emulates" more constant buffers using global memory access. // Here we try to replace the global access back to a constant buffer // load. - storageIndex = SearchForStorageBase(block, source, UbeBaseOffset, UbeBaseOffset + UbeDescsSize); + storageIndex = SearchForStorageBase(block, source, ubeStart, ubeStart + ubeEnd); if (storageIndex >= 0) { @@ -64,7 +77,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations } } - config.SetAccessibleStorageBuffersMask(sbUseMask); + config.SetAccessibleBufferMasks(sbUseMask, ubeUseMask); } private static LinkedListNode ReplaceGlobalWithStorage(BasicBlock block, LinkedListNode node, ShaderConfig config, int storageIndex) diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs index a1a2054c0..a2219b36d 100644 --- a/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs +++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs @@ -12,16 +12,17 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations RunOptimizationPasses(blocks); int sbUseMask = 0; + int ubeUseMask = 0; // Those passes are looking for specific patterns and only needs to run once. for (int blkIndex = 0; blkIndex < blocks.Length; blkIndex++) { - GlobalToStorage.RunPass(blocks[blkIndex], config, ref sbUseMask); + GlobalToStorage.RunPass(blocks[blkIndex], config, ref sbUseMask, ref ubeUseMask); BindlessToIndexed.RunPass(blocks[blkIndex], config); BindlessElimination.RunPass(blocks[blkIndex], config); } - config.SetAccessibleStorageBuffersMask(sbUseMask); + config.SetAccessibleBufferMasks(sbUseMask, ubeUseMask); // Run optimizations one last time to remove any code that is now optimizable after above passes. RunOptimizationPasses(blocks); diff --git a/Ryujinx.Graphics.Shader/Translation/Rewriter.cs b/Ryujinx.Graphics.Shader/Translation/Rewriter.cs index c4d2c5d90..0c3c4a57d 100644 --- a/Ryujinx.Graphics.Shader/Translation/Rewriter.cs +++ b/Ryujinx.Graphics.Shader/Translation/Rewriter.cs @@ -1,3 +1,4 @@ +using Ryujinx.Graphics.Shader.Decoders; using Ryujinx.Graphics.Shader.IntermediateRepresentation; using System.Collections.Generic; using System.Diagnostics; @@ -89,12 +90,42 @@ namespace Ryujinx.Graphics.Shader.Translation return local; } + Operand PrependExistingOperation(Operation operation) + { + Operand local = Local(); + + operation.Dest = local; + node.List.AddBefore(node, operation); + + return local; + } + Operand addrLow = operation.GetSource(0); Operand addrHigh = operation.GetSource(1); Operand sbBaseAddrLow = Const(0); Operand sbSlot = Const(0); + Operand alignMask = Const(-config.GpuAccessor.QueryHostStorageBufferOffsetAlignment()); + + Operand BindingRangeCheck(int cbOffset, out Operand baseAddrLow) + { + baseAddrLow = Cbuf(0, cbOffset); + Operand baseAddrHigh = Cbuf(0, cbOffset + 1); + Operand size = Cbuf(0, cbOffset + 2); + + Operand offset = PrependOperation(Instruction.Subtract, addrLow, baseAddrLow); + Operand borrow = PrependOperation(Instruction.CompareLessU32, addrLow, baseAddrLow); + + Operand inRangeLow = PrependOperation(Instruction.CompareLessU32, offset, size); + + Operand addrHighBorrowed = PrependOperation(Instruction.Add, addrHigh, borrow); + + Operand inRangeHigh = PrependOperation(Instruction.CompareEqual, addrHighBorrowed, baseAddrHigh); + + return PrependOperation(Instruction.BitwiseAnd, inRangeLow, inRangeHigh); + } + int sbUseMask = config.AccessibleStorageBuffersMask; while (sbUseMask != 0) @@ -107,20 +138,7 @@ namespace Ryujinx.Graphics.Shader.Translation int cbOffset = GetStorageCbOffset(config.Stage, slot); - Operand baseAddrLow = Cbuf(0, cbOffset); - Operand baseAddrHigh = Cbuf(0, cbOffset + 1); - Operand size = Cbuf(0, cbOffset + 2); - - Operand offset = PrependOperation(Instruction.Subtract, addrLow, baseAddrLow); - Operand borrow = PrependOperation(Instruction.CompareLessU32, addrLow, baseAddrLow); - - Operand inRangeLow = PrependOperation(Instruction.CompareLessU32, offset, size); - - Operand addrHighBorrowed = PrependOperation(Instruction.Add, addrHigh, borrow); - - Operand inRangeHigh = PrependOperation(Instruction.CompareEqual, addrHighBorrowed, baseAddrHigh); - - Operand inRange = PrependOperation(Instruction.BitwiseAnd, inRangeLow, inRangeHigh); + Operand inRange = BindingRangeCheck(cbOffset, out Operand baseAddrLow); sbBaseAddrLow = PrependOperation(Instruction.ConditionalSelect, inRange, baseAddrLow, sbBaseAddrLow); sbSlot = PrependOperation(Instruction.ConditionalSelect, inRange, Const(slot), sbSlot); @@ -128,8 +146,6 @@ namespace Ryujinx.Graphics.Shader.Translation if (config.AccessibleStorageBuffersMask != 0) { - Operand alignMask = Const(-config.GpuAccessor.QueryHostStorageBufferOffsetAlignment()); - Operand baseAddrTrunc = PrependOperation(Instruction.BitwiseAnd, sbBaseAddrLow, alignMask); Operand byteOffset = PrependOperation(Instruction.Subtract, addrLow, baseAddrTrunc); @@ -178,6 +194,46 @@ namespace Ryujinx.Graphics.Shader.Translation storageOp = new Operation(Instruction.Copy, operation.Dest, Const(0)); } + if (operation.Inst == Instruction.LoadGlobal) + { + int cbeUseMask = config.AccessibleConstantBuffersMask; + + while (cbeUseMask != 0) + { + int slot = BitOperations.TrailingZeroCount(cbeUseMask); + int cbSlot = UbeFirstCbuf + slot; + + cbeUseMask &= ~(1 << slot); + + config.SetUsedConstantBuffer(cbSlot); + + Operand previousResult = PrependExistingOperation(storageOp); + + int cbOffset = GetConstantUbeOffset(slot); + + Operand inRange = BindingRangeCheck(cbOffset, out Operand baseAddrLow); + + Operand baseAddrTruncConst = PrependOperation(Instruction.BitwiseAnd, baseAddrLow, alignMask); + Operand byteOffsetConst = PrependOperation(Instruction.Subtract, addrLow, baseAddrTruncConst); + + Operand cbIndex = PrependOperation(Instruction.ShiftRightU32, byteOffsetConst, Const(2)); + + Operand[] sourcesCb = new Operand[operation.SourcesCount]; + + sourcesCb[0] = Const(cbSlot); + sourcesCb[1] = cbIndex; + + for (int index = 2; index < operation.SourcesCount; index++) + { + sourcesCb[index] = operation.GetSource(index); + } + + Operand ldcResult = PrependOperation(Instruction.LoadConstant, sourcesCb); + + storageOp = new Operation(Instruction.ConditionalSelect, operation.Dest, inRange, ldcResult, previousResult); + } + } + for (int index = 0; index < operation.SourcesCount; index++) { operation.SetSource(index, null); diff --git a/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs b/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs index 85b56b51f..a79ef6f57 100644 --- a/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs +++ b/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs @@ -66,6 +66,7 @@ namespace Ryujinx.Graphics.Shader.Translation public UInt128 ThisInputAttributesComponents { get; private set; } public int AccessibleStorageBuffersMask { get; private set; } + public int AccessibleConstantBuffersMask { get; private set; } private int _usedConstantBuffers; private int _usedStorageBuffers; @@ -100,7 +101,8 @@ namespace Ryujinx.Graphics.Shader.Translation GpuAccessor = gpuAccessor; Options = options; - AccessibleStorageBuffersMask = (1 << GlobalMemory.StorageMaxCount) - 1; + AccessibleStorageBuffersMask = (1 << GlobalMemory.StorageMaxCount) - 1; + AccessibleConstantBuffersMask = (1 << GlobalMemory.UbeMaxCount) - 1; UsedInputAttributesPerPatch = new HashSet(); UsedOutputAttributesPerPatch = new HashSet(); @@ -121,6 +123,11 @@ namespace Ryujinx.Graphics.Shader.Translation OutputTopology = outputTopology; MaxOutputVertices = maxOutputVertices; TransformFeedbackEnabled = gpuAccessor.QueryTransformFeedbackEnabled(); + + if (Stage != ShaderStage.Compute) + { + AccessibleConstantBuffersMask = 0; + } } public ShaderConfig(ShaderHeader header, IGpuAccessor gpuAccessor, TranslationOptions options) : this(gpuAccessor, options) @@ -404,9 +411,10 @@ namespace Ryujinx.Graphics.Shader.Translation UsedFeatures |= flags; } - public void SetAccessibleStorageBuffersMask(int mask) + public void SetAccessibleBufferMasks(int sbMask, int ubeMask) { - AccessibleStorageBuffersMask = mask; + AccessibleStorageBuffersMask = sbMask; + AccessibleConstantBuffersMask = ubeMask; } public void SetUsedConstantBuffer(int slot)